Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

105
LINES

< > BotCompany Repo | #1000846 // HTML parsing

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (568L/5K/14K).

!747

m {
  static L<S> tok; // list of tokens in HTML document
  static L<S> table; // list of tokens in table
  static L<L<S>> rows; // for every row, list of tokens in row
  static L<L<S>> data; // for every row, for every cell, inner data

  p {
    S htmlID = "#3000025";
    if (args.length != 0) htmlID = args[0];
    
    S script = [[
      find table
    ]];
    
    S html = loadSnippet(htmlID);
    tok = htmlcoarsetok(html);
    
    for (S cmd : splitScript(script)) {
      L<S> c = javaTok(cmd);
      if (c.size() == 1) continue;
      
      print("cmd: " + structure(c));
      
      if (cmdMatch("find table", c) != null) {
        findTable();
        if (table == null) fail("No table");
        //print(fromLines(table));
      }
    }
  }
 
  // split at newline, but also take into account multi-line strings 
  static L<S> splitScript(S script) {
    L<S> tok = javaTok(script);
    new L<S> result;
    result.add("");
    for (int i = 0; i < tok.size(); i++) {
      boolean nl = tok.get(i).indexOf("\n") >= 0;
      if (nl)
        result.add("");
      else
        result.set(result.size()-1, result.get(result.size()-1) + tok.get(i));
    }
    return result;
  }
  
  static S[] cmdMatch(S pat, L<S> cmd) {
    return match2(javaTok(pat), cmd);
  }
  
  static void findTable() {
    print("Finding table.");
    for (int i = 1; i < tok.size(); i += 2)
      if (isTag(tok.get(i), "table"))
        for (int j = i+2; j < tok.size(); j += 2)
          if (isTag(tok.get(j), "/table")) {
            print("Table found!");
            table = tok.subList(i-1, j+2);
            findRows();
            return;
          }
  }
  
  static void findRows() {
    L<S> tok = table;
    rows = new ArrayList<List<S>>();
    data = new ArrayList<List<S>>();
    int rowStart = 0;
    
    for (int i = 1; i < table.size(); i += 2) {
      //print(tok.get(i));
      if (isTag(tok.get(i), "tr"))
        rowStart = i;
      else if (isTag(tok.get(i), "/tr") && rowStart != 0) {
        L<S> row = table.subList(rowStart-1, i+2);
        rows.add(row);
        data.add(getData(row));
      }
    }
    
    print(rows.size() + " row(s)");
    print("Top left cell: " + data.get(0).get(0));
  }
  
  static boolean isTag(S token, S tag) {
    return token.regionMatches(true, 0, "<" + tag + " ", 0, tag.length()+2)
      || token.regionMatches(true, 0, "<" + tag + ">", 0, tag.length()+2);
  }
  
  static L<S> getData(L<S> row) {
    int colStart = 0;
    new L<S> cols;
    
    for (int i = 1; i < row.size(); i += 2) {
      S t = row.get(i);
      if (isTag(t, "td") || isTag(t, "th"))
        colStart = i;
      else if ((isTag(t, "/td") || isTag(t, "/th")) && colStart != 0)
        cols.add(join(row.subList(colStart+1, i)));
    }
    return cols;
  }
}

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000846
Snippet name: HTML parsing
Eternal ID of this version: #1000846/1
Text MD5: 08f81fe49d16e7511c8ffd9608af0452
Transpilation MD5: 8bee9b6fe52304604253ae97de7d5535
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-08-30 16:11:16
Source code size: 2892 bytes / 105 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 591 / 653
Referenced in: [show references]