Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

105
LINES

< > BotCompany Repo | #1000846 // HTML parsing

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (568L/5K/14K).

1  
!747
2  
3  
m {
4  
  static L<S> tok; // list of tokens in HTML document
5  
  static L<S> table; // list of tokens in table
6  
  static L<L<S>> rows; // for every row, list of tokens in row
7  
  static L<L<S>> data; // for every row, for every cell, inner data
8  
9  
  p {
10  
    S htmlID = "#3000025";
11  
    if (args.length != 0) htmlID = args[0];
12  
    
13  
    S script = [[
14  
      find table
15  
    ]];
16  
    
17  
    S html = loadSnippet(htmlID);
18  
    tok = htmlcoarsetok(html);
19  
    
20  
    for (S cmd : splitScript(script)) {
21  
      L<S> c = javaTok(cmd);
22  
      if (c.size() == 1) continue;
23  
      
24  
      print("cmd: " + structure(c));
25  
      
26  
      if (cmdMatch("find table", c) != null) {
27  
        findTable();
28  
        if (table == null) fail("No table");
29  
        //print(fromLines(table));
30  
      }
31  
    }
32  
  }
33  
 
34  
  // split at newline, but also take into account multi-line strings 
35  
  static L<S> splitScript(S script) {
36  
    L<S> tok = javaTok(script);
37  
    new L<S> result;
38  
    result.add("");
39  
    for (int i = 0; i < tok.size(); i++) {
40  
      boolean nl = tok.get(i).indexOf("\n") >= 0;
41  
      if (nl)
42  
        result.add("");
43  
      else
44  
        result.set(result.size()-1, result.get(result.size()-1) + tok.get(i));
45  
    }
46  
    return result;
47  
  }
48  
  
49  
  static S[] cmdMatch(S pat, L<S> cmd) {
50  
    return match2(javaTok(pat), cmd);
51  
  }
52  
  
53  
  static void findTable() {
54  
    print("Finding table.");
55  
    for (int i = 1; i < tok.size(); i += 2)
56  
      if (isTag(tok.get(i), "table"))
57  
        for (int j = i+2; j < tok.size(); j += 2)
58  
          if (isTag(tok.get(j), "/table")) {
59  
            print("Table found!");
60  
            table = tok.subList(i-1, j+2);
61  
            findRows();
62  
            return;
63  
          }
64  
  }
65  
  
66  
  static void findRows() {
67  
    L<S> tok = table;
68  
    rows = new ArrayList<List<S>>();
69  
    data = new ArrayList<List<S>>();
70  
    int rowStart = 0;
71  
    
72  
    for (int i = 1; i < table.size(); i += 2) {
73  
      //print(tok.get(i));
74  
      if (isTag(tok.get(i), "tr"))
75  
        rowStart = i;
76  
      else if (isTag(tok.get(i), "/tr") && rowStart != 0) {
77  
        L<S> row = table.subList(rowStart-1, i+2);
78  
        rows.add(row);
79  
        data.add(getData(row));
80  
      }
81  
    }
82  
    
83  
    print(rows.size() + " row(s)");
84  
    print("Top left cell: " + data.get(0).get(0));
85  
  }
86  
  
87  
  static boolean isTag(S token, S tag) {
88  
    return token.regionMatches(true, 0, "<" + tag + " ", 0, tag.length()+2)
89  
      || token.regionMatches(true, 0, "<" + tag + ">", 0, tag.length()+2);
90  
  }
91  
  
92  
  static L<S> getData(L<S> row) {
93  
    int colStart = 0;
94  
    new L<S> cols;
95  
    
96  
    for (int i = 1; i < row.size(); i += 2) {
97  
      S t = row.get(i);
98  
      if (isTag(t, "td") || isTag(t, "th"))
99  
        colStart = i;
100  
      else if ((isTag(t, "/td") || isTag(t, "/th")) && colStart != 0)
101  
        cols.add(join(row.subList(colStart+1, i)));
102  
    }
103  
    return cols;
104  
  }
105  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000846
Snippet name: HTML parsing
Eternal ID of this version: #1000846/1
Text MD5: 08f81fe49d16e7511c8ffd9608af0452
Transpilation MD5: 8bee9b6fe52304604253ae97de7d5535
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-08-30 16:11:16
Source code size: 2892 bytes / 105 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 537 / 585
Referenced in: [show references]