Libraryless. Click here for Pure Java version (568L/5K/14K).
1 | !747 |
2 | |
3 | m { |
4 | static L<S> tok; // list of tokens in HTML document |
5 | static L<S> table; // list of tokens in table |
6 | static L<L<S>> rows; // for every row, list of tokens in row |
7 | static L<L<S>> data; // for every row, for every cell, inner data |
8 | |
9 | p { |
10 | S htmlID = "#3000025"; |
11 | if (args.length != 0) htmlID = args[0]; |
12 | |
13 | S script = [[ |
14 | find table |
15 | ]]; |
16 | |
17 | S html = loadSnippet(htmlID); |
18 | tok = htmlcoarsetok(html); |
19 | |
20 | for (S cmd : splitScript(script)) { |
21 | L<S> c = javaTok(cmd); |
22 | if (c.size() == 1) continue; |
23 | |
24 | print("cmd: " + structure(c)); |
25 | |
26 | if (cmdMatch("find table", c) != null) { |
27 | findTable(); |
28 | if (table == null) fail("No table"); |
29 | //print(fromLines(table)); |
30 | } |
31 | } |
32 | } |
33 | |
34 | // split at newline, but also take into account multi-line strings |
35 | static L<S> splitScript(S script) { |
36 | L<S> tok = javaTok(script); |
37 | new L<S> result; |
38 | result.add(""); |
39 | for (int i = 0; i < tok.size(); i++) { |
40 | boolean nl = tok.get(i).indexOf("\n") >= 0; |
41 | if (nl) |
42 | result.add(""); |
43 | else |
44 | result.set(result.size()-1, result.get(result.size()-1) + tok.get(i)); |
45 | } |
46 | return result; |
47 | } |
48 | |
49 | static S[] cmdMatch(S pat, L<S> cmd) { |
50 | return match2(javaTok(pat), cmd); |
51 | } |
52 | |
53 | static void findTable() { |
54 | print("Finding table."); |
55 | for (int i = 1; i < tok.size(); i += 2) |
56 | if (isTag(tok.get(i), "table")) |
57 | for (int j = i+2; j < tok.size(); j += 2) |
58 | if (isTag(tok.get(j), "/table")) { |
59 | print("Table found!"); |
60 | table = tok.subList(i-1, j+2); |
61 | findRows(); |
62 | return; |
63 | } |
64 | } |
65 | |
66 | static void findRows() { |
67 | L<S> tok = table; |
68 | rows = new ArrayList<List<S>>(); |
69 | data = new ArrayList<List<S>>(); |
70 | int rowStart = 0; |
71 | |
72 | for (int i = 1; i < table.size(); i += 2) { |
73 | //print(tok.get(i)); |
74 | if (isTag(tok.get(i), "tr")) |
75 | rowStart = i; |
76 | else if (isTag(tok.get(i), "/tr") && rowStart != 0) { |
77 | L<S> row = table.subList(rowStart-1, i+2); |
78 | rows.add(row); |
79 | data.add(getData(row)); |
80 | } |
81 | } |
82 | |
83 | print(rows.size() + " row(s)"); |
84 | print("Top left cell: " + data.get(0).get(0)); |
85 | } |
86 | |
87 | static boolean isTag(S token, S tag) { |
88 | return token.regionMatches(true, 0, "<" + tag + " ", 0, tag.length()+2) |
89 | || token.regionMatches(true, 0, "<" + tag + ">", 0, tag.length()+2); |
90 | } |
91 | |
92 | static L<S> getData(L<S> row) { |
93 | int colStart = 0; |
94 | new L<S> cols; |
95 | |
96 | for (int i = 1; i < row.size(); i += 2) { |
97 | S t = row.get(i); |
98 | if (isTag(t, "td") || isTag(t, "th")) |
99 | colStart = i; |
100 | else if ((isTag(t, "/td") || isTag(t, "/th")) && colStart != 0) |
101 | cols.add(join(row.subList(colStart+1, i))); |
102 | } |
103 | return cols; |
104 | } |
105 | } |
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000846 |
Snippet name: | HTML parsing |
Eternal ID of this version: | #1000846/1 |
Text MD5: | 08f81fe49d16e7511c8ffd9608af0452 |
Transpilation MD5: | 8bee9b6fe52304604253ae97de7d5535 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-08-30 16:11:16 |
Source code size: | 2892 bytes / 105 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 592 / 654 |
Referenced in: | [show references] |