Libraryless. Click here for Pure Java version (568L/5K/14K).
!747 m { static L<S> tok; // list of tokens in HTML document static L<S> table; // list of tokens in table static L<L<S>> rows; // for every row, list of tokens in row static L<L<S>> data; // for every row, for every cell, inner data p { S htmlID = "#3000025"; if (args.length != 0) htmlID = args[0]; S script = [[ find table ]]; S html = loadSnippet(htmlID); tok = htmlcoarsetok(html); for (S cmd : splitScript(script)) { L<S> c = javaTok(cmd); if (c.size() == 1) continue; print("cmd: " + structure(c)); if (cmdMatch("find table", c) != null) { findTable(); if (table == null) fail("No table"); //print(fromLines(table)); } } } // split at newline, but also take into account multi-line strings static L<S> splitScript(S script) { L<S> tok = javaTok(script); new L<S> result; result.add(""); for (int i = 0; i < tok.size(); i++) { boolean nl = tok.get(i).indexOf("\n") >= 0; if (nl) result.add(""); else result.set(result.size()-1, result.get(result.size()-1) + tok.get(i)); } return result; } static S[] cmdMatch(S pat, L<S> cmd) { return match2(javaTok(pat), cmd); } static void findTable() { print("Finding table."); for (int i = 1; i < tok.size(); i += 2) if (isTag(tok.get(i), "table")) for (int j = i+2; j < tok.size(); j += 2) if (isTag(tok.get(j), "/table")) { print("Table found!"); table = tok.subList(i-1, j+2); findRows(); return; } } static void findRows() { L<S> tok = table; rows = new ArrayList<List<S>>(); data = new ArrayList<List<S>>(); int rowStart = 0; for (int i = 1; i < table.size(); i += 2) { //print(tok.get(i)); if (isTag(tok.get(i), "tr")) rowStart = i; else if (isTag(tok.get(i), "/tr") && rowStart != 0) { L<S> row = table.subList(rowStart-1, i+2); rows.add(row); data.add(getData(row)); } } print(rows.size() + " row(s)"); print("Top left cell: " + data.get(0).get(0)); } static boolean isTag(S token, S tag) { return token.regionMatches(true, 0, "<" + tag + " ", 0, tag.length()+2) || token.regionMatches(true, 0, "<" + tag + ">", 0, tag.length()+2); } static L<S> getData(L<S> row) { int colStart = 0; new L<S> cols; for (int i = 1; i < row.size(); i += 2) { S t = row.get(i); if (isTag(t, "td") || isTag(t, "th")) colStart = i; else if ((isTag(t, "/td") || isTag(t, "/th")) && colStart != 0) cols.add(join(row.subList(colStart+1, i))); } return cols; } }
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000846 |
Snippet name: | HTML parsing |
Eternal ID of this version: | #1000846/1 |
Text MD5: | 08f81fe49d16e7511c8ffd9608af0452 |
Transpilation MD5: | 8bee9b6fe52304604253ae97de7d5535 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-08-30 16:11:16 |
Source code size: | 2892 bytes / 105 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 593 / 654 |
Referenced in: | #1000849 - Get names of some parties from pouet (with links, static) #3000382 - Answer for ferdie (>> t = 1, f = 0) #3000383 - Answer for funkoverflow (>> t=1, f=0 okay) |