static L<S> sentencesFromHTML(S html) { new SentencesFromHTML x; x.parseHTML(html); ret x.sentences; } sclass SentencesFromHTML { new L<S> sentences; bool withQuestions = true; void parseHTML(S html) { // tokenize and clean up html L<S> tok = htmlcoarsetok(html); tok = removeScripts(tok); tok = dropTags(tok, "span"); tok = dropTags(tok, "a"); tok = dropTags(tok, "b"); tok = dropTags(tok, "small"); // tok = dropAllTags(tok); // Too much! //print(structure(tok)); for (int i = 0; i < tok.size(); i += 2) { S line = tok.get(i).trim(); if (line.startsWith("<!--")) continue; // HTML comment if (line.length() == 0) continue; //print("Line: " + quote(line)); interpretLine(line); } } static boolean hasCharacters(S s) { for (int i = 0; i < s.length(); i++) if (Character.isLetter(s.charAt(i))) ret true; ret false; } void interpretLine(S s) { for (S sentence : getSentences(s)) { char first = sentence.charAt(0); if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue; if (!hasCharacters(sentence)) continue; //L<S> tok = javaTok(sentence); //print(structure(tok)); sentence = htmldecode(sentence); //print("Sentence found: " + quote(sentence)); sentences.add(sentence); } } L<S> getSentences(S s) { L<S> tok = javaTok(s); // To parse quoted things fixSpaces(tok); new L<S> list; int i = 0; while (true) { int j = i; do { j = smartIndexOfMulti(tok, withQuestions ? ll(".", "?") : ll("."), j+1); if (j >= l(tok)) return list; } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5" S sentence = join(tok.subList(i, j+1)).trim(); if (sentence.length() > 1) list.add(sentence); i = j+1; } } // So everything's on one line. static void fixSpaces(L<S> tok) { for (int i = 0; i < l(tok); i += 2) tok.set(i, tok.get(i).equals("") ? "" : " "); } static L<S> removeScripts(L<S> tok) { tok = new ArrayList<S>(tok); for (int i = 1; i < tok.size(); ) if (tagIs(tok.get(i), "script")) { int j = i; while (j < tok.size() && !tagIs(tok.get(j), "/script")) j += 2; while (j+1 > i-1) { // Yeah it's inefficient... tok.remove(i-1); --j; } } else i += 2; return tok; } static L<S> dropTags(L<S> tok, S tag) { new L<S> list; for (int i = 0; i < tok.size(); i++) { S t = tok.get(i); if (tagIs(t, tag) || tagIs(t, "/" + tag)) { list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1)); ++i; } else list.add(tok.get(i)); } return list; } }
Began life as a copy of #1000858
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1005238 |
Snippet name: | sentencesFromHTML |
Eternal ID of this version: | #1005238/1 |
Text MD5: | 81d9b637720506fe3b03c5d7c2a9d0b3 |
Author: | stefan |
Category: | javax / text analysis |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-10-26 17:08:44 |
Source code size: | 2934 bytes / 104 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 520 / 548 |
Referenced in: | [show references] |