1 | static L<S> sentencesFromHTML(S html) { |
2 | new SentencesFromHTML x; |
3 | x.parseHTML(html); |
4 | ret x.sentences; |
5 | } |
6 | |
7 | sclass SentencesFromHTML { |
8 | new L<S> sentences; |
9 | bool withQuestions = true; |
10 | |
11 | void parseHTML(S html) { |
12 | // tokenize and clean up html |
13 | L<S> tok = htmlcoarsetok(html); |
14 | tok = removeScripts(tok); |
15 | tok = dropTags(tok, "span"); |
16 | tok = dropTags(tok, "a"); |
17 | tok = dropTags(tok, "b"); |
18 | tok = dropTags(tok, "small"); |
19 | |
20 | // tok = dropAllTags(tok); // Too much! |
21 | //print(structure(tok)); |
22 | |
23 | for (int i = 0; i < tok.size(); i += 2) { |
24 | S line = tok.get(i).trim(); |
25 | if (line.startsWith("<!--")) continue; // HTML comment |
26 | if (line.length() == 0) continue; |
27 | //print("Line: " + quote(line)); |
28 | interpretLine(line); |
29 | } |
30 | } |
31 | |
32 | static boolean hasCharacters(S s) { |
33 | for (int i = 0; i < s.length(); i++) |
34 | if (Character.isLetter(s.charAt(i))) |
35 | ret true; |
36 | ret false; |
37 | } |
38 | |
39 | void interpretLine(S s) { |
40 | for (S sentence : getSentences(s)) { |
41 | char first = sentence.charAt(0); |
42 | if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue; |
43 | if (!hasCharacters(sentence)) continue; |
44 | //L<S> tok = javaTok(sentence); |
45 | //print(structure(tok)); |
46 | sentence = htmldecode(sentence); |
47 | //print("Sentence found: " + quote(sentence)); |
48 | sentences.add(sentence); |
49 | } |
50 | } |
51 | |
52 | L<S> getSentences(S s) { |
53 | L<S> tok = javaTok(s); // To parse quoted things |
54 | fixSpaces(tok); |
55 | new L<S> list; |
56 | int i = 0; |
57 | while (true) { |
58 | int j = i; |
59 | do { |
60 | j = smartIndexOfMulti(tok, withQuestions ? ll(".", "?") : ll("."), j+1); |
61 | if (j >= l(tok)) return list; |
62 | } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5" |
63 | |
64 | S sentence = join(tok.subList(i, j+1)).trim(); |
65 | if (sentence.length() > 1) |
66 | list.add(sentence); |
67 | i = j+1; |
68 | } |
69 | } |
70 | |
71 | // So everything's on one line. |
72 | static void fixSpaces(L<S> tok) { |
73 | for (int i = 0; i < l(tok); i += 2) |
74 | tok.set(i, tok.get(i).equals("") ? "" : " "); |
75 | } |
76 | |
77 | static L<S> removeScripts(L<S> tok) { |
78 | tok = new ArrayList<S>(tok); |
79 | for (int i = 1; i < tok.size(); ) |
80 | if (tagIs(tok.get(i), "script")) { |
81 | int j = i; |
82 | while (j < tok.size() && !tagIs(tok.get(j), "/script")) |
83 | j += 2; |
84 | while (j+1 > i-1) { // Yeah it's inefficient... |
85 | tok.remove(i-1); --j; |
86 | } |
87 | } else |
88 | i += 2; |
89 | return tok; |
90 | } |
91 | |
92 | static L<S> dropTags(L<S> tok, S tag) { |
93 | new L<S> list; |
94 | for (int i = 0; i < tok.size(); i++) { |
95 | S t = tok.get(i); |
96 | if (tagIs(t, tag) || tagIs(t, "/" + tag)) { |
97 | list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1)); |
98 | ++i; |
99 | } else |
100 | list.add(tok.get(i)); |
101 | } |
102 | return list; |
103 | } |
104 | } |
Began life as a copy of #1000858
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1005238 |
Snippet name: | sentencesFromHTML |
Eternal ID of this version: | #1005238/1 |
Text MD5: | 81d9b637720506fe3b03c5d7c2a9d0b3 |
Author: | stefan |
Category: | javax / text analysis |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-10-26 17:08:44 |
Source code size: | 2934 bytes / 104 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 522 / 550 |
Referenced in: | [show references] |