Libraryless. Click here for Pure Java version (2351L/17K/55K).
1 | !752 |
2 | |
3 | static S html; |
4 | static S keyword; |
5 | static new List<S> sentences; |
6 | static S source; |
7 | |
8 | p { |
9 | if (args.length != 0) { |
10 | if (isSnippetID(args[0])) |
11 | html = loadSnippet(args[0]); |
12 | else if (args[0].startsWith("http:") || args[0].startsWith("https://")) |
13 | html = loadPage(source = args[0]); |
14 | else { |
15 | keyword = join(" ", args); |
16 | html = loadPage(source = "https://en.wikipedia.org/wiki/" + keyword); |
17 | } |
18 | } else |
19 | html = loadSnippet(source = "#3000030"); // A text about Java |
20 | print(); |
21 | |
22 | // tokenize and clean up html |
23 | L<S> tok = htmlcoarsetok(html); |
24 | tok = removeScripts(tok); |
25 | tok = dropTags(tok, "span"); |
26 | tok = dropTags(tok, "a"); |
27 | tok = dropTags(tok, "b"); |
28 | tok = dropTags(tok, "small"); |
29 | |
30 | // tok = dropAllTags(tok); // Too much! |
31 | //print(structure(tok)); |
32 | |
33 | for (int i = 0; i < tok.size(); i += 2) { |
34 | S line = tok.get(i).trim(); |
35 | if (line.startsWith("<!--")) continue; // HTML comment |
36 | if (line.length() == 0) continue; |
37 | //print("Line: " + quote(line)); |
38 | interpretLine(line); |
39 | } |
40 | |
41 | print(); |
42 | print(sentences.size() + " sentences found."); |
43 | print(); |
44 | |
45 | ntUpload(getProgramID(), "Sentences from " + source, makeText()); |
46 | |
47 | if (keyword != null) { |
48 | S shortenedKeyword = keyword.replaceAll(" \\(.*$", "").trim().toLowerCase(); |
49 | print("Here are those about " + quote(keyword) + ":"); |
50 | print(); |
51 | for (S s : sentences) { |
52 | S x = s.toLowerCase(); |
53 | if (x.startsWith(shortenedKeyword) || x.startsWith("The " + shortenedKeyword) || x.startsWith("A " + shortenedKeyword) || x.startsWith("An " + shortenedKeyword)) |
54 | print(s); |
55 | } |
56 | } |
57 | } |
58 | |
59 | static S makeText() { |
60 | ret numberLines(fromLines(sentences)); |
61 | } |
62 | |
63 | static boolean hasCharacters(S s) { |
64 | for (int i = 0; i < s.length(); i++) |
65 | if (Character.isLetter(s.charAt(i))) |
66 | ret true; |
67 | ret false; |
68 | } |
69 | |
70 | static void interpretLine(S s) { |
71 | for (S sentence : getSentences(s)) { |
72 | char first = sentence.charAt(0); |
73 | if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue; |
74 | if (!hasCharacters(sentence)) continue; |
75 | //L<S> tok = javaTok(sentence); |
76 | //print(structure(tok)); |
77 | sentence = htmldecode(sentence); |
78 | if (!isAndroid()) |
79 | print("Sentence found: " + quote(sentence)); |
80 | sentences.add(sentence); |
81 | } |
82 | } |
83 | |
84 | static L<S> getSentences(S s) { |
85 | L<S> tok = javaTok(s); // To parse quoted things |
86 | fixSpaces(tok); |
87 | new L<S> list; |
88 | int i = 0; |
89 | while (true) { |
90 | int j = i; |
91 | do { |
92 | j = indexOf(tok, ".", j+1); |
93 | if (j < 0) return list; |
94 | } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5" |
95 | |
96 | S sentence = join(tok.subList(i, j+1)).trim(); |
97 | if (sentence.length() > 1) |
98 | list.add(sentence); |
99 | i = j+1; |
100 | } |
101 | } |
102 | |
103 | // So everything's on one line. |
104 | static void fixSpaces(L<S> tok) { |
105 | for (int i = 0; i < l(tok); i += 2) |
106 | tok.set(i, tok.get(i).equals("") ? "" : " "); |
107 | } |
108 | |
109 | static L<S> removeScripts(L<S> tok) { |
110 | tok = new ArrayList<S>(tok); |
111 | for (int i = 1; i < tok.size(); ) |
112 | if (tagIs(tok.get(i), "script")) { |
113 | int j = i; |
114 | while (j < tok.size() && !tagIs(tok.get(j), "/script")) |
115 | j += 2; |
116 | while (j+1 > i-1) { // Yeah it's inefficient... |
117 | tok.remove(i-1); --j; |
118 | } |
119 | } else |
120 | i += 2; |
121 | return tok; |
122 | } |
123 | |
124 | static L<S> dropTags(L<S> tok, S tag) { |
125 | new L<S> list; |
126 | for (int i = 0; i < tok.size(); i++) { |
127 | S t = tok.get(i); |
128 | if (tagIs(t, tag) || tagIs(t, "/" + tag)) { |
129 | list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1)); |
130 | ++i; |
131 | } else |
132 | list.add(tok.get(i)); |
133 | } |
134 | return list; |
135 | } |
136 | |
137 | static L<S> dropAllTags(L<S> tok) { |
138 | new L<S> list; |
139 | for (int i = 0; i < tok.size(); i++) { |
140 | S t = tok.get(i); |
141 | if (t.startsWith("<")) { |
142 | list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1)); |
143 | ++i; |
144 | } else |
145 | list.add(tok.get(i)); |
146 | } |
147 | return list; |
148 | } |
download show line numbers debug dex old transpilations
Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000858 |
Snippet name: | Extract some text from web (e.g. Wikipedia) and upload as pure text snippet |
Eternal ID of this version: | #1000858/2 |
Text MD5: | f451d5f9f40a1037c23c4974902de1a0 |
Transpilation MD5: | 423af11850f36ee35d83385a75833ed7 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2017-03-30 19:42:40 |
Source code size: | 4014 bytes / 148 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 934 / 941 |
Version history: | 1 change(s) |
Referenced in: | [show references] |