Libraryless. Click here for Pure Java version (2351L/17K/55K).
!752 static S html; static S keyword; static new List<S> sentences; static S source; p { if (args.length != 0) { if (isSnippetID(args[0])) html = loadSnippet(args[0]); else if (args[0].startsWith("http:") || args[0].startsWith("https://")) html = loadPage(source = args[0]); else { keyword = join(" ", args); html = loadPage(source = "https://en.wikipedia.org/wiki/" + keyword); } } else html = loadSnippet(source = "#3000030"); // A text about Java print(); // tokenize and clean up html L<S> tok = htmlcoarsetok(html); tok = removeScripts(tok); tok = dropTags(tok, "span"); tok = dropTags(tok, "a"); tok = dropTags(tok, "b"); tok = dropTags(tok, "small"); // tok = dropAllTags(tok); // Too much! //print(structure(tok)); for (int i = 0; i < tok.size(); i += 2) { S line = tok.get(i).trim(); if (line.startsWith("<!--")) continue; // HTML comment if (line.length() == 0) continue; //print("Line: " + quote(line)); interpretLine(line); } print(); print(sentences.size() + " sentences found."); print(); ntUpload(getProgramID(), "Sentences from " + source, makeText()); if (keyword != null) { S shortenedKeyword = keyword.replaceAll(" \\(.*$", "").trim().toLowerCase(); print("Here are those about " + quote(keyword) + ":"); print(); for (S s : sentences) { S x = s.toLowerCase(); if (x.startsWith(shortenedKeyword) || x.startsWith("The " + shortenedKeyword) || x.startsWith("A " + shortenedKeyword) || x.startsWith("An " + shortenedKeyword)) print(s); } } } static S makeText() { ret numberLines(fromLines(sentences)); } static boolean hasCharacters(S s) { for (int i = 0; i < s.length(); i++) if (Character.isLetter(s.charAt(i))) ret true; ret false; } static void interpretLine(S s) { for (S sentence : getSentences(s)) { char first = sentence.charAt(0); if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue; if (!hasCharacters(sentence)) continue; //L<S> tok = javaTok(sentence); //print(structure(tok)); sentence = htmldecode(sentence); if (!isAndroid()) print("Sentence found: " + quote(sentence)); sentences.add(sentence); } } static L<S> getSentences(S s) { L<S> tok = javaTok(s); // To parse quoted things fixSpaces(tok); new L<S> list; int i = 0; while (true) { int j = i; do { j = indexOf(tok, ".", j+1); if (j < 0) return list; } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5" S sentence = join(tok.subList(i, j+1)).trim(); if (sentence.length() > 1) list.add(sentence); i = j+1; } } // So everything's on one line. static void fixSpaces(L<S> tok) { for (int i = 0; i < l(tok); i += 2) tok.set(i, tok.get(i).equals("") ? "" : " "); } static L<S> removeScripts(L<S> tok) { tok = new ArrayList<S>(tok); for (int i = 1; i < tok.size(); ) if (tagIs(tok.get(i), "script")) { int j = i; while (j < tok.size() && !tagIs(tok.get(j), "/script")) j += 2; while (j+1 > i-1) { // Yeah it's inefficient... tok.remove(i-1); --j; } } else i += 2; return tok; } static L<S> dropTags(L<S> tok, S tag) { new L<S> list; for (int i = 0; i < tok.size(); i++) { S t = tok.get(i); if (tagIs(t, tag) || tagIs(t, "/" + tag)) { list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1)); ++i; } else list.add(tok.get(i)); } return list; } static L<S> dropAllTags(L<S> tok) { new L<S> list; for (int i = 0; i < tok.size(); i++) { S t = tok.get(i); if (t.startsWith("<")) { list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1)); ++i; } else list.add(tok.get(i)); } return list; }
download show line numbers debug dex old transpilations
Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1000858 |
Snippet name: | Extract some text from web (e.g. Wikipedia) and upload as pure text snippet |
Eternal ID of this version: | #1000858/2 |
Text MD5: | f451d5f9f40a1037c23c4974902de1a0 |
Transpilation MD5: | 423af11850f36ee35d83385a75833ed7 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2017-03-30 19:42:40 |
Source code size: | 4014 bytes / 148 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 933 / 939 |
Version history: | 1 change(s) |
Referenced in: | #1005238 - sentencesFromHTML #3000382 - Answer for ferdie (>> t = 1, f = 0) #3000383 - Answer for funkoverflow (>> t=1, f=0 okay) |