Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

148
LINES

< > BotCompany Repo | #1000858 // Extract some text from web (e.g. Wikipedia) and upload as pure text snippet

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (2351L/17K/55K).

1  
!752
2  
3  
static S html;
4  
static S keyword;
5  
static new List<S> sentences;
6  
static S source;
7  
8  
p {
9  
  if (args.length != 0) {
10  
    if (isSnippetID(args[0]))
11  
      html = loadSnippet(args[0]);
12  
    else if (args[0].startsWith("http:") || args[0].startsWith("https://"))
13  
      html = loadPage(source = args[0]);
14  
    else {
15  
      keyword = join(" ", args);
16  
      html = loadPage(source = "https://en.wikipedia.org/wiki/" + keyword);
17  
    }
18  
  } else
19  
    html = loadSnippet(source = "#3000030"); // A text about Java
20  
  print();
21  
    
22  
  // tokenize and clean up html
23  
  L<S> tok = htmlcoarsetok(html);
24  
  tok = removeScripts(tok);
25  
  tok = dropTags(tok, "span");
26  
  tok = dropTags(tok, "a");
27  
  tok = dropTags(tok, "b");
28  
  tok = dropTags(tok, "small");
29  
  
30  
  // tok = dropAllTags(tok); // Too much!
31  
  //print(structure(tok));
32  
33  
  for (int i = 0; i < tok.size(); i += 2) {
34  
    S line = tok.get(i).trim();
35  
    if (line.startsWith("<!--")) continue; // HTML comment
36  
    if (line.length() == 0) continue;
37  
    //print("Line: " + quote(line));
38  
    interpretLine(line);
39  
  }
40  
  
41  
  print();
42  
  print(sentences.size() + " sentences found.");
43  
  print();
44  
  
45  
  ntUpload(getProgramID(), "Sentences from " + source, makeText());
46  
  
47  
  if (keyword != null) {
48  
    S shortenedKeyword = keyword.replaceAll(" \\(.*$", "").trim().toLowerCase();
49  
    print("Here are those about " + quote(keyword) + ":");
50  
    print();
51  
    for (S s : sentences) {
52  
      S x = s.toLowerCase();
53  
      if (x.startsWith(shortenedKeyword) || x.startsWith("The " + shortenedKeyword) || x.startsWith("A " + shortenedKeyword) || x.startsWith("An " + shortenedKeyword))
54  
        print(s);
55  
    }
56  
  }
57  
}
58  
59  
static S makeText() {
60  
  ret numberLines(fromLines(sentences));
61  
}
62  
63  
static boolean hasCharacters(S s) {
64  
  for (int i = 0; i < s.length(); i++)
65  
    if (Character.isLetter(s.charAt(i)))
66  
      ret true;
67  
  ret false;
68  
}
69  
70  
static void interpretLine(S s) {
71  
  for (S sentence : getSentences(s)) {
72  
    char first = sentence.charAt(0);
73  
    if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue;
74  
    if (!hasCharacters(sentence)) continue;
75  
    //L<S> tok = javaTok(sentence);
76  
    //print(structure(tok));
77  
    sentence = htmldecode(sentence);
78  
    if (!isAndroid())
79  
      print("Sentence found: " + quote(sentence));
80  
    sentences.add(sentence);
81  
  }
82  
}
83  
84  
static L<S> getSentences(S s) {
85  
  L<S> tok = javaTok(s); // To parse quoted things
86  
  fixSpaces(tok);
87  
  new L<S> list;
88  
  int i = 0;
89  
  while (true) {
90  
    int j = i;
91  
    do {
92  
      j = indexOf(tok, ".", j+1);
93  
      if (j < 0) return list;
94  
    } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5"
95  
    
96  
    S sentence = join(tok.subList(i, j+1)).trim();
97  
    if (sentence.length() > 1)
98  
      list.add(sentence);
99  
    i = j+1;
100  
  }
101  
}
102  
103  
// So everything's on one line.
104  
static void fixSpaces(L<S> tok) {
105  
  for (int i = 0; i < l(tok); i += 2)
106  
    tok.set(i, tok.get(i).equals("") ? "" : " ");
107  
}
108  
109  
static L<S> removeScripts(L<S> tok) {
110  
  tok = new ArrayList<S>(tok);
111  
  for (int i = 1; i < tok.size(); )
112  
    if (tagIs(tok.get(i), "script")) {
113  
      int j = i;
114  
      while (j < tok.size() && !tagIs(tok.get(j), "/script"))
115  
        j += 2;
116  
      while (j+1 > i-1) { // Yeah it's inefficient...
117  
        tok.remove(i-1); --j; 
118  
      }
119  
    } else
120  
      i += 2;
121  
  return tok;
122  
}
123  
124  
static L<S> dropTags(L<S> tok, S tag) {
125  
  new L<S> list;
126  
  for (int i = 0; i < tok.size(); i++) {
127  
    S t = tok.get(i);
128  
    if (tagIs(t, tag) || tagIs(t, "/" + tag)) {
129  
      list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1));
130  
      ++i;
131  
    } else
132  
      list.add(tok.get(i));
133  
  }
134  
  return list;
135  
}
136  
137  
static L<S> dropAllTags(L<S> tok) {
138  
  new L<S> list;
139  
  for (int i = 0; i < tok.size(); i++) {
140  
    S t = tok.get(i);
141  
    if (t.startsWith("<")) {
142  
      list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1));
143  
      ++i;
144  
    } else
145  
      list.add(tok.get(i));
146  
  }
147  
  return list;
148  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1000858
Snippet name: Extract some text from web (e.g. Wikipedia) and upload as pure text snippet
Eternal ID of this version: #1000858/2
Text MD5: f451d5f9f40a1037c23c4974902de1a0
Transpilation MD5: 423af11850f36ee35d83385a75833ed7
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2017-03-30 19:42:40
Source code size: 4014 bytes / 148 lines
Pitched / IR pitched: No / No
Views / Downloads: 810 / 837
Version history: 1 change(s)
Referenced in: [show references]