Extract some text from web (e.g. Wikipedia) and upload as pure text snippet [1000858]

!752

static S html;
static S keyword;
static new List<S> sentences;
static S source;

p {
  if (args.length != 0) {
    if (isSnippetID(args[0]))
      html = loadSnippet(args[0]);
    else if (args[0].startsWith("http:") || args[0].startsWith("https://"))
      html = loadPage(source = args[0]);
    else {
      keyword = join(" ", args);
      html = loadPage(source = "https://en.wikipedia.org/wiki/" + keyword);
    }
  } else
    html = loadSnippet(source = "#3000030"); // A text about Java
  print();
    
  // tokenize and clean up html
  L<S> tok = htmlcoarsetok(html);
  tok = removeScripts(tok);
  tok = dropTags(tok, "span");
  tok = dropTags(tok, "a");
  tok = dropTags(tok, "b");
  tok = dropTags(tok, "small");
  
  // tok = dropAllTags(tok); // Too much!
  //print(structure(tok));

  for (int i = 0; i < tok.size(); i += 2) {
    S line = tok.get(i).trim();
    if (line.startsWith("<!--")) continue; // HTML comment
    if (line.length() == 0) continue;
    //print("Line: " + quote(line));
    interpretLine(line);
  }
  
  print();
  print(sentences.size() + " sentences found.");
  print();
  
  ntUpload(getProgramID(), "Sentences from " + source, makeText());
  
  if (keyword != null) {
    S shortenedKeyword = keyword.replaceAll(" \\(.*$", "").trim().toLowerCase();
    print("Here are those about " + quote(keyword) + ":");
    print();
    for (S s : sentences) {
      S x = s.toLowerCase();
      if (x.startsWith(shortenedKeyword) || x.startsWith("The " + shortenedKeyword) || x.startsWith("A " + shortenedKeyword) || x.startsWith("An " + shortenedKeyword))
        print(s);
    }
  }
}

static S makeText() {
  ret numberLines(fromLines(sentences));
}

static boolean hasCharacters(S s) {
  for (int i = 0; i < s.length(); i++)
    if (Character.isLetter(s.charAt(i)))
      ret true;
  ret false;
}

static void interpretLine(S s) {
  for (S sentence : getSentences(s)) {
    char first = sentence.charAt(0);
    if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue;
    if (!hasCharacters(sentence)) continue;
    //L<S> tok = javaTok(sentence);
    //print(structure(tok));
    sentence = htmldecode(sentence);
    if (!isAndroid())
      print("Sentence found: " + quote(sentence));
    sentences.add(sentence);
  }
}

static L<S> getSentences(S s) {
  L<S> tok = javaTok(s); // To parse quoted things
  fixSpaces(tok);
  new L<S> list;
  int i = 0;
  while (true) {
    int j = i;
    do {
      j = indexOf(tok, ".", j+1);
      if (j < 0) return list;
    } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5"
    
    S sentence = join(tok.subList(i, j+1)).trim();
    if (sentence.length() > 1)
      list.add(sentence);
    i = j+1;
  }
}

// So everything's on one line.
static void fixSpaces(L<S> tok) {
  for (int i = 0; i < l(tok); i += 2)
    tok.set(i, tok.get(i).equals("") ? "" : " ");
}

static L<S> removeScripts(L<S> tok) {
  tok = new ArrayList<S>(tok);
  for (int i = 1; i < tok.size(); )
    if (tagIs(tok.get(i), "script")) {
      int j = i;
      while (j < tok.size() && !tagIs(tok.get(j), "/script"))
        j += 2;
      while (j+1 > i-1) { // Yeah it's inefficient...
        tok.remove(i-1); --j; 
      }
    } else
      i += 2;
  return tok;
}

static L<S> dropTags(L<S> tok, S tag) {
  new L<S> list;
  for (int i = 0; i < tok.size(); i++) {
    S t = tok.get(i);
    if (tagIs(t, tag) || tagIs(t, "/" + tag)) {
      list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1));
      ++i;
    } else
      list.add(tok.get(i));
  }
  return list;
}

static L<S> dropAllTags(L<S> tok) {
  new L<S> list;
  for (int i = 0; i < tok.size(); i++) {
    S t = tok.get(i);
    if (t.startsWith("<")) {
      list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1));
      ++i;
    } else
      list.add(tok.get(i));
  }
  return list;
}

Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

Snippet ID:	#1000858
Snippet name:	Extract some text from web (e.g. Wikipedia) and upload as pure text snippet
Eternal ID of this version:	#1000858/2
Text MD5:	f451d5f9f40a1037c23c4974902de1a0
Transpilation MD5:	423af11850f36ee35d83385a75833ed7
Author:	stefan
Category:	javax
Type:	JavaX source code
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2017-03-30 19:42:40
Source code size:	4014 bytes / 148 lines
Pitched / IR pitched:	No / No
Views / Downloads:	932 / 937
Version history:	1 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1000858 // Extract some text from web (e.g. Wikipedia) and upload as pure text snippet

JavaX source code [tags: use-pretranspiled] - run with: x30.jar