Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

93
LINES

< > BotCompany Repo | #1008015 // Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory]

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Uses 4738K of libraries. Click here for Pure Java version (9198L/67K/219K).

!7

static new TreeMap<U, U> wiki; // title to text (stored as compressed strings)

sS processKey(S s) { ret toUpper(s); }

p { time {
  restartWith1GBHeap();
  substance();
  quietGC(); // don't litter console with GC messages
  veryBigConsole();
  setConsoleWidth(900);
  centerConsole();
  consoleWordWrap();
  File f = unpackSimpleWikipedia();
  BufferedReader reader = utf8bufferedReader(f);
  S line;
  int lines = 0, pages = 0;
  StringBuilder pageBuf = null;
  while ((line = reader.readLine()) != null) {
    line = trim(line);
    if (eq(line, "<page>"))
      pageBuf = new StringBuilder;
    if (pageBuf != null)
      pageBuf.append(line).append("\n");
    if (eq(line, "</page>")) {
      //print("Page done. " + l(pageBuf) + " chars");
      L<S> tok = htmlTok(str(pageBuf));
      S title = htmldecode(join(contentsOfContainerTag(tok, "title")));
      S text = trim(htmldecode(join(contentsOfContainerTag(tok, "text"))));
      if (!shouldSkip(title) && !empty(text)) {
        U key = new U(processKey(title));
        U old = wiki.get(key);
        S red = wikipedia_getRedirect(text);
        if (!eqic(red, title) && !eqic(text, str(old))) {
          /*if (old != null) {
            print("Double entry: " + title);
            print("  " + quote(str(old)));
            print("  " + quote(text));
          }*/
          if (old == null || wikipedia_getRedirect(str(old)) != null)
            wiki.put(key, new U(text));
        }
      }
      if ((++pages % 1000) == 0) {
        fractionDone(pages/228400.0);
        print("Pages: " + pages + " (" + title + ")");
        sleep(1);
      }
      pageBuf = null;
    }
  }
  }
  fractionDone(1);
  
  swing {
    JList list = jlist(allToString(keys(wiki)));
    addToWindowSplitRight(consoleFrame(), list);
    onDoubleClick(list, func(S item) {
      answer(item)
    });
  }
  
  // print a random entry
  answer(str(random(keys(wiki))));
  botSleep();
}

answer {
  U u = followRedirect(nicestClosestKey(wiki, new U(processKey(s))));
  if (u != null) {
    clearConsole();
    S title = toUpper(str(u));
    consoleStatus(title);
    print(title);
    print();
    print(dropContainerTags(str(wiki.get(u)))); // drop <ref>
    scrollConsoleUpIn(0.1);
    ret " ";
  }
}

static U followRedirect(U key) {
  U next;
  int count = 0;
  while ((next = toU(processKey(wikipedia_getRedirect(str(wiki.get(key)))))) != null && ++count < 10)
    key = next;
  ret key;
}

sbool shouldSkip(S title) {
  ret swic(title, "Category:") || swic(title, "Template:");
}

Author comment

// stuff to evaluate in "assist" (speed test for full-text searches)
// !j twice { time { for (O u : values((Map) get(mmc(), "wiki"))) words2(str(u)); } }
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cic(str(u), "hello")) ++n; }} ret n;
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (contains(str(u), "hello")) ++n; }} ret n;
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cicFast(str(u), "hello")) ++n; }} ret n;

download  show line numbers  debug dex  old transpilations   

Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt, whxojlpjdney, wtqryiryparv

No comments. add comment

Snippet ID: #1008015
Snippet name: Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory]
Eternal ID of this version: #1008015/101
Text MD5: d24eacd9554cafeba409f3ebbd430297
Transpilation MD5: 32c68d3535898229580a2e3708246ff8
Author: stefan
Category: javax / a.i. / networking
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2018-01-17 10:29:24
Source code size: 2625 bytes / 93 lines
Pitched / IR pitched: No / No
Views / Downloads: 669 / 1739
Version history: 100 change(s)
Referenced in: [show references]