Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

93
LINES

< > BotCompany Repo | #1008015 // Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory]

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Uses 4738K of libraries. Click here for Pure Java version (9198L/67K/219K).

1  
!7
2  
3  
static new TreeMap<U, U> wiki; // title to text (stored as compressed strings)
4  
5  
sS processKey(S s) { ret toUpper(s); }
6  
7  
p { time {
8  
  restartWith1GBHeap();
9  
  substance();
10  
  quietGC(); // don't litter console with GC messages
11  
  veryBigConsole();
12  
  setConsoleWidth(900);
13  
  centerConsole();
14  
  consoleWordWrap();
15  
  File f = unpackSimpleWikipedia();
16  
  BufferedReader reader = utf8bufferedReader(f);
17  
  S line;
18  
  int lines = 0, pages = 0;
19  
  StringBuilder pageBuf = null;
20  
  while ((line = reader.readLine()) != null) {
21  
    line = trim(line);
22  
    if (eq(line, "<page>"))
23  
      pageBuf = new StringBuilder;
24  
    if (pageBuf != null)
25  
      pageBuf.append(line).append("\n");
26  
    if (eq(line, "</page>")) {
27  
      //print("Page done. " + l(pageBuf) + " chars");
28  
      L<S> tok = htmlTok(str(pageBuf));
29  
      S title = htmldecode(join(contentsOfContainerTag(tok, "title")));
30  
      S text = trim(htmldecode(join(contentsOfContainerTag(tok, "text"))));
31  
      if (!shouldSkip(title) && !empty(text)) {
32  
        U key = new U(processKey(title));
33  
        U old = wiki.get(key);
34  
        S red = wikipedia_getRedirect(text);
35  
        if (!eqic(red, title) && !eqic(text, str(old))) {
36  
          /*if (old != null) {
37  
            print("Double entry: " + title);
38  
            print("  " + quote(str(old)));
39  
            print("  " + quote(text));
40  
          }*/
41  
          if (old == null || wikipedia_getRedirect(str(old)) != null)
42  
            wiki.put(key, new U(text));
43  
        }
44  
      }
45  
      if ((++pages % 1000) == 0) {
46  
        fractionDone(pages/228400.0);
47  
        print("Pages: " + pages + " (" + title + ")");
48  
        sleep(1);
49  
      }
50  
      pageBuf = null;
51  
    }
52  
  }
53  
  }
54  
  fractionDone(1);
55  
  
56  
  swing {
57  
    JList list = jlist(allToString(keys(wiki)));
58  
    addToWindowSplitRight(consoleFrame(), list);
59  
    onDoubleClick(list, func(S item) {
60  
      answer(item)
61  
    });
62  
  }
63  
  
64  
  // print a random entry
65  
  answer(str(random(keys(wiki))));
66  
  botSleep();
67  
}
68  
69  
answer {
70  
  U u = followRedirect(nicestClosestKey(wiki, new U(processKey(s))));
71  
  if (u != null) {
72  
    clearConsole();
73  
    S title = toUpper(str(u));
74  
    consoleStatus(title);
75  
    print(title);
76  
    print();
77  
    print(dropContainerTags(str(wiki.get(u)))); // drop <ref>
78  
    scrollConsoleUpIn(0.1);
79  
    ret " ";
80  
  }
81  
}
82  
83  
static U followRedirect(U key) {
84  
  U next;
85  
  int count = 0;
86  
  while ((next = toU(processKey(wikipedia_getRedirect(str(wiki.get(key)))))) != null && ++count < 10)
87  
    key = next;
88  
  ret key;
89  
}
90  
91  
sbool shouldSkip(S title) {
92  
  ret swic(title, "Category:") || swic(title, "Template:");
93  
}

Author comment

// stuff to evaluate in "assist" (speed test for full-text searches)
// !j twice { time { for (O u : values((Map) get(mmc(), "wiki"))) words2(str(u)); } }
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cic(str(u), "hello")) ++n; }} ret n;
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (contains(str(u), "hello")) ++n; }} ret n;
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cicFast(str(u), "hello")) ++n; }} ret n;

download  show line numbers  debug dex  old transpilations   

Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt, whxojlpjdney, wtqryiryparv

No comments. add comment

Snippet ID: #1008015
Snippet name: Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory]
Eternal ID of this version: #1008015/101
Text MD5: d24eacd9554cafeba409f3ebbd430297
Transpilation MD5: 32c68d3535898229580a2e3708246ff8
Author: stefan
Category: javax / a.i. / networking
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2018-01-17 10:29:24
Source code size: 2625 bytes / 93 lines
Pitched / IR pitched: No / No
Views / Downloads: 742 / 1826
Version history: 100 change(s)
Referenced in: [show references]