Uses 4738K of libraries. Click here for Pure Java version (9198L/67K/219K).
1 | !7 |
2 | |
3 | static new TreeMap<U, U> wiki; // title to text (stored as compressed strings) |
4 | |
5 | sS processKey(S s) { ret toUpper(s); } |
6 | |
7 | p { time { |
8 | restartWith1GBHeap(); |
9 | substance(); |
10 | quietGC(); // don't litter console with GC messages |
11 | veryBigConsole(); |
12 | setConsoleWidth(900); |
13 | centerConsole(); |
14 | consoleWordWrap(); |
15 | File f = unpackSimpleWikipedia(); |
16 | BufferedReader reader = utf8bufferedReader(f); |
17 | S line; |
18 | int lines = 0, pages = 0; |
19 | StringBuilder pageBuf = null; |
20 | while ((line = reader.readLine()) != null) { |
21 | line = trim(line); |
22 | if (eq(line, "<page>")) |
23 | pageBuf = new StringBuilder; |
24 | if (pageBuf != null) |
25 | pageBuf.append(line).append("\n"); |
26 | if (eq(line, "</page>")) { |
27 | //print("Page done. " + l(pageBuf) + " chars"); |
28 | L<S> tok = htmlTok(str(pageBuf)); |
29 | S title = htmldecode(join(contentsOfContainerTag(tok, "title"))); |
30 | S text = trim(htmldecode(join(contentsOfContainerTag(tok, "text")))); |
31 | if (!shouldSkip(title) && !empty(text)) { |
32 | U key = new U(processKey(title)); |
33 | U old = wiki.get(key); |
34 | S red = wikipedia_getRedirect(text); |
35 | if (!eqic(red, title) && !eqic(text, str(old))) { |
36 | /*if (old != null) { |
37 | print("Double entry: " + title); |
38 | print(" " + quote(str(old))); |
39 | print(" " + quote(text)); |
40 | }*/ |
41 | if (old == null || wikipedia_getRedirect(str(old)) != null) |
42 | wiki.put(key, new U(text)); |
43 | } |
44 | } |
45 | if ((++pages % 1000) == 0) { |
46 | fractionDone(pages/228400.0); |
47 | print("Pages: " + pages + " (" + title + ")"); |
48 | sleep(1); |
49 | } |
50 | pageBuf = null; |
51 | } |
52 | } |
53 | } |
54 | fractionDone(1); |
55 | |
56 | swing { |
57 | JList list = jlist(allToString(keys(wiki))); |
58 | addToWindowSplitRight(consoleFrame(), list); |
59 | onDoubleClick(list, func(S item) { |
60 | answer(item) |
61 | }); |
62 | } |
63 | |
64 | // print a random entry |
65 | answer(str(random(keys(wiki)))); |
66 | botSleep(); |
67 | } |
68 | |
69 | answer { |
70 | U u = followRedirect(nicestClosestKey(wiki, new U(processKey(s)))); |
71 | if (u != null) { |
72 | clearConsole(); |
73 | S title = toUpper(str(u)); |
74 | consoleStatus(title); |
75 | print(title); |
76 | print(); |
77 | print(dropContainerTags(str(wiki.get(u)))); // drop <ref> |
78 | scrollConsoleUpIn(0.1); |
79 | ret " "; |
80 | } |
81 | } |
82 | |
83 | static U followRedirect(U key) { |
84 | U next; |
85 | int count = 0; |
86 | while ((next = toU(processKey(wikipedia_getRedirect(str(wiki.get(key)))))) != null && ++count < 10) |
87 | key = next; |
88 | ret key; |
89 | } |
90 | |
91 | sbool shouldSkip(S title) { |
92 | ret swic(title, "Category:") || swic(title, "Template:"); |
93 | } |
// stuff to evaluate in "assist" (speed test for full-text searches) // !j twice { time { for (O u : values((Map) get(mmc(), "wiki"))) words2(str(u)); } } // !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cic(str(u), "hello")) ++n; }} ret n; // !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (contains(str(u), "hello")) ++n; }} ret n; // !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cicFast(str(u), "hello")) ++n; }} ret n;
download show line numbers debug dex old transpilations
Travelled to 17 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, iveijnkanddl, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt, whxojlpjdney, wtqryiryparv
No comments. add comment
Snippet ID: | #1008015 |
Snippet name: | Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory] |
Eternal ID of this version: | #1008015/101 |
Text MD5: | d24eacd9554cafeba409f3ebbd430297 |
Transpilation MD5: | 32c68d3535898229580a2e3708246ff8 |
Author: | stefan |
Category: | javax / a.i. / networking |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2018-01-17 10:29:24 |
Source code size: | 2625 bytes / 93 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 742 / 1826 |
Version history: | 100 change(s) |
Referenced in: | [show references] |