1 | srecord IndexedWikiPage(S title, long start, int len) {}
|
2 | |
3 | please include function iteratorFromFunction. |
4 | |
5 | static IterableIterator<IndexedWikiPage> indexSimpleWikipedia() {
|
6 | File f = unpackSimpleWikipedia(); |
7 | final ByteCountingLineReader reader = new(bufferedFileInputStream(f, 1024*1024)); |
8 | |
9 | ret main.<IndexedWikiPage> iteratorFromFunction(new O {
|
10 | int lines = 0, pages = 0; |
11 | |
12 | IndexedWikiPage get() ctex {
|
13 | long pageStart = 0; |
14 | StringBuilder pageBuf = null; |
15 | |
16 | while licensed {
|
17 | long offset = reader.byteCount(); |
18 | S line = reader.readLine(); |
19 | if (line == null) break; |
20 | line = trim(line); |
21 | if (eq(line, "<page>")) {
|
22 | pageStart = offset; |
23 | pageBuf = new StringBuilder; |
24 | } |
25 | if (pageBuf != null) |
26 | pageBuf.append(line).append("\n");
|
27 | if (eq(line, "</page>")) {
|
28 | L<S> tok = htmlTok(str(pageBuf)); |
29 | S title = trim(htmldecode(join(contentsOfContainerTag(tok, "title")))); |
30 | if ((++pages % 1000) == 0) {
|
31 | fractionDone(pages/228400.0); |
32 | print("Pages: " + pages + " (" + title + ")");
|
33 | sleep(1); |
34 | } |
35 | ret new IndexedWikiPage(title, pageStart, toInt(reader.byteCount()-pageStart)); |
36 | } |
37 | } |
38 | fractionDone(1); |
39 | reader.close(); |
40 | null; |
41 | } |
42 | }); |
43 | } |
Began life as a copy of #1008067
download show line numbers debug dex old transpilations
Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1014153 |
| Snippet name: | indexSimpleWikipedia |
| Eternal ID of this version: | #1014153/9 |
| Text MD5: | d3460f1311734490734180efba5af218 |
| Author: | stefan |
| Category: | javax / a.i. / networking |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2018-04-15 14:11:58 |
| Source code size: | 1390 bytes / 43 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 687 / 697 |
| Version history: | 8 change(s) |
| Referenced in: | [show references] |