!7
p {
File f = downloadSimpleWikipedia();
BufferedReader reader = utf8bufferedReader(bunzip2stream(f));
S line;
int lines = 0, pages = 0;
StringBuilder pageBuf = null;
while ((line = reader.readLine()) != null) {
/*if ((++lines % 100) == 0)
print("Lines: " + lines);*/
line = trim(line);
if (eq(line, ""))
pageBuf = new StringBuilder;
if (pageBuf != null)
pageBuf.append(line).append("\n");
if (eq(line, "")) {
//print("Page done. " + l(pageBuf) + " chars");
L tok = htmlTok(str(pageBuf));
S title = join(contentsOfContainerTag(tok, "title"));
if ((++pages % 1000) == 0) {
fractionDone(pages/228400.0);
print("Pages: " + pages + " (" + title + ")");
sleep(1);
}
//print(title);
pageBuf = null;
}
}
}