!7 static File mainLog, fTriples, file, file2; p { //if (isMain()) restartWith2GBHeap(); phase1(); print("PHASE 2 in 5 SECONDS"); sleepSeconds(5); phase2(); } svoid phase1 { // First, rotate. mainLog = programFile(#1010745, "webs-made.txt"); fTriples = programFile(#1010745, "triples.gz"); rotateLogFile(mainLog); File tripelizedDir = programDir(#1010745, "tripelized"); // Load webs new Map fiMap; L webs = websMadeByProgram(#1010745, fiMap); // Get all references to global IDs new HashSet allReferences; for (Web web : webs) for (WebNode n : web_nodesAndRelations(web)) for (S s : web_texts(n)) allReferences.addAll(aggressivelyCollectPossibleGlobalIDs(s)); // Classify by content & find unreferenced duplicates // key = triple + verified HashMap, Bool>, GlobalID> websByContent = new HashMap; new CompactHashSet unreferencedDuplicates; for (Web web : webs) { Pair, Bool> key = pair(webToTriple(web), web.verified()); if (websByContent.containsKey(key)) { if (!allReferences.contains(web.globalID())) unreferencedDuplicates.add(web.globalIDObj()); } else websByContent.put(key, web.globalIDObj()); } print("Have " + n(unreferencedDuplicates, "unreferenced duplicate web") + ", dropping."); // Drop unreferenced duplicates webs = webs_dropWebsContainedInIDSet(webs, unreferencedDuplicates); print("Now have " + nWeb(webs) + "."); // Drop invalid webs new HashMap invalidatedBy; new Matches m; for (Web web : webs) if (web.verified()) for (WebNode n : web_search_dollarX(webFromTriple("$X", "is", "invalid"), web)) if (web_match("Web *", n, m) && isGlobalID($1)) invalidatedBy.put($1, web.globalID()); for (S id : cloneKeys(invalidatedBy)) { S invalidator = invalidatedBy.get(id); if (invalidatedBy.containsKey(invalidator)) invalidatedBy.remove(id); } print("Removing " + n(invalidatedBy, "invalid web")); saveTextFile(countTillNewFile(newFile(tripelizedDir, "invalidated"), ".gz"), structure(invalidatedBy)); webs = webs_dropIDs(webs, keys(invalidatedBy)); int dropped = 0, dropped2 = 0; new L websOut; for (Web web : webs) { if (web_nodesTooLong(web)) dropped++; else if (!web_tripelizable(web)) dropped2++; else websOut.add(web); } webs = websOut; if (dropped != 0) print("Dropped " + nWeb(dropped) + " with too long nodes"); if (dropped2 != 0) print("Dropped " + n(dropped2, "non-triple")); Pair> pair = webs_tripelizable_nonTripelizable(webs); // Save triples file = new File(tripelizedDir, "triples.new.gz"); PrintWriter out = newPrintWriter(newGZIPOutputStream(file)); webs_toTripleFile(pair.a, out); out.close(); // Save non-triples file2 = new File(tripelizedDir, "webs-made.new.gz"); out = newPrintWriter(newGZIPOutputStream(file2)); for (Web web : pair.b) out.println(quote(struct(fiMap.get(web)))); out.close(); print("Stored " + nWeb(pair.a) + " in " + f2s(file) + ", " + nWeb(pair.b) + " in " + f2s(file2)); } svoid phase2 { print("PHASE 2."); // Move data to backups File fBackups = programFile(#1010745, "backups"); for (File f : earlierPartsOfLogFile(mainLog)) { int n = 1; File p; while ((p = newFile(fBackups, dropSuffix(".gz", f.getName()) + ".part" + n + ".gz")).exists()) ++n; renameFile_assertTrue(f, p); } // Backup triples.gz int n = 1; File p; while ((p = newFile(fBackups, dropSuffix(".gz", fTriples.getName()) + ".part" + n + ".gz")).exists()) ++n; renameFile_assertTrue(fTriples, p); // Rename main files renameFile_assertTrue(file, fTriples); renameFile_assertTrue(file2, programFile(#1010745, "webs-made.txt.part1.gz")); print("DONE."); }