Download Jar. Libraryless. Click here for Pure Java version (7925L/54K).
!7 static File mainLog, fTriples, file, file2; p { //if (isMain()) restartWith2GBHeap(); phase1(); print("PHASE 2 in 5 SECONDS"); sleepSeconds(5); phase2(); } svoid phase1 { // First, rotate. mainLog = programFile(#1010745, "webs-made.txt"); fTriples = programFile(#1010745, "triples.gz"); rotateLogFile(mainLog); File tripelizedDir = programDir(#1010745, "tripelized"); // Load webs new Map<Web, SoftwareMadeWeb> fiMap; L<Web> webs = websMadeByProgram(#1010745, fiMap); // Get all references to global IDs new HashSet<S> allReferences; for (Web web : webs) for (WebNode n : web_nodesAndRelations(web)) for (S s : web_texts(n)) allReferences.addAll(aggressivelyCollectPossibleGlobalIDs(s)); // Classify by content & find unreferenced duplicates // key = triple + verified HashMap<Pair<T3<S>, Bool>, GlobalID> websByContent = new HashMap; new CompactHashSet<GlobalID> unreferencedDuplicates; for (Web web : webs) { Pair<T3<S>, Bool> key = pair(webToTriple(web), web.verified()); if (websByContent.containsKey(key)) { if (!allReferences.contains(web.globalID())) unreferencedDuplicates.add(web.globalIDObj()); } else websByContent.put(key, web.globalIDObj()); } print("Have " + n(unreferencedDuplicates, "unreferenced duplicate web") + ", dropping."); // Drop unreferenced duplicates webs = webs_dropWebsContainedInIDSet(webs, unreferencedDuplicates); print("Now have " + nWeb(webs) + "."); // Drop invalid webs new HashMap<S> invalidatedBy; new Matches m; for (Web web : webs) if (web.verified()) for (WebNode n : web_search_dollarX(webFromTriple("$X", "is", "invalid"), web)) if (web_match("Web *", n, m) && isGlobalID($1)) invalidatedBy.put($1, web.globalID()); for (S id : cloneKeys(invalidatedBy)) { S invalidator = invalidatedBy.get(id); if (invalidatedBy.containsKey(invalidator)) invalidatedBy.remove(id); } print("Removing " + n(invalidatedBy, "invalid web")); saveTextFile(countTillNewFile(newFile(tripelizedDir, "invalidated"), ".gz"), structure(invalidatedBy)); webs = webs_dropIDs(webs, keys(invalidatedBy)); int dropped = 0, dropped2 = 0; new L<Web> websOut; for (Web web : webs) { if (web_nodesTooLong(web)) dropped++; else if (!web_tripelizable(web)) dropped2++; else websOut.add(web); } webs = websOut; if (dropped != 0) print("Dropped " + nWeb(dropped) + " with too long nodes"); if (dropped2 != 0) print("Dropped " + n(dropped2, "non-triple")); Pair<L<Web>> pair = webs_tripelizable_nonTripelizable(webs); // Save triples file = new File(tripelizedDir, "triples.new.gz"); PrintWriter out = newPrintWriter(newGZIPOutputStream(file)); webs_toTripleFile(pair.a, out); out.close(); // Save non-triples file2 = new File(tripelizedDir, "webs-made.new.gz"); out = newPrintWriter(newGZIPOutputStream(file2)); for (Web web : pair.b) out.println(quote(struct(fiMap.get(web)))); out.close(); print("Stored " + nWeb(pair.a) + " in " + f2s(file) + ", " + nWeb(pair.b) + " in " + f2s(file2)); } svoid phase2 { print("PHASE 2."); // Move data to backups File fBackups = programFile(#1010745, "backups"); for (File f : earlierPartsOfLogFile(mainLog)) { int n = 1; File p; while ((p = newFile(fBackups, dropSuffix(".gz", f.getName()) + ".part" + n + ".gz")).exists()) ++n; renameFile_assertTrue(f, p); } // Backup triples.gz int n = 1; File p; while ((p = newFile(fBackups, dropSuffix(".gz", fTriples.getName()) + ".part" + n + ".gz")).exists()) ++n; renameFile_assertTrue(fTriples, p); // Rename main files renameFile_assertTrue(file, fTriples); renameFile_assertTrue(file2, programFile(#1010745, "webs-made.txt.part1.gz")); print("DONE."); }
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, ppjhyzlbdabe, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1011778 |
Snippet name: | Tripelize Smart Bot's DB (works, but needs too much memory) |
Eternal ID of this version: | #1011778/38 |
Text MD5: | dd7ba2e95730e1bba0797bebca159c80 |
Transpilation MD5: | efd6fe9942dfc9ebefa9b69e5bab534a |
Author: | stefan |
Category: | javax / a.i. |
Type: | JavaX source code (desktop) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2017-12-26 18:08:21 |
Source code size: | 4032 bytes / 132 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 554 / 1408 |
Version history: | 37 change(s) |
Referenced in: | #1012378 - Phase 1 Of Tripelization #1013307 - ai_compactLiveDB - just store all triples from memory, no attempts at cleaning up the DB |