Download Jar. Libraryless. Click here for Pure Java version (7925L/54K).
1 | !7 |
2 | |
3 | static File mainLog, fTriples, file, file2; |
4 | |
5 | p { |
6 | //if (isMain()) restartWith2GBHeap(); |
7 | |
8 | phase1(); |
9 | |
10 | print("PHASE 2 in 5 SECONDS"); |
11 | sleepSeconds(5); |
12 | phase2(); |
13 | } |
14 | |
15 | svoid phase1 { |
16 | // First, rotate. |
17 | |
18 | mainLog = programFile(#1010745, "webs-made.txt"); |
19 | fTriples = programFile(#1010745, "triples.gz"); |
20 | rotateLogFile(mainLog); |
21 | |
22 | File tripelizedDir = programDir(#1010745, "tripelized"); |
23 | |
24 | // Load webs |
25 | |
26 | new Map<Web, SoftwareMadeWeb> fiMap; |
27 | L<Web> webs = websMadeByProgram(#1010745, fiMap); |
28 | |
29 | // Get all references to global IDs |
30 | new HashSet<S> allReferences; |
31 | for (Web web : webs) |
32 | for (WebNode n : web_nodesAndRelations(web)) |
33 | for (S s : web_texts(n)) |
34 | allReferences.addAll(aggressivelyCollectPossibleGlobalIDs(s)); |
35 | |
36 | // Classify by content & find unreferenced duplicates |
37 | // key = triple + verified |
38 | HashMap<Pair<T3<S>, Bool>, GlobalID> websByContent = new HashMap; |
39 | new CompactHashSet<GlobalID> unreferencedDuplicates; |
40 | for (Web web : webs) { |
41 | Pair<T3<S>, Bool> key = pair(webToTriple(web), web.verified()); |
42 | if (websByContent.containsKey(key)) { |
43 | if (!allReferences.contains(web.globalID())) |
44 | unreferencedDuplicates.add(web.globalIDObj()); |
45 | } else |
46 | websByContent.put(key, web.globalIDObj()); |
47 | } |
48 | print("Have " + n(unreferencedDuplicates, "unreferenced duplicate web") + ", dropping."); |
49 | |
50 | // Drop unreferenced duplicates |
51 | webs = webs_dropWebsContainedInIDSet(webs, unreferencedDuplicates); |
52 | print("Now have " + nWeb(webs) + "."); |
53 | |
54 | // Drop invalid webs |
55 | new HashMap<S> invalidatedBy; |
56 | new Matches m; |
57 | for (Web web : webs) |
58 | if (web.verified()) |
59 | for (WebNode n : web_search_dollarX(webFromTriple("$X", "is", "invalid"), web)) |
60 | if (web_match("Web *", n, m) && isGlobalID($1)) |
61 | invalidatedBy.put($1, web.globalID()); |
62 | for (S id : cloneKeys(invalidatedBy)) { |
63 | S invalidator = invalidatedBy.get(id); |
64 | if (invalidatedBy.containsKey(invalidator)) |
65 | invalidatedBy.remove(id); |
66 | } |
67 | print("Removing " + n(invalidatedBy, "invalid web")); |
68 | saveTextFile(countTillNewFile(newFile(tripelizedDir, "invalidated"), ".gz"), structure(invalidatedBy)); |
69 | webs = webs_dropIDs(webs, keys(invalidatedBy)); |
70 | |
71 | int dropped = 0, dropped2 = 0; |
72 | new L<Web> websOut; |
73 | for (Web web : webs) { |
74 | if (web_nodesTooLong(web)) |
75 | dropped++; |
76 | else if (!web_tripelizable(web)) |
77 | dropped2++; |
78 | else |
79 | websOut.add(web); |
80 | } |
81 | webs = websOut; |
82 | if (dropped != 0) |
83 | print("Dropped " + nWeb(dropped) + " with too long nodes"); |
84 | if (dropped2 != 0) |
85 | print("Dropped " + n(dropped2, "non-triple")); |
86 | |
87 | Pair<L<Web>> pair = webs_tripelizable_nonTripelizable(webs); |
88 | |
89 | // Save triples |
90 | |
91 | file = new File(tripelizedDir, "triples.new.gz"); |
92 | PrintWriter out = newPrintWriter(newGZIPOutputStream(file)); |
93 | webs_toTripleFile(pair.a, out); |
94 | out.close(); |
95 | |
96 | // Save non-triples |
97 | |
98 | file2 = new File(tripelizedDir, "webs-made.new.gz"); |
99 | out = newPrintWriter(newGZIPOutputStream(file2)); |
100 | for (Web web : pair.b) |
101 | out.println(quote(struct(fiMap.get(web)))); |
102 | out.close(); |
103 | |
104 | print("Stored " + nWeb(pair.a) + " in " + f2s(file) + ", " + nWeb(pair.b) + " in " + f2s(file2)); |
105 | } |
106 | |
107 | svoid phase2 { |
108 | print("PHASE 2."); |
109 | |
110 | // Move data to backups |
111 | |
112 | File fBackups = programFile(#1010745, "backups"); |
113 | for (File f : earlierPartsOfLogFile(mainLog)) { |
114 | int n = 1; |
115 | File p; |
116 | while ((p = newFile(fBackups, dropSuffix(".gz", f.getName()) + ".part" + n + ".gz")).exists()) ++n; |
117 | renameFile_assertTrue(f, p); |
118 | } |
119 | |
120 | // Backup triples.gz |
121 | |
122 | int n = 1; |
123 | File p; |
124 | while ((p = newFile(fBackups, dropSuffix(".gz", fTriples.getName()) + ".part" + n + ".gz")).exists()) ++n; |
125 | renameFile_assertTrue(fTriples, p); |
126 | |
127 | // Rename main files |
128 | renameFile_assertTrue(file, fTriples); |
129 | renameFile_assertTrue(file2, programFile(#1010745, "webs-made.txt.part1.gz")); |
130 | |
131 | print("DONE."); |
132 | } |
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, ppjhyzlbdabe, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1011778 |
Snippet name: | Tripelize Smart Bot's DB (works, but needs too much memory) |
Eternal ID of this version: | #1011778/38 |
Text MD5: | dd7ba2e95730e1bba0797bebca159c80 |
Transpilation MD5: | efd6fe9942dfc9ebefa9b69e5bab534a |
Author: | stefan |
Category: | javax / a.i. |
Type: | JavaX source code (desktop) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2017-12-26 18:08:21 |
Source code size: | 4032 bytes / 132 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 553 / 1408 |
Version history: | 37 change(s) |
Referenced in: | [show references] |