Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

132
LINES

< > BotCompany Repo | #1011778 // Tripelize Smart Bot's DB (works, but needs too much memory)

JavaX source code (desktop) [tags: use-pretranspiled] - run with: x30.jar

Download Jar. Libraryless. Click here for Pure Java version (7925L/54K).

1  
!7
2  
3  
static File mainLog, fTriples, file, file2;
4  
5  
p {
6  
  //if (isMain()) restartWith2GBHeap();
7  
  
8  
  phase1();
9  
10  
  print("PHASE 2 in 5 SECONDS");
11  
  sleepSeconds(5);
12  
  phase2();
13  
}
14  
15  
svoid phase1 {
16  
  // First, rotate.
17  
  
18  
  mainLog = programFile(#1010745, "webs-made.txt");
19  
  fTriples = programFile(#1010745, "triples.gz");
20  
  rotateLogFile(mainLog);
21  
  
22  
  File tripelizedDir = programDir(#1010745, "tripelized");
23  
  
24  
  // Load webs
25  
26  
  new Map<Web, SoftwareMadeWeb> fiMap;
27  
  L<Web> webs = websMadeByProgram(#1010745, fiMap);
28  
  
29  
  // Get all references to global IDs
30  
  new HashSet<S> allReferences;
31  
  for (Web web : webs)
32  
    for (WebNode n : web_nodesAndRelations(web))
33  
      for (S s : web_texts(n))
34  
        allReferences.addAll(aggressivelyCollectPossibleGlobalIDs(s));
35  
        
36  
  // Classify by content & find unreferenced duplicates
37  
  // key = triple + verified
38  
  HashMap<Pair<T3<S>, Bool>, GlobalID> websByContent = new HashMap;
39  
  new CompactHashSet<GlobalID> unreferencedDuplicates;
40  
  for (Web web : webs) {
41  
    Pair<T3<S>, Bool> key = pair(webToTriple(web), web.verified());
42  
    if (websByContent.containsKey(key)) {
43  
      if (!allReferences.contains(web.globalID()))
44  
        unreferencedDuplicates.add(web.globalIDObj());
45  
    } else
46  
      websByContent.put(key, web.globalIDObj());
47  
  }
48  
  print("Have " + n(unreferencedDuplicates, "unreferenced duplicate web") + ", dropping.");
49  
  
50  
  // Drop unreferenced duplicates
51  
  webs = webs_dropWebsContainedInIDSet(webs, unreferencedDuplicates);
52  
  print("Now have " + nWeb(webs) + ".");
53  
    
54  
  // Drop invalid webs
55  
  new HashMap<S> invalidatedBy;
56  
  new Matches m;
57  
  for (Web web : webs)
58  
    if (web.verified())
59  
      for (WebNode n : web_search_dollarX(webFromTriple("$X", "is", "invalid"), web))
60  
        if (web_match("Web *", n, m) && isGlobalID($1))
61  
          invalidatedBy.put($1, web.globalID());
62  
  for (S id : cloneKeys(invalidatedBy)) {
63  
    S invalidator = invalidatedBy.get(id);
64  
    if (invalidatedBy.containsKey(invalidator))
65  
      invalidatedBy.remove(id);
66  
  }
67  
  print("Removing " + n(invalidatedBy, "invalid web"));
68  
  saveTextFile(countTillNewFile(newFile(tripelizedDir, "invalidated"), ".gz"), structure(invalidatedBy));
69  
  webs = webs_dropIDs(webs, keys(invalidatedBy));
70  
  
71  
  int dropped = 0, dropped2 = 0;
72  
  new L<Web> websOut;
73  
  for (Web web : webs) {
74  
  if (web_nodesTooLong(web))
75  
      dropped++;
76  
    else if (!web_tripelizable(web))
77  
      dropped2++;
78  
    else
79  
      websOut.add(web);
80  
  }
81  
  webs = websOut;
82  
  if (dropped != 0)
83  
    print("Dropped " + nWeb(dropped) + " with too long nodes");
84  
  if (dropped2 != 0)
85  
    print("Dropped " + n(dropped2, "non-triple"));
86  
  
87  
  Pair<L<Web>> pair = webs_tripelizable_nonTripelizable(webs);
88  
89  
  // Save triples
90  
  
91  
  file = new File(tripelizedDir, "triples.new.gz");
92  
  PrintWriter out = newPrintWriter(newGZIPOutputStream(file));
93  
  webs_toTripleFile(pair.a, out);
94  
  out.close();
95  
96  
  // Save non-triples
97  
  
98  
  file2 = new File(tripelizedDir, "webs-made.new.gz");
99  
  out = newPrintWriter(newGZIPOutputStream(file2));
100  
  for (Web web : pair.b)
101  
    out.println(quote(struct(fiMap.get(web))));
102  
  out.close();
103  
  
104  
  print("Stored " + nWeb(pair.a) + " in " + f2s(file) + ", " + nWeb(pair.b) + " in " + f2s(file2));
105  
}
106  
 
107  
svoid phase2 {
108  
  print("PHASE 2.");
109  
  
110  
  // Move data to backups
111  
  
112  
  File fBackups = programFile(#1010745, "backups");
113  
  for (File f : earlierPartsOfLogFile(mainLog)) {
114  
    int n = 1;
115  
    File p;
116  
    while ((p = newFile(fBackups, dropSuffix(".gz", f.getName()) + ".part" + n + ".gz")).exists()) ++n;
117  
    renameFile_assertTrue(f, p);
118  
  }
119  
  
120  
  // Backup triples.gz
121  
  
122  
  int n = 1;
123  
  File p;
124  
  while ((p = newFile(fBackups, dropSuffix(".gz", fTriples.getName()) + ".part" + n + ".gz")).exists()) ++n;
125  
  renameFile_assertTrue(fTriples, p);
126  
  
127  
  // Rename main files
128  
  renameFile_assertTrue(file, fTriples);
129  
  renameFile_assertTrue(file2, programFile(#1010745, "webs-made.txt.part1.gz"));
130  
  
131  
  print("DONE.");
132  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, ppjhyzlbdabe, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1011778
Snippet name: Tripelize Smart Bot's DB (works, but needs too much memory)
Eternal ID of this version: #1011778/38
Text MD5: dd7ba2e95730e1bba0797bebca159c80
Transpilation MD5: efd6fe9942dfc9ebefa9b69e5bab534a
Author: stefan
Category: javax / a.i.
Type: JavaX source code (desktop)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2017-12-26 18:08:21
Source code size: 4032 bytes / 132 lines
Pitched / IR pitched: No / No
Views / Downloads: 432 / 1132
Version history: 37 change(s)
Referenced in: [show references]