Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

132
LINES

< > BotCompany Repo | #1011778 // Tripelize Smart Bot's DB (works, but needs too much memory)

JavaX source code (desktop) [tags: use-pretranspiled] - run with: x30.jar

Download Jar. Libraryless. Click here for Pure Java version (7925L/54K).

!7

static File mainLog, fTriples, file, file2;

p {
  //if (isMain()) restartWith2GBHeap();
  
  phase1();

  print("PHASE 2 in 5 SECONDS");
  sleepSeconds(5);
  phase2();
}

svoid phase1 {
  // First, rotate.
  
  mainLog = programFile(#1010745, "webs-made.txt");
  fTriples = programFile(#1010745, "triples.gz");
  rotateLogFile(mainLog);
  
  File tripelizedDir = programDir(#1010745, "tripelized");
  
  // Load webs

  new Map<Web, SoftwareMadeWeb> fiMap;
  L<Web> webs = websMadeByProgram(#1010745, fiMap);
  
  // Get all references to global IDs
  new HashSet<S> allReferences;
  for (Web web : webs)
    for (WebNode n : web_nodesAndRelations(web))
      for (S s : web_texts(n))
        allReferences.addAll(aggressivelyCollectPossibleGlobalIDs(s));
        
  // Classify by content & find unreferenced duplicates
  // key = triple + verified
  HashMap<Pair<T3<S>, Bool>, GlobalID> websByContent = new HashMap;
  new CompactHashSet<GlobalID> unreferencedDuplicates;
  for (Web web : webs) {
    Pair<T3<S>, Bool> key = pair(webToTriple(web), web.verified());
    if (websByContent.containsKey(key)) {
      if (!allReferences.contains(web.globalID()))
        unreferencedDuplicates.add(web.globalIDObj());
    } else
      websByContent.put(key, web.globalIDObj());
  }
  print("Have " + n(unreferencedDuplicates, "unreferenced duplicate web") + ", dropping.");
  
  // Drop unreferenced duplicates
  webs = webs_dropWebsContainedInIDSet(webs, unreferencedDuplicates);
  print("Now have " + nWeb(webs) + ".");
    
  // Drop invalid webs
  new HashMap<S> invalidatedBy;
  new Matches m;
  for (Web web : webs)
    if (web.verified())
      for (WebNode n : web_search_dollarX(webFromTriple("$X", "is", "invalid"), web))
        if (web_match("Web *", n, m) && isGlobalID($1))
          invalidatedBy.put($1, web.globalID());
  for (S id : cloneKeys(invalidatedBy)) {
    S invalidator = invalidatedBy.get(id);
    if (invalidatedBy.containsKey(invalidator))
      invalidatedBy.remove(id);
  }
  print("Removing " + n(invalidatedBy, "invalid web"));
  saveTextFile(countTillNewFile(newFile(tripelizedDir, "invalidated"), ".gz"), structure(invalidatedBy));
  webs = webs_dropIDs(webs, keys(invalidatedBy));
  
  int dropped = 0, dropped2 = 0;
  new L<Web> websOut;
  for (Web web : webs) {
  if (web_nodesTooLong(web))
      dropped++;
    else if (!web_tripelizable(web))
      dropped2++;
    else
      websOut.add(web);
  }
  webs = websOut;
  if (dropped != 0)
    print("Dropped " + nWeb(dropped) + " with too long nodes");
  if (dropped2 != 0)
    print("Dropped " + n(dropped2, "non-triple"));
  
  Pair<L<Web>> pair = webs_tripelizable_nonTripelizable(webs);

  // Save triples
  
  file = new File(tripelizedDir, "triples.new.gz");
  PrintWriter out = newPrintWriter(newGZIPOutputStream(file));
  webs_toTripleFile(pair.a, out);
  out.close();

  // Save non-triples
  
  file2 = new File(tripelizedDir, "webs-made.new.gz");
  out = newPrintWriter(newGZIPOutputStream(file2));
  for (Web web : pair.b)
    out.println(quote(struct(fiMap.get(web))));
  out.close();
  
  print("Stored " + nWeb(pair.a) + " in " + f2s(file) + ", " + nWeb(pair.b) + " in " + f2s(file2));
}
 
svoid phase2 {
  print("PHASE 2.");
  
  // Move data to backups
  
  File fBackups = programFile(#1010745, "backups");
  for (File f : earlierPartsOfLogFile(mainLog)) {
    int n = 1;
    File p;
    while ((p = newFile(fBackups, dropSuffix(".gz", f.getName()) + ".part" + n + ".gz")).exists()) ++n;
    renameFile_assertTrue(f, p);
  }
  
  // Backup triples.gz
  
  int n = 1;
  File p;
  while ((p = newFile(fBackups, dropSuffix(".gz", fTriples.getName()) + ".part" + n + ".gz")).exists()) ++n;
  renameFile_assertTrue(fTriples, p);
  
  // Rename main files
  renameFile_assertTrue(file, fTriples);
  renameFile_assertTrue(file2, programFile(#1010745, "webs-made.txt.part1.gz"));
  
  print("DONE.");
}

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, ppjhyzlbdabe, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1011778
Snippet name: Tripelize Smart Bot's DB (works, but needs too much memory)
Eternal ID of this version: #1011778/38
Text MD5: dd7ba2e95730e1bba0797bebca159c80
Transpilation MD5: efd6fe9942dfc9ebefa9b69e5bab534a
Author: stefan
Category: javax / a.i.
Type: JavaX source code (desktop)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2017-12-26 18:08:21
Source code size: 4032 bytes / 132 lines
Pitched / IR pitched: No / No
Views / Downloads: 554 / 1408
Version history: 37 change(s)
Referenced in: #1012378 - Phase 1 Of Tripelization
#1013307 - ai_compactLiveDB - just store all triples from memory, no attempts at cleaning up the DB