LineCompCompressor [backup before eclipse collections] [1028503]

// This is using the implementation that seems to work fastest
// in reality, although it probably is O(n log n) due to the use
// of TreeMaps.
//
// The theoretically best implementation is O(n) due to replacing the
// TreeMaps with a linked list.
// I think it works, see LineComp_PairIndex_v2, but #1028231 is fuxx0red.

sclass LineCompCompressor {
  int safety = 0;
  
  replace Encodings with Map<S, L<Int>>.

  abstract sclass Chunk {
    abstract S text(L<Chunk> chunks);
  }
  
  srecord CPair(int i1, int i2) > Chunk {
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
    
    S text(L<Chunk> chunks) {
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
    }
  }
  
  srecord CPrim(S s) > Chunk {
    S text(L<Chunk> chunks) { ret s; }
  }

  bool verbose = false, verboseCompressionSteps = false;
  bool sortLines = true;
  bool verify = true;

  Map<S, LS> textIDToLines;
  LS allUniqueLines;
  new L<Chunk> chunks;
  int primChunks;
  Map<S, Int> lineIndex;
  Encodings finalEncodings;
  new Map<IntPair, Int> existingPairsIndex;
  
  // key = version ID, values = text
  *(SS texts) {
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
  }
  
  LS myToLines(S s) { ret toLines_nOnly_reversible(s); }
  S myFromLines(LS l) { ret fromLines_rtrim(l); }
  
  run {
    LS allLines = concatLists(values(textIDToLines));
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
    allUniqueLines = uniquify(allLines);
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
    allLines = null; // allow me to forget
    if (sortLines) sortInPlace(allUniqueLines);
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
    for (S line : allUniqueLines)
      chunks.add(new CPrim(line));
    primChunks = l(chunks);
    lineIndex = listIndex(collect s(chunks));
    
    // simple encoding (only direct line references)
    Encodings simpleEncodings = mapValues(textIDToLines,
      (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
    //printAndCheckEncodings(simpleEncodings);
    if (verboseCompressionSteps) print("Have simple encodings");
    
    finalEncodings = compressPairs_v3(simpleEncodings);
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
  }
  
  void saveAsTextFile(File f) {
    S out = exportEncoding(finalEncodings);
    saveTextFile(f, out);
    
    if (verify) checkDecompression(f, textIDToLines);
  }
  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
    temp BufferedReader reader = bufferedUtf8Reader(file);
    LineCompReader lcr = new(reader);
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
    for (S version : keys(textIDToLines))
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
  }
  
  S asText() { ret exportEncoding(finalEncodings); }
  
  S exportEncoding(Encodings encodings) {
    new LS buf;
    buf.add("LINECOMP " + primChunks); // magic signature
    for (Chunk c : chunks) {
      if (c cast CPair)
        buf.add(c.i1 + " " + c.i2);
      else
        buf.add(((CPrim) c).s);
    }
    for (S id, L<Int> l : encodings)
      buf.add(id + "=" + joinWithSpace(l));
    ret lines_rtrim(buf);
  }
  
  Encodings compressPairs_v3(Encodings encodings) {
    new LineComp_PairIndex pairIndex;
    pairIndex.verbose = verboseCompressionSteps;
    pairIndex.init();
    for (S version, L<Int> l : encodings)
      pairIndex.newChain(version, l);

    IntPair toCompress;
    // Compress only most popular pair in one step
    while ping ((toCompress = pairIndex.mostPopularDuplicate()) != null) {
      if (safety > 0 && --safety <= 0) fail("safety");
      int count = pairIndex.getCount(toCompress);
      Int idx = makeCPair(toCompress);
      int dups = pairIndex.numberOfDuplicates();
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
      pairIndex.replacePair(toCompress.a, toCompress.b, idx);
    }
    
    // reconvert to normal list
    ret mapValues(ch -> ch.toList(), pairIndex.chains);
  }
  
  // TODO: check if pair is already there
  Int makeCPair(IntPair p) {
    Int idx = existingPairsIndex.get(p);
    if (idx == null)
      existingPairsIndex.put(p, idx = addAndReturnIndex(chunks, new CPair(p)));
    ret idx;
  }
  
  void printAndCheckEncodings(Encodings encodings) {
    for (S id, L<Int> encoded : encodings) {
      if (verbose) print(id + ": " + joinWithSpace(encoded));
      assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded));
    }
  }
  
  S decode(L<Int> encoded) {
    ret myFromLines(lambdaMap chunkText(encoded));
  }
  
  S chunkText(int idx) {
    ret chunks.get(idx).text(chunks);
  }
}

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

Snippet ID:	#1028503
Snippet name:	LineCompCompressor [backup before eclipse collections]
Eternal ID of this version:	#1028503/1
Text MD5:	b08de58bb27524ad814e02f37aed6908
Transpilation MD5:	7668b40e654b1fc3ec04a78ba30a0396
Author:	stefan
Category:	javax
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2020-06-23 11:07:45
Source code size:	5036 bytes / 149 lines
Pitched / IR pitched:	No / No
Views / Downloads:	243 / 333
Referenced in:

< > BotCompany Repo | #1028503 // LineCompCompressor [backup before eclipse collections]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment