LineCompCompressor, faster version [LIVE] [1028186]

// This is using the implementation that seems to work fastest
// in reality, although it probably is O(n log n) due to the use
// of TreeMaps.
//
// The theoretically best implementation is O(n) (replacing the
// TreeMap with a linked list).

!include once #1027304 // Eclipse Collections

import org.eclipse.collections.impl.map.mutable.primitive.*;

final sclass LineCompCompressor {
  int safety = 0;
  
  replace Encodings with Map<S, L<Int>>.

  abstract sclass Chunk {
    abstract S text(L<Chunk> chunks);
    IntPair getPair() { null; }
  }
  
  srecord CPair(int i1, int i2) > Chunk {
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
    
    S text(L<Chunk> chunks) {
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
    }
    
    IntPair getPair() {
      ret IntPair(i1, i2);
    }
  }
  
  srecord CPrim(S s) > Chunk {
    S text(L<Chunk> chunks) { ret s; }
  }

  bool verbose = false, verboseCompressionSteps = false;
  bool verboseStats;
  bool sortLines = true;
  bool verify = true;
  bool byteMode; // compress bytes instead of lines
  bool caseInsensitive; // for line mode
  bool toUpper; // for byte mode: convert all characters to upper case
  bool recordPairLengths; // for "balanced" compression
  bool balancing; // try to lower tree heights by compressing "shorter" pairs first
  bool fullCompression; // keep compressing until every file is a single symbol

  new LineComp_PairIndex pairIndex;
  Map<S, LS> textIDToLines;
  LS allUniqueLines;
  new L<Chunk> chunks;
  int primChunks;
  Map<S, Int> lineIndex;
  Encodings simpleEncodings; // temporary
  Encodings finalEncodings;
  new LongIntHashMap existingPairsIndex; // formerly Map<IntPair, Int>
  int round, printStatsEvery = /*10000*/0, printStatsInFirstRounds = 10;
  int printStatsEveryNSeconds = /*10*/60; // 60 for slower computers
  
  // initializing fullIndex = new Map will make things slow and use a lot of memory
  // and I thought there might be a case where it improves the compression
  // - but maybe there isn't?
  Map<S, Int> fullIndex; 
  SimpleIntList pairLengths;

  *() {}
  *(SS texts) { loadTexts(texts); }
  
  // key = version ID, values = text
  void loadTexts(SS texts) {
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
  }
  
  LS myToLines(S s) {
    LS l = byteMode
      ? lambdaMap charToHex(characters(toUpper ? upper(s) : s))
      : toLines_nOnly_reversible(s);
    ret l;
  }
      
  S myFromLines(LS l) {
    ret byteMode
      ? join(mapI hexToChar(l)) // TODO: optimize even more
      : fromLines_rtrim(l);
  }
  
  run {
    if (balancing) {
      set recordPairLengths;
      pairIndex.pairComparatorForBalancing = (p1, p2) -> {
        int l1 = pairLength(p1), l2 = pairLength(p2);
        if (l1 != l2) ret l1-l2;
        // return arbitrary deterministic order, but not 0
        if (p1.a != p2.a) ret p1.a-p2.a;
        ret p1.b-p2.b;
      };
    }
      
    if (recordPairLengths) pairLengths = new SimpleIntList;
    LS allLines = concatLists(values(textIDToLines));
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
    allUniqueLines = uniquify(allLines);
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
    allLines = null; // allow me to forget
    if (sortLines) sortInPlace(allUniqueLines);
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
    for (S line : allUniqueLines)
      chunks.add(new CPrim(line));
    primChunks = l(chunks);
    LS _lines = collect s(chunks);
    lineIndex = caseInsensitive ? ciListIndex(_lines) : listIndex(_lines);
    _lines = null;
    
    // simple encoding (only direct line references)
    simpleEncodings = mapValues(textIDToLines,
      (IF1<LS, L<Int>>) (lines -> wrapIntArrayAsImmutableList(mapToIntArray(lines, line -> lineIndex.get(line)))));
    //printAndCheckEncodings(simpleEncodings);
    if (verboseCompressionSteps) print("Have simple encodings");
    if (!verify) textIDToLines = null;
    
    finalEncodings = compressPairs();
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
  }
  
  void saveAsTextFile(File f) {
    S out = exportEncoding(finalEncodings);
    saveTextFile(f, out);
    
    if (verify) checkDecompression(f, textIDToLines);
  }
  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
    temp BufferedReader reader = bufferedUtf8Reader(file);
    LineCompReader lcr = new(reader);
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
    for (S version : keys(textIDToLines))
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
  }
  
  S asText() { ret exportEncoding(finalEncodings); }
  
  S exportEncoding(Encodings encodings) {
    new LS buf;
    buf.add((byteMode ? "BYTECOMP " : "LINECOMP ") + primChunks); // magic signature
    for (Chunk c : chunks) {
      if (c cast CPair)
        buf.add(c.i1 + " " + c.i2);
      else
        buf.add(((CPrim) c).s);
    }
    for (S id, L<Int> l : encodings)
      buf.add(id + "=" + joinWithSpace(l));
    ret lines_rtrim(buf);
  }
  
  Encodings compressPairs() {
    pairIndex.verbose = verboseCompressionSteps || verbose;
    pairIndex.init();
    for ping (S version, L<Int> l : simpleEncodings)
      pairIndex.newChain(version, l);
    pairIndex.haveChains();
    simpleEncodings = null; // forget

    ret compressChains(keys(pairIndex.chains));
  }
  
  Encodings compressChains(Cl<S> files) {
    NotTooOften nto = new(printStatsEveryNSeconds*1000);
    IntPair toCompress;
    // Compress only most popular pair in one step
    while ping ((toCompress = fullCompression
      ? pairIndex.mostPopularPair()
      : pairIndex.mostPopularDuplicate()) != null) {
      if (verbose) {
        print("Chains: " + mapValuesToList(pairIndex.chains, c -> c.toList()));
        //print("Highest bucket (" + pairIndex.highestBucketNr() + "): " + pairIndex.mostPopularDuplicates());
        print("Buckets: " + pairIndex.byCount);
      }

      if (safety > 0 && --safety <= 0) fail("safety");
      ++round;
      if (verboseStats)
        if (printStatsEvery > 0 && (round % printStatsEvery) == 0
          || round <= printStatsInFirstRounds
          || nto!)
          printStats();
      int count = pairIndex.getCount(toCompress);
      Int idx = makeCPair(toCompress);
      int dups = pairIndex.numberOfDuplicates();
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
      pairIndex.replacePair(toCompress.a, toCompress.b, idx);
    }
    if (verboseStats) printStats();
    
    // reconvert to normal list
    ret mapToValues(f -> pairIndex.chains.get(f).toList(), files);
  }
  
  // TODO: check if pair is already there
  Int makeCPair(IntPair p) {
    long l = intPairToLong(p);
    S text = null;
    if (fullIndex != null) {
      text = itemToString(p.a) + itemToString(p.b);
      Int idx = fullIndex.get(text);
      if (idx != null) ret idx;
    }
    int idx = existingPairsIndex.getIfAbsent(l, -1);
    if (idx < 0) {
      idx = addAndReturnIndex(chunks, new CPair(p));
      if (recordPairLengths) listPut(pairLengths, idx, itemLength(p.a) + itemLength(p.b), 1);
      existingPairsIndex.put(l, idx);
      if (fullIndex != null)
        fullIndex.put(text, idx);
    }
    ret idx;
  }
  
  void printAndCheckEncodings(Encodings encodings) {
    for (S id, L<Int> encoded : encodings) {
      if (verbose) print(id + ": " + joinWithSpace(encoded));
      S x = myFromLines(textIDToLines.get(id)), y = decode(encoded);
      if (caseInsensitive)
        assertEqualsIC(x, y);
      else
        assertEquals_quote(x, y);
    }
  }
  
  S decode(L<Int> encoded) {
    LS l = lambdaMap chunkText(encoded);
    if (byteMode) l = lines(lines(l));
    ret myFromLines(l);
  }
  
  // for non-byte mode
  S chunkText(int idx) {
    ret chunks.get(idx).text(chunks);
  }
  
  void printStats() {
    print("Round: " + round);
    pairIndex.printStats();
  }
  
  // byteMode only
  S itemToString(int idx) {
    new StringBuilder buf;
    itemToString_step(buf, idx);
    ret str(buf);
  }
  
  void itemToString_step(StringBuilder buf, int idx) {
    Chunk c = chunks.get(idx);
    if (c cast CPair) {
      itemToString_step(buf, c.i1);
      itemToString_step(buf, c.i2);
    } else {
      S s = ((CPrim) c).s;
      buf.append(byteMode ? charFromHex(s) : s + "\n");
    }
  }
  
  int nLiterals() { ret primChunks; }
  int nItems() { ret l(chunks); }
  
  IntPair getPair(int idx) {
    ret chunks.get(idx).getPair();
  }
  
  int itemLength(int idx) {
    ret idx >= l(pairLengths) ? 1 : pairLengths.get(idx);
  }
  
  int pairLength(IntPair p) {
    ret p == null ? 0 : itemLength(p.a) + itemLength(p.b);
  }
  
  LS literals() { ret lmap itemToString(countIterator(primChunks)); }
}

Travelled to 8 computer(s): bhatertpkbcr, ekrmjmnbrukm, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

Snippet ID:	#1028186
Snippet name:	LineCompCompressor, faster version [LIVE]
Eternal ID of this version:	#1028186/134
Text MD5:	e971d8644645f8667e3bfb0b5f47911d
Transpilation MD5:	101663b9e3006d0041f4bfa5103fde92
Author:	stefan
Category:	javax
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2020-12-13 19:22:27
Source code size:	9242 bytes / 282 lines
Pitched / IR pitched:	No / No
Views / Downloads:	708 / 1669
Version history:	133 change(s)
Referenced in:	#1028192 - LineCompCompressor v1 (with slow pair making, old) #1028217 - Line-Based Multi-Version Compressor Spike, faster version [dev.] #1028234 - LineCompCompressor, faster version backup before IntPair #1028252 - LineCompCompressor 0.3 [dev.] #1028503 - LineCompCompressor [backup before eclipse collections] #1029107 - LCStreamingCompressor [seems to work] #1034167 - Standard Classes + Interfaces (LIVE, continuation of #1003674)

< > BotCompany Repo | #1028186 // LineCompCompressor, faster version [LIVE]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment