Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

127
LINES

< > BotCompany Repo | #1028204 // LineCompCompressor v1.0 (working, with slow pair making, before IntPair)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (6651L/42K).

sclass LineCompCompressor {
  replace Encodings with Map<S, L<Int>>.
  
  abstract sclass Chunk {
    abstract S text(L<Chunk> chunks);
  }
  
  srecord CPair(int i1, int i2) > Chunk {
    CPair(Pair<Int> p) { i1 = p.a; i2 = p.b; }
    
    S text(L<Chunk> chunks) {
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
    }
  }
  
  srecord CPrim(S s) > Chunk {
    S text(L<Chunk> chunks) { ret s; }
  }

  bool verbose = false;
  bool sortLines = true;
  bool verify = true;
  
  Map<S, LS> textIDToLines;
  LS allUniqueLines;
  new L<Chunk> chunks;
  int primChunks;
  Map<S, Int> lineIndex;
  new Map<PairS, Int> linePairIndex;
  Encodings finalEncodings;
  
  // key = version ID, values = text
  *(SS texts) {
    textIDToLines = mapValuesToLinkedHashMap lines(texts);
  }
  
  run {
    allUniqueLines = uniquify(concatLists(values(textIDToLines)));
    if (sortLines) sortInPlace(allUniqueLines);
    for (S line : allUniqueLines)
      chunks.add(new CPrim(line));
    primChunks = l(chunks);
    lineIndex = listIndex(collect s(chunks));
    
    // simple encoding (only direct line references)
    Encodings simpleEncodings = mapValues(textIDToLines,
      (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
    //printAndCheckEncodings(simpleEncodings);
    
    finalEncodings = repeatUntilSame compressPairs(simpleEncodings);
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
  }
  
  void saveAsTextFile(File f) {
    S out = exportEncoding(finalEncodings);
    saveTextFile(f, out);
    
    if (verify) checkDecompression(f, textIDToLines);
  }
  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
    temp BufferedReader reader = bufferedUtf8Reader(file);
    LineCompReader lcr = new(reader);
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
    for (S version : keys(textIDToLines))
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
  }
  
  S exportEncoding(Encodings encodings) {
    new LS buf;
    buf.add("LINECOMP " + primChunks); // magic signature
    for (Chunk c : chunks) {
      if (c cast CPair)
        buf.add(c.i1 + " " + c.i2);
      else
        buf.add(((CPrim) c).s);
    }
    for (S id, L<Int> l : encodings)
      buf.add(id + "=" + joinWithSpace(l));
    ret lines_rtrim(buf);
  }
  
  Encodings compressPairs(Encodings encodings) {
    new MultiSet<Pair<Int>> pairCounts;
    
    for (L<Int> l : values(encodings)) {
      Pair<Int> lastPair = null;
      for (Pair<Int> pair : overlappingPairs(l)) {
        if (neq(pair, lastPair)) {
          lastPair = pair;
          pairCounts.add(pair);
        }
      }
    }
    
    //print("Pair counts: " + pairCounts);
    Pair<Int> toCompress = msMostPopularDuplicate(pairCounts);

    // Compress only most popular pair
    if (toCompress == null) ret encodings; // Nothing to do
    int idx = makeCPair(toCompress);
    print("Made pair: " + toCompress + " -> " + idx + ", " + (msNumberOfDuplicates(pairCounts)-1) + " remaining");
    ret mapValues(encodings, (IF1<L<Int>>) encoded ->
      replaceSublist(encoded, pairToList(toCompress), ll(idx)));
  }
  
  int makeCPair(Pair<Int> p) {
    int idx = addAndReturnIndex(chunks, new CPair(p));
    ret idx;
  }
  
  void printAndCheckEncodings(Encodings encodings) {
    for (S id, L<Int> encoded : encodings) {
      if (verbose) print(id + ": " + joinWithSpace(encoded));
      assertEquals(lines(textIDToLines.get(id)), decode(encoded));
    }
  }
  
  S decode(L<Int> encoded) {
    ret lines(lambdaMap chunkText(encoded));
  }
  
  S chunkText(int idx) {
    ret chunks.get(idx).text(chunks);
  }
}

Author comment

Began life as a copy of #1028192

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028204
Snippet name: LineCompCompressor v1.0 (working, with slow pair making, before IntPair)
Eternal ID of this version: #1028204/1
Text MD5: 58ff7657b5ae50ec8a53ae5a030dc36d
Transpilation MD5: 0455ff992ae44ec739360d5a43d43d84
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-26 13:15:01
Source code size: 3894 bytes / 127 lines
Pitched / IR pitched: No / No
Views / Downloads: 189 / 274
Referenced in: [show references]