// This is using the implementation that seems to work fastest // in reality, although it probably is O(n log n) due to the use // of TreeMaps. // // The theoretically best implementation is O(n) (replacing the // TreeMap with a linked list). !include once #1027304 // Eclipse Collections import org.eclipse.collections.impl.map.mutable.primitive.*; final sclass LineCompCompressor { int safety = 0; replace Encodings with Map>. abstract sclass Chunk { abstract S text(L chunks); IntPair getPair() { null; } } srecord CPair(int i1, int i2) > Chunk { CPair(IntPair p) { i1 = p.a; i2 = p.b; } S text(L chunks) { ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); } IntPair getPair() { ret IntPair(i1, i2); } } srecord CPrim(S s) > Chunk { S text(L chunks) { ret s; } } bool verbose = false, verboseCompressionSteps = false; bool verboseStats; bool sortLines = true; bool verify = true; bool byteMode; // compress bytes instead of lines bool caseInsensitive; // for line mode bool toUpper; // for byte mode: convert all characters to upper case bool recordPairLengths; // for "balanced" compression bool balancing; // try to lower tree heights by compressing "shorter" pairs first bool fullCompression; // keep compressing until every file is a single symbol new LineComp_PairIndex pairIndex; Map textIDToLines; LS allUniqueLines; new L chunks; int primChunks; Map lineIndex; Encodings simpleEncodings; // temporary Encodings finalEncodings; new LongIntHashMap existingPairsIndex; // formerly Map int round, printStatsEvery = /*10000*/0, printStatsInFirstRounds = 10; int printStatsEveryNSeconds = /*10*/60; // 60 for slower computers // initializing fullIndex = new Map will make things slow and use a lot of memory // and I thought there might be a case where it improves the compression // - but maybe there isn't? Map fullIndex; SimpleIntList pairLengths; *() {} *(SS texts) { loadTexts(texts); } // key = version ID, values = text void loadTexts(SS texts) { textIDToLines = mapValuesToLinkedHashMap myToLines(texts); } LS myToLines(S s) { LS l = byteMode ? lambdaMap charToHex(characters(toUpper ? upper(s) : s)) : toLines_nOnly_reversible(s); ret l; } S myFromLines(LS l) { ret byteMode ? join(mapI hexToChar(l)) // TODO: optimize even more : fromLines_rtrim(l); } run { if (balancing) { set recordPairLengths; pairIndex.pairComparatorForBalancing = (p1, p2) -> { int l1 = pairLength(p1), l2 = pairLength(p2); if (l1 != l2) ret l1-l2; // return arbitrary deterministic order, but not 0 if (p1.a != p2.a) ret p1.a-p2.a; ret p1.b-p2.b; }; } if (recordPairLengths) pairLengths = new SimpleIntList; LS allLines = concatLists(values(textIDToLines)); if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines)); allUniqueLines = uniquify(allLines); if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line")); allLines = null; // allow me to forget if (sortLines) sortInPlace(allUniqueLines); if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines)); for (S line : allUniqueLines) chunks.add(new CPrim(line)); primChunks = l(chunks); LS _lines = collect s(chunks); lineIndex = caseInsensitive ? ciListIndex(_lines) : listIndex(_lines); _lines = null; // simple encoding (only direct line references) simpleEncodings = mapValues(textIDToLines, (IF1>) (lines -> wrapIntArrayAsImmutableList(mapToIntArray(lines, line -> lineIndex.get(line))))); //printAndCheckEncodings(simpleEncodings); if (verboseCompressionSteps) print("Have simple encodings"); if (!verify) textIDToLines = null; finalEncodings = compressPairs(); if (verbose || verify) printAndCheckEncodings(finalEncodings); } void saveAsTextFile(File f) { S out = exportEncoding(finalEncodings); saveTextFile(f, out); if (verify) checkDecompression(f, textIDToLines); } void checkDecompression(File file, Map textIDToLines) { temp BufferedReader reader = bufferedUtf8Reader(file); LineCompReader lcr = new(reader); assertEquals(keysList(textIDToLines), asList(lcr.versions())); for (S version : keys(textIDToLines)) assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); if (verbose) print("Decompression OK for " + nVersions(textIDToLines)); } S asText() { ret exportEncoding(finalEncodings); } S exportEncoding(Encodings encodings) { new LS buf; buf.add((byteMode ? "BYTECOMP " : "LINECOMP ") + primChunks); // magic signature for (Chunk c : chunks) { if (c cast CPair) buf.add(c.i1 + " " + c.i2); else buf.add(((CPrim) c).s); } for (S id, L l : encodings) buf.add(id + "=" + joinWithSpace(l)); ret lines_rtrim(buf); } Encodings compressPairs() { pairIndex.verbose = verboseCompressionSteps || verbose; pairIndex.init(); for ping (S version, L l : simpleEncodings) pairIndex.newChain(version, l); pairIndex.haveChains(); simpleEncodings = null; // forget ret compressChains(keys(pairIndex.chains)); } Encodings compressChains(Cl files) { NotTooOften nto = new(printStatsEveryNSeconds*1000); IntPair toCompress; // Compress only most popular pair in one step while ping ((toCompress = fullCompression ? pairIndex.mostPopularPair() : pairIndex.mostPopularDuplicate()) != null) { if (verbose) { print("Chains: " + mapValuesToList(pairIndex.chains, c -> c.toList())); //print("Highest bucket (" + pairIndex.highestBucketNr() + "): " + pairIndex.mostPopularDuplicates()); print("Buckets: " + pairIndex.byCount); } if (safety > 0 && --safety <= 0) fail("safety"); ++round; if (verboseStats) if (printStatsEvery > 0 && (round % printStatsEvery) == 0 || round <= printStatsInFirstRounds || nto!) printStats(); int count = pairIndex.getCount(toCompress); Int idx = makeCPair(toCompress); int dups = pairIndex.numberOfDuplicates(); if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining"); pairIndex.replacePair(toCompress.a, toCompress.b, idx); } if (verboseStats) printStats(); // reconvert to normal list ret mapToValues(f -> pairIndex.chains.get(f).toList(), files); } // TODO: check if pair is already there Int makeCPair(IntPair p) { long l = intPairToLong(p); S text = null; if (fullIndex != null) { text = itemToString(p.a) + itemToString(p.b); Int idx = fullIndex.get(text); if (idx != null) ret idx; } int idx = existingPairsIndex.getIfAbsent(l, -1); if (idx < 0) { idx = addAndReturnIndex(chunks, new CPair(p)); if (recordPairLengths) listPut(pairLengths, idx, itemLength(p.a) + itemLength(p.b), 1); existingPairsIndex.put(l, idx); if (fullIndex != null) fullIndex.put(text, idx); } ret idx; } void printAndCheckEncodings(Encodings encodings) { for (S id, L encoded : encodings) { if (verbose) print(id + ": " + joinWithSpace(encoded)); S x = myFromLines(textIDToLines.get(id)), y = decode(encoded); if (caseInsensitive) assertEqualsIC(x, y); else assertEquals_quote(x, y); } } S decode(L encoded) { LS l = lambdaMap chunkText(encoded); if (byteMode) l = lines(lines(l)); ret myFromLines(l); } // for non-byte mode S chunkText(int idx) { ret chunks.get(idx).text(chunks); } void printStats() { print("Round: " + round); pairIndex.printStats(); } // byteMode only S itemToString(int idx) { new StringBuilder buf; itemToString_step(buf, idx); ret str(buf); } void itemToString_step(StringBuilder buf, int idx) { Chunk c = chunks.get(idx); if (c cast CPair) { itemToString_step(buf, c.i1); itemToString_step(buf, c.i2); } else { S s = ((CPrim) c).s; buf.append(byteMode ? charFromHex(s) : s + "\n"); } } int nLiterals() { ret primChunks; } int nItems() { ret l(chunks); } IntPair getPair(int idx) { ret chunks.get(idx).getPair(); } int itemLength(int idx) { ret idx >= l(pairLengths) ? 1 : pairLengths.get(idx); } int pairLength(IntPair p) { ret p == null ? 0 : itemLength(p.a) + itemLength(p.b); } LS literals() { ret lmap itemToString(countIterator(primChunks)); } }