Libraryless. Click here for Pure Java version (7033L/45K).
// This is using the implementation that seems to work fastest // in reality, although it probably is O(n log n) due to the use // of TreeMaps. // // The theoretically best implementation is O(n) due to replacing the // TreeMaps with a linked list. // I think it works, see LineComp_PairIndex_v2, but #1028231 is fuxx0red. sclass LineCompCompressor { int safety = 0; replace Encodings with Map<S, L<Int>>. abstract sclass Chunk { abstract S text(L<Chunk> chunks); } srecord CPair(int i1, int i2) > Chunk { CPair(IntPair p) { i1 = p.a; i2 = p.b; } S text(L<Chunk> chunks) { ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); } } srecord CPrim(S s) > Chunk { S text(L<Chunk> chunks) { ret s; } } bool verbose = false, verboseCompressionSteps = false; bool sortLines = true; bool verify = true; Map<S, LS> textIDToLines; LS allUniqueLines; new L<Chunk> chunks; int primChunks; Map<S, Int> lineIndex; Encodings finalEncodings; new Map<IntPair, Int> existingPairsIndex; // key = version ID, values = text *(SS texts) { textIDToLines = mapValuesToLinkedHashMap myToLines(texts); } LS myToLines(S s) { ret toLines_nOnly_reversible(s); } S myFromLines(LS l) { ret fromLines_rtrim(l); } run { LS allLines = concatLists(values(textIDToLines)); if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines)); allUniqueLines = uniquify(allLines); if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line")); allLines = null; // allow me to forget if (sortLines) sortInPlace(allUniqueLines); if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines)); for (S line : allUniqueLines) chunks.add(new CPrim(line)); primChunks = l(chunks); lineIndex = listIndex(collect s(chunks)); // simple encoding (only direct line references) Encodings simpleEncodings = mapValues(textIDToLines, (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line)))); //printAndCheckEncodings(simpleEncodings); if (verboseCompressionSteps) print("Have simple encodings"); finalEncodings = compressPairs_v3(simpleEncodings); if (verbose || verify) printAndCheckEncodings(finalEncodings); } void saveAsTextFile(File f) { S out = exportEncoding(finalEncodings); saveTextFile(f, out); if (verify) checkDecompression(f, textIDToLines); } void checkDecompression(File file, Map<S, LS> textIDToLines) { temp BufferedReader reader = bufferedUtf8Reader(file); LineCompReader lcr = new(reader); assertEquals(keysList(textIDToLines), asList(lcr.versions())); for (S version : keys(textIDToLines)) assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); if (verbose) print("Decompression OK for " + nVersions(textIDToLines)); } S asText() { ret exportEncoding(finalEncodings); } S exportEncoding(Encodings encodings) { new LS buf; buf.add("LINECOMP " + primChunks); // magic signature for (Chunk c : chunks) { if (c cast CPair) buf.add(c.i1 + " " + c.i2); else buf.add(((CPrim) c).s); } for (S id, L<Int> l : encodings) buf.add(id + "=" + joinWithSpace(l)); ret lines_rtrim(buf); } Encodings compressPairs_v3(Encodings encodings) { new LineComp_PairIndex pairIndex; pairIndex.verbose = verboseCompressionSteps; pairIndex.init(); for (S version, L<Int> l : encodings) pairIndex.newChain(version, l); IntPair toCompress; // Compress only most popular pair in one step while ping ((toCompress = pairIndex.mostPopularDuplicate()) != null) { if (safety > 0 && --safety <= 0) fail("safety"); int count = pairIndex.getCount(toCompress); Int idx = makeCPair(toCompress); int dups = pairIndex.numberOfDuplicates(); if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining"); pairIndex.replacePair(toCompress.a, toCompress.b, idx); } // reconvert to normal list ret mapValues(ch -> ch.toList(), pairIndex.chains); } // TODO: check if pair is already there Int makeCPair(IntPair p) { Int idx = existingPairsIndex.get(p); if (idx == null) existingPairsIndex.put(p, idx = addAndReturnIndex(chunks, new CPair(p))); ret idx; } void printAndCheckEncodings(Encodings encodings) { for (S id, L<Int> encoded : encodings) { if (verbose) print(id + ": " + joinWithSpace(encoded)); assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded)); } } S decode(L<Int> encoded) { ret myFromLines(lambdaMap chunkText(encoded)); } S chunkText(int idx) { ret chunks.get(idx).text(chunks); } }
Began life as a copy of #1028186
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028503 |
Snippet name: | LineCompCompressor [backup before eclipse collections] |
Eternal ID of this version: | #1028503/1 |
Text MD5: | b08de58bb27524ad814e02f37aed6908 |
Transpilation MD5: | 7668b40e654b1fc3ec04a78ba30a0396 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-06-23 11:07:45 |
Source code size: | 5036 bytes / 149 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 181 / 255 |
Referenced in: | [show references] |