Uses 1489K of libraries. Click here for Pure Java version (9930L/52K).
!7 cprint CompressorSpike { switchable S snippetID = #1020763; LinkedHashMap<S> versions; start-thread { print("Snippet ID: " + snippetID); dm_onFieldChange snippetID(r { setField(versions := null); dm_reload() }); if (versions == null) { versions = mapToLinkedHashMap(reversed(textChangesOfSnippet(snippetID)), sv -> pair(str(sv.versionID), sv.previousValue)); versions.put("latest", loadSnippet(snippetID)); versions = mapValuesToLinkedHashMap toLinesAndBack(versions); // canonicalize line breaks change(); } print("Have " + nVersions(versions) + " with a total of " + nLines(totalLineCount(values(versions)))); new Compressor().run(); } abstract sclass Chunk { abstract S text(L<Chunk> chunks); } srecord CPair(int i1, int i2) > Chunk { CPair(Pair<Int> p) { i1 = p.a; i2 = p.b; } S text(L<Chunk> chunks) { ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); } } srecord CPrim(S s) > Chunk { S text(L<Chunk> chunks) { ret s; } } class Compressor { replace Encodings with Map<S, L<Int>>. bool sortLines = true; Map<S, LS> textIDToLines = mapValuesToLinkedHashMap lines(versions); LS allUniqueLines; new L<Chunk> chunks; int primChunks; Map<S, Int> lineIndex; new Map<PairS, Int> linePairIndex; Encodings finalEncodings; run { allUniqueLines = uniquify(concatLists(values(textIDToLines))); if (sortLines) sortInPlace(allUniqueLines); for (S line : allUniqueLines) chunks.add(new CPrim(line)); primChunks = l(chunks); lineIndex = listIndex(collect s(chunks)); // simple encoding (only direct line references) Encodings simpleEncodings = mapValues(textIDToLines, (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line)))); //printAndCheckEncodings(simpleEncodings); Encodings advancedEncodings = simpleEncodings; while licensed { Encodings e = compressPairs(advancedEncodings); if (e == advancedEncodings) break; advancedEncodings = e; } finalEncodings = advancedEncodings; printAndCheckEncodings(finalEncodings); S out = exportEncoding(finalEncodings); printWithPrecedingNL(out); File file = saveTextFile_infoBox(javaxDataDir("Compressed Snippet Version History/versions-of-" + psI(snippetID) + ".linecomp"), out); print(renderFileInfo(gzipFile(file))); // Make .zip and .tgz for comparison File zipFile = replaceFileExtension(file, ".zip"); { temp ZipOutputStream zipOut = zipOutputStream(zipFile); for (S id, text : versions) zip_addTextFile(zipOut, id, text); } printFileInfo(zipFile); File tgzFile = replaceFileExtension(file, ".tgz"); zip2tgz(zipFile, tgzFile); printFileInfo(tgzFile); checkDecompression(file, textIDToLines); } void checkDecompression(File file, Map<S, LS> textIDToLines) { temp BufferedReader reader = bufferedUtf8Reader(file); LineCompReader lcr = new(reader); assertEquals(keysList(textIDToLines), asList(lcr.versions())); for (S version : keys(textIDToLines)) assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); print("Decompression OK for " + nVersions(textIDToLines)); } S exportEncoding(Encodings encodings) { new LS buf; buf.add("LINECOMP " + primChunks); // magic signature for (Chunk c : chunks) { if (c cast CPair) buf.add(c.i1 + " " + c.i2); else buf.add(((CPrim) c).s); } for (S id, L<Int> l : encodings) buf.add(id + "=" + joinWithSpace(l)); ret lines_rtrim(buf); } Encodings compressPairs(Encodings encodings) { new MultiSet<Pair<Int>> pairCounts; for (L<Int> l : values(encodings)) { Pair<Int> lastPair = null; for (Pair<Int> pair : overlappingPairs(l)) { if (neq(pair, lastPair)) { lastPair = pair; pairCounts.add(pair); } } } //print("Pair counts: " + pairCounts); Pair<Int> toCompress = msMostPopularDuplicate(pairCounts); // Compress only most popular pair if (toCompress == null) ret encodings; // Nothing to do int idx = makeCPair(toCompress); print("Made pair: " + toCompress + " -> " + idx + ", " + (msNumberOfDuplicates(pairCounts)-1) + " remaining"); ret mapValues(encodings, (IF1<L<Int>>) encoded -> replaceSublist(encoded, pairToList(toCompress), ll(idx))); } int makeCPair(Pair<Int> p) { int idx = addAndReturnIndex(chunks, new CPair(p)); ret idx; } void printAndCheckEncodings(Encodings encodings) { for (S id, L<Int> encoded : encodings) { print(id + ": " + joinWithSpace(encoded)); assertEquals(lines(textIDToLines.get(id)), decode(encoded)); } } S decode(L<Int> encoded) { ret lines(lambdaMap chunkText(encoded)); } S chunkText(int idx) { ret chunks.get(idx).text(chunks); } } }
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028167 |
Snippet name: | Line-Based Multi-Version Compressor Spike [OK] |
Eternal ID of this version: | #1028167/39 |
Text MD5: | cb73aae1e1696b65f13d020985285c13 |
Transpilation MD5: | c5b78a07d19cfc1bca13b0d7a45506c2 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code (Dynamic Module) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-26 22:11:16 |
Source code size: | 5416 bytes / 160 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 264 / 841 |
Version history: | 38 change(s) |
Referenced in: | #1028186 - LineCompCompressor, faster version [LIVE] #1028217 - Line-Based Multi-Version Compressor Spike, faster version [dev.] |