!7 cprint CompressorSpike { switchable S snippetID = #1020763; LinkedHashMap versions; start-thread { print("Snippet ID: " + snippetID); dm_onFieldChange snippetID(r { setField(versions := null); dm_reload() }); if (versions == null) { versions = mapToLinkedHashMap(textChangesOfSnippet(snippetID), sv -> pair(str(sv.versionID), sv.previousValue)); versions.put("latest", loadSnippet(snippetID)); versions = mapValuesToLinkedHashMap toLinesAndBack(versions); // canonicalize line breaks change(); } print("Have " + nVersions(versions) + " with a total of " + nLines(totalLineCount(values(versions)))); new Compressor().run(); } abstract sclass Chunk { abstract S text(L chunks); } srecord CPair(int i1, int i2) > Chunk { CPair(Pair p) { i1 = p.a; i2 = p.b; } S text(L chunks) { ret linesLL(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); } } srecord CPrim(S s) > Chunk { S text(L chunks) { ret s; } } class Compressor { Map textIDToLines = mapValuesToLinkedHashMap lines(versions); LS allUniqueLines; new L chunks; Map lineIndex; new Map linePairIndex; replace Encodings with Map>. run { allUniqueLines = uniquify(concatLists(values(textIDToLines))); for (S line : allUniqueLines) chunks.add(new CPrim(line)); lineIndex = listIndex(collect s(chunks)); // simple encoding (only direct line references) Encodings simpleEncodings = mapValues(textIDToLines, (IF1>) (lines -> map(lines, line -> lineIndex.get(line)))); printAndCheckEncodings(simpleEncodings); Encodings advancedEncodings = compressPairs(simpleEncodings); printAndCheckEncodings(advancedEncodings); } Encodings compressPairs(Encodings encodings) { new MultiSet> pairCounts; for (L l : values(encodings)) { Pair lastPair = null; for (Pair pair : overlappingPairs(l)) { if (neq(pair, lastPair)) { lastPair = pair; pairCounts.add(pair); } } } //print("Pair counts: " + pairCounts); L> duplicates = multiSetDuplicatesByPopularity(pairCounts); print("Pairs: " + duplicates); // Compress only most popular pair if (empty(duplicates)) ret encodings; // Nothing to do Pair toCompress = first(duplicates); int idx = makeCPair(toCompress); ret mapValues(encodings, encoded -> replaceSublist(encoded, pairToList(toCompress), ll(idx))); } int makeCPair(Pair p) { int idx = addAndReturnIndex(chunks, new CPair(p)); print("Made pair: " + p + " -> " + idx); ret idx; } void printAndCheckEncodings(Encodings encodings) { for (S id, L encoded : encodings) { print(id + ": " + joinWithSpace(encoded)); assertEquals(lines(textIDToLines.get(id)), decode(encoded)); } } S decode(L encoded) { ret lines(lambdaMap chunkText(encoded)); } S chunkText(int idx) { ret chunks.get(idx).text(chunks); } } }