sclass LineCompCompressor { replace Encodings with Map>. abstract sclass Chunk { abstract S text(L chunks); } srecord CPair(int i1, int i2) > Chunk { CPair(Pair p) { i1 = p.a; i2 = p.b; } S text(L chunks) { ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); } } srecord CPrim(S s) > Chunk { S text(L chunks) { ret s; } } bool verbose = false; bool sortLines = true; bool verify = true; Map textIDToLines; LS allUniqueLines; new L chunks; int primChunks; Map lineIndex; new Map linePairIndex; Encodings finalEncodings; // key = version ID, values = text *(SS texts) { textIDToLines = mapValuesToLinkedHashMap lines(texts); } run { allUniqueLines = uniquify(concatLists(values(textIDToLines))); if (sortLines) sortInPlace(allUniqueLines); for (S line : allUniqueLines) chunks.add(new CPrim(line)); primChunks = l(chunks); lineIndex = listIndex(collect s(chunks)); // simple encoding (only direct line references) Encodings simpleEncodings = mapValues(textIDToLines, (IF1>) (lines -> map(lines, line -> lineIndex.get(line)))); //printAndCheckEncodings(simpleEncodings); finalEncodings = compressPairs(simpleEncodings); if (verbose || verify) printAndCheckEncodings(finalEncodings); } void saveAsTextFile(File f) { S out = exportEncoding(finalEncodings); saveTextFile(f, out); if (verify) checkDecompression(f, textIDToLines); } void checkDecompression(File file, Map textIDToLines) { temp BufferedReader reader = bufferedUtf8Reader(file); LineCompReader lcr = new(reader); assertEquals(keysList(textIDToLines), asList(lcr.versions())); for (S version : keys(textIDToLines)) assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); if (verbose) print("Decompression OK for " + nVersions(textIDToLines)); } S exportEncoding(Encodings encodings) { new LS buf; buf.add("LINECOMP " + primChunks); // magic signature for (Chunk c : chunks) { if (c cast CPair) buf.add(c.i1 + " " + c.i2); else buf.add(((CPrim) c).s); } for (S id, L l : encodings) buf.add(id + "=" + joinWithSpace(l)); ret lines_rtrim(buf); } // new fast version of magic compression function sclass PairCounts { new MultiSet> counts; MultiSetMap> byCount = multiSetMap_innerTreeSet_outerRevTreeMap(); // increment pair's count void add(Pair p) { int count = counts.get(p); byCount.remove(count, p); counts.add(p); byCount.put(count+1, p); } // decrement pair's count void remove(Pair p) { int count = counts.get(p); if (count == 0) fail("Can't remove pair " + p); byCount.remove(count, p); counts.add(p); if (count-- > 0) byCount.put(count, p); } Pair mostPopularDuplicate() { ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount); } int numberOfDuplicates() { ret counts.size()-l(byCount.get(1)); } int getCount(Pair p) { ret counts.get(p); } } Encodings compressPairs(Encodings encodings) { // get initial pair counts new PairCounts pairCounts; for (L l : values(encodings)) { Pair lastPair = null; for (Pair pair : overlappingPairs(l)) { /*if (neq(pair, lastPair))*/ { // XXX - imprecise lastPair = pair; pairCounts.add(pair); } } } // Convert to LinkedList for more efficient modification Map> encodings2 = mapValues toLinkedList(encodings); Pair toCompress; // Compress only most popular pair in one step int lastDups = Int.MAX_VALUE; while ping ((toCompress = pairCounts.mostPopularDuplicate()) != null) { int count = pairCounts.getCount(toCompress), idx = makeCPair(toCompress); int dups = pairCounts.numberOfDuplicates(); if (lastDups == dups) fail("Number of duplicates not decreasing"); lastDups = dups; print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining"); for (L l : values(encodings2)) compressPair(pairCounts, l, toCompress, idx); } // reconvert to normal list ret mapValues toArrayList(encodings2); } // replace replacing (pair) with replaceWith in l void compressPair(PairCounts pairCounts, L l, Pair replacing, int replaceWith) { ListIterator it = listIterator(l); if (!it.hasNext()) ret; int last = it.next(); while (it.hasNext()) { int cur = it.next(); // let's call the positions 0, 1, 2 and 3. currently we're between 2 and 3 if (eq(last, replacing.a) && eq(cur, replacing.b)) { Int i1 = last, i2 = cur, i3 = peekNext(it); // grab positions 1, 2, 3. leaves iterator pointing at 2 for remove() if (i1 != null) pairCounts.remove(pair(i1, i2)); // remove pair (1, 2) from index if (i3 != null) pairCounts.remove(pair(i2, i3)); // remove pair (2, 3) from index it.remove(); // remove position 2 from list, now layout is (1, 3) it.previous(); // grab position 1 it.set(replaceWith); // replace, so now we have (replaceWith, 3) if (i3 != null) pairCounts.add(pair(replaceWith, i3)); // We should now be pointing at the right element for next iteration } last = cur; } } int makeCPair(Pair p) { int idx = addAndReturnIndex(chunks, new CPair(p)); ret idx; } void printAndCheckEncodings(Encodings encodings) { for (S id, L encoded : encodings) { if (verbose) print(id + ": " + joinWithSpace(encoded)); assertEquals(lines(textIDToLines.get(id)), decode(encoded)); } } S decode(L encoded) { ret lines(lambdaMap chunkText(encoded)); } S chunkText(int idx) { ret chunks.get(idx).text(chunks); } }