LineCompCompressor, faster version [LIVE] [1028186]

// This is using the implementation that seems to work fastest
// in reality, although it probably is O(n log n) due to the use
// of TreeMaps.
//
// The theoretically best implementation is O(n) (replacing the
// TreeMap with a linked list).

!include once #1027304 // Eclipse Collections

import org.eclipse.collections.impl.map.mutable.primitive.*;

final sclass LineCompCompressor {
  int safety = 0;
  
  replace Encodings with Map<S, L<Int>>.

  abstract sclass Chunk {
    abstract S text(L<Chunk> chunks);
    IntPair getPair() { null; }
  }
  
  srecord CPair(int i1, int i2) > Chunk {
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
    
    S text(L<Chunk> chunks) {
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
    }
    
    IntPair getPair() {
      ret IntPair(i1, i2);
    }
  }
  
  srecord CPrim(S s) > Chunk {
    S text(L<Chunk> chunks) { ret s; }
  }

  bool verbose = false, verboseCompressionSteps = false;
  bool verboseStats;
  bool sortLines = true;
  bool verify = true;
  bool byteMode; // compress bytes instead of lines
  bool caseInsensitive; // for line mode
  bool toUpper; // for byte mode: convert all characters to upper case
  bool recordPairLengths; // for "balanced" compression
  bool balancing; // try to lower tree heights by compressing "shorter" pairs first
  bool fullCompression; // keep compressing until every file is a single symbol

  new LineComp_PairIndex pairIndex;
  Map<S, LS> textIDToLines;
  LS allUniqueLines;
  new L<Chunk> chunks;
  int primChunks;
  Map<S, Int> lineIndex;
  Encodings simpleEncodings; // temporary
  Encodings finalEncodings;
  new LongIntHashMap existingPairsIndex; // formerly Map<IntPair, Int>
  int round, printStatsEvery = /*10000*/0, printStatsInFirstRounds = 10;
  int printStatsEveryNSeconds = /*10*/60; // 60 for slower computers
  
  // initializing fullIndex = new Map will make things slow and use a lot of memory
  // and I thought there might be a case where it improves the compression
  // - but maybe there isn't?
  Map<S, Int> fullIndex; 
  SimpleIntList pairLengths;

  *() {}
  *(SS texts) { loadTexts(texts); }
  
  // key = version ID, values = text
  void loadTexts(SS texts) {
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
  }
  
  LS myToLines(S s) {
    LS l = byteMode
      ? lambdaMap charToHex(characters(toUpper ? upper(s) : s))
      : toLines_nOnly_reversible(s);
    ret l;
  }
      
  S myFromLines(LS l) {
    ret byteMode
      ? join(mapI hexToChar(l)) // TODO: optimize even more
      : fromLines_rtrim(l);
  }
  
  run {
    if (balancing) {
      set recordPairLengths;
      pairIndex.pairComparatorForBalancing = (p1, p2) -> {
        int l1 = pairLength(p1), l2 = pairLength(p2);
        if (l1 != l2) ret l1-l2;
        // return arbitrary deterministic order, but not 0
        if (p1.a != p2.a) ret p1.a-p2.a;
        ret p1.b-p2.b;
      };
    }
      
    if (recordPairLengths) pairLengths = new SimpleIntList;
    LS allLines = concatLists(values(textIDToLines));
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
    allUniqueLines = uniquify(allLines);
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
    allLines = null; // allow me to forget
    if (sortLines) sortInPlace(allUniqueLines);
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
    for (S line : allUniqueLines)
      chunks.add(new CPrim(line));
    primChunks = l(chunks);
    LS _lines = collect s(chunks);
    lineIndex = caseInsensitive ? ciListIndex(_lines) : listIndex(_lines);
    _lines = null;
    
    // simple encoding (only direct line references)
    simpleEncodings = mapValues(textIDToLines,
      (IF1<LS, L<Int>>) (lines -> wrapIntArrayAsImmutableList(mapToIntArray(lines, line -> lineIndex.get(line)))));
    //printAndCheckEncodings(simpleEncodings);
    if (verboseCompressionSteps) print("Have simple encodings");
    if (!verify) textIDToLines = null;
    
    finalEncodings = compressPairs();
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
  }
  
  void saveAsTextFile(File f) {
    S out = exportEncoding(finalEncodings);
    saveTextFile(f, out);
    
    if (verify) checkDecompression(f, textIDToLines);
  }
  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
    temp BufferedReader reader = bufferedUtf8Reader(file);
    LineCompReader lcr = new(reader);
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
    for (S version : keys(textIDToLines))
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
  }
  
  S asText() { ret exportEncoding(finalEncodings); }
  
  S exportEncoding(Encodings encodings) {
    new LS buf;
    buf.add((byteMode ? "BYTECOMP " : "LINECOMP ") + primChunks); // magic signature
    for (Chunk c : chunks) {
      if (c cast CPair)
        buf.add(c.i1 + " " + c.i2);
      else
        buf.add(((CPrim) c).s);
    }
    for (S id, L<Int> l : encodings)
      buf.add(id + "=" + joinWithSpace(l));
    ret lines_rtrim(buf);
  }
  
  Encodings compressPairs() {
    pairIndex.verbose = verboseCompressionSteps || verbose;
    pairIndex.init();
    for ping (S version, L<Int> l : simpleEncodings)
      pairIndex.newChain(version, l);
    pairIndex.haveChains();
    simpleEncodings = null; // forget

    ret compressChains(keys(pairIndex.chains));
  }
  
  Encodings compressChains(Cl<S> files) {
    NotTooOften nto = new(printStatsEveryNSeconds*1000);
    IntPair toCompress;
    // Compress only most popular pair in one step
    while ping ((toCompress = fullCompression
      ? pairIndex.mostPopularPair()
      : pairIndex.mostPopularDuplicate()) != null) {
      if (verbose) {
        print("Chains: " + mapValuesToList(pairIndex.chains, c -> c.toList()));
        //print("Highest bucket (" + pairIndex.highestBucketNr() + "): " + pairIndex.mostPopularDuplicates());
        print("Buckets: " + pairIndex.byCount);
      }

      if (safety > 0 && --safety <= 0) fail("safety");
      ++round;
      if (verboseStats)
        if (printStatsEvery > 0 && (round % printStatsEvery) == 0
          || round <= printStatsInFirstRounds
          || nto!)
          printStats();
      int count = pairIndex.getCount(toCompress);
      Int idx = makeCPair(toCompress);
      int dups = pairIndex.numberOfDuplicates();
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
      pairIndex.replacePair(toCompress.a, toCompress.b, idx);
    }
    if (verboseStats) printStats();
    
    // reconvert to normal list
    ret mapToValues(f -> pairIndex.chains.get(f).toList(), files);
  }
  
  // TODO: check if pair is already there
  Int makeCPair(IntPair p) {
    long l = intPairToLong(p);
    S text = null;
    if (fullIndex != null) {
      text = itemToString(p.a) + itemToString(p.b);
      Int idx = fullIndex.get(text);
      if (idx != null) ret idx;
    }
    int idx = existingPairsIndex.getIfAbsent(l, -1);
    if (idx < 0) {
      idx = addAndReturnIndex(chunks, new CPair(p));
      if (recordPairLengths) listPut(pairLengths, idx, itemLength(p.a) + itemLength(p.b), 1);
      existingPairsIndex.put(l, idx);
      if (fullIndex != null)
        fullIndex.put(text, idx);
    }
    ret idx;
  }
  
  void printAndCheckEncodings(Encodings encodings) {
    for (S id, L<Int> encoded : encodings) {
      if (verbose) print(id + ": " + joinWithSpace(encoded));
      S x = myFromLines(textIDToLines.get(id)), y = decode(encoded);
      if (caseInsensitive)
        assertEqualsIC(x, y);
      else
        assertEquals_quote(x, y);
    }
  }
  
  S decode(L<Int> encoded) {
    LS l = lambdaMap chunkText(encoded);
    if (byteMode) l = lines(lines(l));
    ret myFromLines(l);
  }
  
  // for non-byte mode
  S chunkText(int idx) {
    ret chunks.get(idx).text(chunks);
  }
  
  void printStats() {
    print("Round: " + round);
    pairIndex.printStats();
  }
  
  // byteMode only
  S itemToString(int idx) {
    new StringBuilder buf;
    itemToString_step(buf, idx);
    ret str(buf);
  }
  
  void itemToString_step(StringBuilder buf, int idx) {
    Chunk c = chunks.get(idx);
    if (c cast CPair) {
      itemToString_step(buf, c.i1);
      itemToString_step(buf, c.i2);
    } else {
      S s = ((CPrim) c).s;
      buf.append(byteMode ? charFromHex(s) : s + "\n");
    }
  }
  
  int nLiterals() { ret primChunks; }
  int nItems() { ret l(chunks); }
  
  IntPair getPair(int idx) {
    ret chunks.get(idx).getPair();
  }
  
  int itemLength(int idx) {
    ret idx >= l(pairLengths) ? 1 : pairLengths.get(idx);
  }
  
  int pairLength(IntPair p) {
    ret p == null ? 0 : itemLength(p.a) + itemLength(p.b);
  }
  
  LS literals() { ret lmap itemToString(countIterator(primChunks)); }
}

Travelled to 8 computer(s): bhatertpkbcr, ekrmjmnbrukm, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

Snippet ID:	#1028186
Snippet name:	LineCompCompressor, faster version [LIVE]
Eternal ID of this version:	#1028186/134
Text MD5:	e971d8644645f8667e3bfb0b5f47911d
Transpilation MD5:	101663b9e3006d0041f4bfa5103fde92
Author:	stefan
Category:	javax
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2020-12-13 19:22:27
Source code size:	9242 bytes / 282 lines
Pitched / IR pitched:	No / No
Views / Downloads:	1074 / 2087
Version history:	133 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1028186 // LineCompCompressor, faster version [LIVE]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment

1	// This is using the implementation that seems to work fastest
2	// in reality, although it probably is O(n log n) due to the use
3	// of TreeMaps.
4	//
5	// The theoretically best implementation is O(n) (replacing the
6	// TreeMap with a linked list).
7
8	!include once #1027304 // Eclipse Collections
9
10	import org.eclipse.collections.impl.map.mutable.primitive.*;
11
12	final sclass LineCompCompressor {
13	int safety = 0;
14
15	replace Encodings with Map<S, L<Int>>.
16
17	abstract sclass Chunk {
18	abstract S text(L<Chunk> chunks);
19	IntPair getPair() { null; }
20	}
21
22	srecord CPair(int i1, int i2) > Chunk {
23	CPair(IntPair p) { i1 = p.a; i2 = p.b; }
24
25	S text(L<Chunk> chunks) {
26	ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
27	}
28
29	IntPair getPair() {
30	ret IntPair(i1, i2);
31	}
32	}
33
34	srecord CPrim(S s) > Chunk {
35	S text(L<Chunk> chunks) { ret s; }
36	}
37
38	bool verbose = false, verboseCompressionSteps = false;
39	bool verboseStats;
40	bool sortLines = true;
41	bool verify = true;
42	bool byteMode; // compress bytes instead of lines
43	bool caseInsensitive; // for line mode
44	bool toUpper; // for byte mode: convert all characters to upper case
45	bool recordPairLengths; // for "balanced" compression
46	bool balancing; // try to lower tree heights by compressing "shorter" pairs first
47	bool fullCompression; // keep compressing until every file is a single symbol
48
49	new LineComp_PairIndex pairIndex;
50	Map<S, LS> textIDToLines;
51	LS allUniqueLines;
52	new L<Chunk> chunks;
53	int primChunks;
54	Map<S, Int> lineIndex;
55	Encodings simpleEncodings; // temporary
56	Encodings finalEncodings;
57	new LongIntHashMap existingPairsIndex; // formerly Map<IntPair, Int>
58	int round, printStatsEvery = /10000/0, printStatsInFirstRounds = 10;
59	int printStatsEveryNSeconds = /10/60; // 60 for slower computers
60
61	// initializing fullIndex = new Map will make things slow and use a lot of memory
62	// and I thought there might be a case where it improves the compression
63	// - but maybe there isn't?
64	Map<S, Int> fullIndex;
65	SimpleIntList pairLengths;
66
67	*() {}
68	*(SS texts) { loadTexts(texts); }
69
70	// key = version ID, values = text
71	void loadTexts(SS texts) {
72	textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
73	}
74
75	LS myToLines(S s) {
76	LS l = byteMode
77	? lambdaMap charToHex(characters(toUpper ? upper(s) : s))
78	: toLines_nOnly_reversible(s);
79	ret l;
80	}
81
82	S myFromLines(LS l) {
83	ret byteMode
84	? join(mapI hexToChar(l)) // TODO: optimize even more
85	: fromLines_rtrim(l);
86	}
87
88	run {
89	if (balancing) {
90	set recordPairLengths;
91	pairIndex.pairComparatorForBalancing = (p1, p2) -> {
92	int l1 = pairLength(p1), l2 = pairLength(p2);
93	if (l1 != l2) ret l1-l2;
94	// return arbitrary deterministic order, but not 0
95	if (p1.a != p2.a) ret p1.a-p2.a;
96	ret p1.b-p2.b;
97	};
98	}
99
100	if (recordPairLengths) pairLengths = new SimpleIntList;
101	LS allLines = concatLists(values(textIDToLines));
102	if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
103	allUniqueLines = uniquify(allLines);
104	if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
105	allLines = null; // allow me to forget
106	if (sortLines) sortInPlace(allUniqueLines);
107	if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
108	for (S line : allUniqueLines)
109	chunks.add(new CPrim(line));
110	primChunks = l(chunks);
111	LS _lines = collect s(chunks);
112	lineIndex = caseInsensitive ? ciListIndex(_lines) : listIndex(_lines);
113	_lines = null;
114
115	// simple encoding (only direct line references)
116	simpleEncodings = mapValues(textIDToLines,
117	(IF1<LS, L<Int>>) (lines -> wrapIntArrayAsImmutableList(mapToIntArray(lines, line -> lineIndex.get(line)))));
118	//printAndCheckEncodings(simpleEncodings);
119	if (verboseCompressionSteps) print("Have simple encodings");
120	if (!verify) textIDToLines = null;
121
122	finalEncodings = compressPairs();
123	if (verbose \|\| verify) printAndCheckEncodings(finalEncodings);
124	}
125
126	void saveAsTextFile(File f) {
127	S out = exportEncoding(finalEncodings);
128	saveTextFile(f, out);
129
130	if (verify) checkDecompression(f, textIDToLines);
131	}
132
133	void checkDecompression(File file, Map<S, LS> textIDToLines) {
134	temp BufferedReader reader = bufferedUtf8Reader(file);
135	LineCompReader lcr = new(reader);
136	assertEquals(keysList(textIDToLines), asList(lcr.versions()));
137	for (S version : keys(textIDToLines))
138	assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
139	if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
140	}
141
142	S asText() { ret exportEncoding(finalEncodings); }
143
144	S exportEncoding(Encodings encodings) {
145	new LS buf;
146	buf.add((byteMode ? "BYTECOMP " : "LINECOMP ") + primChunks); // magic signature
147	for (Chunk c : chunks) {
148	if (c cast CPair)
149	buf.add(c.i1 + " " + c.i2);
150	else
151	buf.add(((CPrim) c).s);
152	}
153	for (S id, L<Int> l : encodings)
154	buf.add(id + "=" + joinWithSpace(l));
155	ret lines_rtrim(buf);
156	}
157
158	Encodings compressPairs() {
159	pairIndex.verbose = verboseCompressionSteps \|\| verbose;
160	pairIndex.init();
161	for ping (S version, L<Int> l : simpleEncodings)
162	pairIndex.newChain(version, l);
163	pairIndex.haveChains();
164	simpleEncodings = null; // forget
165
166	ret compressChains(keys(pairIndex.chains));
167	}
168
169	Encodings compressChains(Cl<S> files) {
170	NotTooOften nto = new(printStatsEveryNSeconds*1000);
171	IntPair toCompress;
172	// Compress only most popular pair in one step
173	while ping ((toCompress = fullCompression
174	? pairIndex.mostPopularPair()
175	: pairIndex.mostPopularDuplicate()) != null) {
176	if (verbose) {
177	print("Chains: " + mapValuesToList(pairIndex.chains, c -> c.toList()));
178	//print("Highest bucket (" + pairIndex.highestBucketNr() + "): " + pairIndex.mostPopularDuplicates());
179	print("Buckets: " + pairIndex.byCount);
180	}
181
182	if (safety > 0 && --safety <= 0) fail("safety");
183	++round;
184	if (verboseStats)
185	if (printStatsEvery > 0 && (round % printStatsEvery) == 0
186	\|\| round <= printStatsInFirstRounds
187	\|\| nto!)
188	printStats();
189	int count = pairIndex.getCount(toCompress);
190	Int idx = makeCPair(toCompress);
191	int dups = pairIndex.numberOfDuplicates();
192	if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
193	pairIndex.replacePair(toCompress.a, toCompress.b, idx);
194	}
195	if (verboseStats) printStats();
196
197	// reconvert to normal list
198	ret mapToValues(f -> pairIndex.chains.get(f).toList(), files);
199	}
200
201	// TODO: check if pair is already there
202	Int makeCPair(IntPair p) {
203	long l = intPairToLong(p);
204	S text = null;
205	if (fullIndex != null) {
206	text = itemToString(p.a) + itemToString(p.b);
207	Int idx = fullIndex.get(text);
208	if (idx != null) ret idx;
209	}
210	int idx = existingPairsIndex.getIfAbsent(l, -1);
211	if (idx < 0) {
212	idx = addAndReturnIndex(chunks, new CPair(p));
213	if (recordPairLengths) listPut(pairLengths, idx, itemLength(p.a) + itemLength(p.b), 1);
214	existingPairsIndex.put(l, idx);
215	if (fullIndex != null)
216	fullIndex.put(text, idx);
217	}
218	ret idx;
219	}
220
221	void printAndCheckEncodings(Encodings encodings) {
222	for (S id, L<Int> encoded : encodings) {
223	if (verbose) print(id + ": " + joinWithSpace(encoded));
224	S x = myFromLines(textIDToLines.get(id)), y = decode(encoded);
225	if (caseInsensitive)
226	assertEqualsIC(x, y);
227	else
228	assertEquals_quote(x, y);
229	}
230	}
231
232	S decode(L<Int> encoded) {
233	LS l = lambdaMap chunkText(encoded);
234	if (byteMode) l = lines(lines(l));
235	ret myFromLines(l);
236	}
237
238	// for non-byte mode
239	S chunkText(int idx) {
240	ret chunks.get(idx).text(chunks);
241	}
242
243	void printStats() {
244	print("Round: " + round);
245	pairIndex.printStats();
246	}
247
248	// byteMode only
249	S itemToString(int idx) {
250	new StringBuilder buf;
251	itemToString_step(buf, idx);
252	ret str(buf);
253	}
254
255	void itemToString_step(StringBuilder buf, int idx) {
256	Chunk c = chunks.get(idx);
257	if (c cast CPair) {
258	itemToString_step(buf, c.i1);
259	itemToString_step(buf, c.i2);
260	} else {
261	S s = ((CPrim) c).s;
262	buf.append(byteMode ? charFromHex(s) : s + "\n");
263	}
264	}
265
266	int nLiterals() { ret primChunks; }
267	int nItems() { ret l(chunks); }
268
269	IntPair getPair(int idx) {
270	ret chunks.get(idx).getPair();
271	}
272
273	int itemLength(int idx) {
274	ret idx >= l(pairLengths) ? 1 : pairLengths.get(idx);
275	}
276
277	int pairLength(IntPair p) {
278	ret p == null ? 0 : itemLength(p.a) + itemLength(p.b);
279	}
280
281	LS literals() { ret lmap itemToString(countIterator(primChunks)); }
282	}