Uses 11335K of libraries. Click here for Pure Java version (9174L/60K).
1 | // This is using the implementation that seems to work fastest |
2 | // in reality, although it probably is O(n log n) due to the use |
3 | // of TreeMaps. |
4 | // |
5 | // The theoretically best implementation is O(n) (replacing the |
6 | // TreeMap with a linked list). |
7 | |
8 | !include once #1027304 // Eclipse Collections |
9 | |
10 | import org.eclipse.collections.impl.map.mutable.primitive.*; |
11 | |
12 | final sclass LineCompCompressor { |
13 | int safety = 0; |
14 | |
15 | replace Encodings with Map<S, L<Int>>. |
16 | |
17 | abstract sclass Chunk { |
18 | abstract S text(L<Chunk> chunks); |
19 | IntPair getPair() { null; } |
20 | } |
21 | |
22 | srecord CPair(int i1, int i2) > Chunk { |
23 | CPair(IntPair p) { i1 = p.a; i2 = p.b; } |
24 | |
25 | S text(L<Chunk> chunks) { |
26 | ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); |
27 | } |
28 | |
29 | IntPair getPair() { |
30 | ret IntPair(i1, i2); |
31 | } |
32 | } |
33 | |
34 | srecord CPrim(S s) > Chunk { |
35 | S text(L<Chunk> chunks) { ret s; } |
36 | } |
37 | |
38 | bool verbose = false, verboseCompressionSteps = false; |
39 | bool verboseStats; |
40 | bool sortLines = true; |
41 | bool verify = true; |
42 | bool byteMode; // compress bytes instead of lines |
43 | bool caseInsensitive; // for line mode |
44 | bool toUpper; // for byte mode: convert all characters to upper case |
45 | bool recordPairLengths; // for "balanced" compression |
46 | bool balancing; // try to lower tree heights by compressing "shorter" pairs first |
47 | bool fullCompression; // keep compressing until every file is a single symbol |
48 | |
49 | new LineComp_PairIndex pairIndex; |
50 | Map<S, LS> textIDToLines; |
51 | LS allUniqueLines; |
52 | new L<Chunk> chunks; |
53 | int primChunks; |
54 | Map<S, Int> lineIndex; |
55 | Encodings simpleEncodings; // temporary |
56 | Encodings finalEncodings; |
57 | new LongIntHashMap existingPairsIndex; // formerly Map<IntPair, Int> |
58 | int round, printStatsEvery = /*10000*/0, printStatsInFirstRounds = 10; |
59 | int printStatsEveryNSeconds = /*10*/60; // 60 for slower computers |
60 | |
61 | // initializing fullIndex = new Map will make things slow and use a lot of memory |
62 | // and I thought there might be a case where it improves the compression |
63 | // - but maybe there isn't? |
64 | Map<S, Int> fullIndex; |
65 | SimpleIntList pairLengths; |
66 | |
67 | *() {} |
68 | *(SS texts) { loadTexts(texts); } |
69 | |
70 | // key = version ID, values = text |
71 | void loadTexts(SS texts) { |
72 | textIDToLines = mapValuesToLinkedHashMap myToLines(texts); |
73 | } |
74 | |
75 | LS myToLines(S s) { |
76 | LS l = byteMode |
77 | ? lambdaMap charToHex(characters(toUpper ? upper(s) : s)) |
78 | : toLines_nOnly_reversible(s); |
79 | ret l; |
80 | } |
81 | |
82 | S myFromLines(LS l) { |
83 | ret byteMode |
84 | ? join(mapI hexToChar(l)) // TODO: optimize even more |
85 | : fromLines_rtrim(l); |
86 | } |
87 | |
88 | run { |
89 | if (balancing) { |
90 | set recordPairLengths; |
91 | pairIndex.pairComparatorForBalancing = (p1, p2) -> { |
92 | int l1 = pairLength(p1), l2 = pairLength(p2); |
93 | if (l1 != l2) ret l1-l2; |
94 | // return arbitrary deterministic order, but not 0 |
95 | if (p1.a != p2.a) ret p1.a-p2.a; |
96 | ret p1.b-p2.b; |
97 | }; |
98 | } |
99 | |
100 | if (recordPairLengths) pairLengths = new SimpleIntList; |
101 | LS allLines = concatLists(values(textIDToLines)); |
102 | if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines)); |
103 | allUniqueLines = uniquify(allLines); |
104 | if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line")); |
105 | allLines = null; // allow me to forget |
106 | if (sortLines) sortInPlace(allUniqueLines); |
107 | if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines)); |
108 | for (S line : allUniqueLines) |
109 | chunks.add(new CPrim(line)); |
110 | primChunks = l(chunks); |
111 | LS _lines = collect s(chunks); |
112 | lineIndex = caseInsensitive ? ciListIndex(_lines) : listIndex(_lines); |
113 | _lines = null; |
114 | |
115 | // simple encoding (only direct line references) |
116 | simpleEncodings = mapValues(textIDToLines, |
117 | (IF1<LS, L<Int>>) (lines -> wrapIntArrayAsImmutableList(mapToIntArray(lines, line -> lineIndex.get(line))))); |
118 | //printAndCheckEncodings(simpleEncodings); |
119 | if (verboseCompressionSteps) print("Have simple encodings"); |
120 | if (!verify) textIDToLines = null; |
121 | |
122 | finalEncodings = compressPairs(); |
123 | if (verbose || verify) printAndCheckEncodings(finalEncodings); |
124 | } |
125 | |
126 | void saveAsTextFile(File f) { |
127 | S out = exportEncoding(finalEncodings); |
128 | saveTextFile(f, out); |
129 | |
130 | if (verify) checkDecompression(f, textIDToLines); |
131 | } |
132 | |
133 | void checkDecompression(File file, Map<S, LS> textIDToLines) { |
134 | temp BufferedReader reader = bufferedUtf8Reader(file); |
135 | LineCompReader lcr = new(reader); |
136 | assertEquals(keysList(textIDToLines), asList(lcr.versions())); |
137 | for (S version : keys(textIDToLines)) |
138 | assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); |
139 | if (verbose) print("Decompression OK for " + nVersions(textIDToLines)); |
140 | } |
141 | |
142 | S asText() { ret exportEncoding(finalEncodings); } |
143 | |
144 | S exportEncoding(Encodings encodings) { |
145 | new LS buf; |
146 | buf.add((byteMode ? "BYTECOMP " : "LINECOMP ") + primChunks); // magic signature |
147 | for (Chunk c : chunks) { |
148 | if (c cast CPair) |
149 | buf.add(c.i1 + " " + c.i2); |
150 | else |
151 | buf.add(((CPrim) c).s); |
152 | } |
153 | for (S id, L<Int> l : encodings) |
154 | buf.add(id + "=" + joinWithSpace(l)); |
155 | ret lines_rtrim(buf); |
156 | } |
157 | |
158 | Encodings compressPairs() { |
159 | pairIndex.verbose = verboseCompressionSteps || verbose; |
160 | pairIndex.init(); |
161 | for ping (S version, L<Int> l : simpleEncodings) |
162 | pairIndex.newChain(version, l); |
163 | pairIndex.haveChains(); |
164 | simpleEncodings = null; // forget |
165 | |
166 | ret compressChains(keys(pairIndex.chains)); |
167 | } |
168 | |
169 | Encodings compressChains(Cl<S> files) { |
170 | NotTooOften nto = new(printStatsEveryNSeconds*1000); |
171 | IntPair toCompress; |
172 | // Compress only most popular pair in one step |
173 | while ping ((toCompress = fullCompression |
174 | ? pairIndex.mostPopularPair() |
175 | : pairIndex.mostPopularDuplicate()) != null) { |
176 | if (verbose) { |
177 | print("Chains: " + mapValuesToList(pairIndex.chains, c -> c.toList())); |
178 | //print("Highest bucket (" + pairIndex.highestBucketNr() + "): " + pairIndex.mostPopularDuplicates()); |
179 | print("Buckets: " + pairIndex.byCount); |
180 | } |
181 | |
182 | if (safety > 0 && --safety <= 0) fail("safety"); |
183 | ++round; |
184 | if (verboseStats) |
185 | if (printStatsEvery > 0 && (round % printStatsEvery) == 0 |
186 | || round <= printStatsInFirstRounds |
187 | || nto!) |
188 | printStats(); |
189 | int count = pairIndex.getCount(toCompress); |
190 | Int idx = makeCPair(toCompress); |
191 | int dups = pairIndex.numberOfDuplicates(); |
192 | if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining"); |
193 | pairIndex.replacePair(toCompress.a, toCompress.b, idx); |
194 | } |
195 | if (verboseStats) printStats(); |
196 | |
197 | // reconvert to normal list |
198 | ret mapToValues(f -> pairIndex.chains.get(f).toList(), files); |
199 | } |
200 | |
201 | // TODO: check if pair is already there |
202 | Int makeCPair(IntPair p) { |
203 | long l = intPairToLong(p); |
204 | S text = null; |
205 | if (fullIndex != null) { |
206 | text = itemToString(p.a) + itemToString(p.b); |
207 | Int idx = fullIndex.get(text); |
208 | if (idx != null) ret idx; |
209 | } |
210 | int idx = existingPairsIndex.getIfAbsent(l, -1); |
211 | if (idx < 0) { |
212 | idx = addAndReturnIndex(chunks, new CPair(p)); |
213 | if (recordPairLengths) listPut(pairLengths, idx, itemLength(p.a) + itemLength(p.b), 1); |
214 | existingPairsIndex.put(l, idx); |
215 | if (fullIndex != null) |
216 | fullIndex.put(text, idx); |
217 | } |
218 | ret idx; |
219 | } |
220 | |
221 | void printAndCheckEncodings(Encodings encodings) { |
222 | for (S id, L<Int> encoded : encodings) { |
223 | if (verbose) print(id + ": " + joinWithSpace(encoded)); |
224 | S x = myFromLines(textIDToLines.get(id)), y = decode(encoded); |
225 | if (caseInsensitive) |
226 | assertEqualsIC(x, y); |
227 | else |
228 | assertEquals_quote(x, y); |
229 | } |
230 | } |
231 | |
232 | S decode(L<Int> encoded) { |
233 | LS l = lambdaMap chunkText(encoded); |
234 | if (byteMode) l = lines(lines(l)); |
235 | ret myFromLines(l); |
236 | } |
237 | |
238 | // for non-byte mode |
239 | S chunkText(int idx) { |
240 | ret chunks.get(idx).text(chunks); |
241 | } |
242 | |
243 | void printStats() { |
244 | print("Round: " + round); |
245 | pairIndex.printStats(); |
246 | } |
247 | |
248 | // byteMode only |
249 | S itemToString(int idx) { |
250 | new StringBuilder buf; |
251 | itemToString_step(buf, idx); |
252 | ret str(buf); |
253 | } |
254 | |
255 | void itemToString_step(StringBuilder buf, int idx) { |
256 | Chunk c = chunks.get(idx); |
257 | if (c cast CPair) { |
258 | itemToString_step(buf, c.i1); |
259 | itemToString_step(buf, c.i2); |
260 | } else { |
261 | S s = ((CPrim) c).s; |
262 | buf.append(byteMode ? charFromHex(s) : s + "\n"); |
263 | } |
264 | } |
265 | |
266 | int nLiterals() { ret primChunks; } |
267 | int nItems() { ret l(chunks); } |
268 | |
269 | IntPair getPair(int idx) { |
270 | ret chunks.get(idx).getPair(); |
271 | } |
272 | |
273 | int itemLength(int idx) { |
274 | ret idx >= l(pairLengths) ? 1 : pairLengths.get(idx); |
275 | } |
276 | |
277 | int pairLength(IntPair p) { |
278 | ret p == null ? 0 : itemLength(p.a) + itemLength(p.b); |
279 | } |
280 | |
281 | LS literals() { ret lmap itemToString(countIterator(primChunks)); } |
282 | } |
Began life as a copy of #1028167
download show line numbers debug dex old transpilations
Travelled to 8 computer(s): bhatertpkbcr, ekrmjmnbrukm, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028186 |
Snippet name: | LineCompCompressor, faster version [LIVE] |
Eternal ID of this version: | #1028186/134 |
Text MD5: | e971d8644645f8667e3bfb0b5f47911d |
Transpilation MD5: | 101663b9e3006d0041f4bfa5103fde92 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-12-13 19:22:27 |
Source code size: | 9242 bytes / 282 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 649 / 1594 |
Version history: | 133 change(s) |
Referenced in: | [show references] |