Libraryless. Click here for Pure Java version (7033L/45K).
1 | // This is using the implementation that seems to work fastest |
2 | // in reality, although it probably is O(n log n) due to the use |
3 | // of TreeMaps. |
4 | // |
5 | // The theoretically best implementation is O(n) due to replacing the |
6 | // TreeMaps with a linked list. |
7 | // I think it works, see LineComp_PairIndex_v2, but #1028231 is fuxx0red. |
8 | |
9 | sclass LineCompCompressor { |
10 | int safety = 0; |
11 | |
12 | replace Encodings with Map<S, L<Int>>. |
13 | |
14 | abstract sclass Chunk { |
15 | abstract S text(L<Chunk> chunks); |
16 | } |
17 | |
18 | srecord CPair(int i1, int i2) > Chunk { |
19 | CPair(IntPair p) { i1 = p.a; i2 = p.b; } |
20 | |
21 | S text(L<Chunk> chunks) { |
22 | ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); |
23 | } |
24 | } |
25 | |
26 | srecord CPrim(S s) > Chunk { |
27 | S text(L<Chunk> chunks) { ret s; } |
28 | } |
29 | |
30 | bool verbose = false, verboseCompressionSteps = false; |
31 | bool sortLines = true; |
32 | bool verify = true; |
33 | |
34 | Map<S, LS> textIDToLines; |
35 | LS allUniqueLines; |
36 | new L<Chunk> chunks; |
37 | int primChunks; |
38 | Map<S, Int> lineIndex; |
39 | Encodings finalEncodings; |
40 | new Map<IntPair, Int> existingPairsIndex; |
41 | |
42 | // key = version ID, values = text |
43 | *(SS texts) { |
44 | textIDToLines = mapValuesToLinkedHashMap myToLines(texts); |
45 | } |
46 | |
47 | LS myToLines(S s) { ret toLines_nOnly_reversible(s); } |
48 | S myFromLines(LS l) { ret fromLines_rtrim(l); } |
49 | |
50 | run { |
51 | LS allLines = concatLists(values(textIDToLines)); |
52 | if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines)); |
53 | allUniqueLines = uniquify(allLines); |
54 | if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line")); |
55 | allLines = null; // allow me to forget |
56 | if (sortLines) sortInPlace(allUniqueLines); |
57 | if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines)); |
58 | for (S line : allUniqueLines) |
59 | chunks.add(new CPrim(line)); |
60 | primChunks = l(chunks); |
61 | lineIndex = listIndex(collect s(chunks)); |
62 | |
63 | // simple encoding (only direct line references) |
64 | Encodings simpleEncodings = mapValues(textIDToLines, |
65 | (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line)))); |
66 | //printAndCheckEncodings(simpleEncodings); |
67 | if (verboseCompressionSteps) print("Have simple encodings"); |
68 | |
69 | finalEncodings = compressPairs_v3(simpleEncodings); |
70 | if (verbose || verify) printAndCheckEncodings(finalEncodings); |
71 | } |
72 | |
73 | void saveAsTextFile(File f) { |
74 | S out = exportEncoding(finalEncodings); |
75 | saveTextFile(f, out); |
76 | |
77 | if (verify) checkDecompression(f, textIDToLines); |
78 | } |
79 | |
80 | void checkDecompression(File file, Map<S, LS> textIDToLines) { |
81 | temp BufferedReader reader = bufferedUtf8Reader(file); |
82 | LineCompReader lcr = new(reader); |
83 | assertEquals(keysList(textIDToLines), asList(lcr.versions())); |
84 | for (S version : keys(textIDToLines)) |
85 | assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); |
86 | if (verbose) print("Decompression OK for " + nVersions(textIDToLines)); |
87 | } |
88 | |
89 | S asText() { ret exportEncoding(finalEncodings); } |
90 | |
91 | S exportEncoding(Encodings encodings) { |
92 | new LS buf; |
93 | buf.add("LINECOMP " + primChunks); // magic signature |
94 | for (Chunk c : chunks) { |
95 | if (c cast CPair) |
96 | buf.add(c.i1 + " " + c.i2); |
97 | else |
98 | buf.add(((CPrim) c).s); |
99 | } |
100 | for (S id, L<Int> l : encodings) |
101 | buf.add(id + "=" + joinWithSpace(l)); |
102 | ret lines_rtrim(buf); |
103 | } |
104 | |
105 | Encodings compressPairs_v3(Encodings encodings) { |
106 | new LineComp_PairIndex pairIndex; |
107 | pairIndex.verbose = verboseCompressionSteps; |
108 | pairIndex.init(); |
109 | for (S version, L<Int> l : encodings) |
110 | pairIndex.newChain(version, l); |
111 | |
112 | IntPair toCompress; |
113 | // Compress only most popular pair in one step |
114 | while ping ((toCompress = pairIndex.mostPopularDuplicate()) != null) { |
115 | if (safety > 0 && --safety <= 0) fail("safety"); |
116 | int count = pairIndex.getCount(toCompress); |
117 | Int idx = makeCPair(toCompress); |
118 | int dups = pairIndex.numberOfDuplicates(); |
119 | if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining"); |
120 | pairIndex.replacePair(toCompress.a, toCompress.b, idx); |
121 | } |
122 | |
123 | // reconvert to normal list |
124 | ret mapValues(ch -> ch.toList(), pairIndex.chains); |
125 | } |
126 | |
127 | // TODO: check if pair is already there |
128 | Int makeCPair(IntPair p) { |
129 | Int idx = existingPairsIndex.get(p); |
130 | if (idx == null) |
131 | existingPairsIndex.put(p, idx = addAndReturnIndex(chunks, new CPair(p))); |
132 | ret idx; |
133 | } |
134 | |
135 | void printAndCheckEncodings(Encodings encodings) { |
136 | for (S id, L<Int> encoded : encodings) { |
137 | if (verbose) print(id + ": " + joinWithSpace(encoded)); |
138 | assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded)); |
139 | } |
140 | } |
141 | |
142 | S decode(L<Int> encoded) { |
143 | ret myFromLines(lambdaMap chunkText(encoded)); |
144 | } |
145 | |
146 | S chunkText(int idx) { |
147 | ret chunks.get(idx).text(chunks); |
148 | } |
149 | } |
Began life as a copy of #1028186
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028503 |
Snippet name: | LineCompCompressor [backup before eclipse collections] |
Eternal ID of this version: | #1028503/1 |
Text MD5: | b08de58bb27524ad814e02f37aed6908 |
Transpilation MD5: | 7668b40e654b1fc3ec04a78ba30a0396 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-06-23 11:07:45 |
Source code size: | 5036 bytes / 149 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 242 / 333 |
Referenced in: | [show references] |