1 | !include once #1028249 // LineComp_PairIndex_v2 [dev.] |
2 | |
3 | sclass LineCompCompressor {
|
4 | int safety = 0; |
5 | |
6 | replace Encodings with Map<S, L<Int>>. |
7 | replace LineComp_PairIndex with LineComp_PairIndex_v2. |
8 | |
9 | abstract sclass Chunk {
|
10 | abstract S text(L<Chunk> chunks); |
11 | } |
12 | |
13 | srecord CPair(int i1, int i2) > Chunk {
|
14 | CPair(IntPair p) { i1 = p.a; i2 = p.b; }
|
15 | |
16 | S text(L<Chunk> chunks) {
|
17 | ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); |
18 | } |
19 | } |
20 | |
21 | srecord CPrim(S s) > Chunk {
|
22 | S text(L<Chunk> chunks) { ret s; }
|
23 | } |
24 | |
25 | bool verbose = false, verboseCompressionSteps = false; |
26 | bool sortLines = true; |
27 | bool verify = true; |
28 | |
29 | Map<S, LS> textIDToLines; |
30 | LS allUniqueLines; |
31 | new L<Chunk> chunks; |
32 | int primChunks; |
33 | Map<S, Int> lineIndex; |
34 | Encodings finalEncodings; |
35 | new Map<IntPair, Int> existingPairsIndex; |
36 | |
37 | // key = version ID, values = text |
38 | *(SS texts) {
|
39 | textIDToLines = mapValuesToLinkedHashMap myToLines(texts); |
40 | } |
41 | |
42 | LS myToLines(S s) { ret toLines_nOnly_reversible(s); }
|
43 | S myFromLines(LS l) { ret fromLines_rtrim(l); }
|
44 | |
45 | run {
|
46 | LS allLines = concatLists(values(textIDToLines)); |
47 | if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
|
48 | allUniqueLines = uniquify(allLines); |
49 | if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
|
50 | allLines = null; // allow me to forget |
51 | if (sortLines) sortInPlace(allUniqueLines); |
52 | if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
|
53 | for (S line : allUniqueLines) |
54 | chunks.add(new CPrim(line)); |
55 | primChunks = l(chunks); |
56 | lineIndex = listIndex(collect s(chunks)); |
57 | |
58 | // simple encoding (only direct line references) |
59 | Encodings simpleEncodings = mapValues(textIDToLines, |
60 | (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line)))); |
61 | //printAndCheckEncodings(simpleEncodings); |
62 | if (verboseCompressionSteps) print("Have simple encodings");
|
63 | |
64 | finalEncodings = compressPairs_v3(simpleEncodings); |
65 | if (verbose || verify) printAndCheckEncodings(finalEncodings); |
66 | } |
67 | |
68 | void saveAsTextFile(File f) {
|
69 | S out = exportEncoding(finalEncodings); |
70 | saveTextFile(f, out); |
71 | |
72 | if (verify) checkDecompression(f, textIDToLines); |
73 | } |
74 | |
75 | void checkDecompression(File file, Map<S, LS> textIDToLines) {
|
76 | print("Verifying archive");
|
77 | temp BufferedReader reader = bufferedUtf8Reader(file); |
78 | LineCompReader lcr = new(reader); |
79 | assertEquals(keysList(textIDToLines), asList(lcr.versions())); |
80 | for (S version : keys(textIDToLines)) |
81 | assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); |
82 | if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
|
83 | } |
84 | |
85 | S asText() { ret exportEncoding(finalEncodings); }
|
86 | |
87 | S exportEncoding(Encodings encodings) {
|
88 | new LS buf; |
89 | buf.add("LINECOMP " + primChunks); // magic signature
|
90 | for (Chunk c : chunks) {
|
91 | if (c cast CPair) |
92 | buf.add(c.i1 + " " + c.i2); |
93 | else |
94 | buf.add(((CPrim) c).s); |
95 | } |
96 | for (S id, L<Int> l : encodings) |
97 | buf.add(id + "=" + joinWithSpace(l)); |
98 | ret lines_rtrim(buf); |
99 | } |
100 | |
101 | // new fastest version of magic compression function |
102 | // (O(n log n) or even better?) |
103 | |
104 | Encodings compressPairs_v3(Encodings encodings) {
|
105 | new LineComp_PairIndex pairIndex; |
106 | pairIndex.verbose = verboseCompressionSteps; |
107 | for (S version, L<Int> l : encodings) |
108 | pairIndex.newChain(version, l); |
109 | |
110 | IntPair toCompress; |
111 | // Compress only most popular pair in one step |
112 | while ping ((toCompress = pairIndex.mostPopularDuplicate()) != null) {
|
113 | if (safety > 0 && --safety <= 0) fail("safety");
|
114 | //pairIndex.fullCheck(); |
115 | int count = pairIndex.getCount(toCompress); |
116 | Int idx = makeCPair(toCompress); |
117 | int dups = pairIndex.numberOfDuplicates(); |
118 | if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
|
119 | pairIndex.replacePair(toCompress.a, toCompress.b, idx); |
120 | } |
121 | |
122 | // reconvert to normal list |
123 | ret mapValues(ch -> ch.toList(), pairIndex.chains); |
124 | } |
125 | |
126 | // TODO: check if pair is already there |
127 | Int makeCPair(IntPair p) {
|
128 | Int idx = existingPairsIndex.get(p); |
129 | if (idx == null) |
130 | existingPairsIndex.put(p, idx = addAndReturnIndex(chunks, new CPair(p))); |
131 | ret idx; |
132 | } |
133 | |
134 | void printAndCheckEncodings(Encodings encodings) {
|
135 | for (S id, L<Int> encoded : encodings) {
|
136 | if (verbose) print(id + ": " + joinWithSpace(encoded)); |
137 | assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded)); |
138 | } |
139 | } |
140 | |
141 | S decode(L<Int> encoded) {
|
142 | ret myFromLines(lambdaMap chunkText(encoded)); |
143 | } |
144 | |
145 | S chunkText(int idx) {
|
146 | ret chunks.get(idx).text(chunks); |
147 | } |
148 | } |
Began life as a copy of #1028186
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
| Snippet ID: | #1028252 |
| Snippet name: | LineCompCompressor 0.3 [dev.] |
| Eternal ID of this version: | #1028252/4 |
| Text MD5: | 2905f33b9a7e80f9c5e344bed46ca174 |
| Author: | stefan |
| Category: | javax |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2020-05-28 14:42:17 |
| Source code size: | 4959 bytes / 148 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 431 / 506 |
| Version history: | 3 change(s) |
| Referenced in: | [show references] |