Libraryless. Click here for Pure Java version (6705L/42K).
1 | sclass LineCompCompressor { |
2 | replace Encodings with Map<S, L<Int>>. |
3 | |
4 | abstract sclass Chunk { |
5 | abstract S text(L<Chunk> chunks); |
6 | } |
7 | |
8 | srecord CPair(int i1, int i2) > Chunk { |
9 | CPair(IntPair p) { i1 = p.a; i2 = p.b; } |
10 | |
11 | S text(L<Chunk> chunks) { |
12 | ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); |
13 | } |
14 | } |
15 | |
16 | srecord CPrim(S s) > Chunk { |
17 | S text(L<Chunk> chunks) { ret s; } |
18 | } |
19 | |
20 | bool verbose = false; |
21 | bool sortLines = true; |
22 | bool verify = true; |
23 | |
24 | Map<S, LS> textIDToLines; |
25 | LS allUniqueLines; |
26 | new L<Chunk> chunks; |
27 | int primChunks; |
28 | Map<S, Int> lineIndex; |
29 | new Map<PairS, Int> linePairIndex; |
30 | Encodings finalEncodings; |
31 | |
32 | // key = version ID, values = text |
33 | *(SS texts) { |
34 | textIDToLines = mapValuesToLinkedHashMap lines(texts); |
35 | } |
36 | |
37 | run { |
38 | allUniqueLines = uniquify(concatLists(values(textIDToLines))); |
39 | if (sortLines) sortInPlace(allUniqueLines); |
40 | for (S line : allUniqueLines) |
41 | chunks.add(new CPrim(line)); |
42 | primChunks = l(chunks); |
43 | lineIndex = listIndex(collect s(chunks)); |
44 | |
45 | // simple encoding (only direct line references) |
46 | Encodings simpleEncodings = mapValues(textIDToLines, |
47 | (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line)))); |
48 | //printAndCheckEncodings(simpleEncodings); |
49 | |
50 | finalEncodings = repeatUntilSame compressPairs(simpleEncodings); |
51 | if (verbose || verify) printAndCheckEncodings(finalEncodings); |
52 | } |
53 | |
54 | void saveAsTextFile(File f) { |
55 | S out = exportEncoding(finalEncodings); |
56 | saveTextFile(f, out); |
57 | |
58 | if (verify) checkDecompression(f, textIDToLines); |
59 | } |
60 | |
61 | void checkDecompression(File file, Map<S, LS> textIDToLines) { |
62 | temp BufferedReader reader = bufferedUtf8Reader(file); |
63 | LineCompReader lcr = new(reader); |
64 | assertEquals(keysList(textIDToLines), asList(lcr.versions())); |
65 | for (S version : keys(textIDToLines)) |
66 | assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); |
67 | if (verbose) print("Decompression OK for " + nVersions(textIDToLines)); |
68 | } |
69 | |
70 | S asText() { ret exportEncoding(finalEncodings); } |
71 | |
72 | S exportEncoding(Encodings encodings) { |
73 | new LS buf; |
74 | buf.add("LINECOMP " + primChunks); // magic signature |
75 | for (Chunk c : chunks) { |
76 | if (c cast CPair) |
77 | buf.add(c.i1 + " " + c.i2); |
78 | else |
79 | buf.add(((CPrim) c).s); |
80 | } |
81 | for (S id, L<Int> l : encodings) |
82 | buf.add(id + "=" + joinWithSpace(l)); |
83 | ret lines_rtrim(buf); |
84 | } |
85 | |
86 | Encodings compressPairs(Encodings encodings) { |
87 | new MultiSet<IntPair> pairCounts; |
88 | |
89 | for (L<Int> l : values(encodings)) { |
90 | IntPair lastPair = null; |
91 | for (IntPair pair : overlappingIntPairs(l)) { |
92 | if (neq(pair, lastPair)) { |
93 | lastPair = pair; |
94 | pairCounts.add(pair); |
95 | } |
96 | } |
97 | } |
98 | |
99 | //print("Pair counts: " + pairCounts); |
100 | IntPair toCompress = msMostPopularDuplicate(pairCounts); |
101 | |
102 | // Compress only most popular pair |
103 | if (toCompress == null) ret encodings; // Nothing to do |
104 | int idx = makeCPair(toCompress); |
105 | print("Made pair: " + toCompress + " -> " + idx + ", " + (msNumberOfDuplicates(pairCounts)-1) + " remaining"); |
106 | ret mapValues(encodings, (IF1<L<Int>>) encoded -> |
107 | replaceSublist(encoded, ll(toCompress.a, toCompress.b), ll(idx))); |
108 | } |
109 | |
110 | int makeCPair(IntPair p) { |
111 | int idx = addAndReturnIndex(chunks, new CPair(p)); |
112 | ret idx; |
113 | } |
114 | |
115 | void printAndCheckEncodings(Encodings encodings) { |
116 | for (S id, L<Int> encoded : encodings) { |
117 | if (verbose) print(id + ": " + joinWithSpace(encoded)); |
118 | assertEquals(lines(textIDToLines.get(id)), decode(encoded)); |
119 | } |
120 | } |
121 | |
122 | S decode(L<Int> encoded) { |
123 | ret lines(lambdaMap chunkText(encoded)); |
124 | } |
125 | |
126 | S chunkText(int idx) { |
127 | ret chunks.get(idx).text(chunks); |
128 | } |
129 | } |
Began life as a copy of #1028186
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028192 |
Snippet name: | LineCompCompressor v1 (with slow pair making, old) |
Eternal ID of this version: | #1028192/7 |
Text MD5: | f8a03e59e758a9ec8f0dd0e399660ec4 |
Transpilation MD5: | 4e19cec1e47c585bb409647f3b0afd78 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-27 13:36:25 |
Source code size: | 3951 bytes / 129 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 264 / 388 |
Version history: | 6 change(s) |
Referenced in: | [show references] |