Uses 1489K of libraries. Click here for Pure Java version (9930L/52K).
1 | !7 |
2 | |
3 | cprint CompressorSpike { |
4 | switchable S snippetID = #1020763; |
5 | LinkedHashMap<S> versions; |
6 | |
7 | start-thread { |
8 | print("Snippet ID: " + snippetID); |
9 | dm_onFieldChange snippetID(r { setField(versions := null); dm_reload() }); |
10 | if (versions == null) { |
11 | versions = mapToLinkedHashMap(reversed(textChangesOfSnippet(snippetID)), sv -> pair(str(sv.versionID), sv.previousValue)); |
12 | versions.put("latest", loadSnippet(snippetID)); |
13 | versions = mapValuesToLinkedHashMap toLinesAndBack(versions); // canonicalize line breaks |
14 | change(); |
15 | } |
16 | print("Have " + nVersions(versions) + " with a total of " + nLines(totalLineCount(values(versions)))); |
17 | |
18 | new Compressor().run(); |
19 | } |
20 | |
21 | abstract sclass Chunk { |
22 | abstract S text(L<Chunk> chunks); |
23 | } |
24 | |
25 | srecord CPair(int i1, int i2) > Chunk { |
26 | CPair(Pair<Int> p) { i1 = p.a; i2 = p.b; } |
27 | |
28 | S text(L<Chunk> chunks) { |
29 | ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks)); |
30 | } |
31 | } |
32 | |
33 | srecord CPrim(S s) > Chunk { |
34 | S text(L<Chunk> chunks) { ret s; } |
35 | } |
36 | |
37 | class Compressor { |
38 | replace Encodings with Map<S, L<Int>>. |
39 | |
40 | bool sortLines = true; |
41 | Map<S, LS> textIDToLines = mapValuesToLinkedHashMap lines(versions); |
42 | LS allUniqueLines; |
43 | new L<Chunk> chunks; |
44 | int primChunks; |
45 | Map<S, Int> lineIndex; |
46 | new Map<PairS, Int> linePairIndex; |
47 | Encodings finalEncodings; |
48 | |
49 | run { |
50 | allUniqueLines = uniquify(concatLists(values(textIDToLines))); |
51 | if (sortLines) sortInPlace(allUniqueLines); |
52 | for (S line : allUniqueLines) |
53 | chunks.add(new CPrim(line)); |
54 | primChunks = l(chunks); |
55 | lineIndex = listIndex(collect s(chunks)); |
56 | |
57 | // simple encoding (only direct line references) |
58 | Encodings simpleEncodings = mapValues(textIDToLines, |
59 | (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line)))); |
60 | //printAndCheckEncodings(simpleEncodings); |
61 | |
62 | Encodings advancedEncodings = simpleEncodings; |
63 | while licensed { |
64 | Encodings e = compressPairs(advancedEncodings); |
65 | if (e == advancedEncodings) break; |
66 | advancedEncodings = e; |
67 | } |
68 | |
69 | finalEncodings = advancedEncodings; |
70 | printAndCheckEncodings(finalEncodings); |
71 | |
72 | S out = exportEncoding(finalEncodings); |
73 | printWithPrecedingNL(out); |
74 | File file = saveTextFile_infoBox(javaxDataDir("Compressed Snippet Version History/versions-of-" + psI(snippetID) + ".linecomp"), out); |
75 | print(renderFileInfo(gzipFile(file))); |
76 | |
77 | // Make .zip and .tgz for comparison |
78 | File zipFile = replaceFileExtension(file, ".zip"); |
79 | { |
80 | temp ZipOutputStream zipOut = zipOutputStream(zipFile); |
81 | for (S id, text : versions) |
82 | zip_addTextFile(zipOut, id, text); |
83 | } |
84 | printFileInfo(zipFile); |
85 | |
86 | File tgzFile = replaceFileExtension(file, ".tgz"); |
87 | zip2tgz(zipFile, tgzFile); |
88 | printFileInfo(tgzFile); |
89 | |
90 | checkDecompression(file, textIDToLines); |
91 | } |
92 | |
93 | void checkDecompression(File file, Map<S, LS> textIDToLines) { |
94 | temp BufferedReader reader = bufferedUtf8Reader(file); |
95 | LineCompReader lcr = new(reader); |
96 | assertEquals(keysList(textIDToLines), asList(lcr.versions())); |
97 | for (S version : keys(textIDToLines)) |
98 | assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version))); |
99 | print("Decompression OK for " + nVersions(textIDToLines)); |
100 | } |
101 | |
102 | S exportEncoding(Encodings encodings) { |
103 | new LS buf; |
104 | buf.add("LINECOMP " + primChunks); // magic signature |
105 | for (Chunk c : chunks) { |
106 | if (c cast CPair) |
107 | buf.add(c.i1 + " " + c.i2); |
108 | else |
109 | buf.add(((CPrim) c).s); |
110 | } |
111 | for (S id, L<Int> l : encodings) |
112 | buf.add(id + "=" + joinWithSpace(l)); |
113 | ret lines_rtrim(buf); |
114 | } |
115 | |
116 | Encodings compressPairs(Encodings encodings) { |
117 | new MultiSet<Pair<Int>> pairCounts; |
118 | |
119 | for (L<Int> l : values(encodings)) { |
120 | Pair<Int> lastPair = null; |
121 | for (Pair<Int> pair : overlappingPairs(l)) { |
122 | if (neq(pair, lastPair)) { |
123 | lastPair = pair; |
124 | pairCounts.add(pair); |
125 | } |
126 | } |
127 | } |
128 | |
129 | //print("Pair counts: " + pairCounts); |
130 | Pair<Int> toCompress = msMostPopularDuplicate(pairCounts); |
131 | |
132 | // Compress only most popular pair |
133 | if (toCompress == null) ret encodings; // Nothing to do |
134 | int idx = makeCPair(toCompress); |
135 | print("Made pair: " + toCompress + " -> " + idx + ", " + (msNumberOfDuplicates(pairCounts)-1) + " remaining"); |
136 | ret mapValues(encodings, (IF1<L<Int>>) encoded -> |
137 | replaceSublist(encoded, pairToList(toCompress), ll(idx))); |
138 | } |
139 | |
140 | int makeCPair(Pair<Int> p) { |
141 | int idx = addAndReturnIndex(chunks, new CPair(p)); |
142 | ret idx; |
143 | } |
144 | |
145 | void printAndCheckEncodings(Encodings encodings) { |
146 | for (S id, L<Int> encoded : encodings) { |
147 | print(id + ": " + joinWithSpace(encoded)); |
148 | assertEquals(lines(textIDToLines.get(id)), decode(encoded)); |
149 | } |
150 | } |
151 | |
152 | S decode(L<Int> encoded) { |
153 | ret lines(lambdaMap chunkText(encoded)); |
154 | } |
155 | |
156 | S chunkText(int idx) { |
157 | ret chunks.get(idx).text(chunks); |
158 | } |
159 | } |
160 | } |
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028167 |
Snippet name: | Line-Based Multi-Version Compressor Spike [OK] |
Eternal ID of this version: | #1028167/39 |
Text MD5: | cb73aae1e1696b65f13d020985285c13 |
Transpilation MD5: | c5b78a07d19cfc1bca13b0d7a45506c2 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code (Dynamic Module) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-26 22:11:16 |
Source code size: | 5416 bytes / 160 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 262 / 839 |
Version history: | 38 change(s) |
Referenced in: | [show references] |