Transpiled version (4708L) is out of date.
1 | !include once #1027304 // Eclipse Collections |
2 | import org.eclipse.collections.impl.map.mutable.primitive.*; |
3 | import org.eclipse.collections.api.tuple.primitive.*; |
4 | |
5 | set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this. |
6 | |
7 | final sclass LineComp_PairIndex { |
8 | bool verbose; |
9 | |
10 | // true = use LinkedHashSet for nodes - uses more space but ensures |
11 | // nodes are compacted in order instead of pseudo-randomly |
12 | // (CompactHashSet). |
13 | // When setting this to false, compression is still |
14 | // deterministic since the introduction of the Node.counter |
15 | // field. |
16 | // inOrder = false saves hugely on memory (e.g. 38M instead |
17 | // of 78M for the nodes field for 1M symbols) |
18 | |
19 | bool inOrder = false; |
20 | |
21 | //MultiSetMap<IntPair, Node> nodes; |
22 | new LongObjectHashMap<Set<Node>> nodes; // key: pair, value: set of nodes in order (LinkedHashSet) |
23 | MultiSetMap<Int, IntPair> byCount // values are pairs |
24 | = null; // init later |
25 | //= multiSetMap_innerHashSet_outerRevTreeMap(); // makes a difference in selecting a pair if there are multiple with the same count |
26 | // = multiSetMap_innerCompactHashSet_outerRevTreeMap(); // same but more compact |
27 | //= multiSetMap_innerTreeSet_outerRevTreeMap(); // initial version that kind of pointlessly sorts the same-count nodes lexicographically |
28 | new LinkedHashMap<S, Ch> chains; |
29 | Ch firstChain; |
30 | int totalPairCount; |
31 | int initialNodesSetCapacity = 1; |
32 | L<IVF1<Node>> onAddingPair, onRemovingPair; |
33 | LongIntHashMap existingPairsIndex; |
34 | int nodeCounter; |
35 | |
36 | // This queue checks all newly made pairs to see if they already |
37 | // have a symbol. |
38 | new LinkedList<Node> compressionQueue; |
39 | |
40 | // TODO: use LongComparator |
41 | Comparator<IntPair> pairComparatorForBalancing; |
42 | |
43 | void init {} |
44 | |
45 | class BalancedByCountMultiSetMap extends MultiSetMap<Int, IntPair> { |
46 | Set<IntPair> _makeEmptySet() { ret new BalancedIntPairSet; |
47 | }; |
48 | *() { |
49 | data = revTreeMap(); |
50 | } |
51 | } |
52 | |
53 | class BalancedIntPairSet extends IntPairTreeSet { |
54 | public int compare(long a, long b) { |
55 | ret pairComparatorForBalancing.compare(longToIntPair(a), longToIntPair(b); |
56 | } |
57 | } |
58 | |
59 | // indicate end of chain-making phase. |
60 | // we still accept more chains after this though, I think (for late input) |
61 | void haveChains { |
62 | if (byCount == null) { |
63 | //time "Making byCount" { |
64 | if (pairComparatorForBalancing != null) { |
65 | //byCount = multiSetMap_innerCustomIntPairTreeSet_outerRevTreeMap(pairComparatorForBalancing); |
66 | byCount = new BalancedByCountMultiSetMap; |
67 | } else |
68 | byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap(); |
69 | for (Set<Node> set : nodes.values()) { |
70 | IntPair p = first(set).pair(); |
71 | int n = set.size(); |
72 | if (verbose) print("Adding node " + p + " (" + n + ")"); |
73 | byCount.put(n, p); |
74 | } |
75 | //} |
76 | ifdef LineComp_PairIndex_printSize |
77 | print("byCount size: " + n2(guessObjectSizeWithout(ll(this), byCount)) + " for " + nNodes(nodes.size())); |
78 | print("nodes size: " + n2(guessObjectSize(nodes))); |
79 | print("PairIndex size: " + n2(guessObjectSize(this))); |
80 | printRecursiveSizeOfFields(this); |
81 | printClassLayout(Node.class); |
82 | endifdef |
83 | } |
84 | } |
85 | |
86 | // a "chain" of nodes (one input file) |
87 | sclass Ch { |
88 | Node tail; |
89 | |
90 | *(LineComp_PairIndex pi, L<Int> l) { |
91 | int count = 0; |
92 | fOr (int i : l) { |
93 | add(pi, i); |
94 | if (((++count) % oneMillion()) == 0) |
95 | print("Added " + n2(count) + " entries to chain"); |
96 | } |
97 | } |
98 | |
99 | L<Int> toList() { |
100 | new L<Int> l; |
101 | Node node = tail; |
102 | while (node != null) { |
103 | l.add(node.value); |
104 | node = node.prev; |
105 | } |
106 | ret reverseInPlace(l); |
107 | } |
108 | |
109 | void add(LineComp_PairIndex pi, int i) { |
110 | new Node n; |
111 | n.counter = pi.nodeCounter++; |
112 | ifndef LineComp_SingleChain |
113 | n.ch = this; |
114 | endifndef |
115 | n.value = i; |
116 | n.prev = tail; |
117 | if (tail != null) tail.next = n; |
118 | tail = n; |
119 | pi.addToIndex(n.prev); |
120 | pi.processQueue(); |
121 | } |
122 | } |
123 | |
124 | sclass Node { |
125 | ifndef LineComp_SingleChain |
126 | Ch ch; |
127 | endifndef |
128 | int counter; // for deterministic ordering in hash set |
129 | int value; |
130 | Node prev, next; |
131 | |
132 | int a() { ret value; } |
133 | int b() { ret next == null ? -1 : next.value; } |
134 | IntPair pair() { ret next == null ? null : IntPair(a(), b()); } |
135 | |
136 | toString { |
137 | ret stringIf(prev != null, "<") + value + |
138 | (next == null ? "" : next.value |
139 | + stringIf(next.next != null, ">")); |
140 | } |
141 | |
142 | public int hashCode() { ret counter; } |
143 | } |
144 | |
145 | IntPair nodeToPair(Node n) { |
146 | ret n?.pair(); |
147 | } |
148 | |
149 | // add node to pair index |
150 | void addToIndex(Node n) { |
151 | IntPair p = nodeToPair(n); |
152 | if (p == null) ret; |
153 | callFAll(onAddingPair, n); |
154 | int count = nodes_getSize(p); |
155 | nodes_add(p, n); |
156 | if (byCount != null) { // not in init phase |
157 | if (count != 0) byCount.remove(count, p); |
158 | byCount.put(count+1, p); |
159 | } |
160 | if (existingPairsIndex != null) |
161 | compressionQueue.add(n); |
162 | } |
163 | |
164 | // remove node from pair index |
165 | void removeFromIndex(Node n) { |
166 | IntPair p = nodeToPair(n); |
167 | if (p == null) ret; |
168 | callFAll(onRemovingPair, n); |
169 | int count = nodes_getSize(p); |
170 | if (count == 0) fail("Can't remove pair " + p); |
171 | nodes_remove(p, n); |
172 | byCount.remove(count, p); |
173 | if (--count > 0) byCount.put(count, p); |
174 | } |
175 | |
176 | // return all pairs in "most popular" bucket |
177 | Cl<IntPair> mostPopularDuplicates() { |
178 | Int key = firstKey(byCount); |
179 | ret toInt(key) < 2 ? null : byCount.get(key); |
180 | } |
181 | |
182 | IntPair mostPopularDuplicate() { |
183 | ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount); |
184 | } |
185 | |
186 | IntPair mostPopularPair() { |
187 | ret firstValue(byCount); |
188 | } |
189 | |
190 | int highestBucketNr() { |
191 | ret or0(firstKey(byCount)); |
192 | } |
193 | |
194 | // counts buckets actually (lower value than totalPairCount) |
195 | int numberOfDuplicates() { |
196 | ret byCount.size()-l(byCount.get(1)); |
197 | } |
198 | |
199 | int getCount(IntPair p) { |
200 | ret nodes_getSize(p); |
201 | } |
202 | |
203 | void replacePair(int pa, int pb, int replaceWith) { |
204 | IntPair p = IntPair(pa, pb); |
205 | Set<Node> set = nodes.get(intPairToLong(p)); |
206 | if (set != null) |
207 | for (Node n : cloneList(set)) { |
208 | continue if n.a() != pa || n.b() != pb; // nodes might have been deleted or changed |
209 | replacePair(n, replaceWith); |
210 | } |
211 | } |
212 | |
213 | void replacePair(Node node, int replaceWith) { |
214 | removeFromIndex(node.prev); |
215 | removeFromIndex(node); |
216 | removeFromIndex(node.next); |
217 | node.value = replaceWith; |
218 | deleteNode(node.next); |
219 | addToIndex(node.prev); |
220 | addToIndex(node); |
221 | processQueue(); |
222 | } |
223 | |
224 | void processQueue { |
225 | if (existingPairsIndex != null) { |
226 | Node n; |
227 | while not null (n = popFirst(compressionQueue)) |
228 | checkIfPairExists(n); |
229 | } |
230 | } |
231 | |
232 | void checkIfPairExists(Node node) { |
233 | if (existingPairsIndex == null) ret; |
234 | if (node.value < 0) ret; |
235 | IntPair pair = nodeToPair(node); |
236 | if (pair == null) ret; |
237 | //print("Checking pair " + pair); |
238 | int i = existingPairsIndex.getIfAbsent(intPairToLong(pair), -1); |
239 | if (i >= 0) { |
240 | //print(" Replacing with: " + i); |
241 | replacePair(node, i); |
242 | } |
243 | } |
244 | |
245 | void deleteNode(Node node) { |
246 | if (node.next != null) node.next.prev = node.prev; else getChain(node).tail = node.prev; |
247 | if (node.prev != null) node.prev.next = node.next; |
248 | node.value = -1; // mark invalid |
249 | } |
250 | |
251 | Ch newChain(S version, L<Int> encoding) { |
252 | Ch ch = new(this, encoding); |
253 | chains.put(version, ch); |
254 | ifdef LineComp_SingleChain |
255 | assertNull(firstChain); |
256 | endifdef |
257 | if (firstChain == null) firstChain = ch; |
258 | ret ch; |
259 | } |
260 | |
261 | int nodes_getSize(IntPair p) { |
262 | ret l(nodes.get(intPairToLong(p))); |
263 | } |
264 | |
265 | void nodes_add(IntPair p, Node n) { |
266 | ++totalPairCount; |
267 | long l = intPairToLong(p); |
268 | Set<Node> set = nodes.get(l); |
269 | if (set == null) |
270 | nodes.put(l, set = newNodesSet()); |
271 | set.add(n); |
272 | } |
273 | |
274 | Set<Node> newNodesSet() { |
275 | ret inOrder |
276 | ? new LinkedHashSet(initialNodesSetCapacity) |
277 | : new CompactHashSet(initialNodesSetCapacity); |
278 | } |
279 | |
280 | void nodes_remove(IntPair p, Node n) { |
281 | --totalPairCount; |
282 | long l = intPairToLong(p); |
283 | Set<Node> set = nodes.get(l); |
284 | if (set == null || !set.remove(n)) fail("Consistency error"); |
285 | if (set.isEmpty()) |
286 | nodes.remove(l); |
287 | } |
288 | |
289 | // also shrinks sets... |
290 | void printStats { |
291 | print("Different pairs: " + l(nodes)); |
292 | print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")"); |
293 | long count = 0, waste = 0, newWaste = 0; |
294 | long count1 = 0, waste1 = 0; |
295 | |
296 | for (Set set : nodes.values()) |
297 | if (set cast HashSet) { |
298 | count1 += set.size(); |
299 | int capacity = hashSetCapacity(set); |
300 | waste1 += capacity-set.size(); |
301 | //if (capacity > set.size()*3) set.shrink(); // TODO |
302 | } |
303 | |
304 | if (pairComparatorForBalancing == null) for (CompactHashSet set : (Cl<CompactHashSet>) (Cl) values(byCount.data)) { |
305 | count += set.size(); |
306 | waste += set.capacity()-set.size(); |
307 | set.shrinkToFactor(0.5); |
308 | newWaste += set.capacity()-set.size(); |
309 | } |
310 | |
311 | print("Nodes set count: " + count1 + ". Waste: " + waste1); |
312 | //print("Set count: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste)); |
313 | print("Duplicates: " + n2(totalPairCount-byCount.count(1))); |
314 | } |
315 | |
316 | Ch getChain(Node node) { |
317 | ifdef LineComp_SingleChain |
318 | ret firstChain; |
319 | endifdef |
320 | ifndef LineComp_SingleChain |
321 | ret node.ch; |
322 | endifndef |
323 | } |
324 | |
325 | /*public LineComp_PairIndex clone() { |
326 | |
327 | }*/ |
328 | } |
Began life as a copy of #1028215
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028247 |
Snippet name: | LineComp_PairIndex - helper for LineComp compressor [LIVE] |
Eternal ID of this version: | #1028247/101 |
Text MD5: | c7456c6094db716f77cd3b448e1c34ca |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-12-13 19:19:54 |
Source code size: | 9983 bytes / 328 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 578 / 1255 |
Version history: | 100 change(s) |
Referenced in: | [show references] |