1 | !include once #1027304 // Eclipse Collections |
2 | import org.eclipse.collections.impl.map.mutable.primitive.*; |
3 | import org.eclipse.collections.api.tuple.primitive.*; |
4 | |
5 | set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this. |
6 | |
7 | final sclass LineComp_PairIndex { |
8 | bool verbose; |
9 | bool inOrder = true; |
10 | //MultiSetMap<IntPair, Node> nodes; |
11 | new LongObjectHashMap<LinkedHashSet<Node>> nodes; // key: pair, value: set of nodes in order |
12 | MultiSetMap<Int, IntPair> byCount // values are pairs |
13 | = null; // init later |
14 | //= multiSetMap_innerHashSet_outerRevTreeMap(); // makes a difference in selecting a pair if there are multiple with the same count |
15 | // = multiSetMap_innerCompactHashSet_outerRevTreeMap(); // same but more compact |
16 | //= multiSetMap_innerTreeSet_outerRevTreeMap(); // initial version that kind of pointlessly sorts the same-count nodes lexicographically |
17 | new LinkedHashMap<S, Ch> chains; |
18 | Ch firstChain; |
19 | int totalPairCount; |
20 | int initialNodesSetCapacity = 1; |
21 | L<IVF1<Node>> onAddingPair, onRemovingPair; |
22 | LongIntHashMap existingPairsIndex; |
23 | |
24 | // This queue checks all newly made pairs to see if they already |
25 | // have a symbol. |
26 | new LinkedList<Node> compressionQueue; |
27 | |
28 | Comparator<IntPair> pairComparatorForBalancing; |
29 | |
30 | void init { |
31 | /*nodes = inOrder |
32 | ? multiSetMap_innerLinkedHashSet_outerHashMap() |
33 | : multiSetMap_innerHashSet_outerHashMap();*/ |
34 | } |
35 | |
36 | // indicate end of chain-making phase. |
37 | // we still accept more chains after this though, I think (for late input) |
38 | void haveChains { |
39 | if (byCount == null) { |
40 | //time "Making byCount" { |
41 | if (pairComparatorForBalancing != null) |
42 | byCount = multiSetMap_innerCustomTreeSet_outerRevTreeMap(pairComparatorForBalancing); |
43 | else |
44 | byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap(); |
45 | for (Set<Node> set : nodes.values()) { |
46 | IntPair p = first(set).pair(); |
47 | int n = set.size(); |
48 | if (verbose) print("Adding node " + p + " (" + n + ")"); |
49 | byCount.put(n, p); |
50 | } |
51 | //} |
52 | ifdef LineComp_PairIndex_printSize |
53 | print("byCount size: " + n2(guessObjectSizeWithout(ll(this, pairComparatorForBalancing), byCount)) + " for " + nNodes(nodes.size())); |
54 | endifdef |
55 | } |
56 | } |
57 | |
58 | // a "chain" of nodes (one input file) |
59 | sclass Ch { |
60 | Node tail; |
61 | |
62 | *(LineComp_PairIndex pi, L<Int> l) { |
63 | int count = 0; |
64 | fOr (int i : l) { |
65 | add(pi, i); |
66 | if (((++count) % oneMillion()) == 0) |
67 | print("Added " + n2(count) + " entries to chain"); |
68 | } |
69 | } |
70 | |
71 | L<Int> toList() { |
72 | new L<Int> l; |
73 | Node node = tail; |
74 | while (node != null) { |
75 | l.add(node.value); |
76 | node = node.prev; |
77 | } |
78 | ret reverseInPlace(l); |
79 | } |
80 | |
81 | void add(LineComp_PairIndex pi, int i) { |
82 | new Node n; |
83 | ifndef LineComp_SingleChain |
84 | n.ch = this; |
85 | endifndef |
86 | n.value = i; |
87 | n.prev = tail; |
88 | if (tail != null) tail.next = n; |
89 | tail = n; |
90 | pi.addToIndex(n.prev); |
91 | pi.processQueue(); |
92 | } |
93 | } |
94 | |
95 | sclass Node { |
96 | ifndef LineComp_SingleChain |
97 | Ch ch; |
98 | endifndef |
99 | int value; |
100 | Node prev, next; |
101 | |
102 | int a() { ret value; } |
103 | int b() { ret next == null ? -1 : next.value; } |
104 | IntPair pair() { ret next == null ? null : IntPair(a(), b()); } |
105 | |
106 | toString { |
107 | ret stringIf(prev != null, "<") + value + |
108 | (next == null ? "" : next.value |
109 | + stringIf(next.next != null, ">")); |
110 | } |
111 | } |
112 | |
113 | IntPair nodeToPair(Node n) { |
114 | ret n?.pair(); |
115 | } |
116 | |
117 | // add node to pair index |
118 | void addToIndex(Node n) { |
119 | IntPair p = nodeToPair(n); |
120 | if (p == null) ret; |
121 | callFAll(onAddingPair, n); |
122 | int count = nodes_getSize(p); |
123 | nodes_add(p, n); |
124 | if (byCount != null) { // not in init phase |
125 | if (count != 0) byCount.remove(count, p); |
126 | byCount.put(count+1, p); |
127 | } |
128 | if (existingPairsIndex != null) |
129 | compressionQueue.add(n); |
130 | } |
131 | |
132 | // remove node from pair index |
133 | void removeFromIndex(Node n) { |
134 | IntPair p = nodeToPair(n); |
135 | if (p == null) ret; |
136 | callFAll(onRemovingPair, n); |
137 | int count = nodes_getSize(p); |
138 | if (count == 0) fail("Can't remove pair " + p); |
139 | nodes_remove(p, n); |
140 | byCount.remove(count, p); |
141 | if (--count > 0) byCount.put(count, p); |
142 | } |
143 | |
144 | // return all pairs in "most popular" bucket |
145 | Cl<IntPair> mostPopularDuplicates() { |
146 | Int key = firstKey(byCount); |
147 | ret toInt(key) < 2 ? null : byCount.get(key); |
148 | } |
149 | |
150 | IntPair mostPopularDuplicate() { |
151 | ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount); |
152 | } |
153 | |
154 | IntPair mostPopularPair() { |
155 | ret firstValue(byCount); |
156 | } |
157 | |
158 | int highestBucketNr() { |
159 | ret or0(firstKey(byCount)); |
160 | } |
161 | |
162 | // counts buckets actually (lower value than totalPairCount) |
163 | int numberOfDuplicates() { |
164 | ret byCount.size()-l(byCount.get(1)); |
165 | } |
166 | |
167 | int getCount(IntPair p) { |
168 | ret nodes_getSize(p); |
169 | } |
170 | |
171 | void replacePair(int pa, int pb, int replaceWith) { |
172 | IntPair p = IntPair(pa, pb); |
173 | Set<Node> set = nodes.get(intPairToLong(p)); |
174 | if (set != null) |
175 | for (Node n : cloneList(set)) { |
176 | continue if n.a() != pa || n.b() != pb; // nodes might have been deleted or changed |
177 | replacePair(n, replaceWith); |
178 | } |
179 | } |
180 | |
181 | void replacePair(Node node, int replaceWith) { |
182 | removeFromIndex(node.prev); |
183 | removeFromIndex(node); |
184 | removeFromIndex(node.next); |
185 | node.value = replaceWith; |
186 | deleteNode(node.next); |
187 | addToIndex(node.prev); |
188 | addToIndex(node); |
189 | processQueue(); |
190 | } |
191 | |
192 | void processQueue { |
193 | if (existingPairsIndex != null) { |
194 | Node n; |
195 | while not null (n = popFirst(compressionQueue)) |
196 | checkIfPairExists(n); |
197 | } |
198 | } |
199 | |
200 | void checkIfPairExists(Node node) { |
201 | if (existingPairsIndex == null) ret; |
202 | if (node.value < 0) ret; |
203 | IntPair pair = nodeToPair(node); |
204 | if (pair == null) ret; |
205 | //print("Checking pair " + pair); |
206 | int i = existingPairsIndex.getIfAbsent(intPairToLong(pair), -1); |
207 | if (i >= 0) { |
208 | //print(" Replacing with: " + i); |
209 | replacePair(node, i); |
210 | } |
211 | } |
212 | |
213 | void deleteNode(Node node) { |
214 | if (node.next != null) node.next.prev = node.prev; else getChain(node).tail = node.prev; |
215 | if (node.prev != null) node.prev.next = node.next; |
216 | node.value = -1; // mark invalid |
217 | } |
218 | |
219 | Ch newChain(S version, L<Int> encoding) { |
220 | Ch ch = new(this, encoding); |
221 | chains.put(version, ch); |
222 | ifdef LineComp_SingleChain |
223 | assertNull(firstChain); |
224 | endifdef |
225 | if (firstChain == null) firstChain = ch; |
226 | ret ch; |
227 | } |
228 | |
229 | int nodes_getSize(IntPair p) { |
230 | ret l(nodes.get(intPairToLong(p))); |
231 | } |
232 | |
233 | void nodes_add(IntPair p, Node n) { |
234 | ++totalPairCount; |
235 | long l = intPairToLong(p); |
236 | LinkedHashSet<Node> set = nodes.get(l); |
237 | if (set == null) |
238 | nodes.put(l, set = new LinkedHashSet(initialNodesSetCapacity); |
239 | set.add(n); |
240 | } |
241 | |
242 | void nodes_remove(IntPair p, Node n) { |
243 | --totalPairCount; |
244 | long l = intPairToLong(p); |
245 | LinkedHashSet<Node> set = nodes.get(l); |
246 | if (set == null || !set.remove(n)) fail("Consistency error"); |
247 | if (set.isEmpty()) |
248 | nodes.remove(l); |
249 | } |
250 | |
251 | // also shrinks sets... |
252 | void printStats { |
253 | print("Different pairs: " + l(nodes)); |
254 | print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")"); |
255 | long count = 0, waste = 0, newWaste = 0; |
256 | long count1 = 0, waste1 = 0; |
257 | |
258 | for (HashSet set : nodes.values()) { |
259 | count1 += set.size(); |
260 | int capacity = hashSetCapacity(set); |
261 | waste1 += capacity-set.size(); |
262 | //if (capacity > set.size()*3) set.shrink(); // TODO |
263 | } |
264 | |
265 | if (pairComparatorForBalancing == null) for (CompactHashSet set : (Cl<CompactHashSet>) (Cl) values(byCount.data)) { |
266 | count += set.size(); |
267 | waste += set.capacity()-set.size(); |
268 | set.shrinkToFactor(0.5); |
269 | newWaste += set.capacity()-set.size(); |
270 | } |
271 | |
272 | print("Nodes set count: " + count1 + ". Waste: " + waste1); |
273 | //print("Set count: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste)); |
274 | print("Duplicates: " + n2(totalPairCount-byCount.count(1))); |
275 | } |
276 | |
277 | Ch getChain(Node node) { |
278 | ifdef LineComp_SingleChain |
279 | ret firstChain; |
280 | endifdef |
281 | ifndef LineComp_SingleChain |
282 | ret node.ch; |
283 | endifndef |
284 | } |
285 | |
286 | /*public LineComp_PairIndex clone() { |
287 | |
288 | }*/ |
289 | } |
Began life as a copy of #1028247
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, vouqrxazstgt
No comments. add comment
Snippet ID: | #1030401 |
Snippet name: | LineComp_PairIndex - helper for LineComp compressor [backup before IntPairTreeSet] |
Eternal ID of this version: | #1030401/6 |
Text MD5: | f8d383902744b65e9e47ef0b05d2c67e |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-12-13 18:30:43 |
Source code size: | 8717 bytes / 289 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 196 / 282 |
Version history: | 5 change(s) |
Referenced in: | [show references] |