Transpiled version (4708L) is out of date.
!include once #1027304 // Eclipse Collections import org.eclipse.collections.impl.map.mutable.primitive.*; import org.eclipse.collections.api.tuple.primitive.*; set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this. final sclass LineComp_PairIndex { bool verbose; // true = use LinkedHashSet for nodes - uses more space but ensures // nodes are compacted in order instead of pseudo-randomly // (CompactHashSet). // When setting this to false, compression is still // deterministic since the introduction of the Node.counter // field. // inOrder = false saves hugely on memory (e.g. 38M instead // of 78M for the nodes field for 1M symbols) bool inOrder = false; //MultiSetMap<IntPair, Node> nodes; new LongObjectHashMap<Set<Node>> nodes; // key: pair, value: set of nodes in order (LinkedHashSet) MultiSetMap<Int, IntPair> byCount // values are pairs = null; // init later //= multiSetMap_innerHashSet_outerRevTreeMap(); // makes a difference in selecting a pair if there are multiple with the same count // = multiSetMap_innerCompactHashSet_outerRevTreeMap(); // same but more compact //= multiSetMap_innerTreeSet_outerRevTreeMap(); // initial version that kind of pointlessly sorts the same-count nodes lexicographically new LinkedHashMap<S, Ch> chains; Ch firstChain; int totalPairCount; int initialNodesSetCapacity = 1; L<IVF1<Node>> onAddingPair, onRemovingPair; LongIntHashMap existingPairsIndex; int nodeCounter; // This queue checks all newly made pairs to see if they already // have a symbol. new LinkedList<Node> compressionQueue; // TODO: use LongComparator Comparator<IntPair> pairComparatorForBalancing; void init {} class BalancedByCountMultiSetMap extends MultiSetMap<Int, IntPair> { Set<IntPair> _makeEmptySet() { ret new BalancedIntPairSet; }; *() { data = revTreeMap(); } } class BalancedIntPairSet extends IntPairTreeSet { public int compare(long a, long b) { ret pairComparatorForBalancing.compare(longToIntPair(a), longToIntPair(b); } } // indicate end of chain-making phase. // we still accept more chains after this though, I think (for late input) void haveChains { if (byCount == null) { //time "Making byCount" { if (pairComparatorForBalancing != null) { //byCount = multiSetMap_innerCustomIntPairTreeSet_outerRevTreeMap(pairComparatorForBalancing); byCount = new BalancedByCountMultiSetMap; } else byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap(); for (Set<Node> set : nodes.values()) { IntPair p = first(set).pair(); int n = set.size(); if (verbose) print("Adding node " + p + " (" + n + ")"); byCount.put(n, p); } //} ifdef LineComp_PairIndex_printSize print("byCount size: " + n2(guessObjectSizeWithout(ll(this), byCount)) + " for " + nNodes(nodes.size())); print("nodes size: " + n2(guessObjectSize(nodes))); print("PairIndex size: " + n2(guessObjectSize(this))); printRecursiveSizeOfFields(this); printClassLayout(Node.class); endifdef } } // a "chain" of nodes (one input file) sclass Ch { Node tail; *(LineComp_PairIndex pi, L<Int> l) { int count = 0; fOr (int i : l) { add(pi, i); if (((++count) % oneMillion()) == 0) print("Added " + n2(count) + " entries to chain"); } } L<Int> toList() { new L<Int> l; Node node = tail; while (node != null) { l.add(node.value); node = node.prev; } ret reverseInPlace(l); } void add(LineComp_PairIndex pi, int i) { new Node n; n.counter = pi.nodeCounter++; ifndef LineComp_SingleChain n.ch = this; endifndef n.value = i; n.prev = tail; if (tail != null) tail.next = n; tail = n; pi.addToIndex(n.prev); pi.processQueue(); } } sclass Node { ifndef LineComp_SingleChain Ch ch; endifndef int counter; // for deterministic ordering in hash set int value; Node prev, next; int a() { ret value; } int b() { ret next == null ? -1 : next.value; } IntPair pair() { ret next == null ? null : IntPair(a(), b()); } toString { ret stringIf(prev != null, "<") + value + (next == null ? "" : next.value + stringIf(next.next != null, ">")); } public int hashCode() { ret counter; } } IntPair nodeToPair(Node n) { ret n?.pair(); } // add node to pair index void addToIndex(Node n) { IntPair p = nodeToPair(n); if (p == null) ret; callFAll(onAddingPair, n); int count = nodes_getSize(p); nodes_add(p, n); if (byCount != null) { // not in init phase if (count != 0) byCount.remove(count, p); byCount.put(count+1, p); } if (existingPairsIndex != null) compressionQueue.add(n); } // remove node from pair index void removeFromIndex(Node n) { IntPair p = nodeToPair(n); if (p == null) ret; callFAll(onRemovingPair, n); int count = nodes_getSize(p); if (count == 0) fail("Can't remove pair " + p); nodes_remove(p, n); byCount.remove(count, p); if (--count > 0) byCount.put(count, p); } // return all pairs in "most popular" bucket Cl<IntPair> mostPopularDuplicates() { Int key = firstKey(byCount); ret toInt(key) < 2 ? null : byCount.get(key); } IntPair mostPopularDuplicate() { ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount); } IntPair mostPopularPair() { ret firstValue(byCount); } int highestBucketNr() { ret or0(firstKey(byCount)); } // counts buckets actually (lower value than totalPairCount) int numberOfDuplicates() { ret byCount.size()-l(byCount.get(1)); } int getCount(IntPair p) { ret nodes_getSize(p); } void replacePair(int pa, int pb, int replaceWith) { IntPair p = IntPair(pa, pb); Set<Node> set = nodes.get(intPairToLong(p)); if (set != null) for (Node n : cloneList(set)) { continue if n.a() != pa || n.b() != pb; // nodes might have been deleted or changed replacePair(n, replaceWith); } } void replacePair(Node node, int replaceWith) { removeFromIndex(node.prev); removeFromIndex(node); removeFromIndex(node.next); node.value = replaceWith; deleteNode(node.next); addToIndex(node.prev); addToIndex(node); processQueue(); } void processQueue { if (existingPairsIndex != null) { Node n; while not null (n = popFirst(compressionQueue)) checkIfPairExists(n); } } void checkIfPairExists(Node node) { if (existingPairsIndex == null) ret; if (node.value < 0) ret; IntPair pair = nodeToPair(node); if (pair == null) ret; //print("Checking pair " + pair); int i = existingPairsIndex.getIfAbsent(intPairToLong(pair), -1); if (i >= 0) { //print(" Replacing with: " + i); replacePair(node, i); } } void deleteNode(Node node) { if (node.next != null) node.next.prev = node.prev; else getChain(node).tail = node.prev; if (node.prev != null) node.prev.next = node.next; node.value = -1; // mark invalid } Ch newChain(S version, L<Int> encoding) { Ch ch = new(this, encoding); chains.put(version, ch); ifdef LineComp_SingleChain assertNull(firstChain); endifdef if (firstChain == null) firstChain = ch; ret ch; } int nodes_getSize(IntPair p) { ret l(nodes.get(intPairToLong(p))); } void nodes_add(IntPair p, Node n) { ++totalPairCount; long l = intPairToLong(p); Set<Node> set = nodes.get(l); if (set == null) nodes.put(l, set = newNodesSet()); set.add(n); } Set<Node> newNodesSet() { ret inOrder ? new LinkedHashSet(initialNodesSetCapacity) : new CompactHashSet(initialNodesSetCapacity); } void nodes_remove(IntPair p, Node n) { --totalPairCount; long l = intPairToLong(p); Set<Node> set = nodes.get(l); if (set == null || !set.remove(n)) fail("Consistency error"); if (set.isEmpty()) nodes.remove(l); } // also shrinks sets... void printStats { print("Different pairs: " + l(nodes)); print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")"); long count = 0, waste = 0, newWaste = 0; long count1 = 0, waste1 = 0; for (Set set : nodes.values()) if (set cast HashSet) { count1 += set.size(); int capacity = hashSetCapacity(set); waste1 += capacity-set.size(); //if (capacity > set.size()*3) set.shrink(); // TODO } if (pairComparatorForBalancing == null) for (CompactHashSet set : (Cl<CompactHashSet>) (Cl) values(byCount.data)) { count += set.size(); waste += set.capacity()-set.size(); set.shrinkToFactor(0.5); newWaste += set.capacity()-set.size(); } print("Nodes set count: " + count1 + ". Waste: " + waste1); //print("Set count: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste)); print("Duplicates: " + n2(totalPairCount-byCount.count(1))); } Ch getChain(Node node) { ifdef LineComp_SingleChain ret firstChain; endifdef ifndef LineComp_SingleChain ret node.ch; endifndef } /*public LineComp_PairIndex clone() { }*/ }
Began life as a copy of #1028215
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028247 |
Snippet name: | LineComp_PairIndex - helper for LineComp compressor [LIVE] |
Eternal ID of this version: | #1028247/101 |
Text MD5: | c7456c6094db716f77cd3b448e1c34ca |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-12-13 19:19:54 |
Source code size: | 9983 bytes / 328 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 576 / 1251 |
Version history: | 100 change(s) |
Referenced in: | [show references] |