// Gave nodes + cousin sets a counter to make their hashing deterministic. !include once #1027304 // Eclipse Collections import org.eclipse.collections.impl.map.mutable.primitive.*; import org.eclipse.collections.api.tuple.primitive.*; set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this. final sclass LineComp_PairIndex { bool verbose; bool inOrder = true; //MultiSetMap nodes; new LongObjectHashMap nodes; MultiSetMap byCount; new LinkedHashMap chains; Ch firstChain; int objectCounter; // for making deterministic hash codes int totalPairCount; static int initialNodesSetCapacity = 1; void init {} // not needed anymore // Cousins is a replacement for LinkedHashSet // - a set of all nodes forming a certain pair with their successor // Nodes are linked through prevCousin/nextCousin final sclass Cousins extends CompactHashSet { Node head, tail; int hashCode; *(LineComp_PairIndex pi) { super(initialNodesSetCapacity); hashCode = ++pi.objectCounter; } @Override public int hashCode() { ret hashCode; } public bool add(Node n) { n.prevCousin = tail; if (tail != null) // there already are entries tail.nextCousin = n; else head = n; tail = n; ret super.add(n); } public bool remove(Node node) { if (node.nextCousin != null) node.nextCousin.prevCousin = node.prevCousin; else tail = node.prevCousin; if (node.prevCousin != null) node.prevCousin.nextCousin = node.nextCousin; else head = node.nextCousin; node.prevCousin = node.nextCousin = null; ret super.remove(node); } L asList() { L l = emptyList(size()); Node n = head; while (n != null) { l.add(n); n = n.nextCousin; } ret l; } } // indicate end of chain-making phase void haveChains { if (byCount == null) { print("n=" + nodes.size()); time "Making byCount" { byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap(); for (Cousins set : nodes.values()) byCount.put(set.size(), set); } } } // a "chain" of nodes (one input file) sclass Ch { Node tail; *(LineComp_PairIndex pi, L l) { int count = 0; fOr (int i : l) { add(pi, i); if (((++count) % oneMillion()) == 0) print("Added " + count + " entries to chain"); } } L toList() { new L l; Node node = tail; while (node != null) { l.add(node.value); node = node.prev; } ret reverseInPlace(l); } void add(LineComp_PairIndex pi, int i) { Node n = new(pi); ifndef LineComp_SingleChain n.ch = this; endifndef n.value = i; n.prev = tail; if (tail != null) tail.next = n; tail = n; pi.addToIndex(n.prev); } } sclass Node { ifndef LineComp_SingleChain Ch ch; endifndef int value; Node prev, next; Node prevCousin, nextCousin; // prev & next with same pair value int hashCode; *(LineComp_PairIndex pi) { hashCode = ++pi.objectCounter; } @Override public int hashCode() { ret hashCode; } int a() { ret value; } int b() { ret next == null ? -1 : next.value; } IntPair pair() { ret next == null ? null : IntPair(a(), b()); } } IntPair nodeToPair(Node n) { ret n?.pair(); } // add node to pair index void addToIndex(Node n) { IntPair p = nodeToPair(n); if (p == null) ret; int count = nodes_getSize(p); Cousins set = nodes_add(p, n); if (byCount != null) { // not in init phase if (count != 0) byCount.remove(count, set); byCount.put(count+1, set); } } // remove node from pair index void removeFromIndex(Node n) { IntPair p = nodeToPair(n); if (p == null) ret; Cousins set = nodes.get(intPairToLong(p)); int count = l(set); if (count == 0) fail("Can't remove pair " + p); nodes_remove(p, n); byCount.remove(count, set); if (--count > 0) byCount.put(count, set); } IntPair mostPopularDuplicate() { ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount).head.pair(); } int numberOfDuplicates() { ret byCount.size()-l(byCount.get(1)); } int getCount(IntPair p) { ret nodes_getSize(p); } void replacePair(int pa, int pb, int replaceWith) { IntPair p = IntPair(pa, pb); Cousins set = nodes.get(intPairToLong(p)); if (set != null) { // TODO: might not even have to make this cloned list L l = set.asList(); //for (Node n = cousins.head; n != null; n = n.nextCousin) { for (Node n : l) { continue if n.a() != pa || n.b() != pb; // nodes might have been deleted or changed replacePair(n, replaceWith); } } } void replacePair(Node node, int replaceWith) { removeFromIndex(node.prev); removeFromIndex(node); removeFromIndex(node.next); node.value = replaceWith; deleteNode(node.next); addToIndex(node.prev); addToIndex(node); } void deleteNode(Node node) { if (node.next != null) node.next.prev = node.prev; else getChain(node).tail = node.prev; if (node.prev != null) node.prev.next = node.next; node.value = -1; // mark invalid } void newChain(S version, L encoding) { Ch ch = new(this, encoding); chains.put(version, ch); ifdef LineComp_SingleChain assertNull(firstChain); firstChain = ch; endifdef } int nodes_getSize(IntPair p) { ret l(nodes.get(intPairToLong(p))); } Cousins nodes_add(IntPair p, Node n) { ++totalPairCount; long l = intPairToLong(p); Cousins set = nodes.get(l); if (set == null) nodes.put(l, set = new Cousins(this)); set.add(n); ret set; } void nodes_remove(IntPair p, Node n) { --totalPairCount; long l = intPairToLong(p); Cousins set = nodes.get(l); if (set == null || !set.remove(n)) fail("Consistency error"); if (set.isEmpty()) nodes.remove(l); } // also shrinks sets... void printStats { //print("Different pairs: " + l(nodes)); print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")"); long count = 0, waste = 0, newWaste = 0; long count1 = 0, waste1 = 0, newWaste1 = 0; for (Cousins set : nodes.values()) { count1 += set.size(); int capacity = set.capacity(); waste1 += capacity-set.size(); set.shrinkToFactor(0.5); newWaste1 += capacity-set.size(); } for (CompactHashSet set : (Cl) (Cl) values(byCount.data)) { count += set.size(); waste += set.capacity()-set.size(); set.shrinkToFactor(0.5); newWaste += set.capacity()-set.size(); } print("Chain length : " + count1 + ". Waste: " + waste1 + (waste1 == newWaste1 ? "" : " => " + newWaste1)); print("Different pairs: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste)); } Ch getChain(Node node) { ifdef LineComp_SingleChain ret firstChain; endifdef ifndef LineComp_SingleChain ret node.ch; endifndef } }