// assumes LineComp_SingleChain! // For multi-chain, use v2 of this class (#1028521) // // nodeHashCode is not used yet // // Some bug with the freeing of nodes... !include once #1027304 // Eclipse Collections import org.eclipse.collections.impl.map.mutable.primitive.*; import org.eclipse.collections.api.tuple.primitive.*; set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this. final sclass LineComp_PairIndex { replace Node with int. replace _Node with Int. bool verbose; bool garbageCollect = false; // still buggy new LongObjectHashMap nodes; MultiSetMap byCount; new LinkedHashMap chains; Ch firstChain; int objectCounter; // for making deterministic hash codes int nodeCounter; static int initialNodesSetCapacity = 1; static final int nodeSize = 6; // 5 ints (hash code + value + 4 pointers) static new ChunkedIntList nodeData; int nodeHashCode(int node) { ret nodeData.get(node); } int value(int node) { ret nodeData.get(node+1); } int prev(int node) { ret nodeData.get(node+2); } int next(int node) { ret nodeData.get(node+3); } int prevCousin(int node) { ret nodeData.get(node+4); } int nextCousin(int node) { ret nodeData.get(node+5); } void setNodeHashCode(int node, int x) { nodeDataSet(node, x); } void setValue(int node, int x) { nodeDataSet(node+1, x); } void setPrev(int node, int x) { nodeDataSet(node+2, x); } void setNext(int node, int x) { nodeDataSet(node+3, x); } void setPrevCousin(int node, int x) { nodeDataSet(node+4, x); } void setNextCousin(int node, int x) { nodeDataSet(node+5, x); } void nodeDataSet(int idx, int value) { nodeData.set(idx, value); } void init { for i to nodeSize: nodeData.add(0); } // Cousins is a replacement for LinkedHashSet // - a set of all nodes forming a certain pair with their successor // Nodes are linked through prevCousin/nextCousin final class Cousins extends CompactHashSet<_Node> { Node head, tail; int hashCode; *(LineComp_PairIndex pi) { super(initialNodesSetCapacity); hashCode = ++pi.objectCounter; } @Override public int hashCode() { ret hashCode; } public bool add(Node n) { setPrevCousin(n, tail); if (tail != 0) // there already are entries setNextCousin(tail, n); else head = n; tail = n; ret super.add(n); } public bool remove(Node node) { if (nextCousin(node) != 0) setPrevCousin(nextCousin(node), prevCousin(node)); else tail = prevCousin(node); if (prevCousin(node) != 0) setNextCousin(prevCousin(node), nextCousin(node)); else head = nextCousin(node); setPrevCousin(node, 0); setNextCousin(node, 0); ret super.remove(node); } // node has changed internal location. don't change the linked list stuff! void moveNode(Node from, Node to) { super.remove(from); super.add(to); } L<_Node> asList() { L<_Node> l = emptyList(size()); Node n = head; while (n != 0) { l.add(n); n = nextCousin(n); } ret l; } } // indicate end of chain-making phase void haveChains { if (byCount == null) { print("n=" + nodes.size()); time "Making byCount" { byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap(); for (Cousins set : nodes.values()) byCount.put(set.size(), set); } } } // a "chain" of nodes (one input file) class Ch { Node tail; *(LineComp_PairIndex pi, L l) { int count = 0; fOr (int i : l) { add(pi, i); if (((++count) % oneMillion()) == 0) print("Added " + count + " entries to chain"); } } L toList() { new L l; Node node = tail; while (node != 0) { l.add(value(node)); node = prev(node); } ret reverseInPlace(l); } void add(LineComp_PairIndex pi, int i) { Node n = pi.newNode(); setValue(n, i); setPrev(n, tail); if (tail != 0) setNext(tail, n); tail = n; pi.addToIndex(prev(n)); } } Node newNode() { for i to nodeSize: nodeData.add(0); ret nodeCounter += nodeSize; } int node_a(int node) { ret value(node); } int node_b(int node) { int next = next(node); ret next == 0 ? -1 : value(next); } IntPair nodeToPair(int node) { if (node == 0) null; int next = next(node); ret next == 0 ? null : IntPair(node_a(node), node_b(node)); } // add node to pair index void addToIndex(Node n) { IntPair p = nodeToPair(n); if (p == null) ret; int count = nodes_getSize(p); Cousins set = nodes_add(p, n); if (byCount != null) { // not in init phase if (count != 0) byCount.remove(count, set); byCount.put(count+1, set); } } // remove node from pair index void removeFromIndex(Node n) { IntPair p = nodeToPair(n); if (p == null) ret; Cousins set = nodes.get(intPairToLong(p)); int count = l(set); if (count == 0) fail("Can't remove pair " + p); nodes_remove(p, n); byCount.remove(count, set); if (--count > 0) byCount.put(count, set); } IntPair mostPopularDuplicate() { ret toInt(firstKey(byCount)) < 2 ? null : nodeToPair(firstValue(byCount).head); } int numberOfDuplicates() { ret byCount.size()-l(byCount.get(1)); } int getCount(IntPair p) { ret nodes_getSize(p); } L<_Node> replacing; Map<_Node, Int> replacing_index; void replacePair(int pa, int pb, int replaceWith) { IntPair p = IntPair(pa, pb); Cousins set = nodes.get(intPairToLong(p)); if (set != null) { replacing = set.asList(); replacing_index = listIndex(replacing); for i over replacing: { Node n = replacing.get(i); continue if node_a(n) != pa || node_b(n) != pb; // nodes might have been deleted or changed replacePair(n, replaceWith); } replacing = null; replacing_index = null; } } void replacePair(Node node, int replaceWith) { removeFromIndex(prev(node)); removeFromIndex(node); removeFromIndex(next(node)); setValue(node, replaceWith); deleteNode(next(node)); addToIndex(prev(node)); addToIndex(node); } void deleteNode(Node node) { if (next(node) != 0) setPrev(next(node), prev(node)); else firstChain.tail = prev(node); if (prev(node) != 0) setNext(prev(node), next(node)); //setValue(node, -1); // mark invalid physicallyDisposeOfNode(node); } void physicallyDisposeOfNode(Node node) { if (!garbageCollect) ret; // move last node in list to place that is now free if (node != nodeCounter) { //print("Moving node " + nodeCounter + " to " + node); moveNode(nodeCounter, node); } //print("Disposing of node " + nodeCounter); nodeData.truncateAt(nodeCounter); nodeCounter -= nodeSize; } void moveNode(int from, int to) { // Copy hash code & value. setNodeHashCode(to, nodeHashCode(from)); setValue(to, value(from)); // Be sure to update all the pointers there may be to this node! // First, find Cousins object. IntPair pair = nodeToPair(from); if (pair != null) { Cousins cousins = nodes.get(intPairToLong(pair)); if (cousins.head == from) cousins.head = to; if (cousins.tail == from) cousins.tail = to; cousins.moveNode(from, to); } // Next, check the chain. if (firstChain.tail == from) firstChain.tail = to; // Now the 4 prev and next pointers int i; if ((i = prev(from)) != 0) setNext(i, to); if ((i = next(from)) != 0) setPrev(i, to); if ((i = prevCousin(from)) != 0) setNextCousin(i, to); if ((i = nextCousin(from)) != 0) setPrevCousin(i, to); // Finally the list we are currently replacing if (replacing_index != null) { _Node ref = replacing_index.get(from); if (ref != null) replacing.set(ref, to); } // That should be it...!? } void newChain(S version, L encoding) { Ch ch = new(this, encoding); chains.put(version, ch); assertNull(firstChain); firstChain = ch; } int nodes_getSize(IntPair p) { ret l(nodes.get(intPairToLong(p))); } Cousins nodes_add(IntPair p, Node n) { long l = intPairToLong(p); Cousins set = nodes.get(l); if (set == null) nodes.put(l, set = new Cousins(this)); set.add(n); ret set; } void nodes_remove(IntPair p, Node n) { long l = intPairToLong(p); Cousins set = nodes.get(l); if (set == null) ret; set.remove(n); if (set.isEmpty()) nodes.remove(l); } // also shrinks sets... void printStats { //print("Different pairs: " + l(nodes)); print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")"); long count = 0, waste = 0, newWaste = 0; long count1 = 0, waste1 = 0, newWaste1 = 0; for (Cousins set : nodes.values()) { count1 += set.size(); int capacity = set.capacity(); waste1 += capacity-set.size(); set.shrinkToFactor(0.5); newWaste1 += capacity-set.size(); } for (CompactHashSet set : (Cl) (Cl) values(byCount.data)) { count += set.size(); waste += set.capacity()-set.size(); set.shrinkToFactor(0.5); newWaste += set.capacity()-set.size(); } print("Chain length : " + count1 + ". Waste: " + waste1 + (waste1 == newWaste1 ? "" : " => " + newWaste1)); print("Different pairs: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste)); } Ch getChain(Node node) { ret firstChain; } }