LineComp_PairIndex - helper for LineComp compressor [LIVE] [1028247]

!include once #1027304 // Eclipse Collections
import org.eclipse.collections.impl.map.mutable.primitive.*;
import org.eclipse.collections.api.tuple.primitive.*;

set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this.

final sclass LineComp_PairIndex {
  bool verbose;
  
  // true = use LinkedHashSet for nodes - uses more space but ensures
  // nodes are compacted in order instead of pseudo-randomly
  // (CompactHashSet).
  // When setting this to false, compression is still
  // deterministic since the introduction of the Node.counter
  // field.
  // inOrder = false saves hugely on memory (e.g. 38M instead
  // of 78M for the nodes field for 1M symbols)
  
  bool inOrder = false;
  
  //MultiSetMap<IntPair, Node> nodes;
  new LongObjectHashMap<Set<Node>> nodes; // key: pair, value: set of nodes in order (LinkedHashSet)
  MultiSetMap<Int, IntPair> byCount // values are pairs
    = null; // init later
    //= multiSetMap_innerHashSet_outerRevTreeMap(); // makes a difference in selecting a pair if there are multiple with the same count
    // = multiSetMap_innerCompactHashSet_outerRevTreeMap(); // same but more compact
    //= multiSetMap_innerTreeSet_outerRevTreeMap(); // initial version that kind of pointlessly sorts the same-count nodes lexicographically
  new LinkedHashMap<S, Ch> chains;
  Ch firstChain;
  int totalPairCount;
  int initialNodesSetCapacity = 1;
  L<IVF1<Node>> onAddingPair, onRemovingPair;
  LongIntHashMap existingPairsIndex;
  int nodeCounter;
  
  // This queue checks all newly made pairs to see if they already
  // have a symbol.
  new LinkedList<Node> compressionQueue;
  
  // TODO: use LongComparator
  Comparator<IntPair> pairComparatorForBalancing;

  void init {}
  
  class BalancedByCountMultiSetMap extends MultiSetMap<Int, IntPair> {
    Set<IntPair> _makeEmptySet() { ret new BalancedIntPairSet;
  };
    *() {
      data = revTreeMap();
    }
  }
  
  class BalancedIntPairSet extends IntPairTreeSet {
    public int compare(long a, long b) {
      ret pairComparatorForBalancing.compare(longToIntPair(a), longToIntPair(b);
    }
  }
  
  // indicate end of chain-making phase.
  // we still accept more chains after this though, I think (for late input)
  void haveChains {
    if (byCount == null) {
      //time "Making byCount" {
        if (pairComparatorForBalancing != null) {
          //byCount = multiSetMap_innerCustomIntPairTreeSet_outerRevTreeMap(pairComparatorForBalancing);
          byCount = new BalancedByCountMultiSetMap;
        } else
          byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap();
        for (Set<Node> set : nodes.values()) {
          IntPair p = first(set).pair();
          int n = set.size();
          if (verbose) print("Adding node " + p + " (" + n + ")");
          byCount.put(n, p);
        }
      //}
      ifdef LineComp_PairIndex_printSize
        print("byCount size: " + n2(guessObjectSizeWithout(ll(this), byCount)) + " for " + nNodes(nodes.size()));
        print("nodes size: " + n2(guessObjectSize(nodes)));
        print("PairIndex size: " + n2(guessObjectSize(this)));
        printRecursiveSizeOfFields(this);
        printClassLayout(Node.class);
      endifdef
    }
  }
  
  // a "chain" of nodes (one input file)
  sclass Ch {
    Node tail;
    
    *(LineComp_PairIndex pi, L<Int> l) {
      int count = 0;
      fOr (int i : l) {
        add(pi, i);
        if (((++count) % oneMillion()) == 0)
          print("Added " + n2(count) + " entries to chain");
      }
    }
    
    L<Int> toList() {
      new L<Int> l;
      Node node = tail;
      while (node != null) {
        l.add(node.value);
        node = node.prev;
      }
      ret reverseInPlace(l);
    }
    
    void add(LineComp_PairIndex pi, int i) {
      new Node n;
      n.counter = pi.nodeCounter++;
      ifndef LineComp_SingleChain
        n.ch = this;
      endifndef
      n.value = i;
      n.prev = tail;
      if (tail != null) tail.next = n;
      tail = n;
      pi.addToIndex(n.prev);
      pi.processQueue();
    }
  }
  
  sclass Node {
    ifndef LineComp_SingleChain
      Ch ch;
    endifndef
    int counter; // for deterministic ordering in hash set
    int value;
    Node prev, next;
    
    int a() { ret value; }
    int b() { ret next == null ? -1 : next.value; }
    IntPair pair() { ret next == null ? null : IntPair(a(), b()); }
    
    toString {
      ret stringIf(prev != null, "<") + value +
        (next == null ? "" : next.value
         + stringIf(next.next != null, ">")); 
    }
    
    public int hashCode() { ret counter; }
  }
  
  IntPair nodeToPair(Node n) {
    ret n?.pair();
  }
  
  // add node to pair index
  void addToIndex(Node n) {
    IntPair p = nodeToPair(n);
    if (p == null) ret;
    callFAll(onAddingPair, n);
    int count = nodes_getSize(p);
    nodes_add(p, n);
    if (byCount != null) { // not in init phase
      if (count != 0) byCount.remove(count, p);
      byCount.put(count+1, p);
    }
    if (existingPairsIndex != null)
      compressionQueue.add(n);
  }
  
  // remove node from pair index
  void removeFromIndex(Node n) {
    IntPair p = nodeToPair(n);
    if (p == null) ret;
    callFAll(onRemovingPair, n);
    int count = nodes_getSize(p);
    if (count == 0) fail("Can't remove pair " + p);
    nodes_remove(p, n);
    byCount.remove(count, p);
    if (--count > 0) byCount.put(count, p);
  }
  
  // return all pairs in "most popular" bucket
  Cl<IntPair> mostPopularDuplicates() {
    Int key = firstKey(byCount);
    ret toInt(key) < 2 ? null : byCount.get(key);
  }
  
  IntPair mostPopularDuplicate() {
    ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount);
  }
  
  IntPair mostPopularPair() {
    ret firstValue(byCount);
  }
  
  int highestBucketNr() {
    ret or0(firstKey(byCount));
  }
  
  // counts buckets actually (lower value than totalPairCount)
  int numberOfDuplicates() {
    ret byCount.size()-l(byCount.get(1));
  }
  
  int getCount(IntPair p) {
    ret nodes_getSize(p);
  }
  
  void replacePair(int pa, int pb, int replaceWith) {
    IntPair p = IntPair(pa, pb);
    Set<Node> set = nodes.get(intPairToLong(p));
    if (set != null)
      for (Node n : cloneList(set)) {
        continue if n.a() != pa || n.b() != pb; // nodes might have been deleted or changed
        replacePair(n, replaceWith);
      }
  }
  
  void replacePair(Node node, int replaceWith) {
    removeFromIndex(node.prev);
    removeFromIndex(node);
    removeFromIndex(node.next);
    node.value = replaceWith;
    deleteNode(node.next);
    addToIndex(node.prev);
    addToIndex(node);
    processQueue();
  }
  
  void processQueue {
    if (existingPairsIndex != null) {
      Node n;
      while not null (n = popFirst(compressionQueue))
        checkIfPairExists(n);
    }
  }
  
  void checkIfPairExists(Node node) {
    if (existingPairsIndex == null) ret;
    if (node.value < 0) ret;
    IntPair pair = nodeToPair(node);
    if (pair == null) ret;
    //print("Checking pair " + pair);
    int i = existingPairsIndex.getIfAbsent(intPairToLong(pair), -1);
    if (i >= 0) {
      //print("  Replacing with: " + i);
      replacePair(node, i);
    }
  }
  
  void deleteNode(Node node) {
    if (node.next != null) node.next.prev = node.prev; else getChain(node).tail = node.prev;
    if (node.prev != null) node.prev.next = node.next;
    node.value = -1; // mark invalid
  }
  
  Ch newChain(S version, L<Int> encoding) {
    Ch ch = new(this, encoding);
    chains.put(version, ch);
    ifdef LineComp_SingleChain
      assertNull(firstChain);
    endifdef
    if (firstChain == null) firstChain = ch;
    ret ch;
  }
  
  int nodes_getSize(IntPair p) {
    ret l(nodes.get(intPairToLong(p)));
  }
  
  void nodes_add(IntPair p, Node n) {
    ++totalPairCount;
    long l = intPairToLong(p);
    Set<Node> set = nodes.get(l);
    if (set == null)
      nodes.put(l, set = newNodesSet());
    set.add(n);
  }
  
  Set<Node> newNodesSet() {
    ret inOrder
      ? new LinkedHashSet(initialNodesSetCapacity)
      : new CompactHashSet(initialNodesSetCapacity);
  }
  
  void nodes_remove(IntPair p, Node n) {
    --totalPairCount;
    long l = intPairToLong(p);
    Set<Node> set = nodes.get(l);
    if (set == null || !set.remove(n)) fail("Consistency error");
    if (set.isEmpty())
      nodes.remove(l);
  }
  
  // also shrinks sets...
  void printStats {
    print("Different pairs: " + l(nodes));
    print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")");
    long count = 0, waste = 0, newWaste = 0;
    long count1 = 0, waste1 = 0;
    
    for (Set set : nodes.values())
      if (set cast HashSet) {
        count1 += set.size();
        int capacity = hashSetCapacity(set);
        waste1 += capacity-set.size();
        //if (capacity > set.size()*3) set.shrink(); // TODO
      }
     
    if (pairComparatorForBalancing == null) for (CompactHashSet set : (Cl<CompactHashSet>) (Cl) values(byCount.data)) {
      count += set.size();
      waste += set.capacity()-set.size();
      set.shrinkToFactor(0.5);
      newWaste += set.capacity()-set.size();
    }
    
    print("Nodes set count: " + count1 + ". Waste: " + waste1);
    //print("Set count: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste));
    print("Duplicates: " + n2(totalPairCount-byCount.count(1)));
  }
  
  Ch getChain(Node node) {
    ifdef LineComp_SingleChain
      ret firstChain;
    endifdef
    ifndef LineComp_SingleChain
      ret node.ch;
    endifndef
  }
  
  /*public LineComp_PairIndex clone() {
    
  }*/
}
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
Snippet ID:	#1028247
Snippet name:	LineComp_PairIndex - helper for LineComp compressor [LIVE]
Eternal ID of this version:	#1028247/101
Text MD5:	c7456c6094db716f77cd3b448e1c34ca
Author:	stefan
Category:	javax
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2020-12-13 19:19:54
Source code size:	9983 bytes / 328 lines
Pitched / IR pitched:	No / No
Views / Downloads:	645 / 1339
Version history:	100 change(s)
Referenced in:	[show references]
< > BotCompany Repo | #1028247 // LineComp_PairIndex - helper for LineComp compressor [LIVE]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment