Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

328
LINES

< > BotCompany Repo | #1028247 // LineComp_PairIndex - helper for LineComp compressor [LIVE]

JavaX fragment (include) [tags: use-pretranspiled]

Transpiled version (4708L) is out of date.

1  
!include once #1027304 // Eclipse Collections
2  
import org.eclipse.collections.impl.map.mutable.primitive.*;
3  
import org.eclipse.collections.api.tuple.primitive.*;
4  
5  
set flag NoModCount. // save some space. Make sure that this flag doesn't screw other collections/map you compile with this.
6  
7  
final sclass LineComp_PairIndex {
8  
  bool verbose;
9  
  
10  
  // true = use LinkedHashSet for nodes - uses more space but ensures
11  
  // nodes are compacted in order instead of pseudo-randomly
12  
  // (CompactHashSet).
13  
  // When setting this to false, compression is still
14  
  // deterministic since the introduction of the Node.counter
15  
  // field.
16  
  // inOrder = false saves hugely on memory (e.g. 38M instead
17  
  // of 78M for the nodes field for 1M symbols)
18  
  
19  
  bool inOrder = false;
20  
  
21  
  //MultiSetMap<IntPair, Node> nodes;
22  
  new LongObjectHashMap<Set<Node>> nodes; // key: pair, value: set of nodes in order (LinkedHashSet)
23  
  MultiSetMap<Int, IntPair> byCount // values are pairs
24  
    = null; // init later
25  
    //= multiSetMap_innerHashSet_outerRevTreeMap(); // makes a difference in selecting a pair if there are multiple with the same count
26  
    // = multiSetMap_innerCompactHashSet_outerRevTreeMap(); // same but more compact
27  
    //= multiSetMap_innerTreeSet_outerRevTreeMap(); // initial version that kind of pointlessly sorts the same-count nodes lexicographically
28  
  new LinkedHashMap<S, Ch> chains;
29  
  Ch firstChain;
30  
  int totalPairCount;
31  
  int initialNodesSetCapacity = 1;
32  
  L<IVF1<Node>> onAddingPair, onRemovingPair;
33  
  LongIntHashMap existingPairsIndex;
34  
  int nodeCounter;
35  
  
36  
  // This queue checks all newly made pairs to see if they already
37  
  // have a symbol.
38  
  new LinkedList<Node> compressionQueue;
39  
  
40  
  // TODO: use LongComparator
41  
  Comparator<IntPair> pairComparatorForBalancing;
42  
43  
  void init {}
44  
  
45  
  class BalancedByCountMultiSetMap extends MultiSetMap<Int, IntPair> {
46  
    Set<IntPair> _makeEmptySet() { ret new BalancedIntPairSet;
47  
  };
48  
    *() {
49  
      data = revTreeMap();
50  
    }
51  
  }
52  
  
53  
  class BalancedIntPairSet extends IntPairTreeSet {
54  
    public int compare(long a, long b) {
55  
      ret pairComparatorForBalancing.compare(longToIntPair(a), longToIntPair(b);
56  
    }
57  
  }
58  
  
59  
  // indicate end of chain-making phase.
60  
  // we still accept more chains after this though, I think (for late input)
61  
  void haveChains {
62  
    if (byCount == null) {
63  
      //time "Making byCount" {
64  
        if (pairComparatorForBalancing != null) {
65  
          //byCount = multiSetMap_innerCustomIntPairTreeSet_outerRevTreeMap(pairComparatorForBalancing);
66  
          byCount = new BalancedByCountMultiSetMap;
67  
        } else
68  
          byCount = multiSetMap_innerCompactHashSet_outerRevTreeMap();
69  
        for (Set<Node> set : nodes.values()) {
70  
          IntPair p = first(set).pair();
71  
          int n = set.size();
72  
          if (verbose) print("Adding node " + p + " (" + n + ")");
73  
          byCount.put(n, p);
74  
        }
75  
      //}
76  
      ifdef LineComp_PairIndex_printSize
77  
        print("byCount size: " + n2(guessObjectSizeWithout(ll(this), byCount)) + " for " + nNodes(nodes.size()));
78  
        print("nodes size: " + n2(guessObjectSize(nodes)));
79  
        print("PairIndex size: " + n2(guessObjectSize(this)));
80  
        printRecursiveSizeOfFields(this);
81  
        printClassLayout(Node.class);
82  
      endifdef
83  
    }
84  
  }
85  
  
86  
  // a "chain" of nodes (one input file)
87  
  sclass Ch {
88  
    Node tail;
89  
    
90  
    *(LineComp_PairIndex pi, L<Int> l) {
91  
      int count = 0;
92  
      fOr (int i : l) {
93  
        add(pi, i);
94  
        if (((++count) % oneMillion()) == 0)
95  
          print("Added " + n2(count) + " entries to chain");
96  
      }
97  
    }
98  
    
99  
    L<Int> toList() {
100  
      new L<Int> l;
101  
      Node node = tail;
102  
      while (node != null) {
103  
        l.add(node.value);
104  
        node = node.prev;
105  
      }
106  
      ret reverseInPlace(l);
107  
    }
108  
    
109  
    void add(LineComp_PairIndex pi, int i) {
110  
      new Node n;
111  
      n.counter = pi.nodeCounter++;
112  
      ifndef LineComp_SingleChain
113  
        n.ch = this;
114  
      endifndef
115  
      n.value = i;
116  
      n.prev = tail;
117  
      if (tail != null) tail.next = n;
118  
      tail = n;
119  
      pi.addToIndex(n.prev);
120  
      pi.processQueue();
121  
    }
122  
  }
123  
  
124  
  sclass Node {
125  
    ifndef LineComp_SingleChain
126  
      Ch ch;
127  
    endifndef
128  
    int counter; // for deterministic ordering in hash set
129  
    int value;
130  
    Node prev, next;
131  
    
132  
    int a() { ret value; }
133  
    int b() { ret next == null ? -1 : next.value; }
134  
    IntPair pair() { ret next == null ? null : IntPair(a(), b()); }
135  
    
136  
    toString {
137  
      ret stringIf(prev != null, "<") + value +
138  
        (next == null ? "" : next.value
139  
         + stringIf(next.next != null, ">")); 
140  
    }
141  
    
142  
    public int hashCode() { ret counter; }
143  
  }
144  
  
145  
  IntPair nodeToPair(Node n) {
146  
    ret n?.pair();
147  
  }
148  
  
149  
  // add node to pair index
150  
  void addToIndex(Node n) {
151  
    IntPair p = nodeToPair(n);
152  
    if (p == null) ret;
153  
    callFAll(onAddingPair, n);
154  
    int count = nodes_getSize(p);
155  
    nodes_add(p, n);
156  
    if (byCount != null) { // not in init phase
157  
      if (count != 0) byCount.remove(count, p);
158  
      byCount.put(count+1, p);
159  
    }
160  
    if (existingPairsIndex != null)
161  
      compressionQueue.add(n);
162  
  }
163  
  
164  
  // remove node from pair index
165  
  void removeFromIndex(Node n) {
166  
    IntPair p = nodeToPair(n);
167  
    if (p == null) ret;
168  
    callFAll(onRemovingPair, n);
169  
    int count = nodes_getSize(p);
170  
    if (count == 0) fail("Can't remove pair " + p);
171  
    nodes_remove(p, n);
172  
    byCount.remove(count, p);
173  
    if (--count > 0) byCount.put(count, p);
174  
  }
175  
  
176  
  // return all pairs in "most popular" bucket
177  
  Cl<IntPair> mostPopularDuplicates() {
178  
    Int key = firstKey(byCount);
179  
    ret toInt(key) < 2 ? null : byCount.get(key);
180  
  }
181  
  
182  
  IntPair mostPopularDuplicate() {
183  
    ret toInt(firstKey(byCount)) < 2 ? null : firstValue(byCount);
184  
  }
185  
  
186  
  IntPair mostPopularPair() {
187  
    ret firstValue(byCount);
188  
  }
189  
  
190  
  int highestBucketNr() {
191  
    ret or0(firstKey(byCount));
192  
  }
193  
  
194  
  // counts buckets actually (lower value than totalPairCount)
195  
  int numberOfDuplicates() {
196  
    ret byCount.size()-l(byCount.get(1));
197  
  }
198  
  
199  
  int getCount(IntPair p) {
200  
    ret nodes_getSize(p);
201  
  }
202  
  
203  
  void replacePair(int pa, int pb, int replaceWith) {
204  
    IntPair p = IntPair(pa, pb);
205  
    Set<Node> set = nodes.get(intPairToLong(p));
206  
    if (set != null)
207  
      for (Node n : cloneList(set)) {
208  
        continue if n.a() != pa || n.b() != pb; // nodes might have been deleted or changed
209  
        replacePair(n, replaceWith);
210  
      }
211  
  }
212  
  
213  
  void replacePair(Node node, int replaceWith) {
214  
    removeFromIndex(node.prev);
215  
    removeFromIndex(node);
216  
    removeFromIndex(node.next);
217  
    node.value = replaceWith;
218  
    deleteNode(node.next);
219  
    addToIndex(node.prev);
220  
    addToIndex(node);
221  
    processQueue();
222  
  }
223  
  
224  
  void processQueue {
225  
    if (existingPairsIndex != null) {
226  
      Node n;
227  
      while not null (n = popFirst(compressionQueue))
228  
        checkIfPairExists(n);
229  
    }
230  
  }
231  
  
232  
  void checkIfPairExists(Node node) {
233  
    if (existingPairsIndex == null) ret;
234  
    if (node.value < 0) ret;
235  
    IntPair pair = nodeToPair(node);
236  
    if (pair == null) ret;
237  
    //print("Checking pair " + pair);
238  
    int i = existingPairsIndex.getIfAbsent(intPairToLong(pair), -1);
239  
    if (i >= 0) {
240  
      //print("  Replacing with: " + i);
241  
      replacePair(node, i);
242  
    }
243  
  }
244  
  
245  
  void deleteNode(Node node) {
246  
    if (node.next != null) node.next.prev = node.prev; else getChain(node).tail = node.prev;
247  
    if (node.prev != null) node.prev.next = node.next;
248  
    node.value = -1; // mark invalid
249  
  }
250  
  
251  
  Ch newChain(S version, L<Int> encoding) {
252  
    Ch ch = new(this, encoding);
253  
    chains.put(version, ch);
254  
    ifdef LineComp_SingleChain
255  
      assertNull(firstChain);
256  
    endifdef
257  
    if (firstChain == null) firstChain = ch;
258  
    ret ch;
259  
  }
260  
  
261  
  int nodes_getSize(IntPair p) {
262  
    ret l(nodes.get(intPairToLong(p)));
263  
  }
264  
  
265  
  void nodes_add(IntPair p, Node n) {
266  
    ++totalPairCount;
267  
    long l = intPairToLong(p);
268  
    Set<Node> set = nodes.get(l);
269  
    if (set == null)
270  
      nodes.put(l, set = newNodesSet());
271  
    set.add(n);
272  
  }
273  
  
274  
  Set<Node> newNodesSet() {
275  
    ret inOrder
276  
      ? new LinkedHashSet(initialNodesSetCapacity)
277  
      : new CompactHashSet(initialNodesSetCapacity);
278  
  }
279  
  
280  
  void nodes_remove(IntPair p, Node n) {
281  
    --totalPairCount;
282  
    long l = intPairToLong(p);
283  
    Set<Node> set = nodes.get(l);
284  
    if (set == null || !set.remove(n)) fail("Consistency error");
285  
    if (set.isEmpty())
286  
      nodes.remove(l);
287  
  }
288  
  
289  
  // also shrinks sets...
290  
  void printStats {
291  
    print("Different pairs: " + l(nodes));
292  
    print("Different counts: " + byCount.keysSize() + " (highest count: " + firstKey(byCount) + ")");
293  
    long count = 0, waste = 0, newWaste = 0;
294  
    long count1 = 0, waste1 = 0;
295  
    
296  
    for (Set set : nodes.values())
297  
      if (set cast HashSet) {
298  
        count1 += set.size();
299  
        int capacity = hashSetCapacity(set);
300  
        waste1 += capacity-set.size();
301  
        //if (capacity > set.size()*3) set.shrink(); // TODO
302  
      }
303  
     
304  
    if (pairComparatorForBalancing == null) for (CompactHashSet set : (Cl<CompactHashSet>) (Cl) values(byCount.data)) {
305  
      count += set.size();
306  
      waste += set.capacity()-set.size();
307  
      set.shrinkToFactor(0.5);
308  
      newWaste += set.capacity()-set.size();
309  
    }
310  
    
311  
    print("Nodes set count: " + count1 + ". Waste: " + waste1);
312  
    //print("Set count: " + count + ". Waste: " + waste + (waste == newWaste ? "" : " => " + newWaste));
313  
    print("Duplicates: " + n2(totalPairCount-byCount.count(1)));
314  
  }
315  
  
316  
  Ch getChain(Node node) {
317  
    ifdef LineComp_SingleChain
318  
      ret firstChain;
319  
    endifdef
320  
    ifndef LineComp_SingleChain
321  
      ret node.ch;
322  
    endifndef
323  
  }
324  
  
325  
  /*public LineComp_PairIndex clone() {
326  
    
327  
  }*/
328  
}

Author comment

Began life as a copy of #1028215

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028247
Snippet name: LineComp_PairIndex - helper for LineComp compressor [LIVE]
Eternal ID of this version: #1028247/101
Text MD5: c7456c6094db716f77cd3b448e1c34ca
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-12-13 19:19:54
Source code size: 9983 bytes / 328 lines
Pitched / IR pitched: No / No
Views / Downloads: 578 / 1255
Version history: 100 change(s)
Referenced in: [show references]