Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

282
LINES

< > BotCompany Repo | #1028186 // LineCompCompressor, faster version [LIVE]

JavaX fragment (include) [tags: use-pretranspiled]

Uses 11335K of libraries. Click here for Pure Java version (9174L/60K).

1  
// This is using the implementation that seems to work fastest
2  
// in reality, although it probably is O(n log n) due to the use
3  
// of TreeMaps.
4  
//
5  
// The theoretically best implementation is O(n) (replacing the
6  
// TreeMap with a linked list).
7  
8  
!include once #1027304 // Eclipse Collections
9  
10  
import org.eclipse.collections.impl.map.mutable.primitive.*;
11  
12  
final sclass LineCompCompressor {
13  
  int safety = 0;
14  
  
15  
  replace Encodings with Map<S, L<Int>>.
16  
17  
  abstract sclass Chunk {
18  
    abstract S text(L<Chunk> chunks);
19  
    IntPair getPair() { null; }
20  
  }
21  
  
22  
  srecord CPair(int i1, int i2) > Chunk {
23  
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
24  
    
25  
    S text(L<Chunk> chunks) {
26  
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
27  
    }
28  
    
29  
    IntPair getPair() {
30  
      ret IntPair(i1, i2);
31  
    }
32  
  }
33  
  
34  
  srecord CPrim(S s) > Chunk {
35  
    S text(L<Chunk> chunks) { ret s; }
36  
  }
37  
38  
  bool verbose = false, verboseCompressionSteps = false;
39  
  bool verboseStats;
40  
  bool sortLines = true;
41  
  bool verify = true;
42  
  bool byteMode; // compress bytes instead of lines
43  
  bool caseInsensitive; // for line mode
44  
  bool toUpper; // for byte mode: convert all characters to upper case
45  
  bool recordPairLengths; // for "balanced" compression
46  
  bool balancing; // try to lower tree heights by compressing "shorter" pairs first
47  
  bool fullCompression; // keep compressing until every file is a single symbol
48  
49  
  new LineComp_PairIndex pairIndex;
50  
  Map<S, LS> textIDToLines;
51  
  LS allUniqueLines;
52  
  new L<Chunk> chunks;
53  
  int primChunks;
54  
  Map<S, Int> lineIndex;
55  
  Encodings simpleEncodings; // temporary
56  
  Encodings finalEncodings;
57  
  new LongIntHashMap existingPairsIndex; // formerly Map<IntPair, Int>
58  
  int round, printStatsEvery = /*10000*/0, printStatsInFirstRounds = 10;
59  
  int printStatsEveryNSeconds = /*10*/60; // 60 for slower computers
60  
  
61  
  // initializing fullIndex = new Map will make things slow and use a lot of memory
62  
  // and I thought there might be a case where it improves the compression
63  
  // - but maybe there isn't?
64  
  Map<S, Int> fullIndex; 
65  
  SimpleIntList pairLengths;
66  
67  
  *() {}
68  
  *(SS texts) { loadTexts(texts); }
69  
  
70  
  // key = version ID, values = text
71  
  void loadTexts(SS texts) {
72  
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
73  
  }
74  
  
75  
  LS myToLines(S s) {
76  
    LS l = byteMode
77  
      ? lambdaMap charToHex(characters(toUpper ? upper(s) : s))
78  
      : toLines_nOnly_reversible(s);
79  
    ret l;
80  
  }
81  
      
82  
  S myFromLines(LS l) {
83  
    ret byteMode
84  
      ? join(mapI hexToChar(l)) // TODO: optimize even more
85  
      : fromLines_rtrim(l);
86  
  }
87  
  
88  
  run {
89  
    if (balancing) {
90  
      set recordPairLengths;
91  
      pairIndex.pairComparatorForBalancing = (p1, p2) -> {
92  
        int l1 = pairLength(p1), l2 = pairLength(p2);
93  
        if (l1 != l2) ret l1-l2;
94  
        // return arbitrary deterministic order, but not 0
95  
        if (p1.a != p2.a) ret p1.a-p2.a;
96  
        ret p1.b-p2.b;
97  
      };
98  
    }
99  
      
100  
    if (recordPairLengths) pairLengths = new SimpleIntList;
101  
    LS allLines = concatLists(values(textIDToLines));
102  
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
103  
    allUniqueLines = uniquify(allLines);
104  
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
105  
    allLines = null; // allow me to forget
106  
    if (sortLines) sortInPlace(allUniqueLines);
107  
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
108  
    for (S line : allUniqueLines)
109  
      chunks.add(new CPrim(line));
110  
    primChunks = l(chunks);
111  
    LS _lines = collect s(chunks);
112  
    lineIndex = caseInsensitive ? ciListIndex(_lines) : listIndex(_lines);
113  
    _lines = null;
114  
    
115  
    // simple encoding (only direct line references)
116  
    simpleEncodings = mapValues(textIDToLines,
117  
      (IF1<LS, L<Int>>) (lines -> wrapIntArrayAsImmutableList(mapToIntArray(lines, line -> lineIndex.get(line)))));
118  
    //printAndCheckEncodings(simpleEncodings);
119  
    if (verboseCompressionSteps) print("Have simple encodings");
120  
    if (!verify) textIDToLines = null;
121  
    
122  
    finalEncodings = compressPairs();
123  
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
124  
  }
125  
  
126  
  void saveAsTextFile(File f) {
127  
    S out = exportEncoding(finalEncodings);
128  
    saveTextFile(f, out);
129  
    
130  
    if (verify) checkDecompression(f, textIDToLines);
131  
  }
132  
  
133  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
134  
    temp BufferedReader reader = bufferedUtf8Reader(file);
135  
    LineCompReader lcr = new(reader);
136  
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
137  
    for (S version : keys(textIDToLines))
138  
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
139  
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
140  
  }
141  
  
142  
  S asText() { ret exportEncoding(finalEncodings); }
143  
  
144  
  S exportEncoding(Encodings encodings) {
145  
    new LS buf;
146  
    buf.add((byteMode ? "BYTECOMP " : "LINECOMP ") + primChunks); // magic signature
147  
    for (Chunk c : chunks) {
148  
      if (c cast CPair)
149  
        buf.add(c.i1 + " " + c.i2);
150  
      else
151  
        buf.add(((CPrim) c).s);
152  
    }
153  
    for (S id, L<Int> l : encodings)
154  
      buf.add(id + "=" + joinWithSpace(l));
155  
    ret lines_rtrim(buf);
156  
  }
157  
  
158  
  Encodings compressPairs() {
159  
    pairIndex.verbose = verboseCompressionSteps || verbose;
160  
    pairIndex.init();
161  
    for ping (S version, L<Int> l : simpleEncodings)
162  
      pairIndex.newChain(version, l);
163  
    pairIndex.haveChains();
164  
    simpleEncodings = null; // forget
165  
166  
    ret compressChains(keys(pairIndex.chains));
167  
  }
168  
  
169  
  Encodings compressChains(Cl<S> files) {
170  
    NotTooOften nto = new(printStatsEveryNSeconds*1000);
171  
    IntPair toCompress;
172  
    // Compress only most popular pair in one step
173  
    while ping ((toCompress = fullCompression
174  
      ? pairIndex.mostPopularPair()
175  
      : pairIndex.mostPopularDuplicate()) != null) {
176  
      if (verbose) {
177  
        print("Chains: " + mapValuesToList(pairIndex.chains, c -> c.toList()));
178  
        //print("Highest bucket (" + pairIndex.highestBucketNr() + "): " + pairIndex.mostPopularDuplicates());
179  
        print("Buckets: " + pairIndex.byCount);
180  
      }
181  
182  
      if (safety > 0 && --safety <= 0) fail("safety");
183  
      ++round;
184  
      if (verboseStats)
185  
        if (printStatsEvery > 0 && (round % printStatsEvery) == 0
186  
          || round <= printStatsInFirstRounds
187  
          || nto!)
188  
          printStats();
189  
      int count = pairIndex.getCount(toCompress);
190  
      Int idx = makeCPair(toCompress);
191  
      int dups = pairIndex.numberOfDuplicates();
192  
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
193  
      pairIndex.replacePair(toCompress.a, toCompress.b, idx);
194  
    }
195  
    if (verboseStats) printStats();
196  
    
197  
    // reconvert to normal list
198  
    ret mapToValues(f -> pairIndex.chains.get(f).toList(), files);
199  
  }
200  
  
201  
  // TODO: check if pair is already there
202  
  Int makeCPair(IntPair p) {
203  
    long l = intPairToLong(p);
204  
    S text = null;
205  
    if (fullIndex != null) {
206  
      text = itemToString(p.a) + itemToString(p.b);
207  
      Int idx = fullIndex.get(text);
208  
      if (idx != null) ret idx;
209  
    }
210  
    int idx = existingPairsIndex.getIfAbsent(l, -1);
211  
    if (idx < 0) {
212  
      idx = addAndReturnIndex(chunks, new CPair(p));
213  
      if (recordPairLengths) listPut(pairLengths, idx, itemLength(p.a) + itemLength(p.b), 1);
214  
      existingPairsIndex.put(l, idx);
215  
      if (fullIndex != null)
216  
        fullIndex.put(text, idx);
217  
    }
218  
    ret idx;
219  
  }
220  
  
221  
  void printAndCheckEncodings(Encodings encodings) {
222  
    for (S id, L<Int> encoded : encodings) {
223  
      if (verbose) print(id + ": " + joinWithSpace(encoded));
224  
      S x = myFromLines(textIDToLines.get(id)), y = decode(encoded);
225  
      if (caseInsensitive)
226  
        assertEqualsIC(x, y);
227  
      else
228  
        assertEquals_quote(x, y);
229  
    }
230  
  }
231  
  
232  
  S decode(L<Int> encoded) {
233  
    LS l = lambdaMap chunkText(encoded);
234  
    if (byteMode) l = lines(lines(l));
235  
    ret myFromLines(l);
236  
  }
237  
  
238  
  // for non-byte mode
239  
  S chunkText(int idx) {
240  
    ret chunks.get(idx).text(chunks);
241  
  }
242  
  
243  
  void printStats() {
244  
    print("Round: " + round);
245  
    pairIndex.printStats();
246  
  }
247  
  
248  
  // byteMode only
249  
  S itemToString(int idx) {
250  
    new StringBuilder buf;
251  
    itemToString_step(buf, idx);
252  
    ret str(buf);
253  
  }
254  
  
255  
  void itemToString_step(StringBuilder buf, int idx) {
256  
    Chunk c = chunks.get(idx);
257  
    if (c cast CPair) {
258  
      itemToString_step(buf, c.i1);
259  
      itemToString_step(buf, c.i2);
260  
    } else {
261  
      S s = ((CPrim) c).s;
262  
      buf.append(byteMode ? charFromHex(s) : s + "\n");
263  
    }
264  
  }
265  
  
266  
  int nLiterals() { ret primChunks; }
267  
  int nItems() { ret l(chunks); }
268  
  
269  
  IntPair getPair(int idx) {
270  
    ret chunks.get(idx).getPair();
271  
  }
272  
  
273  
  int itemLength(int idx) {
274  
    ret idx >= l(pairLengths) ? 1 : pairLengths.get(idx);
275  
  }
276  
  
277  
  int pairLength(IntPair p) {
278  
    ret p == null ? 0 : itemLength(p.a) + itemLength(p.b);
279  
  }
280  
  
281  
  LS literals() { ret lmap itemToString(countIterator(primChunks)); }
282  
}

Author comment

Began life as a copy of #1028167

download  show line numbers  debug dex  old transpilations   

Travelled to 8 computer(s): bhatertpkbcr, ekrmjmnbrukm, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028186
Snippet name: LineCompCompressor, faster version [LIVE]
Eternal ID of this version: #1028186/134
Text MD5: e971d8644645f8667e3bfb0b5f47911d
Transpilation MD5: 101663b9e3006d0041f4bfa5103fde92
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-12-13 19:22:27
Source code size: 9242 bytes / 282 lines
Pitched / IR pitched: No / No
Views / Downloads: 586 / 1506
Version history: 133 change(s)
Referenced in: [show references]