Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

148
LINES

< > BotCompany Repo | #1028252 // LineCompCompressor 0.3 [dev.]

JavaX fragment (include)

1  
!include once #1028249 // LineComp_PairIndex_v2 [dev.]
2  
3  
sclass LineCompCompressor {
4  
  int safety = 0;
5  
  
6  
  replace Encodings with Map<S, L<Int>>.
7  
  replace LineComp_PairIndex with LineComp_PairIndex_v2.
8  
  
9  
  abstract sclass Chunk {
10  
    abstract S text(L<Chunk> chunks);
11  
  }
12  
  
13  
  srecord CPair(int i1, int i2) > Chunk {
14  
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
15  
    
16  
    S text(L<Chunk> chunks) {
17  
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
18  
    }
19  
  }
20  
  
21  
  srecord CPrim(S s) > Chunk {
22  
    S text(L<Chunk> chunks) { ret s; }
23  
  }
24  
25  
  bool verbose = false, verboseCompressionSteps = false;
26  
  bool sortLines = true;
27  
  bool verify = true;
28  
29  
  Map<S, LS> textIDToLines;
30  
  LS allUniqueLines;
31  
  new L<Chunk> chunks;
32  
  int primChunks;
33  
  Map<S, Int> lineIndex;
34  
  Encodings finalEncodings;
35  
  new Map<IntPair, Int> existingPairsIndex;
36  
  
37  
  // key = version ID, values = text
38  
  *(SS texts) {
39  
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
40  
  }
41  
  
42  
  LS myToLines(S s) { ret toLines_nOnly_reversible(s); }
43  
  S myFromLines(LS l) { ret fromLines_rtrim(l); }
44  
  
45  
  run {
46  
    LS allLines = concatLists(values(textIDToLines));
47  
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
48  
    allUniqueLines = uniquify(allLines);
49  
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
50  
    allLines = null; // allow me to forget
51  
    if (sortLines) sortInPlace(allUniqueLines);
52  
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
53  
    for (S line : allUniqueLines)
54  
      chunks.add(new CPrim(line));
55  
    primChunks = l(chunks);
56  
    lineIndex = listIndex(collect s(chunks));
57  
    
58  
    // simple encoding (only direct line references)
59  
    Encodings simpleEncodings = mapValues(textIDToLines,
60  
      (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
61  
    //printAndCheckEncodings(simpleEncodings);
62  
    if (verboseCompressionSteps) print("Have simple encodings");
63  
    
64  
    finalEncodings = compressPairs_v3(simpleEncodings);
65  
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
66  
  }
67  
  
68  
  void saveAsTextFile(File f) {
69  
    S out = exportEncoding(finalEncodings);
70  
    saveTextFile(f, out);
71  
    
72  
    if (verify) checkDecompression(f, textIDToLines);
73  
  }
74  
  
75  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
76  
    print("Verifying archive");
77  
    temp BufferedReader reader = bufferedUtf8Reader(file);
78  
    LineCompReader lcr = new(reader);
79  
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
80  
    for (S version : keys(textIDToLines))
81  
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
82  
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
83  
  }
84  
  
85  
  S asText() { ret exportEncoding(finalEncodings); }
86  
  
87  
  S exportEncoding(Encodings encodings) {
88  
    new LS buf;
89  
    buf.add("LINECOMP " + primChunks); // magic signature
90  
    for (Chunk c : chunks) {
91  
      if (c cast CPair)
92  
        buf.add(c.i1 + " " + c.i2);
93  
      else
94  
        buf.add(((CPrim) c).s);
95  
    }
96  
    for (S id, L<Int> l : encodings)
97  
      buf.add(id + "=" + joinWithSpace(l));
98  
    ret lines_rtrim(buf);
99  
  }
100  
  
101  
  // new fastest version of magic compression function
102  
  // (O(n log n) or even better?)
103  
  
104  
  Encodings compressPairs_v3(Encodings encodings) {
105  
    new LineComp_PairIndex pairIndex;
106  
    pairIndex.verbose = verboseCompressionSteps;
107  
    for (S version, L<Int> l : encodings)
108  
      pairIndex.newChain(version, l);
109  
110  
    IntPair toCompress;
111  
    // Compress only most popular pair in one step
112  
    while ping ((toCompress = pairIndex.mostPopularDuplicate()) != null) {
113  
      if (safety > 0 && --safety <= 0) fail("safety");
114  
      //pairIndex.fullCheck();
115  
      int count = pairIndex.getCount(toCompress);
116  
      Int idx = makeCPair(toCompress);
117  
      int dups = pairIndex.numberOfDuplicates();
118  
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
119  
      pairIndex.replacePair(toCompress.a, toCompress.b, idx);
120  
    }
121  
    
122  
    // reconvert to normal list
123  
    ret mapValues(ch -> ch.toList(), pairIndex.chains);
124  
  }
125  
  
126  
  // TODO: check if pair is already there
127  
  Int makeCPair(IntPair p) {
128  
    Int idx = existingPairsIndex.get(p);
129  
    if (idx == null)
130  
      existingPairsIndex.put(p, idx = addAndReturnIndex(chunks, new CPair(p)));
131  
    ret idx;
132  
  }
133  
  
134  
  void printAndCheckEncodings(Encodings encodings) {
135  
    for (S id, L<Int> encoded : encodings) {
136  
      if (verbose) print(id + ": " + joinWithSpace(encoded));
137  
      assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded));
138  
    }
139  
  }
140  
  
141  
  S decode(L<Int> encoded) {
142  
    ret myFromLines(lambdaMap chunkText(encoded));
143  
  }
144  
  
145  
  S chunkText(int idx) {
146  
    ret chunks.get(idx).text(chunks);
147  
  }
148  
}

Author comment

Began life as a copy of #1028186

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028252
Snippet name: LineCompCompressor 0.3 [dev.]
Eternal ID of this version: #1028252/4
Text MD5: 2905f33b9a7e80f9c5e344bed46ca174
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-28 14:42:17
Source code size: 4959 bytes / 148 lines
Pitched / IR pitched: No / No
Views / Downloads: 197 / 267
Version history: 3 change(s)
Referenced in: [show references]