Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

149
LINES

< > BotCompany Repo | #1028503 // LineCompCompressor [backup before eclipse collections]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (7033L/45K).

1  
// This is using the implementation that seems to work fastest
2  
// in reality, although it probably is O(n log n) due to the use
3  
// of TreeMaps.
4  
//
5  
// The theoretically best implementation is O(n) due to replacing the
6  
// TreeMaps with a linked list.
7  
// I think it works, see LineComp_PairIndex_v2, but #1028231 is fuxx0red.
8  
9  
sclass LineCompCompressor {
10  
  int safety = 0;
11  
  
12  
  replace Encodings with Map<S, L<Int>>.
13  
14  
  abstract sclass Chunk {
15  
    abstract S text(L<Chunk> chunks);
16  
  }
17  
  
18  
  srecord CPair(int i1, int i2) > Chunk {
19  
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
20  
    
21  
    S text(L<Chunk> chunks) {
22  
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
23  
    }
24  
  }
25  
  
26  
  srecord CPrim(S s) > Chunk {
27  
    S text(L<Chunk> chunks) { ret s; }
28  
  }
29  
30  
  bool verbose = false, verboseCompressionSteps = false;
31  
  bool sortLines = true;
32  
  bool verify = true;
33  
34  
  Map<S, LS> textIDToLines;
35  
  LS allUniqueLines;
36  
  new L<Chunk> chunks;
37  
  int primChunks;
38  
  Map<S, Int> lineIndex;
39  
  Encodings finalEncodings;
40  
  new Map<IntPair, Int> existingPairsIndex;
41  
  
42  
  // key = version ID, values = text
43  
  *(SS texts) {
44  
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
45  
  }
46  
  
47  
  LS myToLines(S s) { ret toLines_nOnly_reversible(s); }
48  
  S myFromLines(LS l) { ret fromLines_rtrim(l); }
49  
  
50  
  run {
51  
    LS allLines = concatLists(values(textIDToLines));
52  
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
53  
    allUniqueLines = uniquify(allLines);
54  
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
55  
    allLines = null; // allow me to forget
56  
    if (sortLines) sortInPlace(allUniqueLines);
57  
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
58  
    for (S line : allUniqueLines)
59  
      chunks.add(new CPrim(line));
60  
    primChunks = l(chunks);
61  
    lineIndex = listIndex(collect s(chunks));
62  
    
63  
    // simple encoding (only direct line references)
64  
    Encodings simpleEncodings = mapValues(textIDToLines,
65  
      (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
66  
    //printAndCheckEncodings(simpleEncodings);
67  
    if (verboseCompressionSteps) print("Have simple encodings");
68  
    
69  
    finalEncodings = compressPairs_v3(simpleEncodings);
70  
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
71  
  }
72  
  
73  
  void saveAsTextFile(File f) {
74  
    S out = exportEncoding(finalEncodings);
75  
    saveTextFile(f, out);
76  
    
77  
    if (verify) checkDecompression(f, textIDToLines);
78  
  }
79  
  
80  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
81  
    temp BufferedReader reader = bufferedUtf8Reader(file);
82  
    LineCompReader lcr = new(reader);
83  
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
84  
    for (S version : keys(textIDToLines))
85  
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
86  
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
87  
  }
88  
  
89  
  S asText() { ret exportEncoding(finalEncodings); }
90  
  
91  
  S exportEncoding(Encodings encodings) {
92  
    new LS buf;
93  
    buf.add("LINECOMP " + primChunks); // magic signature
94  
    for (Chunk c : chunks) {
95  
      if (c cast CPair)
96  
        buf.add(c.i1 + " " + c.i2);
97  
      else
98  
        buf.add(((CPrim) c).s);
99  
    }
100  
    for (S id, L<Int> l : encodings)
101  
      buf.add(id + "=" + joinWithSpace(l));
102  
    ret lines_rtrim(buf);
103  
  }
104  
  
105  
  Encodings compressPairs_v3(Encodings encodings) {
106  
    new LineComp_PairIndex pairIndex;
107  
    pairIndex.verbose = verboseCompressionSteps;
108  
    pairIndex.init();
109  
    for (S version, L<Int> l : encodings)
110  
      pairIndex.newChain(version, l);
111  
112  
    IntPair toCompress;
113  
    // Compress only most popular pair in one step
114  
    while ping ((toCompress = pairIndex.mostPopularDuplicate()) != null) {
115  
      if (safety > 0 && --safety <= 0) fail("safety");
116  
      int count = pairIndex.getCount(toCompress);
117  
      Int idx = makeCPair(toCompress);
118  
      int dups = pairIndex.numberOfDuplicates();
119  
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
120  
      pairIndex.replacePair(toCompress.a, toCompress.b, idx);
121  
    }
122  
    
123  
    // reconvert to normal list
124  
    ret mapValues(ch -> ch.toList(), pairIndex.chains);
125  
  }
126  
  
127  
  // TODO: check if pair is already there
128  
  Int makeCPair(IntPair p) {
129  
    Int idx = existingPairsIndex.get(p);
130  
    if (idx == null)
131  
      existingPairsIndex.put(p, idx = addAndReturnIndex(chunks, new CPair(p)));
132  
    ret idx;
133  
  }
134  
  
135  
  void printAndCheckEncodings(Encodings encodings) {
136  
    for (S id, L<Int> encoded : encodings) {
137  
      if (verbose) print(id + ": " + joinWithSpace(encoded));
138  
      assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded));
139  
    }
140  
  }
141  
  
142  
  S decode(L<Int> encoded) {
143  
    ret myFromLines(lambdaMap chunkText(encoded));
144  
  }
145  
  
146  
  S chunkText(int idx) {
147  
    ret chunks.get(idx).text(chunks);
148  
  }
149  
}

Author comment

Began life as a copy of #1028186

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028503
Snippet name: LineCompCompressor [backup before eclipse collections]
Eternal ID of this version: #1028503/1
Text MD5: b08de58bb27524ad814e02f37aed6908
Transpilation MD5: 7668b40e654b1fc3ec04a78ba30a0396
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-06-23 11:07:45
Source code size: 5036 bytes / 149 lines
Pitched / IR pitched: No / No
Views / Downloads: 118 / 168
Referenced in: [show references]