Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

151
LINES

< > BotCompany Repo | #1028234 // LineCompCompressor, faster version backup before IntPair

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Compilation Failed (7093L/45K).

1  
sclass LineCompCompressor {
2  
  int safety = 0;
3  
  
4  
  replace Encodings with Map<S, L<Int>>.
5  
  
6  
  abstract sclass Chunk {
7  
    abstract S text(L<Chunk> chunks);
8  
  }
9  
  
10  
  srecord CPair(int i1, int i2) > Chunk {
11  
    CPair(Pair<Int> p) { i1 = p.a; i2 = p.b; }
12  
    
13  
    S text(L<Chunk> chunks) {
14  
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
15  
    }
16  
  }
17  
  
18  
  srecord CPrim(S s) > Chunk {
19  
    S text(L<Chunk> chunks) { ret s; }
20  
  }
21  
22  
  bool verbose = false, verboseCompressionSteps = false;
23  
  bool sortLines = true;
24  
  bool verify = true;
25  
  
26  
  Map<S, LS> textIDToLines;
27  
  LS allUniqueLines;
28  
  new L<Chunk> chunks;
29  
  int primChunks;
30  
  Map<S, Int> lineIndex;
31  
  new Map<PairS, Int> linePairIndex;
32  
  Encodings finalEncodings;
33  
  
34  
  // key = version ID, values = text
35  
  *(SS texts) {
36  
    textIDToLines = mapValuesToLinkedHashMap myToLines(texts);
37  
  }
38  
  
39  
  LS myToLines(S s) { ret toLines_nOnly_reversible(s); }
40  
  S myFromLines(LS l) { ret fromLines_rtrim(l); }
41  
  
42  
  run {
43  
    LS allLines = concatLists(values(textIDToLines));
44  
    if (verboseCompressionSteps) print("Uniquifying " + nLines(allLines));
45  
    allUniqueLines = uniquify(allLines);
46  
    if (verboseCompressionSteps) print("Have " + n2(allUniqueLines, "unique line"));
47  
    allLines = null; // allow me to forget
48  
    if (sortLines) sortInPlace(allUniqueLines);
49  
    if (verboseCompressionSteps) print("Sorted " + nLines(allUniqueLines));
50  
    for (S line : allUniqueLines)
51  
      chunks.add(new CPrim(line));
52  
    primChunks = l(chunks);
53  
    lineIndex = listIndex(collect s(chunks));
54  
    
55  
    // simple encoding (only direct line references)
56  
    Encodings simpleEncodings = mapValues(textIDToLines,
57  
      (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
58  
    //printAndCheckEncodings(simpleEncodings);
59  
    
60  
    if (verboseCompressionSteps) print("Have simple encodings");
61  
    finalEncodings = compressPairs(simpleEncodings);
62  
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
63  
  }
64  
  
65  
  void saveAsTextFile(File f) {
66  
    S out = exportEncoding(finalEncodings);
67  
    saveTextFile(f, out);
68  
    
69  
    if (verify) checkDecompression(f, textIDToLines);
70  
  }
71  
  
72  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
73  
    temp BufferedReader reader = bufferedUtf8Reader(file);
74  
    LineCompReader lcr = new(reader);
75  
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
76  
    for (S version : keys(textIDToLines))
77  
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
78  
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
79  
  }
80  
  
81  
  S asText() { ret exportEncoding(finalEncodings); }
82  
  
83  
  S exportEncoding(Encodings encodings) {
84  
    new LS buf;
85  
    buf.add("LINECOMP " + primChunks); // magic signature
86  
    for (Chunk c : chunks) {
87  
      if (c cast CPair)
88  
        buf.add(c.i1 + " " + c.i2);
89  
      else
90  
        buf.add(((CPrim) c).s);
91  
    }
92  
    for (S id, L<Int> l : encodings)
93  
      buf.add(id + "=" + joinWithSpace(l));
94  
    ret lines_rtrim(buf);
95  
  }
96  
  
97  
  // new fast version of magic compression function
98  
  
99  
  Encodings compressPairs(Encodings encodings) {
100  
    // get initial pair counts
101  
    new LineComp_PairCounts pairCounts;
102  
    for (L<Int> l : values(encodings))
103  
      pairCounts.addAll(overlappingPairs(l));
104  
105  
    // Convert to LinkedList for more efficient modification
106  
    Map<S, L<Int>> encodings2 = /*mapValues toLinkedList*/(encodings);
107  
    
108  
    Pair<Int> toCompress;
109  
    // Compress only most popular pair in one step
110  
    //int lastDups = Int.MAX_VALUE;
111  
    while ping ((toCompress = pairCounts.mostPopularDuplicate()) != null) {
112  
      if (safety > 0 && --safety <= 0) fail("safety");
113  
      int count = pairCounts.getCount(toCompress), idx = makeCPair(toCompress);
114  
      int dups = pairCounts.numberOfDuplicates();
115  
      /*if (lastDups == dups)
116  
        fail("Number of duplicates not decreasing");
117  
      lastDups = dups;*/
118  
      if (verboseCompressionSteps) print("Compressing pair " + toCompress + " (count=" + count + ") -> " + idx + ", " + (dups-1) + " remaining");
119  
      for (L<Int> l : values(encodings2))
120  
        compressPair(pairCounts, l, toCompress, idx);
121  
    }
122  
    
123  
    // reconvert to normal list
124  
    ret mapValues toArrayList(encodings2);
125  
  }
126  
  
127  
  // replace replacing (pair) with replaceWith in l
128  
  void compressPair(LineComp_PairCounts pairCounts, L<Int> l, Pair<Int> replacing, int replaceWith) {
129  
    lineComp_replaceSublistWithUpdatingPairCount(l, pairToList(replacing), ll(replaceWith), pairCounts);
130  
  }
131  
  
132  
  int makeCPair(Pair<Int> p) {
133  
    int idx = addAndReturnIndex(chunks, new CPair(p));
134  
    ret idx;
135  
  }
136  
  
137  
  void printAndCheckEncodings(Encodings encodings) {
138  
    for (S id, L<Int> encoded : encodings) {
139  
      if (verbose) print(id + ": " + joinWithSpace(encoded));
140  
      assertEquals(myFromLines(textIDToLines.get(id)), decode(encoded));
141  
    }
142  
  }
143  
  
144  
  S decode(L<Int> encoded) {
145  
    ret myFromLines(lambdaMap chunkText(encoded));
146  
  }
147  
  
148  
  S chunkText(int idx) {
149  
    ret chunks.get(idx).text(chunks);
150  
  }
151  
}

Author comment

Began life as a copy of #1028186

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028234
Snippet name: LineCompCompressor, faster version backup before IntPair
Eternal ID of this version: #1028234/4
Text MD5: b48988ec7819b1a73eaafe978ff2a049
Transpilation MD5: 13a4af9ad25d0b8ed7a02fb5235cf3e5
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-27 16:41:26
Source code size: 5165 bytes / 151 lines
Pitched / IR pitched: No / No
Views / Downloads: 210 / 307
Version history: 3 change(s)
Referenced in: [show references]