Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

129
LINES

< > BotCompany Repo | #1028192 // LineCompCompressor v1 (with slow pair making, old)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (6705L/42K).

1  
sclass LineCompCompressor {
2  
  replace Encodings with Map<S, L<Int>>.
3  
  
4  
  abstract sclass Chunk {
5  
    abstract S text(L<Chunk> chunks);
6  
  }
7  
  
8  
  srecord CPair(int i1, int i2) > Chunk {
9  
    CPair(IntPair p) { i1 = p.a; i2 = p.b; }
10  
    
11  
    S text(L<Chunk> chunks) {
12  
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
13  
    }
14  
  }
15  
  
16  
  srecord CPrim(S s) > Chunk {
17  
    S text(L<Chunk> chunks) { ret s; }
18  
  }
19  
20  
  bool verbose = false;
21  
  bool sortLines = true;
22  
  bool verify = true;
23  
  
24  
  Map<S, LS> textIDToLines;
25  
  LS allUniqueLines;
26  
  new L<Chunk> chunks;
27  
  int primChunks;
28  
  Map<S, Int> lineIndex;
29  
  new Map<PairS, Int> linePairIndex;
30  
  Encodings finalEncodings;
31  
  
32  
  // key = version ID, values = text
33  
  *(SS texts) {
34  
    textIDToLines = mapValuesToLinkedHashMap lines(texts);
35  
  }
36  
  
37  
  run {
38  
    allUniqueLines = uniquify(concatLists(values(textIDToLines)));
39  
    if (sortLines) sortInPlace(allUniqueLines);
40  
    for (S line : allUniqueLines)
41  
      chunks.add(new CPrim(line));
42  
    primChunks = l(chunks);
43  
    lineIndex = listIndex(collect s(chunks));
44  
    
45  
    // simple encoding (only direct line references)
46  
    Encodings simpleEncodings = mapValues(textIDToLines,
47  
      (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
48  
    //printAndCheckEncodings(simpleEncodings);
49  
    
50  
    finalEncodings = repeatUntilSame compressPairs(simpleEncodings);
51  
    if (verbose || verify) printAndCheckEncodings(finalEncodings);
52  
  }
53  
  
54  
  void saveAsTextFile(File f) {
55  
    S out = exportEncoding(finalEncodings);
56  
    saveTextFile(f, out);
57  
    
58  
    if (verify) checkDecompression(f, textIDToLines);
59  
  }
60  
  
61  
  void checkDecompression(File file, Map<S, LS> textIDToLines) {
62  
    temp BufferedReader reader = bufferedUtf8Reader(file);
63  
    LineCompReader lcr = new(reader);
64  
    assertEquals(keysList(textIDToLines), asList(lcr.versions()));
65  
    for (S version : keys(textIDToLines))
66  
      assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
67  
    if (verbose) print("Decompression OK for " + nVersions(textIDToLines));
68  
  }
69  
  
70  
  S asText() { ret exportEncoding(finalEncodings); }
71  
  
72  
  S exportEncoding(Encodings encodings) {
73  
    new LS buf;
74  
    buf.add("LINECOMP " + primChunks); // magic signature
75  
    for (Chunk c : chunks) {
76  
      if (c cast CPair)
77  
        buf.add(c.i1 + " " + c.i2);
78  
      else
79  
        buf.add(((CPrim) c).s);
80  
    }
81  
    for (S id, L<Int> l : encodings)
82  
      buf.add(id + "=" + joinWithSpace(l));
83  
    ret lines_rtrim(buf);
84  
  }
85  
  
86  
  Encodings compressPairs(Encodings encodings) {
87  
    new MultiSet<IntPair> pairCounts;
88  
    
89  
    for (L<Int> l : values(encodings)) {
90  
      IntPair lastPair = null;
91  
      for (IntPair pair : overlappingIntPairs(l)) {
92  
        if (neq(pair, lastPair)) {
93  
          lastPair = pair;
94  
          pairCounts.add(pair);
95  
        }
96  
      }
97  
    }
98  
    
99  
    //print("Pair counts: " + pairCounts);
100  
    IntPair toCompress = msMostPopularDuplicate(pairCounts);
101  
102  
    // Compress only most popular pair
103  
    if (toCompress == null) ret encodings; // Nothing to do
104  
    int idx = makeCPair(toCompress);
105  
    print("Made pair: " + toCompress + " -> " + idx + ", " + (msNumberOfDuplicates(pairCounts)-1) + " remaining");
106  
    ret mapValues(encodings, (IF1<L<Int>>) encoded ->
107  
      replaceSublist(encoded, ll(toCompress.a, toCompress.b), ll(idx)));
108  
  }
109  
  
110  
  int makeCPair(IntPair p) {
111  
    int idx = addAndReturnIndex(chunks, new CPair(p));
112  
    ret idx;
113  
  }
114  
  
115  
  void printAndCheckEncodings(Encodings encodings) {
116  
    for (S id, L<Int> encoded : encodings) {
117  
      if (verbose) print(id + ": " + joinWithSpace(encoded));
118  
      assertEquals(lines(textIDToLines.get(id)), decode(encoded));
119  
    }
120  
  }
121  
  
122  
  S decode(L<Int> encoded) {
123  
    ret lines(lambdaMap chunkText(encoded));
124  
  }
125  
  
126  
  S chunkText(int idx) {
127  
    ret chunks.get(idx).text(chunks);
128  
  }
129  
}

Author comment

Began life as a copy of #1028186

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028192
Snippet name: LineCompCompressor v1 (with slow pair making, old)
Eternal ID of this version: #1028192/7
Text MD5: f8a03e59e758a9ec8f0dd0e399660ec4
Transpilation MD5: 4e19cec1e47c585bb409647f3b0afd78
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-27 13:36:25
Source code size: 3951 bytes / 129 lines
Pitched / IR pitched: No / No
Views / Downloads: 149 / 235
Version history: 6 change(s)
Referenced in: [show references]