Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

160
LINES

< > BotCompany Repo | #1028167 // Line-Based Multi-Version Compressor Spike [OK]

JavaX source code (Dynamic Module) [tags: use-pretranspiled] - run with: Stefan's OS

Uses 1489K of libraries. Click here for Pure Java version (9930L/52K).

1  
!7
2  
3  
cprint CompressorSpike {
4  
  switchable S snippetID = #1020763;
5  
  LinkedHashMap<S> versions;
6  
  
7  
  start-thread {
8  
    print("Snippet ID: " + snippetID);
9  
    dm_onFieldChange snippetID(r { setField(versions := null); dm_reload() });
10  
    if (versions == null) {
11  
      versions = mapToLinkedHashMap(reversed(textChangesOfSnippet(snippetID)), sv -> pair(str(sv.versionID), sv.previousValue));
12  
      versions.put("latest", loadSnippet(snippetID));
13  
      versions = mapValuesToLinkedHashMap toLinesAndBack(versions); // canonicalize line breaks
14  
      change();
15  
    }
16  
    print("Have " + nVersions(versions) + " with a total of " + nLines(totalLineCount(values(versions))));
17  
    
18  
    new Compressor().run();
19  
  }
20  
  
21  
  abstract sclass Chunk {
22  
    abstract S text(L<Chunk> chunks);
23  
  }
24  
  
25  
  srecord CPair(int i1, int i2) > Chunk {
26  
    CPair(Pair<Int> p) { i1 = p.a; i2 = p.b; }
27  
    
28  
    S text(L<Chunk> chunks) {
29  
      ret linesLL_rtrim(chunks.get(i1).text(chunks), chunks.get(i2).text(chunks));
30  
    }
31  
  }
32  
  
33  
  srecord CPrim(S s) > Chunk {
34  
    S text(L<Chunk> chunks) { ret s; }
35  
  }
36  
  
37  
  class Compressor {
38  
    replace Encodings with Map<S, L<Int>>.
39  
    
40  
    bool sortLines = true;
41  
    Map<S, LS> textIDToLines = mapValuesToLinkedHashMap lines(versions);
42  
    LS allUniqueLines;
43  
    new L<Chunk> chunks;
44  
    int primChunks;
45  
    Map<S, Int> lineIndex;
46  
    new Map<PairS, Int> linePairIndex;
47  
    Encodings finalEncodings;
48  
    
49  
    run {
50  
      allUniqueLines = uniquify(concatLists(values(textIDToLines)));
51  
      if (sortLines) sortInPlace(allUniqueLines);
52  
      for (S line : allUniqueLines)
53  
        chunks.add(new CPrim(line));
54  
      primChunks = l(chunks);
55  
      lineIndex = listIndex(collect s(chunks));
56  
      
57  
      // simple encoding (only direct line references)
58  
      Encodings simpleEncodings = mapValues(textIDToLines,
59  
        (IF1<LS, L<Int>>) (lines -> map(lines, line -> lineIndex.get(line))));
60  
      //printAndCheckEncodings(simpleEncodings);
61  
      
62  
      Encodings advancedEncodings = simpleEncodings;
63  
      while licensed {
64  
        Encodings e = compressPairs(advancedEncodings);
65  
        if (e == advancedEncodings) break;
66  
        advancedEncodings = e;
67  
      }
68  
69  
      finalEncodings = advancedEncodings;
70  
      printAndCheckEncodings(finalEncodings);
71  
      
72  
      S out = exportEncoding(finalEncodings);
73  
      printWithPrecedingNL(out);
74  
      File file = saveTextFile_infoBox(javaxDataDir("Compressed Snippet Version History/versions-of-" + psI(snippetID) + ".linecomp"), out);
75  
      print(renderFileInfo(gzipFile(file)));
76  
      
77  
      // Make .zip and .tgz for comparison
78  
      File zipFile = replaceFileExtension(file, ".zip");
79  
      {
80  
        temp ZipOutputStream zipOut = zipOutputStream(zipFile);
81  
        for (S id, text : versions)
82  
          zip_addTextFile(zipOut, id, text);
83  
      }
84  
      printFileInfo(zipFile);
85  
      
86  
      File tgzFile = replaceFileExtension(file, ".tgz");
87  
      zip2tgz(zipFile, tgzFile);
88  
      printFileInfo(tgzFile);
89  
      
90  
      checkDecompression(file, textIDToLines);
91  
    }
92  
    
93  
    void checkDecompression(File file, Map<S, LS> textIDToLines) {
94  
      temp BufferedReader reader = bufferedUtf8Reader(file);
95  
      LineCompReader lcr = new(reader);
96  
      assertEquals(keysList(textIDToLines), asList(lcr.versions()));
97  
      for (S version : keys(textIDToLines))
98  
        assertEquals(lcr.textForVersion(version), lines_rtrim(textIDToLines.get(version)));
99  
      print("Decompression OK for " + nVersions(textIDToLines));
100  
    }
101  
    
102  
    S exportEncoding(Encodings encodings) {
103  
      new LS buf;
104  
      buf.add("LINECOMP " + primChunks); // magic signature
105  
      for (Chunk c : chunks) {
106  
        if (c cast CPair)
107  
          buf.add(c.i1 + " " + c.i2);
108  
        else
109  
          buf.add(((CPrim) c).s);
110  
      }
111  
      for (S id, L<Int> l : encodings)
112  
        buf.add(id + "=" + joinWithSpace(l));
113  
      ret lines_rtrim(buf);
114  
    }
115  
    
116  
    Encodings compressPairs(Encodings encodings) {
117  
      new MultiSet<Pair<Int>> pairCounts;
118  
      
119  
      for (L<Int> l : values(encodings)) {
120  
        Pair<Int> lastPair = null;
121  
        for (Pair<Int> pair : overlappingPairs(l)) {
122  
          if (neq(pair, lastPair)) {
123  
            lastPair = pair;
124  
            pairCounts.add(pair);
125  
          }
126  
        }
127  
      }
128  
      
129  
      //print("Pair counts: " + pairCounts);
130  
      Pair<Int> toCompress = msMostPopularDuplicate(pairCounts);
131  
132  
      // Compress only most popular pair
133  
      if (toCompress == null) ret encodings; // Nothing to do
134  
      int idx = makeCPair(toCompress);
135  
      print("Made pair: " + toCompress + " -> " + idx + ", " + (msNumberOfDuplicates(pairCounts)-1) + " remaining");
136  
      ret mapValues(encodings, (IF1<L<Int>>) encoded ->
137  
        replaceSublist(encoded, pairToList(toCompress), ll(idx)));
138  
    }
139  
    
140  
    int makeCPair(Pair<Int> p) {
141  
      int idx = addAndReturnIndex(chunks, new CPair(p));
142  
      ret idx;
143  
    }
144  
    
145  
    void printAndCheckEncodings(Encodings encodings) {
146  
      for (S id, L<Int> encoded : encodings) {
147  
        print(id + ": " + joinWithSpace(encoded));
148  
        assertEquals(lines(textIDToLines.get(id)), decode(encoded));
149  
      }
150  
    }
151  
    
152  
    S decode(L<Int> encoded) {
153  
      ret lines(lambdaMap chunkText(encoded));
154  
    }
155  
    
156  
    S chunkText(int idx) {
157  
      ret chunks.get(idx).text(chunks);
158  
    }
159  
  }
160  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1028167
Snippet name: Line-Based Multi-Version Compressor Spike [OK]
Eternal ID of this version: #1028167/39
Text MD5: cb73aae1e1696b65f13d020985285c13
Transpilation MD5: c5b78a07d19cfc1bca13b0d7a45506c2
Author: stefan
Category: javax
Type: JavaX source code (Dynamic Module)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-05-26 22:11:16
Source code size: 5416 bytes / 160 lines
Pitched / IR pitched: No / No
Views / Downloads: 262 / 839
Version history: 38 change(s)
Referenced in: [show references]