Transpiled version (5916L) is out of date.
1 | // This is the class we are experimenting on, so compressed texts |
2 | // should not be stored. |
3 | // Use AdaptiveIdentifierCompression_LTS1 for that |
4 | |
5 | sclass AdaptiveIdentifierCompression { |
6 | new SS shortenings; |
7 | new MultiSet<S> tokenCount; |
8 | new SS expansions; |
9 | settable S escapeWord = "Z"; |
10 | S codeAlphabet = lowerCaseAlphabet(); |
11 | ItIt<S> newCodeMaker; |
12 | S nextCode; |
13 | settable int minCountToCompress = 1; |
14 | settable bool skipSameSizeShortenings = false; |
15 | int escapeWordsUsed; |
16 | |
17 | void init { |
18 | if (newCodeMaker == null) { |
19 | newCodeMaker = allWordsOfAlphabet(codeAlphabet); |
20 | if (empty(newCode())) newCode(); |
21 | } |
22 | } |
23 | |
24 | S encode(S token) { |
25 | if (!isIdentifier(token)) ret token; |
26 | |
27 | init(); |
28 | ret compressIdentifier(token); |
29 | } |
30 | |
31 | S compressIdentifier(S token) { |
32 | tokenCount.add(token); |
33 | |
34 | S code = shortenings.get(token); |
35 | //printVars compressIdentifier(+token, +code); |
36 | if (code != null) |
37 | ret code; |
38 | |
39 | // check if escapeWord appears in input text |
40 | |
41 | if (eq(token, escapeWord)) { |
42 | code = createCodeFor(token); |
43 | //print("Made code for escape word " + escapeWord + ": " + code); |
44 | ++escapeWordsUsed; |
45 | ret escapeWord + " " + token; |
46 | } |
47 | |
48 | int count = tokenCount.get(token); |
49 | if (count < minCountToCompress) |
50 | ret token; |
51 | |
52 | if (eq(token, nextCode)) { |
53 | newCode(); |
54 | ++escapeWordsUsed; |
55 | ret escapeWord + " " + token; |
56 | } |
57 | |
58 | // check if token clashes with a code we created |
59 | |
60 | S existingMeaning = expansions.get(token); |
61 | |
62 | if (existingMeaning != null) { |
63 | //printVars(+token, +existingMeaning); |
64 | |
65 | // It's not a problem - we just send the escape word |
66 | // and the token will get a new code. |
67 | createCodeFor(token); |
68 | ++escapeWordsUsed; |
69 | ret escapeWord + " " + token; |
70 | } |
71 | |
72 | ret createCodeFor(token); |
73 | } |
74 | |
75 | S createCodeFor(S token) { |
76 | // create code for token |
77 | |
78 | // if out of codes (unlikely), return token as is |
79 | if (nextCode == null) |
80 | ret token; |
81 | |
82 | S code = nextCode; |
83 | |
84 | if (skipSameSizeShortenings && l(code) >= l(token)) { |
85 | //printVars("Skipped shortening", +token, +code); |
86 | ret token; |
87 | } |
88 | |
89 | newCode(); |
90 | |
91 | //printVars("Made shortening", +token, +code); |
92 | shortenings.put(token, code); |
93 | expansions.put(code, token); |
94 | |
95 | // first time, so return original token |
96 | ret token; |
97 | } |
98 | |
99 | S newCode() { |
100 | do { |
101 | if (!newCodeMaker.hasNext()) |
102 | null; |
103 | nextCode = newCodeMaker.next(); |
104 | } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord)); |
105 | //print("Have next code", nextCode); |
106 | ret nextCode; |
107 | } |
108 | |
109 | /// decoder |
110 | |
111 | bool escape; |
112 | |
113 | S decode(S token) { |
114 | if (!isIdentifier(token)) ret escape ? "" : token; |
115 | |
116 | init(); |
117 | |
118 | if (escape) { |
119 | escape = false; |
120 | ret decoded(token); |
121 | } |
122 | |
123 | if (eq(token, escapeWord)) { |
124 | set escape; |
125 | ret ""; |
126 | } |
127 | |
128 | S expanded = expansions.get(token); |
129 | if (expanded != null) |
130 | ret decoded(expanded); |
131 | |
132 | ret decoded(token); |
133 | } |
134 | |
135 | S decoded(S token) { |
136 | compressIdentifier(token); |
137 | ret token; |
138 | } |
139 | |
140 | // utils |
141 | |
142 | // use some sort of default tokenizer (letterSeqOnlyTok) |
143 | |
144 | S compress(S text) { |
145 | LS tok = letterSeqOnlyTok(text); |
146 | ret concatMapStrings encode(tok); |
147 | } |
148 | |
149 | S decompress(S text) { |
150 | LS tok = letterSeqOnlyTok(text); |
151 | ret concatMapStrings decode(tok); |
152 | } |
153 | } |
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
Snippet ID: | #1034370 |
Snippet name: | AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work] |
Eternal ID of this version: | #1034370/49 |
Text MD5: | 8cee69f775dddc26a04d47405e2a2535 |
Author: | stefan |
Category: | javax / compressing text |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2022-02-11 21:15:11 |
Source code size: | 3603 bytes / 153 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 215 / 426 |
Version history: | 48 change(s) |
Referenced in: | [show references] |