Libraryless. Click here for Pure Java version (6342L/34K).
1 | sclass AdaptiveIdentifierCompression_LTS1 { |
2 | new SS shortenings; |
3 | new MultiSet<S> tokenCount; |
4 | new SS expansions; |
5 | settable S escapeWord = "Z"; |
6 | settable S codeAlphabet = lowerAndUpperCaseAlphabet(); |
7 | settable int minCodeLength = 2; |
8 | ItIt<S> codeMaker; |
9 | S nextCode; |
10 | settable int minCountToCompress = 1; |
11 | settable bool skipSameSizeShortenings = false; |
12 | int escapeWordsUsed; |
13 | |
14 | void init { |
15 | if (codeMaker == null) { |
16 | codeMaker = createCodeMaker(); |
17 | newCode(); |
18 | } |
19 | } |
20 | |
21 | S encode(S token) { |
22 | if (!isCompressibleToken(token)) ret token; |
23 | |
24 | init(); |
25 | ret compressIdentifier(token); |
26 | } |
27 | |
28 | S compressIdentifier(S token) { |
29 | tokenCount.add(token); |
30 | |
31 | S code = shortenings.get(token); |
32 | //printVars compressIdentifier(+token, +code); |
33 | if (code != null) |
34 | ret code; |
35 | |
36 | // check if escapeWord appears in input text |
37 | |
38 | if (eq(token, escapeWord)) { |
39 | code = createCodeFor(token); |
40 | //print("Made code for escape word " + escapeWord + ": " + code); |
41 | ++escapeWordsUsed; |
42 | ret escapeWord + " " + token; |
43 | } |
44 | |
45 | int count = tokenCount.get(token); |
46 | if (count < minCountToCompress) |
47 | ret token; |
48 | |
49 | if (eq(token, nextCode)) { |
50 | newCode(); |
51 | ++escapeWordsUsed; |
52 | ret escapeWord + " " + token; |
53 | } |
54 | |
55 | // check if token clashes with a code we created |
56 | |
57 | S existingMeaning = expansions.get(token); |
58 | |
59 | if (existingMeaning != null) { |
60 | //printVars(+token, +existingMeaning); |
61 | |
62 | // It's not a problem - we just send the escape word |
63 | // and the token will get a new code. |
64 | createCodeFor(token); |
65 | ++escapeWordsUsed; |
66 | ret escapeWord + " " + token; |
67 | } |
68 | |
69 | ret createCodeFor(token); |
70 | } |
71 | |
72 | S createCodeFor(S token) { |
73 | // create code for token |
74 | |
75 | // if out of codes (unlikely), return token as is |
76 | if (nextCode == null) |
77 | ret token; |
78 | |
79 | S code = nextCode; |
80 | |
81 | if (skipSameSizeShortenings && l(code) >= l(token)) { |
82 | //printVars("Skipped shortening", +token, +code); |
83 | ret token; |
84 | } |
85 | |
86 | newCode(); |
87 | |
88 | //printVars("Made shortening", +token, +code); |
89 | shortenings.put(token, code); |
90 | expansions.put(code, token); |
91 | |
92 | // first time, so return original token |
93 | ret token; |
94 | } |
95 | |
96 | S newCode() { |
97 | do { |
98 | if (!codeMaker.hasNext()) |
99 | null; |
100 | nextCode = codeMaker.next(); |
101 | } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord)); |
102 | //print("Have next code", nextCode); |
103 | ret nextCode; |
104 | } |
105 | |
106 | /// decoder |
107 | |
108 | bool escape; |
109 | |
110 | S decode(S token) { |
111 | if (!isCompressibleToken(token)) ret escape ? "" : token; |
112 | |
113 | init(); |
114 | |
115 | if (escape) { |
116 | escape = false; |
117 | ret decoded(token); |
118 | } |
119 | |
120 | if (eq(token, escapeWord)) { |
121 | set escape; |
122 | ret ""; |
123 | } |
124 | |
125 | S expanded = expansions.get(token); |
126 | if (expanded != null) |
127 | ret decoded(expanded); |
128 | |
129 | ret decoded(token); |
130 | } |
131 | |
132 | S decoded(S token) { |
133 | compressIdentifier(token); |
134 | ret token; |
135 | } |
136 | |
137 | // must also be true for encoded tokens |
138 | bool isCompressibleToken(S token) { |
139 | ret startsWithLetter(token); |
140 | } |
141 | |
142 | // utils |
143 | |
144 | // use some sort of simple tokenizer that is compatible |
145 | |
146 | LS tokenize(S text) { |
147 | ret letterDigitSeqOnlyTok(text); |
148 | } |
149 | |
150 | S compress(S text) { |
151 | ret concatMapStrings encode(tokenize(text)); |
152 | } |
153 | |
154 | S decompress(S text) { |
155 | ret concatMapStrings decode(tokenize(text)); |
156 | } |
157 | |
158 | ItIt<S> createCodeMaker() { |
159 | ret LexicographicIterator(codeAlphabet, minCodeLength); |
160 | } |
161 | } |
Began life as a copy of #1034370
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
Snippet ID: | #1034535 |
Snippet name: | AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.] |
Eternal ID of this version: | #1034535/6 |
Text MD5: | 997fb7b617df8f033bcaa741d6b0a3cd |
Transpilation MD5: | 9a5817d0865522186f0a76f92501e494 |
Author: | stefan |
Category: | javax / compressing text |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2022-02-11 22:11:16 |
Source code size: | 3718 bytes / 161 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 141 / 240 |
Version history: | 5 change(s) |
Referenced in: | [show references] |