Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

161
LINES

< > BotCompany Repo | #1034535 // AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (6342L/34K).

1  
sclass AdaptiveIdentifierCompression_LTS1 {
2  
  new SS shortenings;
3  
  new MultiSet<S> tokenCount;
4  
  new SS expansions;
5  
  settable S escapeWord = "Z";
6  
  settable S codeAlphabet = lowerAndUpperCaseAlphabet();
7  
  settable int minCodeLength = 2;
8  
  ItIt<S> codeMaker;
9  
  S nextCode;
10  
  settable int minCountToCompress = 1;
11  
  settable bool skipSameSizeShortenings = false;
12  
  int escapeWordsUsed;
13  
14  
  void init {
15  
    if (codeMaker == null) {
16  
      codeMaker = createCodeMaker();
17  
      newCode();
18  
    }
19  
  }
20  
  
21  
  S encode(S token) {
22  
    if (!isCompressibleToken(token)) ret token;
23  
    
24  
    init();
25  
    ret compressIdentifier(token);
26  
  }
27  
  
28  
  S compressIdentifier(S token) {
29  
    tokenCount.add(token);
30  
    
31  
    S code = shortenings.get(token);
32  
    //printVars compressIdentifier(+token, +code);
33  
    if (code != null)
34  
      ret code;
35  
      
36  
    // check if escapeWord appears in input text
37  
    
38  
    if (eq(token, escapeWord)) {
39  
      code = createCodeFor(token);
40  
      //print("Made code for escape word " + escapeWord + ": " + code);
41  
      ++escapeWordsUsed;
42  
      ret escapeWord + " " + token;
43  
    }
44  
    
45  
    int count = tokenCount.get(token);
46  
    if (count < minCountToCompress)
47  
      ret token;
48  
      
49  
    if (eq(token, nextCode)) {
50  
      newCode();
51  
      ++escapeWordsUsed;
52  
      ret escapeWord + " " + token;
53  
    }
54  
    
55  
    // check if token clashes with a code we created
56  
    
57  
    S existingMeaning = expansions.get(token);
58  
    
59  
    if (existingMeaning != null) {
60  
      //printVars(+token, +existingMeaning);
61  
      
62  
      // It's not a problem - we just send the escape word
63  
      // and the token will get a new code.
64  
      createCodeFor(token);
65  
      ++escapeWordsUsed;
66  
      ret escapeWord + " " + token;
67  
    }
68  
    
69  
    ret createCodeFor(token);
70  
  }
71  
  
72  
  S createCodeFor(S token) {
73  
    // create code for token
74  
75  
    // if out of codes (unlikely), return token as is
76  
    if (nextCode == null)
77  
      ret token;
78  
    
79  
    S code = nextCode;
80  
    
81  
    if (skipSameSizeShortenings && l(code) >= l(token)) {
82  
      //printVars("Skipped shortening", +token, +code);
83  
      ret token;
84  
    }
85  
      
86  
    newCode();
87  
    
88  
    //printVars("Made shortening", +token, +code);
89  
    shortenings.put(token, code);
90  
    expansions.put(code, token);
91  
    
92  
    // first time, so return original token
93  
    ret token;
94  
  }
95  
  
96  
  S newCode() {
97  
    do {
98  
      if (!codeMaker.hasNext())
99  
        null;
100  
      nextCode = codeMaker.next();
101  
    } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord));
102  
    //print("Have next code", nextCode);
103  
    ret nextCode;
104  
  }
105  
  
106  
  /// decoder
107  
  
108  
  bool escape;
109  
  
110  
  S decode(S token) {
111  
    if (!isCompressibleToken(token)) ret escape ? "" : token;
112  
    
113  
    init();
114  
    
115  
    if (escape) {
116  
      escape = false;
117  
      ret decoded(token);
118  
    }
119  
    
120  
    if (eq(token, escapeWord)) {
121  
      set escape;
122  
      ret "";
123  
    }
124  
    
125  
    S expanded = expansions.get(token);
126  
    if (expanded != null)
127  
      ret decoded(expanded);
128  
      
129  
    ret decoded(token);
130  
  }
131  
  
132  
  S decoded(S token) {
133  
    compressIdentifier(token);
134  
    ret token;
135  
  }
136  
  
137  
  // must also be true for encoded tokens
138  
  bool isCompressibleToken(S token) {
139  
    ret startsWithLetter(token);
140  
  }
141  
  
142  
  // utils
143  
  
144  
  // use some sort of simple tokenizer that is compatible
145  
  
146  
  LS tokenize(S text) {
147  
    ret letterDigitSeqOnlyTok(text);
148  
  }
149  
  
150  
  S compress(S text) {
151  
    ret concatMapStrings encode(tokenize(text));
152  
  }
153  
  
154  
  S decompress(S text) {
155  
    ret concatMapStrings decode(tokenize(text));
156  
  }
157  
  
158  
  ItIt<S> createCodeMaker() {
159  
    ret LexicographicIterator(codeAlphabet, minCodeLength);
160  
  }
161  
}

Author comment

Began life as a copy of #1034370

download  show line numbers  debug dex  old transpilations   

Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj

No comments. add comment

Snippet ID: #1034535
Snippet name: AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]
Eternal ID of this version: #1034535/6
Text MD5: 997fb7b617df8f033bcaa741d6b0a3cd
Transpilation MD5: 9a5817d0865522186f0a76f92501e494
Author: stefan
Category: javax / compressing text
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2022-02-11 22:11:16
Source code size: 3718 bytes / 161 lines
Pitched / IR pitched: No / No
Views / Downloads: 75 / 152
Version history: 5 change(s)
Referenced in: [show references]