Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

153
LINES

< > BotCompany Repo | #1034370 // AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work]

JavaX fragment (include) [tags: use-pretranspiled]

Transpiled version (5916L) is out of date.

1  
// This is the class we are experimenting on, so compressed texts
2  
// should not be stored.
3  
// Use AdaptiveIdentifierCompression_LTS1 for that
4  
5  
sclass AdaptiveIdentifierCompression {
6  
  new SS shortenings;
7  
  new MultiSet<S> tokenCount;
8  
  new SS expansions;
9  
  settable S escapeWord = "Z";
10  
  S codeAlphabet = lowerCaseAlphabet();
11  
  ItIt<S> newCodeMaker;
12  
  S nextCode;
13  
  settable int minCountToCompress = 1;
14  
  settable bool skipSameSizeShortenings = false;
15  
  int escapeWordsUsed;
16  
17  
  void init {
18  
    if (newCodeMaker == null) {
19  
      newCodeMaker = allWordsOfAlphabet(codeAlphabet);
20  
      if (empty(newCode())) newCode();
21  
    }
22  
  }
23  
  
24  
  S encode(S token) {
25  
    if (!isIdentifier(token)) ret token;
26  
    
27  
    init();
28  
    ret compressIdentifier(token);
29  
  }
30  
  
31  
  S compressIdentifier(S token) {
32  
    tokenCount.add(token);
33  
    
34  
    S code = shortenings.get(token);
35  
    //printVars compressIdentifier(+token, +code);
36  
    if (code != null)
37  
      ret code;
38  
      
39  
    // check if escapeWord appears in input text
40  
    
41  
    if (eq(token, escapeWord)) {
42  
      code = createCodeFor(token);
43  
      //print("Made code for escape word " + escapeWord + ": " + code);
44  
      ++escapeWordsUsed;
45  
      ret escapeWord + " " + token;
46  
    }
47  
    
48  
    int count = tokenCount.get(token);
49  
    if (count < minCountToCompress)
50  
      ret token;
51  
      
52  
    if (eq(token, nextCode)) {
53  
      newCode();
54  
      ++escapeWordsUsed;
55  
      ret escapeWord + " " + token;
56  
    }
57  
    
58  
    // check if token clashes with a code we created
59  
    
60  
    S existingMeaning = expansions.get(token);
61  
    
62  
    if (existingMeaning != null) {
63  
      //printVars(+token, +existingMeaning);
64  
      
65  
      // It's not a problem - we just send the escape word
66  
      // and the token will get a new code.
67  
      createCodeFor(token);
68  
      ++escapeWordsUsed;
69  
      ret escapeWord + " " + token;
70  
    }
71  
    
72  
    ret createCodeFor(token);
73  
  }
74  
  
75  
  S createCodeFor(S token) {
76  
    // create code for token
77  
78  
    // if out of codes (unlikely), return token as is
79  
    if (nextCode == null)
80  
      ret token;
81  
    
82  
    S code = nextCode;
83  
    
84  
    if (skipSameSizeShortenings && l(code) >= l(token)) {
85  
      //printVars("Skipped shortening", +token, +code);
86  
      ret token;
87  
    }
88  
      
89  
    newCode();
90  
    
91  
    //printVars("Made shortening", +token, +code);
92  
    shortenings.put(token, code);
93  
    expansions.put(code, token);
94  
    
95  
    // first time, so return original token
96  
    ret token;
97  
  }
98  
  
99  
  S newCode() {
100  
    do {
101  
      if (!newCodeMaker.hasNext())
102  
        null;
103  
      nextCode = newCodeMaker.next();
104  
    } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord));
105  
    //print("Have next code", nextCode);
106  
    ret nextCode;
107  
  }
108  
  
109  
  /// decoder
110  
  
111  
  bool escape;
112  
  
113  
  S decode(S token) {
114  
    if (!isIdentifier(token)) ret escape ? "" : token;
115  
    
116  
    init();
117  
    
118  
    if (escape) {
119  
      escape = false;
120  
      ret decoded(token);
121  
    }
122  
    
123  
    if (eq(token, escapeWord)) {
124  
      set escape;
125  
      ret "";
126  
    }
127  
    
128  
    S expanded = expansions.get(token);
129  
    if (expanded != null)
130  
      ret decoded(expanded);
131  
      
132  
    ret decoded(token);
133  
  }
134  
  
135  
  S decoded(S token) {
136  
    compressIdentifier(token);
137  
    ret token;
138  
  }
139  
  
140  
  // utils
141  
  
142  
  // use some sort of default tokenizer (letterSeqOnlyTok)
143  
  
144  
  S compress(S text) {
145  
    LS tok = letterSeqOnlyTok(text);
146  
    ret concatMapStrings encode(tok);
147  
  }
148  
  
149  
  S decompress(S text) {
150  
    LS tok = letterSeqOnlyTok(text);
151  
    ret concatMapStrings decode(tok);
152  
  }
153  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj

No comments. add comment

Snippet ID: #1034370
Snippet name: AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work]
Eternal ID of this version: #1034370/49
Text MD5: 8cee69f775dddc26a04d47405e2a2535
Author: stefan
Category: javax / compressing text
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2022-02-11 21:15:11
Source code size: 3603 bytes / 153 lines
Pitched / IR pitched: No / No
Views / Downloads: 215 / 426
Version history: 48 change(s)
Referenced in: [show references]