AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text

// This is the class we are experimenting on, so compressed texts
// should not be stored.
// Use AdaptiveIdentifierCompression_LTS1 for that

sclass AdaptiveIdentifierCompression {
  new SS shortenings;
  new MultiSet<S> tokenCount;
  new SS expansions;
  settable S escapeWord = "Z";
  S codeAlphabet = lowerCaseAlphabet();
  ItIt<S> newCodeMaker;
  S nextCode;
  settable int minCountToCompress = 1;
  settable bool skipSameSizeShortenings = false;
  int escapeWordsUsed;

  void init {
    if (newCodeMaker == null) {
      newCodeMaker = allWordsOfAlphabet(codeAlphabet);
      if (empty(newCode())) newCode();
    }
  }
  
  S encode(S token) {
    if (!isIdentifier(token)) ret token;
    
    init();
    ret compressIdentifier(token);
  }
  
  S compressIdentifier(S token) {
    tokenCount.add(token);
    
    S code = shortenings.get(token);
    //printVars compressIdentifier(+token, +code);
    if (code != null)
      ret code;
      
    // check if escapeWord appears in input text
    
    if (eq(token, escapeWord)) {
      code = createCodeFor(token);
      //print("Made code for escape word " + escapeWord + ": " + code);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    int count = tokenCount.get(token);
    if (count < minCountToCompress)
      ret token;
      
    if (eq(token, nextCode)) {
      newCode();
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    // check if token clashes with a code we created
    
    S existingMeaning = expansions.get(token);
    
    if (existingMeaning != null) {
      //printVars(+token, +existingMeaning);
      
      // It's not a problem - we just send the escape word
      // and the token will get a new code.
      createCodeFor(token);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    ret createCodeFor(token);
  }
  
  S createCodeFor(S token) {
    // create code for token

    // if out of codes (unlikely), return token as is
    if (nextCode == null)
      ret token;
    
    S code = nextCode;
    
    if (skipSameSizeShortenings && l(code) >= l(token)) {
      //printVars("Skipped shortening", +token, +code);
      ret token;
    }
      
    newCode();
    
    //printVars("Made shortening", +token, +code);
    shortenings.put(token, code);
    expansions.put(code, token);
    
    // first time, so return original token
    ret token;
  }
  
  S newCode() {
    do {
      if (!newCodeMaker.hasNext())
        null;
      nextCode = newCodeMaker.next();
    } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord));
    //print("Have next code", nextCode);
    ret nextCode;
  }
  
  /// decoder
  
  bool escape;
  
  S decode(S token) {
    if (!isIdentifier(token)) ret escape ? "" : token;
    
    init();
    
    if (escape) {
      escape = false;
      ret decoded(token);
    }
    
    if (eq(token, escapeWord)) {
      set escape;
      ret "";
    }
    
    S expanded = expansions.get(token);
    if (expanded != null)
      ret decoded(expanded);
      
    ret decoded(token);
  }
  
  S decoded(S token) {
    compressIdentifier(token);
    ret token;
  }
  
  // utils
  
  // use some sort of default tokenizer (letterSeqOnlyTok)
  
  S compress(S text) {
    LS tok = letterSeqOnlyTok(text);
    ret concatMapStrings encode(tok);
  }
  
  S decompress(S text) {
    LS tok = letterSeqOnlyTok(text);
    ret concatMapStrings decode(tok);
  }
}

Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj

1	// This is the class we are experimenting on, so compressed texts
2	// should not be stored.
3	// Use AdaptiveIdentifierCompression_LTS1 for that
4
5	sclass AdaptiveIdentifierCompression {
6	new SS shortenings;
7	new MultiSet<S> tokenCount;
8	new SS expansions;
9	settable S escapeWord = "Z";
10	S codeAlphabet = lowerCaseAlphabet();
11	ItIt<S> newCodeMaker;
12	S nextCode;
13	settable int minCountToCompress = 1;
14	settable bool skipSameSizeShortenings = false;
15	int escapeWordsUsed;
16
17	void init {
18	if (newCodeMaker == null) {
19	newCodeMaker = allWordsOfAlphabet(codeAlphabet);
20	if (empty(newCode())) newCode();
21	}
22	}
23
24	S encode(S token) {
25	if (!isIdentifier(token)) ret token;
26
27	init();
28	ret compressIdentifier(token);
29	}
30
31	S compressIdentifier(S token) {
32	tokenCount.add(token);
33
34	S code = shortenings.get(token);
35	//printVars compressIdentifier(+token, +code);
36	if (code != null)
37	ret code;
38
39	// check if escapeWord appears in input text
40
41	if (eq(token, escapeWord)) {
42	code = createCodeFor(token);
43	//print("Made code for escape word " + escapeWord + ": " + code);
44	++escapeWordsUsed;
45	ret escapeWord + " " + token;
46	}
47
48	int count = tokenCount.get(token);
49	if (count < minCountToCompress)
50	ret token;
51
52	if (eq(token, nextCode)) {
53	newCode();
54	++escapeWordsUsed;
55	ret escapeWord + " " + token;
56	}
57
58	// check if token clashes with a code we created
59
60	S existingMeaning = expansions.get(token);
61
62	if (existingMeaning != null) {
63	//printVars(+token, +existingMeaning);
64
65	// It's not a problem - we just send the escape word
66	// and the token will get a new code.
67	createCodeFor(token);
68	++escapeWordsUsed;
69	ret escapeWord + " " + token;
70	}
71
72	ret createCodeFor(token);
73	}
74
75	S createCodeFor(S token) {
76	// create code for token
77
78	// if out of codes (unlikely), return token as is
79	if (nextCode == null)
80	ret token;
81
82	S code = nextCode;
83
84	if (skipSameSizeShortenings && l(code) >= l(token)) {
85	//printVars("Skipped shortening", +token, +code);
86	ret token;
87	}
88
89	newCode();
90
91	//printVars("Made shortening", +token, +code);
92	shortenings.put(token, code);
93	expansions.put(code, token);
94
95	// first time, so return original token
96	ret token;
97	}
98
99	S newCode() {
100	do {
101	if (!newCodeMaker.hasNext())
102	null;
103	nextCode = newCodeMaker.next();
104	} while (shortenings.containsKey(nextCode) \|\| eq(nextCode, escapeWord));
105	//print("Have next code", nextCode);
106	ret nextCode;
107	}
108
109	/// decoder
110
111	bool escape;
112
113	S decode(S token) {
114	if (!isIdentifier(token)) ret escape ? "" : token;
115
116	init();
117
118	if (escape) {
119	escape = false;
120	ret decoded(token);
121	}
122
123	if (eq(token, escapeWord)) {
124	set escape;
125	ret "";
126	}
127
128	S expanded = expansions.get(token);
129	if (expanded != null)
130	ret decoded(expanded);
131
132	ret decoded(token);
133	}
134
135	S decoded(S token) {
136	compressIdentifier(token);
137	ret token;
138	}
139
140	// utils
141
142	// use some sort of default tokenizer (letterSeqOnlyTok)
143
144	S compress(S text) {
145	LS tok = letterSeqOnlyTok(text);
146	ret concatMapStrings encode(tok);
147	}
148
149	S decompress(S text) {
150	LS tok = letterSeqOnlyTok(text);
151	ret concatMapStrings decode(tok);
152	}
153	}

Snippet ID:	#1034370
Snippet name:	AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work]
Eternal ID of this version:	#1034370/49
Text MD5:	8cee69f775dddc26a04d47405e2a2535
Author:	stefan
Category:	javax / compressing text
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2022-02-11 21:15:11
Source code size:	3603 bytes / 153 lines
Pitched / IR pitched:	No / No
Views / Downloads:	215 / 426
Version history:	48 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1034370 // AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work]

JavaX fragment (include) [tags: use-pretranspiled]