AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.] [1034535]

sclass AdaptiveIdentifierCompression_LTS1 {
  new SS shortenings;
  new MultiSet<S> tokenCount;
  new SS expansions;
  settable S escapeWord = "Z";
  settable S codeAlphabet = lowerAndUpperCaseAlphabet();
  settable int minCodeLength = 2;
  ItIt<S> codeMaker;
  S nextCode;
  settable int minCountToCompress = 1;
  settable bool skipSameSizeShortenings = false;
  int escapeWordsUsed;

  void init {
    if (codeMaker == null) {
      codeMaker = createCodeMaker();
      newCode();
    }
  }
  
  S encode(S token) {
    if (!isCompressibleToken(token)) ret token;
    
    init();
    ret compressIdentifier(token);
  }
  
  S compressIdentifier(S token) {
    tokenCount.add(token);
    
    S code = shortenings.get(token);
    //printVars compressIdentifier(+token, +code);
    if (code != null)
      ret code;
      
    // check if escapeWord appears in input text
    
    if (eq(token, escapeWord)) {
      code = createCodeFor(token);
      //print("Made code for escape word " + escapeWord + ": " + code);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    int count = tokenCount.get(token);
    if (count < minCountToCompress)
      ret token;
      
    if (eq(token, nextCode)) {
      newCode();
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    // check if token clashes with a code we created
    
    S existingMeaning = expansions.get(token);
    
    if (existingMeaning != null) {
      //printVars(+token, +existingMeaning);
      
      // It's not a problem - we just send the escape word
      // and the token will get a new code.
      createCodeFor(token);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    ret createCodeFor(token);
  }
  
  S createCodeFor(S token) {
    // create code for token

    // if out of codes (unlikely), return token as is
    if (nextCode == null)
      ret token;
    
    S code = nextCode;
    
    if (skipSameSizeShortenings && l(code) >= l(token)) {
      //printVars("Skipped shortening", +token, +code);
      ret token;
    }
      
    newCode();
    
    //printVars("Made shortening", +token, +code);
    shortenings.put(token, code);
    expansions.put(code, token);
    
    // first time, so return original token
    ret token;
  }
  
  S newCode() {
    do {
      if (!codeMaker.hasNext())
        null;
      nextCode = codeMaker.next();
    } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord));
    //print("Have next code", nextCode);
    ret nextCode;
  }
  
  /// decoder
  
  bool escape;
  
  S decode(S token) {
    if (!isCompressibleToken(token)) ret escape ? "" : token;
    
    init();
    
    if (escape) {
      escape = false;
      ret decoded(token);
    }
    
    if (eq(token, escapeWord)) {
      set escape;
      ret "";
    }
    
    S expanded = expansions.get(token);
    if (expanded != null)
      ret decoded(expanded);
      
    ret decoded(token);
  }
  
  S decoded(S token) {
    compressIdentifier(token);
    ret token;
  }
  
  // must also be true for encoded tokens
  bool isCompressibleToken(S token) {
    ret startsWithLetter(token);
  }
  
  // utils
  
  // use some sort of simple tokenizer that is compatible
  
  LS tokenize(S text) {
    ret letterDigitSeqOnlyTok(text);
  }
  
  S compress(S text) {
    ret concatMapStrings encode(tokenize(text));
  }
  
  S decompress(S text) {
    ret concatMapStrings decode(tokenize(text));
  }
  
  ItIt<S> createCodeMaker() {
    ret LexicographicIterator(codeAlphabet, minCodeLength);
  }
}

Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj

1	sclass AdaptiveIdentifierCompression_LTS1 {
2	new SS shortenings;
3	new MultiSet<S> tokenCount;
4	new SS expansions;
5	settable S escapeWord = "Z";
6	settable S codeAlphabet = lowerAndUpperCaseAlphabet();
7	settable int minCodeLength = 2;
8	ItIt<S> codeMaker;
9	S nextCode;
10	settable int minCountToCompress = 1;
11	settable bool skipSameSizeShortenings = false;
12	int escapeWordsUsed;
13
14	void init {
15	if (codeMaker == null) {
16	codeMaker = createCodeMaker();
17	newCode();
18	}
19	}
20
21	S encode(S token) {
22	if (!isCompressibleToken(token)) ret token;
23
24	init();
25	ret compressIdentifier(token);
26	}
27
28	S compressIdentifier(S token) {
29	tokenCount.add(token);
30
31	S code = shortenings.get(token);
32	//printVars compressIdentifier(+token, +code);
33	if (code != null)
34	ret code;
35
36	// check if escapeWord appears in input text
37
38	if (eq(token, escapeWord)) {
39	code = createCodeFor(token);
40	//print("Made code for escape word " + escapeWord + ": " + code);
41	++escapeWordsUsed;
42	ret escapeWord + " " + token;
43	}
44
45	int count = tokenCount.get(token);
46	if (count < minCountToCompress)
47	ret token;
48
49	if (eq(token, nextCode)) {
50	newCode();
51	++escapeWordsUsed;
52	ret escapeWord + " " + token;
53	}
54
55	// check if token clashes with a code we created
56
57	S existingMeaning = expansions.get(token);
58
59	if (existingMeaning != null) {
60	//printVars(+token, +existingMeaning);
61
62	// It's not a problem - we just send the escape word
63	// and the token will get a new code.
64	createCodeFor(token);
65	++escapeWordsUsed;
66	ret escapeWord + " " + token;
67	}
68
69	ret createCodeFor(token);
70	}
71
72	S createCodeFor(S token) {
73	// create code for token
74
75	// if out of codes (unlikely), return token as is
76	if (nextCode == null)
77	ret token;
78
79	S code = nextCode;
80
81	if (skipSameSizeShortenings && l(code) >= l(token)) {
82	//printVars("Skipped shortening", +token, +code);
83	ret token;
84	}
85
86	newCode();
87
88	//printVars("Made shortening", +token, +code);
89	shortenings.put(token, code);
90	expansions.put(code, token);
91
92	// first time, so return original token
93	ret token;
94	}
95
96	S newCode() {
97	do {
98	if (!codeMaker.hasNext())
99	null;
100	nextCode = codeMaker.next();
101	} while (shortenings.containsKey(nextCode) \|\| eq(nextCode, escapeWord));
102	//print("Have next code", nextCode);
103	ret nextCode;
104	}
105
106	/// decoder
107
108	bool escape;
109
110	S decode(S token) {
111	if (!isCompressibleToken(token)) ret escape ? "" : token;
112
113	init();
114
115	if (escape) {
116	escape = false;
117	ret decoded(token);
118	}
119
120	if (eq(token, escapeWord)) {
121	set escape;
122	ret "";
123	}
124
125	S expanded = expansions.get(token);
126	if (expanded != null)
127	ret decoded(expanded);
128
129	ret decoded(token);
130	}
131
132	S decoded(S token) {
133	compressIdentifier(token);
134	ret token;
135	}
136
137	// must also be true for encoded tokens
138	bool isCompressibleToken(S token) {
139	ret startsWithLetter(token);
140	}
141
142	// utils
143
144	// use some sort of simple tokenizer that is compatible
145
146	LS tokenize(S text) {
147	ret letterDigitSeqOnlyTok(text);
148	}
149
150	S compress(S text) {
151	ret concatMapStrings encode(tokenize(text));
152	}
153
154	S decompress(S text) {
155	ret concatMapStrings decode(tokenize(text));
156	}
157
158	ItIt<S> createCodeMaker() {
159	ret LexicographicIterator(codeAlphabet, minCodeLength);
160	}
161	}

Snippet ID:	#1034535
Snippet name:	AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]
Eternal ID of this version:	#1034535/6
Text MD5:	997fb7b617df8f033bcaa741d6b0a3cd
Transpilation MD5:	9a5817d0865522186f0a76f92501e494
Author:	stefan
Category:	javax / compressing text
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2022-02-11 22:11:16
Source code size:	3718 bytes / 161 lines
Pitched / IR pitched:	No / No
Views / Downloads:	141 / 240
Version history:	5 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1034535 // AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment