Transpiled version (5916L) is out of date.
// This is the class we are experimenting on, so compressed texts // should not be stored. // Use AdaptiveIdentifierCompression_LTS1 for that sclass AdaptiveIdentifierCompression { new SS shortenings; new MultiSet<S> tokenCount; new SS expansions; settable S escapeWord = "Z"; S codeAlphabet = lowerCaseAlphabet(); ItIt<S> newCodeMaker; S nextCode; settable int minCountToCompress = 1; settable bool skipSameSizeShortenings = false; int escapeWordsUsed; void init { if (newCodeMaker == null) { newCodeMaker = allWordsOfAlphabet(codeAlphabet); if (empty(newCode())) newCode(); } } S encode(S token) { if (!isIdentifier(token)) ret token; init(); ret compressIdentifier(token); } S compressIdentifier(S token) { tokenCount.add(token); S code = shortenings.get(token); //printVars compressIdentifier(+token, +code); if (code != null) ret code; // check if escapeWord appears in input text if (eq(token, escapeWord)) { code = createCodeFor(token); //print("Made code for escape word " + escapeWord + ": " + code); ++escapeWordsUsed; ret escapeWord + " " + token; } int count = tokenCount.get(token); if (count < minCountToCompress) ret token; if (eq(token, nextCode)) { newCode(); ++escapeWordsUsed; ret escapeWord + " " + token; } // check if token clashes with a code we created S existingMeaning = expansions.get(token); if (existingMeaning != null) { //printVars(+token, +existingMeaning); // It's not a problem - we just send the escape word // and the token will get a new code. createCodeFor(token); ++escapeWordsUsed; ret escapeWord + " " + token; } ret createCodeFor(token); } S createCodeFor(S token) { // create code for token // if out of codes (unlikely), return token as is if (nextCode == null) ret token; S code = nextCode; if (skipSameSizeShortenings && l(code) >= l(token)) { //printVars("Skipped shortening", +token, +code); ret token; } newCode(); //printVars("Made shortening", +token, +code); shortenings.put(token, code); expansions.put(code, token); // first time, so return original token ret token; } S newCode() { do { if (!newCodeMaker.hasNext()) null; nextCode = newCodeMaker.next(); } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord)); //print("Have next code", nextCode); ret nextCode; } /// decoder bool escape; S decode(S token) { if (!isIdentifier(token)) ret escape ? "" : token; init(); if (escape) { escape = false; ret decoded(token); } if (eq(token, escapeWord)) { set escape; ret ""; } S expanded = expansions.get(token); if (expanded != null) ret decoded(expanded); ret decoded(token); } S decoded(S token) { compressIdentifier(token); ret token; } // utils // use some sort of default tokenizer (letterSeqOnlyTok) S compress(S text) { LS tok = letterSeqOnlyTok(text); ret concatMapStrings encode(tok); } S decompress(S text) { LS tok = letterSeqOnlyTok(text); ret concatMapStrings decode(tok); } }
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
Snippet ID: | #1034370 |
Snippet name: | AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work] |
Eternal ID of this version: | #1034370/49 |
Text MD5: | 8cee69f775dddc26a04d47405e2a2535 |
Author: | stefan |
Category: | javax / compressing text |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2022-02-11 21:15:11 |
Source code size: | 3603 bytes / 153 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 213 / 424 |
Version history: | 48 change(s) |
Referenced in: | [show references] |