Libraryless. Click here for Pure Java version (6342L/34K).
sclass AdaptiveIdentifierCompression_LTS1 { new SS shortenings; new MultiSet<S> tokenCount; new SS expansions; settable S escapeWord = "Z"; settable S codeAlphabet = lowerAndUpperCaseAlphabet(); settable int minCodeLength = 2; ItIt<S> codeMaker; S nextCode; settable int minCountToCompress = 1; settable bool skipSameSizeShortenings = false; int escapeWordsUsed; void init { if (codeMaker == null) { codeMaker = createCodeMaker(); newCode(); } } S encode(S token) { if (!isCompressibleToken(token)) ret token; init(); ret compressIdentifier(token); } S compressIdentifier(S token) { tokenCount.add(token); S code = shortenings.get(token); //printVars compressIdentifier(+token, +code); if (code != null) ret code; // check if escapeWord appears in input text if (eq(token, escapeWord)) { code = createCodeFor(token); //print("Made code for escape word " + escapeWord + ": " + code); ++escapeWordsUsed; ret escapeWord + " " + token; } int count = tokenCount.get(token); if (count < minCountToCompress) ret token; if (eq(token, nextCode)) { newCode(); ++escapeWordsUsed; ret escapeWord + " " + token; } // check if token clashes with a code we created S existingMeaning = expansions.get(token); if (existingMeaning != null) { //printVars(+token, +existingMeaning); // It's not a problem - we just send the escape word // and the token will get a new code. createCodeFor(token); ++escapeWordsUsed; ret escapeWord + " " + token; } ret createCodeFor(token); } S createCodeFor(S token) { // create code for token // if out of codes (unlikely), return token as is if (nextCode == null) ret token; S code = nextCode; if (skipSameSizeShortenings && l(code) >= l(token)) { //printVars("Skipped shortening", +token, +code); ret token; } newCode(); //printVars("Made shortening", +token, +code); shortenings.put(token, code); expansions.put(code, token); // first time, so return original token ret token; } S newCode() { do { if (!codeMaker.hasNext()) null; nextCode = codeMaker.next(); } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord)); //print("Have next code", nextCode); ret nextCode; } /// decoder bool escape; S decode(S token) { if (!isCompressibleToken(token)) ret escape ? "" : token; init(); if (escape) { escape = false; ret decoded(token); } if (eq(token, escapeWord)) { set escape; ret ""; } S expanded = expansions.get(token); if (expanded != null) ret decoded(expanded); ret decoded(token); } S decoded(S token) { compressIdentifier(token); ret token; } // must also be true for encoded tokens bool isCompressibleToken(S token) { ret startsWithLetter(token); } // utils // use some sort of simple tokenizer that is compatible LS tokenize(S text) { ret letterDigitSeqOnlyTok(text); } S compress(S text) { ret concatMapStrings encode(tokenize(text)); } S decompress(S text) { ret concatMapStrings decode(tokenize(text)); } ItIt<S> createCodeMaker() { ret LexicographicIterator(codeAlphabet, minCodeLength); } }
Began life as a copy of #1034370
download show line numbers debug dex old transpilations
Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
Snippet ID: | #1034535 |
Snippet name: | AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.] |
Eternal ID of this version: | #1034535/6 |
Text MD5: | 997fb7b617df8f033bcaa741d6b0a3cd |
Transpilation MD5: | 9a5817d0865522186f0a76f92501e494 |
Author: | stefan |
Category: | javax / compressing text |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2022-02-11 22:11:16 |
Source code size: | 3718 bytes / 161 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 140 / 239 |
Version history: | 5 change(s) |
Referenced in: | [show references] |