sclass AdaptiveIdentifierCompression_LTS1 { new SS shortenings; new MultiSet tokenCount; new SS expansions; settable S escapeWord = "Z"; settable S codeAlphabet = lowerAndUpperCaseAlphabet(); settable int minCodeLength = 2; ItIt codeMaker; S nextCode; settable int minCountToCompress = 1; settable bool skipSameSizeShortenings = false; int escapeWordsUsed; void init { if (codeMaker == null) { codeMaker = createCodeMaker(); newCode(); } } S encode(S token) { if (!isCompressibleToken(token)) ret token; init(); ret compressIdentifier(token); } S compressIdentifier(S token) { tokenCount.add(token); S code = shortenings.get(token); //printVars compressIdentifier(+token, +code); if (code != null) ret code; // check if escapeWord appears in input text if (eq(token, escapeWord)) { code = createCodeFor(token); //print("Made code for escape word " + escapeWord + ": " + code); ++escapeWordsUsed; ret escapeWord + " " + token; } int count = tokenCount.get(token); if (count < minCountToCompress) ret token; if (eq(token, nextCode)) { newCode(); ++escapeWordsUsed; ret escapeWord + " " + token; } // check if token clashes with a code we created S existingMeaning = expansions.get(token); if (existingMeaning != null) { //printVars(+token, +existingMeaning); // It's not a problem - we just send the escape word // and the token will get a new code. createCodeFor(token); ++escapeWordsUsed; ret escapeWord + " " + token; } ret createCodeFor(token); } S createCodeFor(S token) { // create code for token // if out of codes (unlikely), return token as is if (nextCode == null) ret token; S code = nextCode; if (skipSameSizeShortenings && l(code) >= l(token)) { //printVars("Skipped shortening", +token, +code); ret token; } newCode(); //printVars("Made shortening", +token, +code); shortenings.put(token, code); expansions.put(code, token); // first time, so return original token ret token; } S newCode() { do { if (!codeMaker.hasNext()) null; nextCode = codeMaker.next(); } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord)); //print("Have next code", nextCode); ret nextCode; } /// decoder bool escape; S decode(S token) { if (!isCompressibleToken(token)) ret escape ? "" : token; init(); if (escape) { escape = false; ret decoded(token); } if (eq(token, escapeWord)) { set escape; ret ""; } S expanded = expansions.get(token); if (expanded != null) ret decoded(expanded); ret decoded(token); } S decoded(S token) { compressIdentifier(token); ret token; } // must also be true for encoded tokens bool isCompressibleToken(S token) { ret startsWithLetter(token); } // utils // use some sort of simple tokenizer that is compatible LS tokenize(S text) { ret letterDigitSeqOnlyTok(text); } S compress(S text) { ret concatMapStrings encode(tokenize(text)); } S decompress(S text) { ret concatMapStrings decode(tokenize(text)); } ItIt createCodeMaker() { ret LexicographicIterator(codeAlphabet, minCodeLength); } }