// This is the class we are experimenting on, so compressed texts // should not be stored. // Use AdaptiveIdentifierCompression_LTS1 for that sclass AdaptiveIdentifierCompression { new SS shortenings; new MultiSet tokenCount; new SS expansions; settable S escapeWord = "Z"; S codeAlphabet = lowerCaseAlphabet(); ItIt newCodeMaker; S nextCode; settable int minCountToCompress = 1; settable bool skipSameSizeShortenings = false; int escapeWordsUsed; void init { if (newCodeMaker == null) { newCodeMaker = allWordsOfAlphabet(codeAlphabet); if (empty(newCode())) newCode(); } } S encode(S token) { if (!isIdentifier(token)) ret token; init(); ret compressIdentifier(token); } S compressIdentifier(S token) { tokenCount.add(token); S code = shortenings.get(token); //printVars compressIdentifier(+token, +code); if (code != null) ret code; // check if escapeWord appears in input text if (eq(token, escapeWord)) { code = createCodeFor(token); //print("Made code for escape word " + escapeWord + ": " + code); ++escapeWordsUsed; ret escapeWord + " " + token; } int count = tokenCount.get(token); if (count < minCountToCompress) ret token; if (eq(token, nextCode)) { newCode(); ++escapeWordsUsed; ret escapeWord + " " + token; } // check if token clashes with a code we created S existingMeaning = expansions.get(token); if (existingMeaning != null) { //printVars(+token, +existingMeaning); // It's not a problem - we just send the escape word // and the token will get a new code. createCodeFor(token); ++escapeWordsUsed; ret escapeWord + " " + token; } ret createCodeFor(token); } S createCodeFor(S token) { // create code for token // if out of codes (unlikely), return token as is if (nextCode == null) ret token; S code = nextCode; if (skipSameSizeShortenings && l(code) >= l(token)) { //printVars("Skipped shortening", +token, +code); ret token; } newCode(); //printVars("Made shortening", +token, +code); shortenings.put(token, code); expansions.put(code, token); // first time, so return original token ret token; } S newCode() { do { if (!newCodeMaker.hasNext()) null; nextCode = newCodeMaker.next(); } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord)); //print("Have next code", nextCode); ret nextCode; } /// decoder bool escape; S decode(S token) { if (!isIdentifier(token)) ret escape ? "" : token; init(); if (escape) { escape = false; ret decoded(token); } if (eq(token, escapeWord)) { set escape; ret ""; } S expanded = expansions.get(token); if (expanded != null) ret decoded(expanded); ret decoded(token); } S decoded(S token) { compressIdentifier(token); ret token; } // utils // use some sort of default tokenizer (letterSeqOnlyTok) S compress(S text) { LS tok = letterSeqOnlyTok(text); ret concatMapStrings encode(tok); } S decompress(S text) { LS tok = letterSeqOnlyTok(text); ret concatMapStrings decode(tok); } }