AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.] [1034535]

sclass AdaptiveIdentifierCompression_LTS1 {
  new SS shortenings;
  new MultiSet<S> tokenCount;
  new SS expansions;
  settable S escapeWord = "Z";
  settable S codeAlphabet = lowerAndUpperCaseAlphabet();
  settable int minCodeLength = 2;
  ItIt<S> codeMaker;
  S nextCode;
  settable int minCountToCompress = 1;
  settable bool skipSameSizeShortenings = false;
  int escapeWordsUsed;

  void init {
    if (codeMaker == null) {
      codeMaker = createCodeMaker();
      newCode();
    }
  }
  
  S encode(S token) {
    if (!isCompressibleToken(token)) ret token;
    
    init();
    ret compressIdentifier(token);
  }
  
  S compressIdentifier(S token) {
    tokenCount.add(token);
    
    S code = shortenings.get(token);
    //printVars compressIdentifier(+token, +code);
    if (code != null)
      ret code;
      
    // check if escapeWord appears in input text
    
    if (eq(token, escapeWord)) {
      code = createCodeFor(token);
      //print("Made code for escape word " + escapeWord + ": " + code);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    int count = tokenCount.get(token);
    if (count < minCountToCompress)
      ret token;
      
    if (eq(token, nextCode)) {
      newCode();
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    // check if token clashes with a code we created
    
    S existingMeaning = expansions.get(token);
    
    if (existingMeaning != null) {
      //printVars(+token, +existingMeaning);
      
      // It's not a problem - we just send the escape word
      // and the token will get a new code.
      createCodeFor(token);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    ret createCodeFor(token);
  }
  
  S createCodeFor(S token) {
    // create code for token

    // if out of codes (unlikely), return token as is
    if (nextCode == null)
      ret token;
    
    S code = nextCode;
    
    if (skipSameSizeShortenings && l(code) >= l(token)) {
      //printVars("Skipped shortening", +token, +code);
      ret token;
    }
      
    newCode();
    
    //printVars("Made shortening", +token, +code);
    shortenings.put(token, code);
    expansions.put(code, token);
    
    // first time, so return original token
    ret token;
  }
  
  S newCode() {
    do {
      if (!codeMaker.hasNext())
        null;
      nextCode = codeMaker.next();
    } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord));
    //print("Have next code", nextCode);
    ret nextCode;
  }
  
  /// decoder
  
  bool escape;
  
  S decode(S token) {
    if (!isCompressibleToken(token)) ret escape ? "" : token;
    
    init();
    
    if (escape) {
      escape = false;
      ret decoded(token);
    }
    
    if (eq(token, escapeWord)) {
      set escape;
      ret "";
    }
    
    S expanded = expansions.get(token);
    if (expanded != null)
      ret decoded(expanded);
      
    ret decoded(token);
  }
  
  S decoded(S token) {
    compressIdentifier(token);
    ret token;
  }
  
  // must also be true for encoded tokens
  bool isCompressibleToken(S token) {
    ret startsWithLetter(token);
  }
  
  // utils
  
  // use some sort of simple tokenizer that is compatible
  
  LS tokenize(S text) {
    ret letterDigitSeqOnlyTok(text);
  }
  
  S compress(S text) {
    ret concatMapStrings encode(tokenize(text));
  }
  
  S decompress(S text) {
    ret concatMapStrings decode(tokenize(text));
  }
  
  ItIt<S> createCodeMaker() {
    ret LexicographicIterator(codeAlphabet, minCodeLength);
  }
}

Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj

Snippet ID:	#1034535
Snippet name:	AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]
Eternal ID of this version:	#1034535/6
Text MD5:	997fb7b617df8f033bcaa741d6b0a3cd
Transpilation MD5:	9a5817d0865522186f0a76f92501e494
Author:	stefan
Category:	javax / compressing text
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2022-02-11 22:11:16
Source code size:	3718 bytes / 161 lines
Pitched / IR pitched:	No / No
Views / Downloads:	142 / 242
Version history:	5 change(s)
Referenced in:	#1003674 - Standard Classes + Interfaces (LIVE continued in #1034167)

< > BotCompany Repo | #1034535 // AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]

JavaX fragment (include) [tags: use-pretranspiled]

Author comment