Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

153
LINES

< > BotCompany Repo | #1034370 // AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work]

JavaX fragment (include) [tags: use-pretranspiled]

Transpiled version (5916L) is out of date.

// This is the class we are experimenting on, so compressed texts
// should not be stored.
// Use AdaptiveIdentifierCompression_LTS1 for that

sclass AdaptiveIdentifierCompression {
  new SS shortenings;
  new MultiSet<S> tokenCount;
  new SS expansions;
  settable S escapeWord = "Z";
  S codeAlphabet = lowerCaseAlphabet();
  ItIt<S> newCodeMaker;
  S nextCode;
  settable int minCountToCompress = 1;
  settable bool skipSameSizeShortenings = false;
  int escapeWordsUsed;

  void init {
    if (newCodeMaker == null) {
      newCodeMaker = allWordsOfAlphabet(codeAlphabet);
      if (empty(newCode())) newCode();
    }
  }
  
  S encode(S token) {
    if (!isIdentifier(token)) ret token;
    
    init();
    ret compressIdentifier(token);
  }
  
  S compressIdentifier(S token) {
    tokenCount.add(token);
    
    S code = shortenings.get(token);
    //printVars compressIdentifier(+token, +code);
    if (code != null)
      ret code;
      
    // check if escapeWord appears in input text
    
    if (eq(token, escapeWord)) {
      code = createCodeFor(token);
      //print("Made code for escape word " + escapeWord + ": " + code);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    int count = tokenCount.get(token);
    if (count < minCountToCompress)
      ret token;
      
    if (eq(token, nextCode)) {
      newCode();
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    // check if token clashes with a code we created
    
    S existingMeaning = expansions.get(token);
    
    if (existingMeaning != null) {
      //printVars(+token, +existingMeaning);
      
      // It's not a problem - we just send the escape word
      // and the token will get a new code.
      createCodeFor(token);
      ++escapeWordsUsed;
      ret escapeWord + " " + token;
    }
    
    ret createCodeFor(token);
  }
  
  S createCodeFor(S token) {
    // create code for token

    // if out of codes (unlikely), return token as is
    if (nextCode == null)
      ret token;
    
    S code = nextCode;
    
    if (skipSameSizeShortenings && l(code) >= l(token)) {
      //printVars("Skipped shortening", +token, +code);
      ret token;
    }
      
    newCode();
    
    //printVars("Made shortening", +token, +code);
    shortenings.put(token, code);
    expansions.put(code, token);
    
    // first time, so return original token
    ret token;
  }
  
  S newCode() {
    do {
      if (!newCodeMaker.hasNext())
        null;
      nextCode = newCodeMaker.next();
    } while (shortenings.containsKey(nextCode) || eq(nextCode, escapeWord));
    //print("Have next code", nextCode);
    ret nextCode;
  }
  
  /// decoder
  
  bool escape;
  
  S decode(S token) {
    if (!isIdentifier(token)) ret escape ? "" : token;
    
    init();
    
    if (escape) {
      escape = false;
      ret decoded(token);
    }
    
    if (eq(token, escapeWord)) {
      set escape;
      ret "";
    }
    
    S expanded = expansions.get(token);
    if (expanded != null)
      ret decoded(expanded);
      
    ret decoded(token);
  }
  
  S decoded(S token) {
    compressIdentifier(token);
    ret token;
  }
  
  // utils
  
  // use some sort of default tokenizer (letterSeqOnlyTok)
  
  S compress(S text) {
    LS tok = letterSeqOnlyTok(text);
    ret concatMapStrings encode(tok);
  }
  
  S decompress(S text) {
    LS tok = letterSeqOnlyTok(text);
    ret concatMapStrings decode(tok);
  }
}

download  show line numbers  debug dex  old transpilations   

Travelled to 4 computer(s): bhatertpkbcr, ekrmjmnbrukm, mowyntqkapby, mqqgnosmbjvj

No comments. add comment

Snippet ID: #1034370
Snippet name: AdaptiveIdentifierCompression [a simple compression for identifiers e.g. in a structure text - seems to work]
Eternal ID of this version: #1034370/49
Text MD5: 8cee69f775dddc26a04d47405e2a2535
Author: stefan
Category: javax / compressing text
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2022-02-11 21:15:11
Source code size: 3603 bytes / 153 lines
Pitched / IR pitched: No / No
Views / Downloads: 214 / 425
Version history: 48 change(s)
Referenced in: #1003674 - Standard Classes + Interfaces (LIVE continued in #1034167)
#1034535 - AdaptiveIdentifierCompression_LTS1 - long-term stable version 1 [dev.]