Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

135
LINES

< > BotCompany Repo | #1032583 // SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (4081L/24K).

// supports 8 different character classes
sclass SpecialCharsHierarchicalBitMap {
  S specialChars;
  int minLevel = 1, maxLevel = 8;

  byte[] input;
  byte[] charIndex = new byte[256];
  byte[][] tables;
  int reachedLevel;
  
  // how many computation steps it took
  int steps;
  
  *() {}
  *(byte[] *input) {}
  *(S specialChars, byte[] *input) {}
  
  selfType specialChar(S s, int mask) {
    int n = l(s);
    for i to n: specialChar(s.charAt(i), mask);
    this;
  }
  
  selfType specialChar(char c, int mask) {
    if (!isUByte(mask)) fail("Mask out of range (must be u-byte): " + intToHex(mask));
    charIndex[ubyteToInt(c)] |= (byte) mask;
    this;
  }
  
  // These probably must all be in the first character page.
  // TODO: check that
  selfType specialChars(S s) {
    specialChars = s;
    
    // make charIndex table
    
    var m = l(specialChars);
    if (m > 8) fail("Too many special chars (max: 8) - " + m);
    for i to m:
      charIndex[ubyteToInt(specialChars.charAt(i))] = (byte) (1 << i);
    this;
  }
  
  void analyze(S input) {
    this.input = toUtf8(input); // XXX probably wrong
    analyze();
  }
  
  void analyze(byte[] input) {
    this.input = input;
    analyze();
  }
  
  void analyze {
    // allocate logarithmic tables
      
    var input = this.input;
    // n is actual input length
    // allLevels is size of n in bits
    // n2 is n rounded up to power of two
    int n = l(input), allLevels = ld_iceil(n), n2 = 1 << allLevels;
    int minLevel = this.minLevel;
    
    // reachedLevel is the minimum of allLevels and maxLevel
    reachedLevel = min(allLevels, maxLevel);
    
    // representedLevels is how many array elements we need
    int representedLevels = max(0, reachedLevel+1-minLevel);
    
    int n3 = (n+(1 << minLevel)-1) >> minLevel;
    ifdef SpecialCharsHierarchicalBitMap_debug
      printVars(+n, +allLevels, +minLevel, +maxLevel, +representedLevels, +n2, +n3);
    endifdef
  
    tables = new byte[representedLevels][];
    for level to representedLevels: {
      tables[level] = new byte[n3];
      ifdef SpecialCharsHierarchicalBitMap_debug
        printVars(+level, +n3);
      endifdef
      n3 = (n3+1) >> 1;
    }
      
    int steps = n; // for each outer iteration
    
    for i to n: {
      int c = ubyteToInt(input[i]);
      byte mask = charIndex[c];
      ifdef SpecialCharsHierarchicalBitMap_debug
        printVars(+i, +c, +mask);
      endifdef
      if (mask == 0) continue;
      
      int j = i >> minLevel;
      for level to representedLevels: {
        ++steps;
        var tbl = tables[level];
        ifdef SpecialCharsHierarchicalBitMap_debug
          printVars(+level, len := l(tables[level]), +j);
        endifdef
        var val = tbl[j];
        byte val2 = (byte) (val | mask);
        
        ifdef SpecialCharsHierarchicalBitMap_debug
          printVars(+val, +val2);
        endifdef
        
        if (val == val2) break;
        
        tbl[j] = val2;
        j >>= 1;
      }
    }
    
    this.steps = steps;
  }
  
  S stats() {
    ret renderVars(
      inputLength := l(input),
      +steps,
      representedLevels := l(tables),
      cells := lengthLevel2_byteArrays(tables));
  }
  
  int representedLevels() { ret l(tables); }
  int minLevel() { ret minLevel; }
  int reachedLevel() { ret reachedLevel; }
  
  byte[] table(int level) { ret tables[level-minLevel]; }
  
  int maskForChar(byte inputByte) {
    ret charIndex[ubyteToInt(inputByte)];
  }
}

download  show line numbers  debug dex  old transpilations   

Travelled to 3 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj

No comments. add comment

Snippet ID: #1032583
Snippet name: SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash)
Eternal ID of this version: #1032583/40
Text MD5: a259ae52c0ddbf74320ecf3230079fc4
Transpilation MD5: afd9b09cf11ac1842686e94955533e71
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2021-09-23 08:36:54
Source code size: 3625 bytes / 135 lines
Pitched / IR pitched: No / No
Views / Downloads: 217 / 444
Version history: 39 change(s)
Referenced in: [show references]