Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

135
LINES

< > BotCompany Repo | #1032583 // SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (4081L/24K).

1  
// supports 8 different character classes
2  
sclass SpecialCharsHierarchicalBitMap {
3  
  S specialChars;
4  
  int minLevel = 1, maxLevel = 8;
5  
6  
  byte[] input;
7  
  byte[] charIndex = new byte[256];
8  
  byte[][] tables;
9  
  int reachedLevel;
10  
  
11  
  // how many computation steps it took
12  
  int steps;
13  
  
14  
  *() {}
15  
  *(byte[] *input) {}
16  
  *(S specialChars, byte[] *input) {}
17  
  
18  
  selfType specialChar(S s, int mask) {
19  
    int n = l(s);
20  
    for i to n: specialChar(s.charAt(i), mask);
21  
    this;
22  
  }
23  
  
24  
  selfType specialChar(char c, int mask) {
25  
    if (!isUByte(mask)) fail("Mask out of range (must be u-byte): " + intToHex(mask));
26  
    charIndex[ubyteToInt(c)] |= (byte) mask;
27  
    this;
28  
  }
29  
  
30  
  // These probably must all be in the first character page.
31  
  // TODO: check that
32  
  selfType specialChars(S s) {
33  
    specialChars = s;
34  
    
35  
    // make charIndex table
36  
    
37  
    var m = l(specialChars);
38  
    if (m > 8) fail("Too many special chars (max: 8) - " + m);
39  
    for i to m:
40  
      charIndex[ubyteToInt(specialChars.charAt(i))] = (byte) (1 << i);
41  
    this;
42  
  }
43  
  
44  
  void analyze(S input) {
45  
    this.input = toUtf8(input); // XXX probably wrong
46  
    analyze();
47  
  }
48  
  
49  
  void analyze(byte[] input) {
50  
    this.input = input;
51  
    analyze();
52  
  }
53  
  
54  
  void analyze {
55  
    // allocate logarithmic tables
56  
      
57  
    var input = this.input;
58  
    // n is actual input length
59  
    // allLevels is size of n in bits
60  
    // n2 is n rounded up to power of two
61  
    int n = l(input), allLevels = ld_iceil(n), n2 = 1 << allLevels;
62  
    int minLevel = this.minLevel;
63  
    
64  
    // reachedLevel is the minimum of allLevels and maxLevel
65  
    reachedLevel = min(allLevels, maxLevel);
66  
    
67  
    // representedLevels is how many array elements we need
68  
    int representedLevels = max(0, reachedLevel+1-minLevel);
69  
    
70  
    int n3 = (n+(1 << minLevel)-1) >> minLevel;
71  
    ifdef SpecialCharsHierarchicalBitMap_debug
72  
      printVars(+n, +allLevels, +minLevel, +maxLevel, +representedLevels, +n2, +n3);
73  
    endifdef
74  
  
75  
    tables = new byte[representedLevels][];
76  
    for level to representedLevels: {
77  
      tables[level] = new byte[n3];
78  
      ifdef SpecialCharsHierarchicalBitMap_debug
79  
        printVars(+level, +n3);
80  
      endifdef
81  
      n3 = (n3+1) >> 1;
82  
    }
83  
      
84  
    int steps = n; // for each outer iteration
85  
    
86  
    for i to n: {
87  
      int c = ubyteToInt(input[i]);
88  
      byte mask = charIndex[c];
89  
      ifdef SpecialCharsHierarchicalBitMap_debug
90  
        printVars(+i, +c, +mask);
91  
      endifdef
92  
      if (mask == 0) continue;
93  
      
94  
      int j = i >> minLevel;
95  
      for level to representedLevels: {
96  
        ++steps;
97  
        var tbl = tables[level];
98  
        ifdef SpecialCharsHierarchicalBitMap_debug
99  
          printVars(+level, len := l(tables[level]), +j);
100  
        endifdef
101  
        var val = tbl[j];
102  
        byte val2 = (byte) (val | mask);
103  
        
104  
        ifdef SpecialCharsHierarchicalBitMap_debug
105  
          printVars(+val, +val2);
106  
        endifdef
107  
        
108  
        if (val == val2) break;
109  
        
110  
        tbl[j] = val2;
111  
        j >>= 1;
112  
      }
113  
    }
114  
    
115  
    this.steps = steps;
116  
  }
117  
  
118  
  S stats() {
119  
    ret renderVars(
120  
      inputLength := l(input),
121  
      +steps,
122  
      representedLevels := l(tables),
123  
      cells := lengthLevel2_byteArrays(tables));
124  
  }
125  
  
126  
  int representedLevels() { ret l(tables); }
127  
  int minLevel() { ret minLevel; }
128  
  int reachedLevel() { ret reachedLevel; }
129  
  
130  
  byte[] table(int level) { ret tables[level-minLevel]; }
131  
  
132  
  int maskForChar(byte inputByte) {
133  
    ret charIndex[ubyteToInt(inputByte)];
134  
  }
135  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 3 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj

No comments. add comment

Snippet ID: #1032583
Snippet name: SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash)
Eternal ID of this version: #1032583/40
Text MD5: a259ae52c0ddbf74320ecf3230079fc4
Transpilation MD5: afd9b09cf11ac1842686e94955533e71
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2021-09-23 08:36:54
Source code size: 3625 bytes / 135 lines
Pitched / IR pitched: No / No
Views / Downloads: 157 / 352
Version history: 39 change(s)
Referenced in: [show references]