Libraryless. Click here for Pure Java version (4081L/24K).
// supports 8 different character classes sclass SpecialCharsHierarchicalBitMap { S specialChars; int minLevel = 1, maxLevel = 8; byte[] input; byte[] charIndex = new byte[256]; byte[][] tables; int reachedLevel; // how many computation steps it took int steps; *() {} *(byte[] *input) {} *(S specialChars, byte[] *input) {} selfType specialChar(S s, int mask) { int n = l(s); for i to n: specialChar(s.charAt(i), mask); this; } selfType specialChar(char c, int mask) { if (!isUByte(mask)) fail("Mask out of range (must be u-byte): " + intToHex(mask)); charIndex[ubyteToInt(c)] |= (byte) mask; this; } // These probably must all be in the first character page. // TODO: check that selfType specialChars(S s) { specialChars = s; // make charIndex table var m = l(specialChars); if (m > 8) fail("Too many special chars (max: 8) - " + m); for i to m: charIndex[ubyteToInt(specialChars.charAt(i))] = (byte) (1 << i); this; } void analyze(S input) { this.input = toUtf8(input); // XXX probably wrong analyze(); } void analyze(byte[] input) { this.input = input; analyze(); } void analyze { // allocate logarithmic tables var input = this.input; // n is actual input length // allLevels is size of n in bits // n2 is n rounded up to power of two int n = l(input), allLevels = ld_iceil(n), n2 = 1 << allLevels; int minLevel = this.minLevel; // reachedLevel is the minimum of allLevels and maxLevel reachedLevel = min(allLevels, maxLevel); // representedLevels is how many array elements we need int representedLevels = max(0, reachedLevel+1-minLevel); int n3 = (n+(1 << minLevel)-1) >> minLevel; ifdef SpecialCharsHierarchicalBitMap_debug printVars(+n, +allLevels, +minLevel, +maxLevel, +representedLevels, +n2, +n3); endifdef tables = new byte[representedLevels][]; for level to representedLevels: { tables[level] = new byte[n3]; ifdef SpecialCharsHierarchicalBitMap_debug printVars(+level, +n3); endifdef n3 = (n3+1) >> 1; } int steps = n; // for each outer iteration for i to n: { int c = ubyteToInt(input[i]); byte mask = charIndex[c]; ifdef SpecialCharsHierarchicalBitMap_debug printVars(+i, +c, +mask); endifdef if (mask == 0) continue; int j = i >> minLevel; for level to representedLevels: { ++steps; var tbl = tables[level]; ifdef SpecialCharsHierarchicalBitMap_debug printVars(+level, len := l(tables[level]), +j); endifdef var val = tbl[j]; byte val2 = (byte) (val | mask); ifdef SpecialCharsHierarchicalBitMap_debug printVars(+val, +val2); endifdef if (val == val2) break; tbl[j] = val2; j >>= 1; } } this.steps = steps; } S stats() { ret renderVars( inputLength := l(input), +steps, representedLevels := l(tables), cells := lengthLevel2_byteArrays(tables)); } int representedLevels() { ret l(tables); } int minLevel() { ret minLevel; } int reachedLevel() { ret reachedLevel; } byte[] table(int level) { ret tables[level-minLevel]; } int maskForChar(byte inputByte) { ret charIndex[ubyteToInt(inputByte)]; } }
download show line numbers debug dex old transpilations
Travelled to 3 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
Snippet ID: | #1032583 |
Snippet name: | SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash) |
Eternal ID of this version: | #1032583/40 |
Text MD5: | a259ae52c0ddbf74320ecf3230079fc4 |
Transpilation MD5: | afd9b09cf11ac1842686e94955533e71 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2021-09-23 08:36:54 |
Source code size: | 3625 bytes / 135 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 218 / 446 |
Version history: | 39 change(s) |
Referenced in: | #1032590 - test_SpecialCharsHierarchicalBitMap #1032592 - JSONSpecialIntegral - find special characters quickly in long string (e.g. brackets, quotes, backslash) #1034167 - Standard Classes + Interfaces (LIVE, continuation of #1003674) |