Libraryless. Click here for Pure Java version (4081L/24K).
| 1 | // supports 8 different character classes | 
| 2 | sclass SpecialCharsHierarchicalBitMap {
 | 
| 3 | S specialChars; | 
| 4 | int minLevel = 1, maxLevel = 8; | 
| 5 | |
| 6 | byte[] input; | 
| 7 | byte[] charIndex = new byte[256]; | 
| 8 | byte[][] tables; | 
| 9 | int reachedLevel; | 
| 10 | |
| 11 | // how many computation steps it took | 
| 12 | int steps; | 
| 13 | |
| 14 |   *() {}
 | 
| 15 |   *(byte[] *input) {}
 | 
| 16 |   *(S specialChars, byte[] *input) {}
 | 
| 17 | |
| 18 |   selfType specialChar(S s, int mask) {
 | 
| 19 | int n = l(s); | 
| 20 | for i to n: specialChar(s.charAt(i), mask); | 
| 21 | this; | 
| 22 | } | 
| 23 | |
| 24 |   selfType specialChar(char c, int mask) {
 | 
| 25 |     if (!isUByte(mask)) fail("Mask out of range (must be u-byte): " + intToHex(mask));
 | 
| 26 | charIndex[ubyteToInt(c)] |= (byte) mask; | 
| 27 | this; | 
| 28 | } | 
| 29 | |
| 30 | // These probably must all be in the first character page. | 
| 31 | // TODO: check that | 
| 32 |   selfType specialChars(S s) {
 | 
| 33 | specialChars = s; | 
| 34 | |
| 35 | // make charIndex table | 
| 36 | |
| 37 | var m = l(specialChars); | 
| 38 |     if (m > 8) fail("Too many special chars (max: 8) - " + m);
 | 
| 39 | for i to m: | 
| 40 | charIndex[ubyteToInt(specialChars.charAt(i))] = (byte) (1 << i); | 
| 41 | this; | 
| 42 | } | 
| 43 | |
| 44 |   void analyze(S input) {
 | 
| 45 | this.input = toUtf8(input); // XXX probably wrong | 
| 46 | analyze(); | 
| 47 | } | 
| 48 | |
| 49 |   void analyze(byte[] input) {
 | 
| 50 | this.input = input; | 
| 51 | analyze(); | 
| 52 | } | 
| 53 | |
| 54 |   void analyze {
 | 
| 55 | // allocate logarithmic tables | 
| 56 | |
| 57 | var input = this.input; | 
| 58 | // n is actual input length | 
| 59 | // allLevels is size of n in bits | 
| 60 | // n2 is n rounded up to power of two | 
| 61 | int n = l(input), allLevels = ld_iceil(n), n2 = 1 << allLevels; | 
| 62 | int minLevel = this.minLevel; | 
| 63 | |
| 64 | // reachedLevel is the minimum of allLevels and maxLevel | 
| 65 | reachedLevel = min(allLevels, maxLevel); | 
| 66 | |
| 67 | // representedLevels is how many array elements we need | 
| 68 | int representedLevels = max(0, reachedLevel+1-minLevel); | 
| 69 | |
| 70 | int n3 = (n+(1 << minLevel)-1) >> minLevel; | 
| 71 | ifdef SpecialCharsHierarchicalBitMap_debug | 
| 72 | printVars(+n, +allLevels, +minLevel, +maxLevel, +representedLevels, +n2, +n3); | 
| 73 | endifdef | 
| 74 | |
| 75 | tables = new byte[representedLevels][]; | 
| 76 |     for level to representedLevels: {
 | 
| 77 | tables[level] = new byte[n3]; | 
| 78 | ifdef SpecialCharsHierarchicalBitMap_debug | 
| 79 | printVars(+level, +n3); | 
| 80 | endifdef | 
| 81 | n3 = (n3+1) >> 1; | 
| 82 | } | 
| 83 | |
| 84 | int steps = n; // for each outer iteration | 
| 85 | |
| 86 |     for i to n: {
 | 
| 87 | int c = ubyteToInt(input[i]); | 
| 88 | byte mask = charIndex[c]; | 
| 89 | ifdef SpecialCharsHierarchicalBitMap_debug | 
| 90 | printVars(+i, +c, +mask); | 
| 91 | endifdef | 
| 92 | if (mask == 0) continue; | 
| 93 | |
| 94 | int j = i >> minLevel; | 
| 95 |       for level to representedLevels: {
 | 
| 96 | ++steps; | 
| 97 | var tbl = tables[level]; | 
| 98 | ifdef SpecialCharsHierarchicalBitMap_debug | 
| 99 | printVars(+level, len := l(tables[level]), +j); | 
| 100 | endifdef | 
| 101 | var val = tbl[j]; | 
| 102 | byte val2 = (byte) (val | mask); | 
| 103 | |
| 104 | ifdef SpecialCharsHierarchicalBitMap_debug | 
| 105 | printVars(+val, +val2); | 
| 106 | endifdef | 
| 107 | |
| 108 | if (val == val2) break; | 
| 109 | |
| 110 | tbl[j] = val2; | 
| 111 | j >>= 1; | 
| 112 | } | 
| 113 | } | 
| 114 | |
| 115 | this.steps = steps; | 
| 116 | } | 
| 117 | |
| 118 |   S stats() {
 | 
| 119 | ret renderVars( | 
| 120 | inputLength := l(input), | 
| 121 | +steps, | 
| 122 | representedLevels := l(tables), | 
| 123 | cells := lengthLevel2_byteArrays(tables)); | 
| 124 | } | 
| 125 | |
| 126 |   int representedLevels() { ret l(tables); }
 | 
| 127 |   int minLevel() { ret minLevel; }
 | 
| 128 |   int reachedLevel() { ret reachedLevel; }
 | 
| 129 | |
| 130 |   byte[] table(int level) { ret tables[level-minLevel]; }
 | 
| 131 | |
| 132 |   int maskForChar(byte inputByte) {
 | 
| 133 | ret charIndex[ubyteToInt(inputByte)]; | 
| 134 | } | 
| 135 | } | 
download show line numbers debug dex old transpilations
Travelled to 3 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
| Snippet ID: | #1032583 | 
| Snippet name: | SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash) | 
| Eternal ID of this version: | #1032583/40 | 
| Text MD5: | a259ae52c0ddbf74320ecf3230079fc4 | 
| Transpilation MD5: | afd9b09cf11ac1842686e94955533e71 | 
| Author: | stefan | 
| Category: | javax | 
| Type: | JavaX fragment (include) | 
| Public (visible to everyone): | Yes | 
| Archived (hidden from active list): | No | 
| Created/modified: | 2021-09-23 08:36:54 | 
| Source code size: | 3625 bytes / 135 lines | 
| Pitched / IR pitched: | No / No | 
| Views / Downloads: | 456 / 712 | 
| Version history: | 39 change(s) | 
| Referenced in: | [show references] |