Libraryless. Click here for Pure Java version (4081L/24K).
1 | // supports 8 different character classes |
2 | sclass SpecialCharsHierarchicalBitMap { |
3 | S specialChars; |
4 | int minLevel = 1, maxLevel = 8; |
5 | |
6 | byte[] input; |
7 | byte[] charIndex = new byte[256]; |
8 | byte[][] tables; |
9 | int reachedLevel; |
10 | |
11 | // how many computation steps it took |
12 | int steps; |
13 | |
14 | *() {} |
15 | *(byte[] *input) {} |
16 | *(S specialChars, byte[] *input) {} |
17 | |
18 | selfType specialChar(S s, int mask) { |
19 | int n = l(s); |
20 | for i to n: specialChar(s.charAt(i), mask); |
21 | this; |
22 | } |
23 | |
24 | selfType specialChar(char c, int mask) { |
25 | if (!isUByte(mask)) fail("Mask out of range (must be u-byte): " + intToHex(mask)); |
26 | charIndex[ubyteToInt(c)] |= (byte) mask; |
27 | this; |
28 | } |
29 | |
30 | // These probably must all be in the first character page. |
31 | // TODO: check that |
32 | selfType specialChars(S s) { |
33 | specialChars = s; |
34 | |
35 | // make charIndex table |
36 | |
37 | var m = l(specialChars); |
38 | if (m > 8) fail("Too many special chars (max: 8) - " + m); |
39 | for i to m: |
40 | charIndex[ubyteToInt(specialChars.charAt(i))] = (byte) (1 << i); |
41 | this; |
42 | } |
43 | |
44 | void analyze(S input) { |
45 | this.input = toUtf8(input); // XXX probably wrong |
46 | analyze(); |
47 | } |
48 | |
49 | void analyze(byte[] input) { |
50 | this.input = input; |
51 | analyze(); |
52 | } |
53 | |
54 | void analyze { |
55 | // allocate logarithmic tables |
56 | |
57 | var input = this.input; |
58 | // n is actual input length |
59 | // allLevels is size of n in bits |
60 | // n2 is n rounded up to power of two |
61 | int n = l(input), allLevels = ld_iceil(n), n2 = 1 << allLevels; |
62 | int minLevel = this.minLevel; |
63 | |
64 | // reachedLevel is the minimum of allLevels and maxLevel |
65 | reachedLevel = min(allLevels, maxLevel); |
66 | |
67 | // representedLevels is how many array elements we need |
68 | int representedLevels = max(0, reachedLevel+1-minLevel); |
69 | |
70 | int n3 = (n+(1 << minLevel)-1) >> minLevel; |
71 | ifdef SpecialCharsHierarchicalBitMap_debug |
72 | printVars(+n, +allLevels, +minLevel, +maxLevel, +representedLevels, +n2, +n3); |
73 | endifdef |
74 | |
75 | tables = new byte[representedLevels][]; |
76 | for level to representedLevels: { |
77 | tables[level] = new byte[n3]; |
78 | ifdef SpecialCharsHierarchicalBitMap_debug |
79 | printVars(+level, +n3); |
80 | endifdef |
81 | n3 = (n3+1) >> 1; |
82 | } |
83 | |
84 | int steps = n; // for each outer iteration |
85 | |
86 | for i to n: { |
87 | int c = ubyteToInt(input[i]); |
88 | byte mask = charIndex[c]; |
89 | ifdef SpecialCharsHierarchicalBitMap_debug |
90 | printVars(+i, +c, +mask); |
91 | endifdef |
92 | if (mask == 0) continue; |
93 | |
94 | int j = i >> minLevel; |
95 | for level to representedLevels: { |
96 | ++steps; |
97 | var tbl = tables[level]; |
98 | ifdef SpecialCharsHierarchicalBitMap_debug |
99 | printVars(+level, len := l(tables[level]), +j); |
100 | endifdef |
101 | var val = tbl[j]; |
102 | byte val2 = (byte) (val | mask); |
103 | |
104 | ifdef SpecialCharsHierarchicalBitMap_debug |
105 | printVars(+val, +val2); |
106 | endifdef |
107 | |
108 | if (val == val2) break; |
109 | |
110 | tbl[j] = val2; |
111 | j >>= 1; |
112 | } |
113 | } |
114 | |
115 | this.steps = steps; |
116 | } |
117 | |
118 | S stats() { |
119 | ret renderVars( |
120 | inputLength := l(input), |
121 | +steps, |
122 | representedLevels := l(tables), |
123 | cells := lengthLevel2_byteArrays(tables)); |
124 | } |
125 | |
126 | int representedLevels() { ret l(tables); } |
127 | int minLevel() { ret minLevel; } |
128 | int reachedLevel() { ret reachedLevel; } |
129 | |
130 | byte[] table(int level) { ret tables[level-minLevel]; } |
131 | |
132 | int maskForChar(byte inputByte) { |
133 | ret charIndex[ubyteToInt(inputByte)]; |
134 | } |
135 | } |
download show line numbers debug dex old transpilations
Travelled to 3 computer(s): bhatertpkbcr, mowyntqkapby, mqqgnosmbjvj
No comments. add comment
Snippet ID: | #1032583 |
Snippet name: | SpecialCharsHierarchicalBitMap - find special characters quickly in long string (e.g. brackets, quotes, backslash) |
Eternal ID of this version: | #1032583/40 |
Text MD5: | a259ae52c0ddbf74320ecf3230079fc4 |
Transpilation MD5: | afd9b09cf11ac1842686e94955533e71 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2021-09-23 08:36:54 |
Source code size: | 3625 bytes / 135 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 219 / 447 |
Version history: | 39 change(s) |
Referenced in: | [show references] |