Libraryless. Click here for Pure Java version (2563L/16K).
1 | // clever index of tokenized strings |
2 | // maintaining one CI-sorted list per token position |
3 | sclass PositionalTokenIndex {
|
4 | L<LLS> sorts = new L; // token index -> entries sorted by token at index |
5 | bool dirty; |
6 | |
7 | *() {}
|
8 | *(Iterable<LS> toks) { addAll(toks); }
|
9 | |
10 | void addAll(Iterable<LS> toks) {
|
11 | fOr (LS tok : toks) |
12 | add(tok); |
13 | } |
14 | |
15 | void add(LS tok) {
|
16 | for (int i = 1; i < l(tok); i += 2) {
|
17 | LL<S> idx = listGetOrCreate_List(sorts, i/2); |
18 | idx.add(tok); |
19 | } |
20 | set dirty; |
21 | } |
22 | |
23 | private void sort {
|
24 | for i over sorts: {
|
25 | int j = i*2+1; |
26 | sortInPlaceByCalculatedFieldIC(sorts.get(i), |
27 | func(LS tok) -> S { tok.get(j) });
|
28 | } |
29 | dirty = false; |
30 | } |
31 | |
32 | // return all tokenizations with token t in position i |
33 | // may return null |
34 | LLS byToken(int i, S t) {
|
35 | clean(); |
36 | LLS list = get(sorts, i); |
37 | if (list == null) null; |
38 | |
39 | int start = binarySearchForStart(list, i, t); |
40 | if (start < 0) null; |
41 | int end = binarySearchForEnd(list, i, t, start); |
42 | ret subList(list, start, end); |
43 | } |
44 | |
45 | private int binarySearchForStart(LLS list, int position, S t) {
|
46 | int low = 0, high = list.size()-1; |
47 | while (low <= high) {
|
48 | int mid = (low + high) >>> 1; |
49 | S token = list.get(mid).get(position*2+1); |
50 | int cmp = compareIC(token, t); |
51 | |
52 | if (cmp < 0) |
53 | low = mid + 1; |
54 | else if (cmp > 0) |
55 | high = mid - 1; |
56 | else // we found the target token, now check if we're at start |
57 | if (mid == 0 || !eqic(list.get(mid-1).get(position*2+1), t)) |
58 | ret mid; // yep, it's the start |
59 | else |
60 | high = mid - 1; // continue to search higher up |
61 | } |
62 | ret -1; |
63 | } |
64 | |
65 | private int binarySearchForEnd(LLS list, int position, S t, int start) {
|
66 | int low = start+1, high = list.size()-1; |
67 | while (low <= high) {
|
68 | int mid = (low + high) >>> 1; |
69 | S token = list.get(mid).get(position*2+1); |
70 | |
71 | if (eqic(token, t)) |
72 | low = mid + 1; // still in target range, go lower |
73 | else // we found a different token, now check if we're at the end |
74 | if (mid-1 == start || eqic(list.get(mid-1).get(position*2+1), t)) |
75 | ret mid; // yep, it's the end |
76 | else |
77 | high = mid - 1; // continue to search higher up |
78 | } |
79 | ret low; |
80 | } |
81 | |
82 | LLS fullList() { ret first(sorts); }
|
83 | |
84 | // if you want to sort it ahead of time |
85 | void clean() {
|
86 | if (dirty) sort(); |
87 | } |
88 | } |
download show line numbers debug dex old transpilations
Travelled to 6 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1025688 |
| Snippet name: | PositionalTokenIndex [OK] |
| Eternal ID of this version: | #1025688/22 |
| Text MD5: | 8f04881744542f85c46fddd354f95789 |
| Transpilation MD5: | 0aa7216c098c99abd973bb522ab773d2 |
| Author: | stefan |
| Category: | javax / agi.blue |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2019-10-13 23:15:53 |
| Source code size: | 2496 bytes / 88 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 509 / 1008 |
| Version history: | 21 change(s) |
| Referenced in: | [show references] |