Download Jar. Libraryless. Click here for Pure Java version (3242L/21K).
1 | transient sclass SingleTextWordIndex { |
2 | S regexp = "\\w+"; |
3 | new ElementInstanceMatrix<Int, S> wordMatrix; |
4 | int length; |
5 | |
6 | *(S text) { |
7 | init(text); |
8 | } |
9 | |
10 | *(S *regexp, S text) { |
11 | init(text); |
12 | } |
13 | |
14 | void init(S text) { |
15 | length = l(text); |
16 | wordMatrix.numberToInstance = wordMatrix.instanceToNumber = i -> i; |
17 | for (IntRange r : regexpFindRanges(regexp, text)) |
18 | wordMatrix.add(r.start, ll(upper(substring(text, r)))); |
19 | wordMatrix.doneAdding(); |
20 | } |
21 | |
22 | LPair<S, Int> wordsAndOffsets(S text) { |
23 | ret map(regexpFindRanges(regexp, text), |
24 | r -> pair(upper(substring(text, r)), r.start)); |
25 | } |
26 | |
27 | // assumes word boundaries left and right of query |
28 | int[] preSearch(S query) { |
29 | ret indicesOfWordCombination(wordsAndOffsets(query)); |
30 | } |
31 | |
32 | int[] indicesOfWordCombination(LPair<S, Int> wordsWithOffsets) { |
33 | int n = l(wordsWithOffsets); |
34 | if (n == 0) null; |
35 | if (n == 1) ret intArray_minus(first(wordsWithOffsets).b, wordMatrix.instancesContainingElement_intArray(first(wordsWithOffsets).a); |
36 | |
37 | // get entries for words, exit when a word is unknown |
38 | ElementInstanceMatrix.Entry[] entries = new ElementInstanceMatrix.Entry[n]; |
39 | for i to n: { |
40 | ElementInstanceMatrix.Entry e = wordMatrix.index.get(wordsWithOffsets.get(i).a); |
41 | if (e == null) null; |
42 | entries[i] = e; |
43 | } |
44 | |
45 | // go through words again, shift & AND-combine all bit sets |
46 | BitSet bs = leftShiftBitSet(wordsWithOffsets.get(0).b, cloneBitSet(entries[0].bitSet())); |
47 | for (int i = 1; i < n; i++) |
48 | bs.and(leftShiftBitSet(wordsWithOffsets.get(i).b, entries[i].bitSet())); |
49 | |
50 | ret bitSetToIntArray(bs); |
51 | } |
52 | } |
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029078 |
Snippet name: | SingleTextWordIndex |
Eternal ID of this version: | #1029078/17 |
Text MD5: | 0700c3e516890f194314dae86bd68c81 |
Transpilation MD5: | 45deeedf03bfcaab885d3fb179d76ea2 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code (desktop) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-19 02:36:35 |
Source code size: | 1689 bytes / 52 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 345 / 1184 |
Version history: | 16 change(s) |
Referenced in: | [show references] |