Libraryless. Click here for Pure Java version (3461L/22K).
1 | // Generalized by: ElementInstanceMatrix |
2 | |
3 | // A represents the id of a document to be indexed (e.g. a snippet ID) |
4 | transient sclass WordIndexWithBitSets<A> { |
5 | S regexp = "\\w+"; |
6 | new UniqueList<A> documents; |
7 | Map<S, Entry> index = ciMap(); |
8 | |
9 | // each word can be represented as either an int array, a bit set or both |
10 | // for now, the int array is always there |
11 | sclass Entry { |
12 | new Set<Int> intSet; // while building the index |
13 | int[] intArray; |
14 | BitSet bitSet; |
15 | |
16 | BitSet bitSet() { |
17 | if (bitSet == null) |
18 | bitSet = intArrayToBitSet(intArray); |
19 | ret bitSet; |
20 | } |
21 | |
22 | int count() { ret intArray.length; } |
23 | } |
24 | |
25 | *() {} |
26 | *(Map<A, S> map) { fOr (A a, S text : map) add(a, text); } |
27 | |
28 | void add(A a, S text) { |
29 | int idx = documents.addOrGetIndex(a); |
30 | Set<S> words = extractWords(text); |
31 | for (S word : words) addWord(idx, word); |
32 | } |
33 | |
34 | void addWord(int idx, S word) { |
35 | Entry e = index.get(word); |
36 | if (e == null) index.put(word, e = new Entry); |
37 | e.intSet.add(idx); |
38 | } |
39 | |
40 | // Call this exactly once before doing queries on the index |
41 | void doneAdding() { |
42 | for (Entry e : values(index)) { |
43 | e.intArray = sortIntArrayInPlace(toIntArray(e.intSet)); |
44 | e.intSet = null; |
45 | } |
46 | } |
47 | |
48 | Set<S> extractWords(S text) { |
49 | ret asCISet(extractWords_list(text)); |
50 | } |
51 | |
52 | LS extractWords_list(S text) { |
53 | ret regexpExtractAll(regexp, text); |
54 | } |
55 | |
56 | L<IntRange> wordRanges(S text) { |
57 | ret regexpFindRanges(regexp, text); |
58 | } |
59 | |
60 | L<A> documentsContainingWord(S word) { |
61 | Entry e = index.get(word); |
62 | ret e == null ? null : documentListFromIntArray(e.intArray); |
63 | } |
64 | |
65 | L<A> documentListFromIntArray(int[] array) { |
66 | ret listFromFunction(array.length, i -> documents.get(array[i])); |
67 | } |
68 | |
69 | // may return null (=empty list) |
70 | L<A> documentsContainingAllWords(Cl<S> words) { |
71 | int n = l(words; |
72 | if (n == 0) ret documents; |
73 | if (n == 1) ret documentsContainingWord(first(words)); |
74 | |
75 | // get entries for words, exit when a word is unknown |
76 | Entry[] entries = new[n]; |
77 | Iterator<S> itWords = iterator(words); |
78 | for i to n: { |
79 | Entry e = index.get(itWords.next()); |
80 | if (e == null) null; |
81 | entries[i] = e; |
82 | } |
83 | |
84 | // go through words again, AND-combine all bit sets |
85 | BitSet bs = cloneBitSet(entries[0].bitSet()); |
86 | for (int i = 1; i < n; i++) |
87 | bs.and(entries[i].bitSet()); |
88 | |
89 | ret documentListFromIntArray(bitSetToIntArray(bs)); |
90 | } |
91 | |
92 | L<A> documentsContainingAllWords(S text, O... _) { |
93 | ret documentsContainingAllWords(extractWords(text)); |
94 | } |
95 | |
96 | NavigableSet<S> words() { ret (NavigableSet) keys(index); } |
97 | |
98 | int numWords() { ret l(index); } |
99 | |
100 | // This method only works when A = S |
101 | void add(S s) { add((A) s, s); } |
102 | } |
Began life as a copy of #1024242
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029068 |
Snippet name: | WordIndexWithBitSets - index a list by words (case-insensitive by default) |
Eternal ID of this version: | #1029068/20 |
Text MD5: | 98cfe0948c6afe02a7feb923599d48f3 |
Transpilation MD5: | 2a2456c1641df2f34b94aff95eeaeee2 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-19 01:22:59 |
Source code size: | 2836 bytes / 102 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 292 / 632 |
Version history: | 19 change(s) |
Referenced in: | [show references] |