| 1 | // Generalized by: ElementInstanceMatrix | 
| 2 | |
| 3 | // A represents the id of a document to be indexed (e.g. a snippet ID) | 
| 4 | transient sclass WordIndexWithBitSets<A> {
 | 
| 5 | S regexp = "\\w+"; | 
| 6 | new UniqueList<A> documents; | 
| 7 | Map<S, Entry> index = ciMap(); | 
| 8 | |
| 9 | // each word can be represented as either an int array, a bit set or both | 
| 10 | // for now, the int array is always there | 
| 11 |   sclass Entry {
 | 
| 12 | new Set<Int> intSet; // while building the index | 
| 13 | int[] intArray; | 
| 14 | BitSet bitSet; | 
| 15 | |
| 16 |     BitSet bitSet() {
 | 
| 17 | if (bitSet == null) | 
| 18 | bitSet = intArrayToBitSet(intArray); | 
| 19 | ret bitSet; | 
| 20 | } | 
| 21 | |
| 22 |     int count() { ret intArray.length; }
 | 
| 23 | } | 
| 24 | |
| 25 |   *() {}
 | 
| 26 |   *(Map<A, S> map) { fOr (A a, S text : map) add(a, text); }
 | 
| 27 | |
| 28 |   void add(A a, S text) {
 | 
| 29 | int idx = documents.addOrGetIndex(a); | 
| 30 | Set<S> words = extractWords(text); | 
| 31 | for (S word : words) addWord(idx, word); | 
| 32 | } | 
| 33 | |
| 34 |   void addWord(int idx, S word) {
 | 
| 35 | Entry e = index.get(word); | 
| 36 | if (e == null) index.put(word, e = new Entry); | 
| 37 | e.intSet.add(idx); | 
| 38 | } | 
| 39 | |
| 40 | // Call this exactly once before doing queries on the index | 
| 41 |   void doneAdding() {
 | 
| 42 |     for (Entry e : values(index)) {
 | 
| 43 | e.intArray = sortIntArrayInPlace(toIntArray(e.intSet)); | 
| 44 | e.intSet = null; | 
| 45 | } | 
| 46 | } | 
| 47 | |
| 48 |   Set<S> extractWords(S text) {
 | 
| 49 | ret asCISet(extractWords_list(text)); | 
| 50 | } | 
| 51 | |
| 52 |   LS extractWords_list(S text) {
 | 
| 53 | ret regexpExtractAll(regexp, text); | 
| 54 | } | 
| 55 | |
| 56 |   L<IntRange> wordRanges(S text) {
 | 
| 57 | ret regexpFindRanges(regexp, text); | 
| 58 | } | 
| 59 | |
| 60 |   L<A> documentsContainingWord(S word) {
 | 
| 61 | Entry e = index.get(word); | 
| 62 | ret e == null ? null : documentListFromIntArray(e.intArray); | 
| 63 | } | 
| 64 | |
| 65 |   L<A> documentListFromIntArray(int[] array) {
 | 
| 66 | ret listFromFunction(array.length, i -> documents.get(array[i])); | 
| 67 | } | 
| 68 | |
| 69 | // may return null (=empty list) | 
| 70 |   L<A> documentsContainingAllWords(Cl<S> words) {
 | 
| 71 | int n = l(words; | 
| 72 | if (n == 0) ret documents; | 
| 73 | if (n == 1) ret documentsContainingWord(first(words)); | 
| 74 | |
| 75 | // get entries for words, exit when a word is unknown | 
| 76 | Entry[] entries = new[n]; | 
| 77 | Iterator<S> itWords = iterator(words); | 
| 78 |     for i to n: {
 | 
| 79 | Entry e = index.get(itWords.next()); | 
| 80 | if (e == null) null; | 
| 81 | entries[i] = e; | 
| 82 | } | 
| 83 | |
| 84 | // go through words again, AND-combine all bit sets | 
| 85 | BitSet bs = cloneBitSet(entries[0].bitSet()); | 
| 86 | for (int i = 1; i < n; i++) | 
| 87 | bs.and(entries[i].bitSet()); | 
| 88 | |
| 89 | ret documentListFromIntArray(bitSetToIntArray(bs)); | 
| 90 | } | 
| 91 | |
| 92 |   L<A> documentsContainingAllWords(S text, O... _) {
 | 
| 93 | ret documentsContainingAllWords(extractWords(text)); | 
| 94 | } | 
| 95 | |
| 96 |   NavigableSet<S> words() { ret (NavigableSet) keys(index); }
 | 
| 97 | |
| 98 |   int numWords() { ret l(index); }
 | 
| 99 | |
| 100 | // This method only works when A = S | 
| 101 |   void add(S s) { add((A) s, s); }
 | 
| 102 | } | 
Began life as a copy of #1029068
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
| Snippet ID: | #1029081 | 
| Snippet name: | WordIndexWithBitSets [backup] | 
| Eternal ID of this version: | #1029081/1 | 
| Text MD5: | 98cfe0948c6afe02a7feb923599d48f3 | 
| Author: | stefan | 
| Category: | javax | 
| Type: | JavaX fragment (include) | 
| Public (visible to everyone): | Yes | 
| Archived (hidden from active list): | No | 
| Created/modified: | 2020-07-19 02:07:42 | 
| Source code size: | 2836 bytes / 102 lines | 
| Pitched / IR pitched: | No / No | 
| Views / Downloads: | 369 / 400 | 
| Referenced in: | [show references] |