Libraryless. Click here for Pure Java version (3461L/22K).
// Generalized by: ElementInstanceMatrix // A represents the id of a document to be indexed (e.g. a snippet ID) transient sclass WordIndexWithBitSets<A> { S regexp = "\\w+"; new UniqueList<A> documents; Map<S, Entry> index = ciMap(); // each word can be represented as either an int array, a bit set or both // for now, the int array is always there sclass Entry { new Set<Int> intSet; // while building the index int[] intArray; BitSet bitSet; BitSet bitSet() { if (bitSet == null) bitSet = intArrayToBitSet(intArray); ret bitSet; } int count() { ret intArray.length; } } *() {} *(Map<A, S> map) { fOr (A a, S text : map) add(a, text); } void add(A a, S text) { int idx = documents.addOrGetIndex(a); Set<S> words = extractWords(text); for (S word : words) addWord(idx, word); } void addWord(int idx, S word) { Entry e = index.get(word); if (e == null) index.put(word, e = new Entry); e.intSet.add(idx); } // Call this exactly once before doing queries on the index void doneAdding() { for (Entry e : values(index)) { e.intArray = sortIntArrayInPlace(toIntArray(e.intSet)); e.intSet = null; } } Set<S> extractWords(S text) { ret asCISet(extractWords_list(text)); } LS extractWords_list(S text) { ret regexpExtractAll(regexp, text); } L<IntRange> wordRanges(S text) { ret regexpFindRanges(regexp, text); } L<A> documentsContainingWord(S word) { Entry e = index.get(word); ret e == null ? null : documentListFromIntArray(e.intArray); } L<A> documentListFromIntArray(int[] array) { ret listFromFunction(array.length, i -> documents.get(array[i])); } // may return null (=empty list) L<A> documentsContainingAllWords(Cl<S> words) { int n = l(words; if (n == 0) ret documents; if (n == 1) ret documentsContainingWord(first(words)); // get entries for words, exit when a word is unknown Entry[] entries = new[n]; Iterator<S> itWords = iterator(words); for i to n: { Entry e = index.get(itWords.next()); if (e == null) null; entries[i] = e; } // go through words again, AND-combine all bit sets BitSet bs = cloneBitSet(entries[0].bitSet()); for (int i = 1; i < n; i++) bs.and(entries[i].bitSet()); ret documentListFromIntArray(bitSetToIntArray(bs)); } L<A> documentsContainingAllWords(S text, O... _) { ret documentsContainingAllWords(extractWords(text)); } NavigableSet<S> words() { ret (NavigableSet) keys(index); } int numWords() { ret l(index); } // This method only works when A = S void add(S s) { add((A) s, s); } }
Began life as a copy of #1024242
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029068 |
Snippet name: | WordIndexWithBitSets - index a list by words (case-insensitive by default) |
Eternal ID of this version: | #1029068/20 |
Text MD5: | 98cfe0948c6afe02a7feb923599d48f3 |
Transpilation MD5: | 2a2456c1641df2f34b94aff95eeaeee2 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-19 01:22:59 |
Source code size: | 2836 bytes / 102 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 291 / 630 |
Version history: | 19 change(s) |
Referenced in: | [show references] |