// Generalized by: ElementInstanceMatrix
// A represents the id of a document to be indexed (e.g. a snippet ID)
transient sclass WordIndexWithBitSets {
S regexp = "\\w+";
new UniqueList documents;
Map index = ciMap();
// each word can be represented as either an int array, a bit set or both
// for now, the int array is always there
sclass Entry {
new Set intSet; // while building the index
int[] intArray;
BitSet bitSet;
BitSet bitSet() {
if (bitSet == null)
bitSet = intArrayToBitSet(intArray);
ret bitSet;
}
int count() { ret intArray.length; }
}
*() {}
*(Map map) { fOr (A a, S text : map) add(a, text); }
void add(A a, S text) {
int idx = documents.addOrGetIndex(a);
Set words = extractWords(text);
for (S word : words) addWord(idx, word);
}
void addWord(int idx, S word) {
Entry e = index.get(word);
if (e == null) index.put(word, e = new Entry);
e.intSet.add(idx);
}
// Call this exactly once before doing queries on the index
void doneAdding() {
for (Entry e : values(index)) {
e.intArray = sortIntArrayInPlace(toIntArray(e.intSet));
e.intSet = null;
}
}
Set extractWords(S text) {
ret asCISet(extractWords_list(text));
}
LS extractWords_list(S text) {
ret regexpExtractAll(regexp, text);
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
L documentsContainingWord(S word) {
Entry e = index.get(word);
ret e == null ? null : documentListFromIntArray(e.intArray);
}
L documentListFromIntArray(int[] array) {
ret listFromFunction(array.length, i -> documents.get(array[i]));
}
// may return null (=empty list)
L documentsContainingAllWords(Cl words) {
int n = l(words;
if (n == 0) ret documents;
if (n == 1) ret documentsContainingWord(first(words));
// get entries for words, exit when a word is unknown
Entry[] entries = new[n];
Iterator itWords = iterator(words);
for i to n: {
Entry e = index.get(itWords.next());
if (e == null) null;
entries[i] = e;
}
// go through words again, AND-combine all bit sets
BitSet bs = cloneBitSet(entries[0].bitSet());
for (int i = 1; i < n; i++)
bs.and(entries[i].bitSet());
ret documentListFromIntArray(bitSetToIntArray(bs));
}
L documentsContainingAllWords(S text, O... _) {
ret documentsContainingAllWords(extractWords(text));
}
NavigableSet words() { ret (NavigableSet) keys(index); }
int numWords() { ret l(index); }
// This method only works when A = S
void add(S s) { add((A) s, s); }
}