transient sclass DeepBitSetWordIndex {
S regexp = "\\w+";
Map singleTextIndices;
new ElementInstanceMatrix mainIndex;
void addDocument(A a, S text) {
singleTextIndices.put(a, new SingleTextWordIndex(regexp, text));
mainIndex.add(a, mapToSet upper(regexpExtractAll(regexp, text));
}
LPair wordsAndOffsets(S text) {
ret map(regexpFindRanges(regexp, text),
r -> pair(upper(substring(text, r)), r.start));
}
// assumes word boundaries left and right of query
Cl preSearch(S query) {
LPair l = wordsAndOffsets(query);
Cl candidates = mainIndex.instancesContainingAllElements(pairsA(l));
print(nCandidates(candidates));
ret filter(candidates, a -> nempty(singleTextIndices.get(a).indicesOfWordCombination(l)));
}
int numWords() { ret mainIndex.numElements(); }
}