transient sclass DeepBitSetWordIndex {
S regexp = "\\w+";
new Map singleTextIndices;
new ElementInstanceMatrix mainIndex;
void add(A a, S text) {
singleTextIndices.put(a, new SingleTextWordIndex(regexp, text));
mainIndex.add(a, mapToSet upper(regexpExtractAll(regexp, text));
}
void doneAdding {
mainIndex.doneAdding();
}
LPair wordsAndOffsets(S text) {
ret map(regexpFindRanges(regexp, text),
r -> pair(upper(substring(text, r)), r.start));
}
// assumes word boundaries left and right of query
Cl preSearch(S query, O... _) {
optPar bool debug;
LPair l = wordsAndOffsets(query);
Cl candidates = mainIndex.instancesContainingAllElements(pairsA(l));
if (debug) {
L lengths = map(candidates, a -> singleTextIndices.get(a).length);
print(nCandidates(candidates) + ", total length: " + n2(intSum(lengths)) + ", lengths: " + lengths);
}
ret filter(candidates, a -> nempty(singleTextIndices.get(a).indicesOfWordCombination(l)));
}
int numWords() { ret mainIndex.numElements(); }
}