transient sclass DeepBitSetWordIndex {
S regexp = "\\w+";
Map> singleTextIndices;
new ElementInstanceMatrix mainIndex;
void addDocument(A a, S text) {
*(S text) {
wordMatrix.numberToInstance = wordMatrix.instanceToNumber = i -> i;
for (IntRange r : regexpFindRanges(regexp, text))
wordMatrix.add(r.start, ll(upper(substring(text, r))));
wordMatrix.doneAdding();
}
LPair wordsAndOffsets(S text) {
ret map(regexpFindRanges(regexp, text),
r -> pair(upper(substring(text, r)), r.start));
}
// assumes word boundaries left and right of query
int[] preSearch(S query) {
ret indicesOfWordCombination(wordsAndOffsets(query));
}
int[] indicesOfWordCombination(LPair wordsWithOffsets) {
int n = l(wordsWithOffsets);
if (n == 0) null;
if (n == 1) ret intArray_minus(first(wordsWithOffsets).b, wordMatrix.instancesContainingElement_intArray(first(wordsWithOffsets).a);
// get entries for words, exit when a word is unknown
ElementInstanceMatrix.Entry[] entries = new ElementInstanceMatrix.Entry[n];
for i to n: {
ElementInstanceMatrix.Entry e = wordMatrix.index.get(wordsWithOffsets.get(i).a);
if (e == null) null;
entries[i] = e;
}
// go through words again, shift & AND-combine all bit sets
BitSet bs = leftShiftBitSet(wordsWithOffsets.get(0).b, cloneBitSet(entries[0].bitSet()));
for (int i = 1; i < n; i++)
bs.and(leftShiftBitSet(wordsWithOffsets.get(i).b, entries[i].bitSet()));
ret bitSetToIntArray(bs);
}
}