// A represents the id of a document to be indexed (e.g. a snippet ID)
transient sclass WordIndexWithBitSets {
S regexp = "\\w+";
new UniqueList documents;
Map index = ciMap();
// each word can be represented as either an int array, a bit set or both
// for now, the int array is always there
sclass Entry {
new Set intSet; // while building the index
int[] intArray;
BitSet bitSet;
BitSet bitSet() {
if (bitSet == null)
bitSet = intArrayToBitSet(intArray);
ret bitSet;
}
int count() { ret intArray.length; }
}
*() {}
*(Map map) { fOr (A a, S text : map) add(a, text); }
void add(A a, S text) {
int idx = documents.addOrGetIndex(a);
Set words = extractWords(text);
for (S word : words) addWord(idx, word);
}
void addWord(int idx, S word) {
Entry e = index.get(word);
if (e == null) index.put(word, e = new Entry);
e.intSet.add(idx);
}
// Call this exactly once before doing queries on the index
void doneAdding() {
for (Entry e : values(index)) {
e.intArray = sortIntArrayInPlace(toIntArray(e.intSet));
e.intSet = null;
}
}
Set extractWords(S text) {
ret asCISet(extractWords_list(text));
}
LS extractWords_list(S text) {
ret regexpExtractAll(regexp, text);
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
L documentsContainingWord(S word) {
Entry e = index.get(word);
ret e == null ? null : listFromFunction(e.intArray.length, i -> documents.get(e.intArray[i]));
}
// may return null (=empty list)
L documentsContainingAllWords(L words) {
if (empty(words)) ret documents;
// find word with least entries
int n = words.length, iShortest = 0;
Entry e = index.get(words.get(0));
if (e == null) null;
int lShortest = e.count();
for (int i = 1; i < n; i++) {
Entry e = index.get(word);
if (e == null) null;
int count = e.count();
if (count < lShortest) {
iShortest = i;
lShortest = count;
}
}
// convert everything to bit sets and AND them
}
NavigableSet words() { ret (NavigableSet) keys(index); }
int numWords() { ret l(index); }
// This method only works when A = S
void add(S s) { add((A) s, s); }
}