// A represents the id of a document to be indexed (e.g. a snippet ID)
transient sclass WordIndexWithBitSets {
S regexp = "\\w+";
new UniqueList documents;
Map index = ciMap();
// each word can be represented as either an int array, a bit set or both
sclass Entry {
new Set intSet; // while building the index
int[] intArray;
BitSet bitSet;
}
*() {}
*(Map map) { fOr (A a, S text : map) add(a, text); }
void add(A a, S text) {
int idx = documents.addOrGetIndex(a);
Set words = extractWords(text);
for (S word : words) addWord(idx, word);
}
void addWord(int idx, S word) {
Entry e = index.get(a);
if (e == null) index.put(a, e = new Entry);
e.intSet.add(idx);
}
// Call this exactly once before doing queries on the index
void doneAdding() {
for (Entry e : values(index)) {
e.intArray = sortIntArrayPlace(toIntArray(e.intSet));
e.bitSet = intArrayToBitSet(e.intArray);
e.intSet = null;
}
}
Set extractWords(S text) {
ret asCISet(extractWords_list(text));
}
LS extractWords_list(S text) {
ret regexpExtractAll(regexp, text);
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
Set get(S word) {
ret index.get(word);
}
NavigableSet words() { ret (NavigableSet) keys(index); }
int numWords() { ret index.keysSize(); }
// These methods only work when A = S
void add(S s) { add((A) s, s); }
void remove(S s) { remove((A) s, s); }
}