transient sclass WordIndex {
// if this is set, it's used for sorting the values
// which can speed up lookups
Comparator valueComparator;
S regexp = "\\w+";
MultiSetMap index = ciMultiSetMap(); // sets are better for lookups
*() {}
*(Comparator *valueComparator) { index = ciMultiSetMap_innerTreeSet(valueComparator); }
*(Map map) { fOr (A a, S text : map) add(a, text); }
void add(A a, S text) {
Set words = extractWords(text);
for (S word : words) addWord(a, word);
}
void addWord(A a, S word) {
index.add(word, a);
}
Set extractWords(S text) {
ret asCISet(extractWords_list(text));
}
LS extractWords_list(S text) {
ret regexpExtractAll(regexp, text);
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
Set get(S word) {
ret index.get(word);
}
void remove(A a, S text) {
Set words = extractWords(text);
for (S word : words) index.remove(word, a);
}
NavigableSet words() { ret (NavigableSet) keys(index); }
int numWords() { ret index.keysSize(); }
// These methods only work when A = S
void add(S s) { add((A) s, s); }
void remove(S s) { remove((A) s, s); }
}