// A represents the id of a document to be indexed (e.g. a snippet ID) transient sclass WordIndexWithBitSets { S regexp = "\\w+"; new UniqueList documents; Map index = ciMap(); // each word can be represented as either an int array, a bit set or both sclass Entry { new Set intSet; // while building the index int[] intArray; BitSet bitSet; } *() {} *(Map map) { fOr (A a, S text : map) add(a, text); } void add(A a, S text) { int idx = documents.addOrGetIndex(a); Set words = extractWords(text); for (S word : words) addWord(idx, word); } void addWord(int idx, S word) { Entry e = index.get(word); if (e == null) index.put(word, e = new Entry); e.intSet.add(idx); } // Call this exactly once before doing queries on the index void doneAdding() { for (Entry e : values(index)) { e.intArray = sortIntArrayInPlace(toIntArray(e.intSet)); e.bitSet = intArrayToBitSet(e.intArray); e.intSet = null; } } Set extractWords(S text) { ret asCISet(extractWords_list(text)); } LS extractWords_list(S text) { ret regexpExtractAll(regexp, text); } L wordRanges(S text) { ret regexpFindRanges(regexp, text); } L documentsContainingWord(S word) { Entry e = index.get(word); ret e == null ? null : listFromFunction(e.intArray.length, i -> documents.get(e.intArray[i])); } NavigableSet words() { ret (NavigableSet) keys(index); } int numWords() { ret l(index); } // These methods only work when A = S void add(S s) { add((A) s, s); } void remove(S s) { remove((A) s, s); } }