// A represents the id of a document to be indexed (e.g. a snippet ID) transient sclass WordIndexWithBitSets { S regexp = "\\w+"; new UniqueList documents; Map index = ciMap(); // each word can be represented as either an int array, a bit set or both // for now, the int array is always there sclass Entry { new Set intSet; // while building the index int[] intArray; BitSet bitSet; BitSet bitSet() { if (bitSet == null) bitSet = intArrayToBitSet(intArray); ret bitSet; } int count() { ret intArray.length; } } *() {} *(Map map) { fOr (A a, S text : map) add(a, text); } void add(A a, S text) { int idx = documents.addOrGetIndex(a); Set words = extractWords(text); for (S word : words) addWord(idx, word); } void addWord(int idx, S word) { Entry e = index.get(word); if (e == null) index.put(word, e = new Entry); e.intSet.add(idx); } // Call this exactly once before doing queries on the index void doneAdding() { for (Entry e : values(index)) { e.intArray = sortIntArrayInPlace(toIntArray(e.intSet)); e.intSet = null; } } Set extractWords(S text) { ret asCISet(extractWords_list(text)); } LS extractWords_list(S text) { ret regexpExtractAll(regexp, text); } L wordRanges(S text) { ret regexpFindRanges(regexp, text); } L documentsContainingWord(S word) { Entry e = index.get(word); ret e == null ? null : documentListFromIntArray(e.intArray); } L documentListFromIntArray(int[] array) { ret listFromFunction(array.length, i -> documents.get(array[i])); } // may return null (=empty list) L documentsContainingAllWords(LS words) { int n = l(words; if (n == 0) ret documents; if (n == 1) ret documentsContainingWord(first(words)); // get entries for words, exit when a word is unknown Entry[] entries = new[n]; for i to n: { Entry e = index.get(word); if (e == null) null; entries[i] = e; } // go through words again, AND-combine all bit sets BitSet bs = new(entries[0].bitSet()); for (int i = 1; i < n; i++) bs.and(entries[i].bitSet()); int n = bs.cardinality(); int[] result = new[n]; int j = 0; for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1)) result[j++] == i; ret documentListFromIntArray(result); } NavigableSet words() { ret (NavigableSet) keys(index); } int numWords() { ret l(index); } // This method only works when A = S void add(S s) { add((A) s, s); } }