// Generalized by: ElementInstanceMatrix // A represents the id of a document to be indexed (e.g. a snippet ID) transient sclass WordIndexWithBitSets { S regexp = "\\w+"; new UniqueList documents; Map index = ciMap(); // each word can be represented as either an int array, a bit set or both // for now, the int array is always there sclass Entry { new Set intSet; // while building the index int[] intArray; BitSet bitSet; BitSet bitSet() { if (bitSet == null) bitSet = intArrayToBitSet(intArray); ret bitSet; } int count() { ret intArray.length; } } *() {} *(Map map) { fOr (A a, S text : map) add(a, text); } void add(A a, S text) { int idx = documents.addOrGetIndex(a); Set words = extractWords(text); for (S word : words) addWord(idx, word); } void addWord(int idx, S word) { Entry e = index.get(word); if (e == null) index.put(word, e = new Entry); e.intSet.add(idx); } // Call this exactly once before doing queries on the index void doneAdding() { for (Entry e : values(index)) { e.intArray = sortIntArrayInPlace(toIntArray(e.intSet)); e.intSet = null; } } Set extractWords(S text) { ret asCISet(extractWords_list(text)); } LS extractWords_list(S text) { ret regexpExtractAll(regexp, text); } L wordRanges(S text) { ret regexpFindRanges(regexp, text); } L documentsContainingWord(S word) { Entry e = index.get(word); ret e == null ? null : documentListFromIntArray(e.intArray); } L documentListFromIntArray(int[] array) { ret listFromFunction(array.length, i -> documents.get(array[i])); } // may return null (=empty list) L documentsContainingAllWords(Cl words) { int n = l(words; if (n == 0) ret documents; if (n == 1) ret documentsContainingWord(first(words)); // get entries for words, exit when a word is unknown Entry[] entries = new[n]; Iterator itWords = iterator(words); for i to n: { Entry e = index.get(itWords.next()); if (e == null) null; entries[i] = e; } // go through words again, AND-combine all bit sets BitSet bs = cloneBitSet(entries[0].bitSet()); for (int i = 1; i < n; i++) bs.and(entries[i].bitSet()); ret documentListFromIntArray(bitSetToIntArray(bs)); } L documentsContainingAllWords(S text, O... _) { ret documentsContainingAllWords(extractWords(text)); } NavigableSet words() { ret (NavigableSet) keys(index); } int numWords() { ret l(index); } // This method only works when A = S void add(S s) { add((A) s, s); } }