transient sclass DeepWordIndex {
S regexp = "\\w+";
new Map> entries;
MultiSetMap> entriesByWord = ciMultiSetMap();
sclass Entry extends Var {
Map wordPositions = ciMap(); // int array is sorted
*(A id) { super(id); }
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
void add(A a, S text) {
Entry e = new Entry(a);
if (entries.put(a, e) != null) fail("Double insertion");
MultiMap wordPositions = ciMultiMap();
for (IntRange r : wordRanges(text)) {
S word = substring(text, r);
wordPositions.put(word, r.start);
entriesByWord.put(word, e);
}
for (S word : keys(wordPositions))
e.wordPositions.put(word, toIntArray(wordPositions.get(word)));
}
Set> get(S word) { ret entriesByWord.get(word); }
int numWords() { ret entriesByWord.keysSize(); }
Iterable>>lookupString_withPositions(S query, O... _) {
optPar bool debug;
L ranges = wordRanges(query);
if (empty(ranges)) null;
int nRanges = l(ranges);
if (nRanges >= 3) {
IntRange r = ranges.get(1);
S word = substring(query, r);
Set> entries = entriesByWord.get(word);
if (entries == null) ret emptyList();
new LPair> out;
entrySearch: for (Entry entry : entries) {
int[] positions = entry.wordPositions.get(word);
for (int iWord = 2; iWord < nRanges-1; iWord++) {
IntRange r2 = ranges.get(iWord);
S word2 = substring(query, r2);
int[] positions2 = entry.wordPositions.get(word2);
if (positions2 == null) continue entrySearch;
int ofs = r2.start-r.start;
positions = intersectSortedIntArrays_ofs(positions, positions2, ofs);
if (empty(positions)) continue entrySearch;
}
out.add(pair(entry!, toList(positions)));
}
ret out;
}
null;
}
}