transient sclass DeepWordIndex {
S regexp = "\\w+";
bool useHashMaps; // makes it case-sensitive and doesn't allow partial word searches
bool sortEntries; // if A implements Comparable
new Map> entries;
//new L> entriesList;
MultiSetMap> entriesByWord;
Map>> entriesByWord_lists;
sclass Entry extends Var implements Comparable> {
Map wordPositions = ciMap(); // int array is sorted
*(A id) { super(id); }
public int compareTo(Entry e) {
ret ((Comparable) get()).compareTo(e!);
}
public bool equals(O o) {
ret o instanceof Entry && get().equals(((Entry) o)!);
}
}
void init() {
if (entriesByWord != null) ret;
entriesByWord = useHashMaps
? sortEntries ? multiSetMap_innerTreeSet() : new MultiSetMap
: sortEntries ? ciMultiSetMap_innerTreeSet() : ciMultiSetMap();
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
void add(A a, S text) {
init();
Entry e = new Entry(a);
if (useHashMaps) {
e.wordPositions = new HashMap;
text = upper(text);
}
if (entries.put(a, e) != null) fail("Double insertion");
MultiMap wordPositions = ciMultiMap();
for (IntRange r : wordRanges(text)) {
S word = substring(text, r);
wordPositions.put(word, r.start);
entriesByWord.put(word, e);
}
for (S word : keys(wordPositions))
e.wordPositions.put(word, toIntArray(wordPositions.get(word)));
}
Set> get(S word) { ret entriesByWord.get(word); }
int numWords() { ret entriesByWord.keysSize(); }
void doneAdding() {
if (entriesByWord_lists != null) ret;
entriesByWord_lists = mapValues asList(entriesByWord.data);
// TODO: release entriesByWord
}
Iterable>>lookupString_withPositions(S query, O... _) {
optPar bool debug;
doneAdding();
if (useHashMaps) query = upper(query);
S _query = query;
L ranges = wordRanges(query);
if (empty(ranges)) null;
int nRanges = l(ranges);
int iFirstComplete = first(ranges).start == 0 ? 1 : 0;
int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges;
LS words = map(ranges, r -> substring(_query, r));
LL> entriesAtIndex = map(words, word -> entriesByWord_lists.get(word));
if (iLastComplete >= iFirstComplete+1) {
int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest));
if (nBest == 0) { /*print("No results for " + words.get(shortest));*/ ret emptyList(); }
for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) {
int n = l(entriesAtIndex.get(iWord));
if (n == 0) { /*print("No results for " + words.get(iWord));*/ ret emptyList(); }
if (n < nBest) {
shortest = iWord;
nBest = n;
}
}
int _shortest = shortest;
Iterable> entries = sortEntries
? intersectMultipleSortedCollectionsI(subList(entriesAtIndex, iFirstComplete, iLastComplete))
: entriesAtIndex.get(shortest);
int startShortest = ranges.get(shortest).start;
S shortestWord = words.get(shortest); // not the shortest word, but the word with the shortest result list
/*if (debug)*/ print("shortest: " + shortestWord + ", words: " + zipTwoLists(words, lmap l(entriesAtIndex)));
new IntBuffer intBuffer;
ret mapI_nonNulls_if1(entries, entry -> {
int[] positions = entry.wordPositions.get(shortestWord);
for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) {
continue if iWord == _shortest;
IntRange r2 = ranges.get(iWord);
S word2 = words.get(iWord);
int[] positions2 = entry.wordPositions.get(word2);
if (positions2 == null) null;
int ofs = startShortest-r2.start;
int len = l(positions);
positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer);
print("Intersected " + len + "/" + l(positions2) + " => " + l(positions));
//if (debug) print("Got " + asList(positions));
if (empty(positions)) null;
}
ret pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest));
});
}
null;
}
}