transient sclass DeepWordPairIndex {
S regexp = "\\w+";
bool useHashMaps = true; // makes it case-sensitive and doesn't allow partial word searches
new Map> entries;
new MultiSetMap> entriesByPair;
sclass Entry extends Var {
new Map pairPositions; // int array is sorted
*(A id) { super(id); }
}
void useHashMaps() {
set useHashMaps;
entriesByPair = new MultiSetMap;
}
L wordRanges(S text) {
ret regexpFindRanges(regexp, text);
}
void add(A a, S text) {
Entry e = new Entry(a);
if (useHashMaps) {
e.pairPositions = new HashMap;
text = upper(text);
}
if (entries.put(a, e) != null) fail("Double insertion");
new MultiMap pairPositions;
for (Pair p : overlappingPairs(wordRanges(text))) {
PairS pair = pair(substring(text, p.a), substring(text, p.b));
pairPositions.put(pair, p.a.start);
entriesByPair.put(pair, e);
}
for (PairS pair : keys(pairPositions))
e.pairPositions.put(pair, toIntArray(pairPositions.get(pair)));
}
Set> get(PairS pair) { ret entriesByPair.get(pair); }
int numPairs() { ret entriesByPair.keysSize(); }
Iterable>>lookupString_withPositions(S query, O... _) {
optPar bool debug;
if (useHashMaps) query = upper(query);
S _query = query;
L ranges = wordRanges(query);
if (empty(ranges)) null;
int nRanges = l(ranges);
int iFirstComplete = first(ranges).start == 0 ? 1 : 0;
int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges;
--iLastComplete; // because pairs
LS words = map(ranges, r -> substring(_query, r));
LPairS pairs = overlappingPairs(words);
L>> entriesAtIndex = map(pairs, pair -> entriesByPair.get(pair));
if (iLastComplete >= iFirstComplete+1) {
int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest));
if (nBest == 0) ret emptyList();
for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) {
int n = l(entriesAtIndex.get(iWord));
if (n == 0) ret emptyList();
if (n < nBest) {
shortest = iWord;
nBest = n;
}
}
/*if (debug)*/ print("pairs: " + zipTwoLists(pairs, lmap l(entriesAtIndex)));
Set> entries = entriesAtIndex.get(shortest);
int startShortest = ranges.get(shortest).start;
PairS shortestPair = pairs.get(shortest); // not the shortest pair, but the pair with the shortest result list
new IntBuffer intBuffer;
new LPair> out;
entrySearch: for (Entry entry : entries) {
int[] positions = entry.pairPositions.get(shortestPair);
for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) {
continue if iWord == shortest;
IntRange r2 = ranges.get(iWord);
PairS pair2 = pairs.get(iWord);
int[] positions2 = entry.pairPositions.get(pair2);
if (positions2 == null) continue entrySearch;
int ofs = startShortest-r2.start;
//if (debug) print("Intersecting " + asList(positions) + "/" + asList(positions2) + " with ofs " + ofs);
positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer);
//if (debug) print("Got " + asList(positions));
if (empty(positions)) continue entrySearch;
}
out.add(pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest)));
}
ret out;
}
null;
}
}