transient sclass DeepWordIndex { S regexp = "\\w+"; bool useHashMaps = true; // makes it case-sensitive and doesn't allow partial word searches new Map> entries; MultiSetMap> entriesByWord = ciMultiSetMap(); sclass Entry extends Var implements Comparable> { Map wordPositions = ciMap(); // int array is sorted *(A id) { super(id); } public int compareTo(Entry e) { ret ((Comparable) get()).compareTo(e!); } public bool equals(O o) { ret o instanceof Entry && get().equals(((Entry) o)!); } } void useHashMaps() { set useHashMaps; entriesByWord = new MultiSetMap; } L wordRanges(S text) { ret regexpFindRanges(regexp, text); } void add(A a, S text) { Entry e = new Entry(a); if (useHashMaps) { e.wordPositions = new HashMap; text = upper(text); } if (entries.put(a, e) != null) fail("Double insertion"); MultiMap wordPositions = ciMultiMap(); for (IntRange r : wordRanges(text)) { S word = substring(text, r); wordPositions.put(word, r.start); entriesByWord.put(word, e); } for (S word : keys(wordPositions)) e.wordPositions.put(word, toIntArray(wordPositions.get(word))); } Set> get(S word) { ret entriesByWord.get(word); } int numWords() { ret entriesByWord.keysSize(); } Iterable>>lookupString_withPositions(S query, O... _) { optPar bool debug; if (useHashMaps) query = upper(query); S _query = query; L ranges = wordRanges(query); if (empty(ranges)) null; int nRanges = l(ranges); int iFirstComplete = first(ranges).start == 0 ? 1 : 0; int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges; LS words = map(ranges, r -> substring(_query, r)); L>> entriesAtIndex = map(words, word -> entriesByWord.get(word)); if (iLastComplete >= iFirstComplete+1) { int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest)); if (nBest == 0) ret emptyList(); for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) { int n = l(entriesAtIndex.get(iWord)); if (n == 0) ret emptyList(); if (n < nBest) { shortest = iWord; nBest = n; } } /*if (debug)*/ print("words: " + zipTwoLists(words, lmap l(entriesAtIndex))); Set> entries = entriesAtIndex.get(shortest); int startShortest = ranges.get(shortest).start; S shortestWord = words.get(shortest); // not the shortest word, but the word with the shortest result list new IntBuffer intBuffer; new LPair> out; entrySearch: for (Entry entry : entries) { int[] positions = entry.wordPositions.get(shortestWord); for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) { continue if iWord == shortest; IntRange r2 = ranges.get(iWord); S word2 = words.get(iWord); int[] positions2 = entry.wordPositions.get(word2); if (positions2 == null) continue entrySearch; int ofs = startShortest-r2.start; //if (debug) print("Intersecting " + asList(positions) + "/" + asList(positions2) + " with ofs " + ofs); positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer); //if (debug) print("Got " + asList(positions)); if (empty(positions)) continue entrySearch; } out.add(pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest))); } ret out; } null; } }