Libraryless. Click here for Pure Java version (3999L/25K).
transient sclass DeepWordIndex<A> { S regexp = "\\w+"; bool useHashMaps; // makes it case-sensitive and doesn't allow partial word searches bool sortEntries; // if A implements Comparable new Map<A, Entry<A>> entries; //new L<Entry<A>> entriesList; MultiSetMap<S, Entry<A>> entriesByWord; Map<S, L<Entry<A>>> entriesByWord_lists; sclass Entry<A> extends Var<A> implements Comparable<Entry<A>> { Map<S, int[]> wordPositions = ciMap(); // int array is sorted *(A id) { super(id); } public int compareTo(Entry<A> e) { ret ((Comparable) get()).compareTo(e!); } public bool equals(O o) { ret o instanceof Entry && get().equals(((Entry) o)!); } } void init() { if (entriesByWord != null) ret; entriesByWord = useHashMaps ? sortEntries ? multiSetMap_innerTreeSet() : new MultiSetMap : sortEntries ? ciMultiSetMap_innerTreeSet() : ciMultiSetMap(); } L<IntRange> wordRanges(S text) { ret regexpFindRanges(regexp, text); } void add(A a, S text) { init(); Entry<A> e = new Entry<A>(a); if (useHashMaps) { e.wordPositions = new HashMap; text = upper(text); } if (entries.put(a, e) != null) fail("Double insertion"); MultiMap<S, Int> wordPositions = ciMultiMap(); for (IntRange r : wordRanges(text)) { S word = substring(text, r); wordPositions.put(word, r.start); entriesByWord.put(word, e); } for (S word : keys(wordPositions)) e.wordPositions.put(word, toIntArray(wordPositions.get(word))); } Set<Entry<A>> get(S word) { ret entriesByWord.get(word); } int numWords() { ret entriesByWord.keysSize(); } void doneAdding() { if (entriesByWord_lists != null) ret; entriesByWord_lists = mapValues asList(entriesByWord.data); // TODO: release entriesByWord } Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) { optPar bool debug; doneAdding(); if (useHashMaps) query = upper(query); S _query = query; L<IntRange> ranges = wordRanges(query); if (empty(ranges)) null; int nRanges = l(ranges); int iFirstComplete = first(ranges).start == 0 ? 1 : 0; int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges; LS words = map(ranges, r -> substring(_query, r)); LL<Entry<A>> entriesAtIndex = map(words, word -> entriesByWord_lists.get(word)); if (iLastComplete >= iFirstComplete+1) { int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest)); if (nBest == 0) { /*print("No results for " + words.get(shortest));*/ ret emptyList(); } for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) { int n = l(entriesAtIndex.get(iWord)); if (n == 0) { /*print("No results for " + words.get(iWord));*/ ret emptyList(); } if (n < nBest) { shortest = iWord; nBest = n; } } int _shortest = shortest; Iterable<Entry<A>> entries = sortEntries ? intersectMultipleSortedCollectionsI(subList(entriesAtIndex, iFirstComplete, iLastComplete)) : entriesAtIndex.get(shortest); int startShortest = ranges.get(shortest).start; S shortestWord = words.get(shortest); // not the shortest word, but the word with the shortest result list /*if (debug)*/ print("shortest: " + shortestWord + ", words: " + zipTwoLists(words, lmap l(entriesAtIndex))); new IntBuffer intBuffer; ret mapI_nonNulls_if1(entries, entry -> { int[] positions = entry.wordPositions.get(shortestWord); for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) { continue if iWord == _shortest; IntRange r2 = ranges.get(iWord); S word2 = words.get(iWord); int[] positions2 = entry.wordPositions.get(word2); if (positions2 == null) null; int ofs = startShortest-r2.start; int len = l(positions); positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer); print("Intersected " + len + "/" + l(positions2) + " => " + l(positions)); //if (debug) print("Got " + asList(positions)); if (empty(positions)) null; } ret pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest)); }); } null; } }
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029024 |
Snippet name: | DeepWordIndex |
Eternal ID of this version: | #1029024/57 |
Text MD5: | 92af6cd15ba2d76d30e8974f29f55d4b |
Transpilation MD5: | ea371a277ae8de4cd8f6e19b5081c335 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-17 19:18:50 |
Source code size: | 4432 bytes / 123 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 297 / 736 |
Version history: | 56 change(s) |
Referenced in: | [show references] |