Libraryless. Click here for Pure Java version (3999L/25K).
1 | transient sclass DeepWordIndex<A> { |
2 | S regexp = "\\w+"; |
3 | bool useHashMaps; // makes it case-sensitive and doesn't allow partial word searches |
4 | bool sortEntries; // if A implements Comparable |
5 | new Map<A, Entry<A>> entries; |
6 | //new L<Entry<A>> entriesList; |
7 | MultiSetMap<S, Entry<A>> entriesByWord; |
8 | Map<S, L<Entry<A>>> entriesByWord_lists; |
9 | |
10 | sclass Entry<A> extends Var<A> implements Comparable<Entry<A>> { |
11 | Map<S, int[]> wordPositions = ciMap(); // int array is sorted |
12 | |
13 | *(A id) { super(id); } |
14 | |
15 | public int compareTo(Entry<A> e) { |
16 | ret ((Comparable) get()).compareTo(e!); |
17 | } |
18 | |
19 | public bool equals(O o) { |
20 | ret o instanceof Entry && get().equals(((Entry) o)!); |
21 | } |
22 | } |
23 | |
24 | void init() { |
25 | if (entriesByWord != null) ret; |
26 | entriesByWord = useHashMaps |
27 | ? sortEntries ? multiSetMap_innerTreeSet() : new MultiSetMap |
28 | : sortEntries ? ciMultiSetMap_innerTreeSet() : ciMultiSetMap(); |
29 | } |
30 | |
31 | L<IntRange> wordRanges(S text) { |
32 | ret regexpFindRanges(regexp, text); |
33 | } |
34 | |
35 | void add(A a, S text) { |
36 | init(); |
37 | Entry<A> e = new Entry<A>(a); |
38 | if (useHashMaps) { |
39 | e.wordPositions = new HashMap; |
40 | text = upper(text); |
41 | } |
42 | if (entries.put(a, e) != null) fail("Double insertion"); |
43 | MultiMap<S, Int> wordPositions = ciMultiMap(); |
44 | for (IntRange r : wordRanges(text)) { |
45 | S word = substring(text, r); |
46 | wordPositions.put(word, r.start); |
47 | entriesByWord.put(word, e); |
48 | } |
49 | for (S word : keys(wordPositions)) |
50 | e.wordPositions.put(word, toIntArray(wordPositions.get(word))); |
51 | } |
52 | |
53 | Set<Entry<A>> get(S word) { ret entriesByWord.get(word); } |
54 | |
55 | int numWords() { ret entriesByWord.keysSize(); } |
56 | |
57 | void doneAdding() { |
58 | if (entriesByWord_lists != null) ret; |
59 | entriesByWord_lists = mapValues asList(entriesByWord.data); |
60 | // TODO: release entriesByWord |
61 | } |
62 | |
63 | Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) { |
64 | optPar bool debug; |
65 | doneAdding(); |
66 | if (useHashMaps) query = upper(query); |
67 | S _query = query; |
68 | L<IntRange> ranges = wordRanges(query); |
69 | if (empty(ranges)) null; |
70 | int nRanges = l(ranges); |
71 | int iFirstComplete = first(ranges).start == 0 ? 1 : 0; |
72 | int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges; |
73 | |
74 | LS words = map(ranges, r -> substring(_query, r)); |
75 | LL<Entry<A>> entriesAtIndex = map(words, word -> entriesByWord_lists.get(word)); |
76 | |
77 | if (iLastComplete >= iFirstComplete+1) { |
78 | int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest)); |
79 | if (nBest == 0) { /*print("No results for " + words.get(shortest));*/ ret emptyList(); } |
80 | for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) { |
81 | int n = l(entriesAtIndex.get(iWord)); |
82 | if (n == 0) { /*print("No results for " + words.get(iWord));*/ ret emptyList(); } |
83 | if (n < nBest) { |
84 | shortest = iWord; |
85 | nBest = n; |
86 | } |
87 | } |
88 | int _shortest = shortest; |
89 | |
90 | Iterable<Entry<A>> entries = sortEntries |
91 | ? intersectMultipleSortedCollectionsI(subList(entriesAtIndex, iFirstComplete, iLastComplete)) |
92 | : entriesAtIndex.get(shortest); |
93 | |
94 | int startShortest = ranges.get(shortest).start; |
95 | S shortestWord = words.get(shortest); // not the shortest word, but the word with the shortest result list |
96 | /*if (debug)*/ print("shortest: " + shortestWord + ", words: " + zipTwoLists(words, lmap l(entriesAtIndex))); |
97 | |
98 | new IntBuffer intBuffer; |
99 | |
100 | ret mapI_nonNulls_if1(entries, entry -> { |
101 | int[] positions = entry.wordPositions.get(shortestWord); |
102 | |
103 | for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) { |
104 | continue if iWord == _shortest; |
105 | IntRange r2 = ranges.get(iWord); |
106 | S word2 = words.get(iWord); |
107 | int[] positions2 = entry.wordPositions.get(word2); |
108 | if (positions2 == null) null; |
109 | int ofs = startShortest-r2.start; |
110 | int len = l(positions); |
111 | positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer); |
112 | print("Intersected " + len + "/" + l(positions2) + " => " + l(positions)); |
113 | //if (debug) print("Got " + asList(positions)); |
114 | if (empty(positions)) null; |
115 | } |
116 | |
117 | ret pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest)); |
118 | }); |
119 | } |
120 | |
121 | null; |
122 | } |
123 | } |
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029024 |
Snippet name: | DeepWordIndex |
Eternal ID of this version: | #1029024/57 |
Text MD5: | 92af6cd15ba2d76d30e8974f29f55d4b |
Transpilation MD5: | ea371a277ae8de4cd8f6e19b5081c335 |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-17 19:18:50 |
Source code size: | 4432 bytes / 123 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 298 / 738 |
Version history: | 56 change(s) |
Referenced in: | [show references] |