Libraryless. Click here for Pure Java version (3730L/23K).
1 | transient sclass DeepWordPairIndex<A> { |
2 | S regexp = "\\w+"; |
3 | bool useHashMaps = true; // makes it case-sensitive and doesn't allow partial word searches |
4 | new Map<A, Entry<A>> entries; |
5 | new MultiSetMap<PairS, Entry<A>> entriesByPair; |
6 | |
7 | sclass Entry<A> extends Var<A> { |
8 | new Map<PairS, int[]> pairPositions; // int array is sorted |
9 | |
10 | *(A id) { super(id); } |
11 | } |
12 | |
13 | void useHashMaps() { |
14 | set useHashMaps; |
15 | entriesByPair = new MultiSetMap; |
16 | } |
17 | |
18 | L<IntRange> wordRanges(S text) { |
19 | ret regexpFindRanges(regexp, text); |
20 | } |
21 | |
22 | void add(A a, S text) { |
23 | Entry<A> e = new Entry<A>(a); |
24 | if (useHashMaps) { |
25 | e.pairPositions = new HashMap; |
26 | text = upper(text); |
27 | } |
28 | if (entries.put(a, e) != null) fail("Double insertion"); |
29 | new MultiMap<PairS, Int> pairPositions; |
30 | for (Pair<IntRange> p : overlappingPairs(wordRanges(text))) { |
31 | PairS pair = pair(substring(text, p.a), substring(text, p.b)); |
32 | pairPositions.put(pair, p.a.start); |
33 | entriesByPair.put(pair, e); |
34 | } |
35 | for (PairS pair : keys(pairPositions)) |
36 | e.pairPositions.put(pair, toIntArray(pairPositions.get(pair))); |
37 | } |
38 | |
39 | Set<Entry<A>> get(PairS pair) { ret entriesByPair.get(pair); } |
40 | |
41 | int numPairs() { ret entriesByPair.keysSize(); } |
42 | |
43 | Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) { |
44 | optPar bool debug; |
45 | if (useHashMaps) query = upper(query); |
46 | S _query = query; |
47 | L<IntRange> ranges = wordRanges(query); |
48 | if (empty(ranges)) null; |
49 | int nRanges = l(ranges); |
50 | int iFirstComplete = first(ranges).start == 0 ? 1 : 0; |
51 | int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges; |
52 | --iLastComplete; // because pairs |
53 | |
54 | LS words = map(ranges, r -> substring(_query, r)); |
55 | LPairS pairs = overlappingPairs(words); |
56 | L<Set<Entry<A>>> entriesAtIndex = map(pairs, pair -> entriesByPair.get(pair)); |
57 | |
58 | if (iLastComplete >= iFirstComplete+1) { |
59 | int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest)); |
60 | if (nBest == 0) ret emptyList(); |
61 | for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) { |
62 | int n = l(entriesAtIndex.get(iWord)); |
63 | if (n == 0) ret emptyList(); |
64 | if (n < nBest) { |
65 | shortest = iWord; |
66 | nBest = n; |
67 | } |
68 | } |
69 | |
70 | /*if (debug)*/ print("pairs: " + zipTwoLists(pairs, lmap l(entriesAtIndex))); |
71 | |
72 | Set<Entry<A>> entries = entriesAtIndex.get(shortest); |
73 | int startShortest = ranges.get(shortest).start; |
74 | PairS shortestPair = pairs.get(shortest); // not the shortest pair, but the pair with the shortest result list |
75 | new IntBuffer intBuffer; |
76 | |
77 | new LPair<A, Cl<Int>> out; |
78 | entrySearch: for (Entry<A> entry : entries) { |
79 | int[] positions = entry.pairPositions.get(shortestPair); |
80 | |
81 | for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) { |
82 | continue if iWord == shortest; |
83 | IntRange r2 = ranges.get(iWord); |
84 | PairS pair2 = pairs.get(iWord); |
85 | int[] positions2 = entry.pairPositions.get(pair2); |
86 | if (positions2 == null) continue entrySearch; |
87 | int ofs = startShortest-r2.start; |
88 | //if (debug) print("Intersecting " + asList(positions) + "/" + asList(positions2) + " with ofs " + ofs); |
89 | positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer); |
90 | //if (debug) print("Got " + asList(positions)); |
91 | if (empty(positions)) continue entrySearch; |
92 | } |
93 | |
94 | out.add(pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest))); |
95 | } |
96 | ret out; |
97 | } |
98 | |
99 | null; |
100 | } |
101 | } |
Began life as a copy of #1029024
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029042 |
Snippet name: | DeepWordPairIndex [dev.] |
Eternal ID of this version: | #1029042/13 |
Text MD5: | 031dd215db5e9dc11c14d88ea68dca35 |
Transpilation MD5: | 2cd245965f98520999330cd9f397c38a |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-17 16:29:02 |
Source code size: | 3776 bytes / 101 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 259 / 558 |
Version history: | 12 change(s) |
Referenced in: | [show references] |