Libraryless. Click here for Pure Java version (3730L/23K).
transient sclass DeepWordPairIndex<A> { S regexp = "\\w+"; bool useHashMaps = true; // makes it case-sensitive and doesn't allow partial word searches new Map<A, Entry<A>> entries; new MultiSetMap<PairS, Entry<A>> entriesByPair; sclass Entry<A> extends Var<A> { new Map<PairS, int[]> pairPositions; // int array is sorted *(A id) { super(id); } } void useHashMaps() { set useHashMaps; entriesByPair = new MultiSetMap; } L<IntRange> wordRanges(S text) { ret regexpFindRanges(regexp, text); } void add(A a, S text) { Entry<A> e = new Entry<A>(a); if (useHashMaps) { e.pairPositions = new HashMap; text = upper(text); } if (entries.put(a, e) != null) fail("Double insertion"); new MultiMap<PairS, Int> pairPositions; for (Pair<IntRange> p : overlappingPairs(wordRanges(text))) { PairS pair = pair(substring(text, p.a), substring(text, p.b)); pairPositions.put(pair, p.a.start); entriesByPair.put(pair, e); } for (PairS pair : keys(pairPositions)) e.pairPositions.put(pair, toIntArray(pairPositions.get(pair))); } Set<Entry<A>> get(PairS pair) { ret entriesByPair.get(pair); } int numPairs() { ret entriesByPair.keysSize(); } Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) { optPar bool debug; if (useHashMaps) query = upper(query); S _query = query; L<IntRange> ranges = wordRanges(query); if (empty(ranges)) null; int nRanges = l(ranges); int iFirstComplete = first(ranges).start == 0 ? 1 : 0; int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges; --iLastComplete; // because pairs LS words = map(ranges, r -> substring(_query, r)); LPairS pairs = overlappingPairs(words); L<Set<Entry<A>>> entriesAtIndex = map(pairs, pair -> entriesByPair.get(pair)); if (iLastComplete >= iFirstComplete+1) { int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest)); if (nBest == 0) ret emptyList(); for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) { int n = l(entriesAtIndex.get(iWord)); if (n == 0) ret emptyList(); if (n < nBest) { shortest = iWord; nBest = n; } } /*if (debug)*/ print("pairs: " + zipTwoLists(pairs, lmap l(entriesAtIndex))); Set<Entry<A>> entries = entriesAtIndex.get(shortest); int startShortest = ranges.get(shortest).start; PairS shortestPair = pairs.get(shortest); // not the shortest pair, but the pair with the shortest result list new IntBuffer intBuffer; new LPair<A, Cl<Int>> out; entrySearch: for (Entry<A> entry : entries) { int[] positions = entry.pairPositions.get(shortestPair); for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) { continue if iWord == shortest; IntRange r2 = ranges.get(iWord); PairS pair2 = pairs.get(iWord); int[] positions2 = entry.pairPositions.get(pair2); if (positions2 == null) continue entrySearch; int ofs = startShortest-r2.start; //if (debug) print("Intersecting " + asList(positions) + "/" + asList(positions2) + " with ofs " + ofs); positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer); //if (debug) print("Got " + asList(positions)); if (empty(positions)) continue entrySearch; } out.add(pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest))); } ret out; } null; } }
Began life as a copy of #1029024
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029042 |
Snippet name: | DeepWordPairIndex [dev.] |
Eternal ID of this version: | #1029042/13 |
Text MD5: | 031dd215db5e9dc11c14d88ea68dca35 |
Transpilation MD5: | 2cd245965f98520999330cd9f397c38a |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-17 16:29:02 |
Source code size: | 3776 bytes / 101 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 258 / 556 |
Version history: | 12 change(s) |
Referenced in: | #1034167 - Standard Classes + Interfaces (LIVE, continuation of #1003674) |