Download Jar. Libraryless. Click here for Pure Java version (3631L/23K).
1 | // texts are split into chunks of a maximum size |
2 | transient sclass ChunkedDeepBitSetWordIndex<A> { |
3 | class Chunk { A a; int nr; *(A *a, int *nr) {} } |
4 | |
5 | int chunkLength = 32768; |
6 | int maxQueryLength = 32; // max query length = overlap between chunks |
7 | S regexp = "\\w+"; |
8 | new Map<Chunk, SingleTextWordIndex> singleTextIndices; |
9 | new ElementInstanceMatrix<Chunk, S> mainIndex; |
10 | |
11 | void add(A a, S text) { |
12 | LS chunks = textToChunks(text); |
13 | for i over chunks: { |
14 | S chunkText = chunks.get(i); |
15 | Chunk c = new Chunk(a, i); |
16 | singleTextIndices.put(c, new SingleTextWordIndex(regexp, chunkText)); |
17 | mainIndex.add(c, mapToSet upper(regexpExtractAll(regexp, chunkText))); |
18 | } |
19 | } |
20 | |
21 | LS textToChunks(S text) { |
22 | ret stringToChunksWithOverlap(text, chunkLength, maxQueryLength); |
23 | } |
24 | |
25 | void doneAdding { |
26 | mainIndex.doneAdding(); |
27 | } |
28 | |
29 | LPair<S, Int> wordsAndOffsets(S text) { |
30 | ret map(regexpFindRanges(regexp, text), |
31 | r -> pair(upper(substring(text, r)), r.start)); |
32 | } |
33 | |
34 | // assumes word boundaries left and right of query |
35 | Cl<Chunk> preSearch_chunks(S query, O... _) { |
36 | optPar bool debug; |
37 | LPair<S, Int> l = wordsAndOffsets(query); |
38 | Cl<Chunk> candidates = mainIndex.instancesContainingAllElements(pairsA(l)); |
39 | if (debug) { |
40 | L<Int> lengths = map(candidates, a -> singleTextIndices.get(a).length); |
41 | print(nCandidates(candidates) + ", total length: " + n2(intSum(lengths)) + ", lengths: " + lengths); |
42 | } |
43 | ret /*parallelFilter*/filter(candidates, a -> nempty(singleTextIndices.get(a).indicesOfWordCombination(l))); |
44 | } |
45 | |
46 | Cl<A> preSearch(S query, O... _) { |
47 | ret mapToSet(c -> c.a, preSearch_chunks(query, _)); |
48 | } |
49 | |
50 | int numWords() { ret mainIndex.numElements(); } |
51 | } |
Began life as a copy of #1029082
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029087 |
Snippet name: | ChunkedDeepBitSetWordIndex |
Eternal ID of this version: | #1029087/12 |
Text MD5: | 8e7cf39e788f179bb8251443c49a6b1a |
Transpilation MD5: | 786a453c659874d4eefaadceef815e55 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code (desktop) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-19 03:23:54 |
Source code size: | 1784 bytes / 51 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 284 / 1042 |
Version history: | 11 change(s) |
Referenced in: | [show references] |