Download Jar. Libraryless. Click here for Pure Java version (3631L/23K).
// texts are split into chunks of a maximum size transient sclass ChunkedDeepBitSetWordIndex<A> { class Chunk { A a; int nr; *(A *a, int *nr) {} } int chunkLength = 32768; int maxQueryLength = 32; // max query length = overlap between chunks S regexp = "\\w+"; new Map<Chunk, SingleTextWordIndex> singleTextIndices; new ElementInstanceMatrix<Chunk, S> mainIndex; void add(A a, S text) { LS chunks = textToChunks(text); for i over chunks: { S chunkText = chunks.get(i); Chunk c = new Chunk(a, i); singleTextIndices.put(c, new SingleTextWordIndex(regexp, chunkText)); mainIndex.add(c, mapToSet upper(regexpExtractAll(regexp, chunkText))); } } LS textToChunks(S text) { ret stringToChunksWithOverlap(text, chunkLength, maxQueryLength); } void doneAdding { mainIndex.doneAdding(); } LPair<S, Int> wordsAndOffsets(S text) { ret map(regexpFindRanges(regexp, text), r -> pair(upper(substring(text, r)), r.start)); } // assumes word boundaries left and right of query Cl<Chunk> preSearch_chunks(S query, O... _) { optPar bool debug; LPair<S, Int> l = wordsAndOffsets(query); Cl<Chunk> candidates = mainIndex.instancesContainingAllElements(pairsA(l)); if (debug) { L<Int> lengths = map(candidates, a -> singleTextIndices.get(a).length); print(nCandidates(candidates) + ", total length: " + n2(intSum(lengths)) + ", lengths: " + lengths); } ret /*parallelFilter*/filter(candidates, a -> nempty(singleTextIndices.get(a).indicesOfWordCombination(l))); } Cl<A> preSearch(S query, O... _) { ret mapToSet(c -> c.a, preSearch_chunks(query, _)); } int numWords() { ret mainIndex.numElements(); } }
Began life as a copy of #1029082
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029087 |
Snippet name: | ChunkedDeepBitSetWordIndex |
Eternal ID of this version: | #1029087/12 |
Text MD5: | 8e7cf39e788f179bb8251443c49a6b1a |
Transpilation MD5: | 786a453c659874d4eefaadceef815e55 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code (desktop) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-19 03:23:54 |
Source code size: | 1784 bytes / 51 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 283 / 1041 |
Version history: | 11 change(s) |
Referenced in: | [show references] |