// texts are split into chunks of a maximum size transient sclass ChunkedDeepBitSetWordIndex { record Chunk(A a, int nr) {} int chunkLength = 65536; int maxQueryLength = 32; // max query length = overlap between chunks S regexp = "\\w+"; new Map singleTextIndices; new ElementInstanceMatrix mainIndex; void add(A a, S text) { LS chunks = textToChunks(text); for i over chunks: { chunkText = chunks.get(i); Chunk c = new Chunk(a, i); singleTextIndices.put(c, new SingleTextWordIndex(regexp, chunkText)); mainIndex.add(c, mapToSet upper(regexpExtractAll(regexp, chunkText))); } } LS textToChunks(S text) { ret stringToChunksWithOverlap(text, chunkLength, overlap); } void doneAdding { mainIndex.doneAdding(); } LPair wordsAndOffsets(S text) { ret map(regexpFindRanges(regexp, text), r -> pair(upper(substring(text, r)), r.start)); } // assumes word boundaries left and right of query Cl preSearch_chunks(S query, O... _) { optPar bool debug; LPair l = wordsAndOffsets(query); Cl candidates = mainIndex.instancesContainingAllElements(pairsA(l)); if (debug) { L lengths = map(candidates, a -> singleTextIndices.get(a).length); print(nCandidates(candidates) + ", total length: " + n2(intSum(lengths)) + ", lengths: " + lengths); } ret filter(candidates, a -> nempty(singleTextIndices.get(a).indicesOfWordCombination(l))); } Cl preSearch(S query, O... _) { ret pairsASet(preSearch_chunks(query, _)); } int numWords() { ret mainIndex.numElements(); } }