Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

102
LINES

< > BotCompany Repo | #1029068 // WordIndexWithBitSets - index a list by words (case-insensitive by default)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3461L/22K).

// Generalized by: ElementInstanceMatrix

// A represents the id of a document to be indexed (e.g. a snippet ID)
transient sclass WordIndexWithBitSets<A> {
  S regexp = "\\w+";
  new UniqueList<A> documents;
  Map<S, Entry> index = ciMap();

  // each word can be represented as either an int array, a bit set or both
  // for now, the int array is always there
  sclass Entry {
    new Set<Int> intSet; // while building the index
    int[] intArray;
    BitSet bitSet;
    
    BitSet bitSet() {
      if (bitSet == null)
        bitSet = intArrayToBitSet(intArray);
      ret bitSet;
    }
    
    int count() { ret intArray.length; }
  }
  
  *() {}
  *(Map<A, S> map) { fOr (A a, S text : map) add(a, text); }
  
  void add(A a, S text) {
    int idx = documents.addOrGetIndex(a);
    Set<S> words = extractWords(text);
    for (S word : words) addWord(idx, word);
  }
  
  void addWord(int idx, S word) {
    Entry e = index.get(word);
    if (e == null) index.put(word, e = new Entry);
    e.intSet.add(idx);
  }
  
  // Call this exactly once before doing queries on the index
  void doneAdding() {
    for (Entry e : values(index)) {
      e.intArray = sortIntArrayInPlace(toIntArray(e.intSet));
      e.intSet = null;
    }
  }
  
  Set<S> extractWords(S text) {
    ret asCISet(extractWords_list(text));
  }
  
  LS extractWords_list(S text) {
    ret regexpExtractAll(regexp, text);
  }
  
  L<IntRange> wordRanges(S text) {
    ret regexpFindRanges(regexp, text);
  }
  
  L<A> documentsContainingWord(S word) {
    Entry e = index.get(word);
    ret e == null ? null : documentListFromIntArray(e.intArray);
  }
  
  L<A> documentListFromIntArray(int[] array) {
    ret listFromFunction(array.length, i -> documents.get(array[i]));
  }
  
  // may return null (=empty list)
  L<A> documentsContainingAllWords(Cl<S> words) {
    int n = l(words;
    if (n == 0) ret documents;
    if (n == 1) ret documentsContainingWord(first(words));

    // get entries for words, exit when a word is unknown
    Entry[] entries = new[n];
    Iterator<S> itWords = iterator(words);
    for i to n: {
      Entry e = index.get(itWords.next());
      if (e == null) null;
      entries[i] = e;
    }
    
    // go through words again, AND-combine all bit sets
    BitSet bs = cloneBitSet(entries[0].bitSet());
    for (int i = 1; i < n; i++)
      bs.and(entries[i].bitSet());

    ret documentListFromIntArray(bitSetToIntArray(bs)); 
  }
  
  L<A> documentsContainingAllWords(S text, O... _) {
    ret documentsContainingAllWords(extractWords(text));
  }
  
  NavigableSet<S> words() { ret (NavigableSet) keys(index); }
  
  int numWords() { ret l(index); }
  
  // This method only works when A = S
  void add(S s) { add((A) s, s); }
}

Author comment

Began life as a copy of #1024242

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029068
Snippet name: WordIndexWithBitSets - index a list by words (case-insensitive by default)
Eternal ID of this version: #1029068/20
Text MD5: 98cfe0948c6afe02a7feb923599d48f3
Transpilation MD5: 2a2456c1641df2f34b94aff95eeaeee2
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-19 01:22:59
Source code size: 2836 bytes / 102 lines
Pitched / IR pitched: No / No
Views / Downloads: 291 / 630
Version history: 19 change(s)
Referenced in: [show references]