Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

102
LINES

< > BotCompany Repo | #1029068 // WordIndexWithBitSets - index a list by words (case-insensitive by default)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3461L/22K).

1  
// Generalized by: ElementInstanceMatrix
2  
3  
// A represents the id of a document to be indexed (e.g. a snippet ID)
4  
transient sclass WordIndexWithBitSets<A> {
5  
  S regexp = "\\w+";
6  
  new UniqueList<A> documents;
7  
  Map<S, Entry> index = ciMap();
8  
9  
  // each word can be represented as either an int array, a bit set or both
10  
  // for now, the int array is always there
11  
  sclass Entry {
12  
    new Set<Int> intSet; // while building the index
13  
    int[] intArray;
14  
    BitSet bitSet;
15  
    
16  
    BitSet bitSet() {
17  
      if (bitSet == null)
18  
        bitSet = intArrayToBitSet(intArray);
19  
      ret bitSet;
20  
    }
21  
    
22  
    int count() { ret intArray.length; }
23  
  }
24  
  
25  
  *() {}
26  
  *(Map<A, S> map) { fOr (A a, S text : map) add(a, text); }
27  
  
28  
  void add(A a, S text) {
29  
    int idx = documents.addOrGetIndex(a);
30  
    Set<S> words = extractWords(text);
31  
    for (S word : words) addWord(idx, word);
32  
  }
33  
  
34  
  void addWord(int idx, S word) {
35  
    Entry e = index.get(word);
36  
    if (e == null) index.put(word, e = new Entry);
37  
    e.intSet.add(idx);
38  
  }
39  
  
40  
  // Call this exactly once before doing queries on the index
41  
  void doneAdding() {
42  
    for (Entry e : values(index)) {
43  
      e.intArray = sortIntArrayInPlace(toIntArray(e.intSet));
44  
      e.intSet = null;
45  
    }
46  
  }
47  
  
48  
  Set<S> extractWords(S text) {
49  
    ret asCISet(extractWords_list(text));
50  
  }
51  
  
52  
  LS extractWords_list(S text) {
53  
    ret regexpExtractAll(regexp, text);
54  
  }
55  
  
56  
  L<IntRange> wordRanges(S text) {
57  
    ret regexpFindRanges(regexp, text);
58  
  }
59  
  
60  
  L<A> documentsContainingWord(S word) {
61  
    Entry e = index.get(word);
62  
    ret e == null ? null : documentListFromIntArray(e.intArray);
63  
  }
64  
  
65  
  L<A> documentListFromIntArray(int[] array) {
66  
    ret listFromFunction(array.length, i -> documents.get(array[i]));
67  
  }
68  
  
69  
  // may return null (=empty list)
70  
  L<A> documentsContainingAllWords(Cl<S> words) {
71  
    int n = l(words;
72  
    if (n == 0) ret documents;
73  
    if (n == 1) ret documentsContainingWord(first(words));
74  
75  
    // get entries for words, exit when a word is unknown
76  
    Entry[] entries = new[n];
77  
    Iterator<S> itWords = iterator(words);
78  
    for i to n: {
79  
      Entry e = index.get(itWords.next());
80  
      if (e == null) null;
81  
      entries[i] = e;
82  
    }
83  
    
84  
    // go through words again, AND-combine all bit sets
85  
    BitSet bs = cloneBitSet(entries[0].bitSet());
86  
    for (int i = 1; i < n; i++)
87  
      bs.and(entries[i].bitSet());
88  
89  
    ret documentListFromIntArray(bitSetToIntArray(bs)); 
90  
  }
91  
  
92  
  L<A> documentsContainingAllWords(S text, O... _) {
93  
    ret documentsContainingAllWords(extractWords(text));
94  
  }
95  
  
96  
  NavigableSet<S> words() { ret (NavigableSet) keys(index); }
97  
  
98  
  int numWords() { ret l(index); }
99  
  
100  
  // This method only works when A = S
101  
  void add(S s) { add((A) s, s); }
102  
}

Author comment

Began life as a copy of #1024242

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029068
Snippet name: WordIndexWithBitSets - index a list by words (case-insensitive by default)
Eternal ID of this version: #1029068/20
Text MD5: 98cfe0948c6afe02a7feb923599d48f3
Transpilation MD5: 2a2456c1641df2f34b94aff95eeaeee2
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-19 01:22:59
Source code size: 2836 bytes / 102 lines
Pitched / IR pitched: No / No
Views / Downloads: 292 / 632
Version history: 19 change(s)
Referenced in: [show references]