Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

123
LINES

< > BotCompany Repo | #1029024 // DeepWordIndex

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3999L/25K).

1  
transient sclass DeepWordIndex<A> {
2  
  S regexp = "\\w+";
3  
  bool useHashMaps; // makes it case-sensitive and doesn't allow partial word searches
4  
  bool sortEntries; // if A implements Comparable
5  
  new Map<A, Entry<A>> entries;
6  
  //new L<Entry<A>> entriesList;
7  
  MultiSetMap<S, Entry<A>> entriesByWord;
8  
  Map<S, L<Entry<A>>> entriesByWord_lists;
9  
  
10  
  sclass Entry<A> extends Var<A> implements Comparable<Entry<A>> {
11  
    Map<S, int[]> wordPositions = ciMap(); // int array is sorted
12  
    
13  
    *(A id) { super(id); }
14  
    
15  
    public int compareTo(Entry<A> e) {
16  
      ret ((Comparable) get()).compareTo(e!);
17  
    }
18  
    
19  
    public bool equals(O o) {
20  
      ret o instanceof Entry && get().equals(((Entry) o)!);
21  
    }
22  
  }
23  
  
24  
  void init() {
25  
    if (entriesByWord != null) ret;
26  
    entriesByWord = useHashMaps
27  
      ? sortEntries ? multiSetMap_innerTreeSet() : new MultiSetMap
28  
      : sortEntries ? ciMultiSetMap_innerTreeSet() : ciMultiSetMap();
29  
  }
30  
  
31  
  L<IntRange> wordRanges(S text) {
32  
    ret regexpFindRanges(regexp, text);
33  
  }
34  
   
35  
  void add(A a, S text) {
36  
    init();
37  
    Entry<A> e = new Entry<A>(a);
38  
    if (useHashMaps) {
39  
      e.wordPositions = new HashMap;
40  
      text = upper(text);
41  
    }
42  
    if (entries.put(a, e) != null) fail("Double insertion");
43  
    MultiMap<S, Int> wordPositions = ciMultiMap();
44  
    for (IntRange r : wordRanges(text)) {
45  
      S word = substring(text, r);
46  
      wordPositions.put(word, r.start);
47  
      entriesByWord.put(word, e);
48  
    }
49  
    for (S word : keys(wordPositions))
50  
      e.wordPositions.put(word, toIntArray(wordPositions.get(word)));
51  
  }
52  
  
53  
  Set<Entry<A>> get(S word) { ret entriesByWord.get(word); }
54  
  
55  
  int numWords() { ret entriesByWord.keysSize(); }
56  
57  
  void doneAdding() {
58  
    if (entriesByWord_lists != null) ret;
59  
    entriesByWord_lists = mapValues asList(entriesByWord.data);
60  
    // TODO: release entriesByWord
61  
  }
62  
  
63  
  Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) {
64  
    optPar bool debug;
65  
    doneAdding();
66  
    if (useHashMaps) query = upper(query);
67  
    S _query = query;
68  
    L<IntRange> ranges = wordRanges(query);
69  
    if (empty(ranges)) null;
70  
    int nRanges = l(ranges);
71  
    int iFirstComplete = first(ranges).start == 0 ? 1 : 0;
72  
    int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges;
73  
    
74  
    LS words = map(ranges, r -> substring(_query, r));
75  
    LL<Entry<A>> entriesAtIndex = map(words, word -> entriesByWord_lists.get(word));
76  
      
77  
    if (iLastComplete >= iFirstComplete+1) {
78  
      int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest));
79  
      if (nBest == 0) { /*print("No results for " + words.get(shortest));*/ ret emptyList(); }
80  
      for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) {
81  
        int n = l(entriesAtIndex.get(iWord));
82  
        if (n == 0) { /*print("No results for " + words.get(iWord));*/ ret emptyList(); }
83  
        if (n < nBest) {
84  
          shortest = iWord;
85  
          nBest = n;
86  
        }
87  
      }
88  
      int _shortest = shortest;
89  
      
90  
      Iterable<Entry<A>> entries = sortEntries
91  
        ? intersectMultipleSortedCollectionsI(subList(entriesAtIndex, iFirstComplete, iLastComplete))
92  
        : entriesAtIndex.get(shortest);
93  
        
94  
      int startShortest = ranges.get(shortest).start;
95  
      S shortestWord = words.get(shortest); // not the shortest word, but the word with the shortest result list
96  
      /*if (debug)*/ print("shortest: " + shortestWord + ", words: " + zipTwoLists(words, lmap l(entriesAtIndex)));
97  
      
98  
      new IntBuffer intBuffer;
99  
      
100  
      ret mapI_nonNulls_if1(entries, entry -> {
101  
        int[] positions = entry.wordPositions.get(shortestWord);
102  
103  
        for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) {
104  
          continue if iWord == _shortest;
105  
          IntRange r2 = ranges.get(iWord);
106  
          S word2 = words.get(iWord);
107  
          int[] positions2 = entry.wordPositions.get(word2);
108  
          if (positions2 == null) null;
109  
          int ofs = startShortest-r2.start;
110  
          int len = l(positions);
111  
          positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer);
112  
          print("Intersected " + len + "/" + l(positions2) + " => " + l(positions));
113  
          //if (debug) print("Got " + asList(positions));
114  
          if (empty(positions)) null;
115  
        }
116  
        
117  
        ret pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest));
118  
      });
119  
    }
120  
    
121  
    null;
122  
  }
123  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029024
Snippet name: DeepWordIndex
Eternal ID of this version: #1029024/57
Text MD5: 92af6cd15ba2d76d30e8974f29f55d4b
Transpilation MD5: ea371a277ae8de4cd8f6e19b5081c335
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-17 19:18:50
Source code size: 4432 bytes / 123 lines
Pitched / IR pitched: No / No
Views / Downloads: 298 / 738
Version history: 56 change(s)
Referenced in: [show references]