Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

101
LINES

< > BotCompany Repo | #1029042 // DeepWordPairIndex [dev.]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3730L/23K).

1  
transient sclass DeepWordPairIndex<A> {
2  
  S regexp = "\\w+";
3  
  bool useHashMaps = true; // makes it case-sensitive and doesn't allow partial word searches
4  
  new Map<A, Entry<A>> entries;
5  
  new MultiSetMap<PairS, Entry<A>> entriesByPair;
6  
  
7  
  sclass Entry<A> extends Var<A> {
8  
    new Map<PairS, int[]> pairPositions; // int array is sorted
9  
    
10  
    *(A id) { super(id); }
11  
  }
12  
  
13  
  void useHashMaps() {
14  
    set useHashMaps;
15  
    entriesByPair = new MultiSetMap;
16  
  }
17  
  
18  
  L<IntRange> wordRanges(S text) {
19  
    ret regexpFindRanges(regexp, text);
20  
  }
21  
   
22  
  void add(A a, S text) {
23  
    Entry<A> e = new Entry<A>(a);
24  
    if (useHashMaps) {
25  
      e.pairPositions = new HashMap;
26  
      text = upper(text);
27  
    }
28  
    if (entries.put(a, e) != null) fail("Double insertion");
29  
    new MultiMap<PairS, Int> pairPositions;
30  
    for (Pair<IntRange> p : overlappingPairs(wordRanges(text))) {
31  
      PairS pair = pair(substring(text, p.a), substring(text, p.b));
32  
      pairPositions.put(pair, p.a.start);
33  
      entriesByPair.put(pair, e);
34  
    }
35  
    for (PairS pair : keys(pairPositions))
36  
      e.pairPositions.put(pair, toIntArray(pairPositions.get(pair)));
37  
  }
38  
  
39  
  Set<Entry<A>> get(PairS pair) { ret entriesByPair.get(pair); }
40  
  
41  
  int numPairs() { ret entriesByPair.keysSize(); }
42  
  
43  
  Iterable<Pair<A, Cl<Int>>>lookupString_withPositions(S query, O... _) {
44  
    optPar bool debug;
45  
    if (useHashMaps) query = upper(query);
46  
    S _query = query;
47  
    L<IntRange> ranges = wordRanges(query);
48  
    if (empty(ranges)) null;
49  
    int nRanges = l(ranges);
50  
    int iFirstComplete = first(ranges).start == 0 ? 1 : 0;
51  
    int iLastComplete = last(ranges).end == l(query) ? nRanges-1 : nRanges;
52  
    --iLastComplete; // because pairs
53  
    
54  
    LS words = map(ranges, r -> substring(_query, r));
55  
    LPairS pairs = overlappingPairs(words);
56  
    L<Set<Entry<A>>> entriesAtIndex = map(pairs, pair -> entriesByPair.get(pair));
57  
      
58  
    if (iLastComplete >= iFirstComplete+1) {
59  
      int shortest = iFirstComplete, nBest = l(entriesAtIndex.get(shortest));
60  
      if (nBest == 0) ret emptyList();
61  
      for (int iWord = iFirstComplete+1; iWord < iLastComplete; iWord++) {
62  
        int n = l(entriesAtIndex.get(iWord));
63  
        if (n == 0) ret emptyList();
64  
        if (n < nBest) {
65  
          shortest = iWord;
66  
          nBest = n;
67  
        }
68  
      }
69  
      
70  
      /*if (debug)*/ print("pairs: " + zipTwoLists(pairs, lmap l(entriesAtIndex)));
71  
      
72  
      Set<Entry<A>> entries = entriesAtIndex.get(shortest);
73  
      int startShortest = ranges.get(shortest).start;
74  
      PairS shortestPair = pairs.get(shortest); // not the shortest pair, but the pair with the shortest result list
75  
      new IntBuffer intBuffer;
76  
      
77  
      new LPair<A, Cl<Int>> out;
78  
      entrySearch: for (Entry<A> entry : entries) {
79  
        int[] positions = entry.pairPositions.get(shortestPair);
80  
81  
        for (int iWord = iFirstComplete; iWord < iLastComplete; iWord++) {
82  
          continue if iWord == shortest;
83  
          IntRange r2 = ranges.get(iWord);
84  
          PairS pair2 = pairs.get(iWord);
85  
          int[] positions2 = entry.pairPositions.get(pair2);
86  
          if (positions2 == null) continue entrySearch;
87  
          int ofs = startShortest-r2.start;
88  
          //if (debug) print("Intersecting " + asList(positions) + "/" + asList(positions2) + " with ofs " + ofs);
89  
          positions = intersectSortedIntArrays_ofs_optimized2(positions, positions2, ofs, intBuffer);
90  
          //if (debug) print("Got " + asList(positions));
91  
          if (empty(positions)) continue entrySearch;
92  
        }
93  
        
94  
        out.add(pair(entry!, wrapIntArrayAsImmutableList_ofs(positions, -startShortest)));
95  
      }
96  
      ret out;
97  
    }
98  
    
99  
    null;
100  
  }
101  
}

Author comment

Began life as a copy of #1029024

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029042
Snippet name: DeepWordPairIndex [dev.]
Eternal ID of this version: #1029042/13
Text MD5: 031dd215db5e9dc11c14d88ea68dca35
Transpilation MD5: 2cd245965f98520999330cd9f397c38a
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-17 16:29:02
Source code size: 3776 bytes / 101 lines
Pitched / IR pitched: No / No
Views / Downloads: 259 / 558
Version history: 12 change(s)
Referenced in: [show references]