Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

88
LINES

< > BotCompany Repo | #1025688 // PositionalTokenIndex [OK]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (2563L/16K).

1  
// clever index of tokenized strings
2  
// maintaining one CI-sorted list per token position
3  
sclass PositionalTokenIndex {
4  
  L<LLS> sorts = new L; // token index -> entries sorted by token at index
5  
  bool dirty;
6  
  
7  
  *() {}
8  
  *(Iterable<LS> toks) { addAll(toks); }
9  
  
10  
  void addAll(Iterable<LS> toks) {
11  
    fOr (LS tok : toks)
12  
      add(tok);
13  
  }
14  
  
15  
  void add(LS tok) {
16  
    for (int i = 1; i < l(tok); i += 2) {
17  
      LL<S> idx = listGetOrCreate_List(sorts, i/2);
18  
      idx.add(tok);
19  
    }
20  
    set dirty;
21  
  }
22  
  
23  
  private void sort {
24  
    for i over sorts: {
25  
      int j = i*2+1;
26  
      sortInPlaceByCalculatedFieldIC(sorts.get(i),
27  
        func(LS tok) -> S { tok.get(j) });
28  
    }
29  
    dirty = false;
30  
  }
31  
  
32  
  // return all tokenizations with token t in position i
33  
  // may return null
34  
  LLS byToken(int i, S t) {
35  
    clean();
36  
    LLS list = get(sorts, i);
37  
    if (list == null) null;
38  
    
39  
    int start = binarySearchForStart(list, i, t);
40  
    if (start < 0) null;
41  
    int end = binarySearchForEnd(list, i, t, start);
42  
    ret subList(list, start, end);
43  
  }
44  
  
45  
  private int binarySearchForStart(LLS list, int position, S t) {
46  
    int low = 0, high = list.size()-1;
47  
    while (low <= high) {
48  
      int mid = (low + high) >>> 1;
49  
      S token = list.get(mid).get(position*2+1);
50  
      int cmp = compareIC(token, t);
51  
52  
      if (cmp < 0)
53  
        low = mid + 1;
54  
      else if (cmp > 0)
55  
        high = mid - 1;
56  
      else // we found the target token, now check if we're at start
57  
        if (mid == 0 || !eqic(list.get(mid-1).get(position*2+1), t))
58  
          ret mid; // yep, it's the start
59  
        else
60  
          high = mid - 1; // continue to search higher up
61  
    }
62  
    ret -1;
63  
  }
64  
  
65  
  private int binarySearchForEnd(LLS list, int position, S t, int start) {
66  
    int low = start+1, high = list.size()-1;
67  
    while (low <= high) {
68  
      int mid = (low + high) >>> 1;
69  
      S token = list.get(mid).get(position*2+1);
70  
      
71  
      if (eqic(token, t))
72  
        low = mid + 1; // still in target range, go lower
73  
      else // we found a different token, now check if we're at the end
74  
        if (mid-1 == start || eqic(list.get(mid-1).get(position*2+1), t))
75  
          ret mid; // yep, it's the end
76  
        else
77  
          high = mid - 1; // continue to search higher up
78  
    }
79  
    ret low;
80  
  }
81  
  
82  
  LLS fullList() { ret first(sorts); }
83  
  
84  
  // if you want to sort it ahead of time
85  
  void clean() {
86  
    if (dirty) sort();
87  
  }
88  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 6 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1025688
Snippet name: PositionalTokenIndex [OK]
Eternal ID of this version: #1025688/22
Text MD5: 8f04881744542f85c46fddd354f95789
Transpilation MD5: 0aa7216c098c99abd973bb522ab773d2
Author: stefan
Category: javax / agi.blue
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2019-10-13 23:15:53
Source code size: 2496 bytes / 88 lines
Pitched / IR pitched: No / No
Views / Downloads: 286 / 750
Version history: 21 change(s)
Referenced in: [show references]