Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

256
LINES

< > BotCompany Repo | #1029263 // IntSuffixTree_managed [use ints instead of chars]

JavaX fragment (include) [tags: use-pretranspiled]

Uses 11335K of libraries. Click here for Pure Java version (18648L/123K).

1  
sclass IntSuffixTree_managed implements IManagedObject, AutoCloseable {
2  
  replace Addr with int.
3  
  
4  
  new ManagedIntObjects_v1 mem;
5  
  Node root;
6  
  L<Int> fullText;
7  
  int nodeCount;
8  
  Cl<AutoCloseable> dependentCloseables;
9  
  int printEvery = 1000000;
10  
  
11  
  Comparator<Node> childComparator = (a, b) -> cmp(a.firstCharOrMinus1(this), b.firstCharOrMinus1(this));
12  
  
13  
  class Node extends AbstractManagedObject {
14  
    // children is a pointer to an int "array" (length field + ints)
15  
    final static int ofs_from = 0, ofs_to = 1, ofs_children = 2;
16  
    final static int objectSize = ofs_children+1;
17  
    
18  
    // initialize as wrapper for existing object
19  
    *(Addr addr) { super(addr); }
20  
    
21  
    // initialize as new object
22  
    *() { addr = mem.alloc(objectSize); }
23  
    
24  
    // create object from arguments
25  
    *(int from, int to) {
26  
      this();
27  
      from(from);
28  
      to(to);
29  
    }
30  
    
31  
    *(IntRange r) { this(r.start, r.end); }
32  
    
33  
    // getters + setters
34  
    int from() { ret mem.get(addr+ofs_from); }
35  
    void from(int from) { mem.set(addr+ofs_from, from); }
36  
    int to() { ret mem.get(addr+ofs_to); }
37  
    void to(int to) { mem.set(addr+ofs_to, to); }
38  
    Addr children_ptr() { ret mem.get(addr+ofs_children); }
39  
    L<Node> children() {
40  
      ret mem.objectArray(children_ptr(), addr -> new Node(addr));
41  
    }
42  
    void children(Addr addr) { mem.set(this.addr+ofs_children, addr); }
43  
44  
    // GC handling
45  
    public void scanForCollection(IManagedObjectCollector gc) {
46  
      gc.noteObject(addr, objectSize, this == root ? newAddr -> { addr = newAddr; } : null);
47  
      gc.notePointer(addr+ofs_children);
48  
      gc.notePointerArray(children_ptr());
49  
      for (Node n : children())
50  
        n.scanForCollection(gc);
51  
    }
52  
    
53  
    // other methods
54  
    
55  
    L<Int> text(IntSuffixTree_managed tree) { ret subList(tree.fullText, from(), to()); }
56  
    
57  
    int lText() { ret to()-from(); }
58  
    
59  
    bool isTerminal(IntSuffixTree_managed tree) { ret to() == l(tree.fullText); }
60  
    
61  
    void setChildren(IntSuffixTree_managed tree, L<Node> nodes) {
62  
      mem.freePointerArray(children_ptr());
63  
      children(mem.newPointerArray(l(nodes));
64  
      for i over nodes:
65  
        children().set(i, nodes.get(i));
66  
      sortInPlace(children(), tree.childComparator);
67  
    }
68  
    
69  
    void addChild(IntSuffixTree_managed tree, Node n) {
70  
      Addr oldChildren = children_ptr();
71  
      int i = l(children());
72  
      //print(children_ptr := children_ptr() + ", l=" + l(children()));
73  
      children(mem.resizePointerArray(oldChildren, i+1);
74  
      //print(children_ptr := children_ptr() + ", l=" + l(children()));
75  
      mem.freePointerArray(oldChildren);
76  
      children().set(i, n);
77  
      sortInPlace(children(), tree.childComparator);
78  
    }
79  
    
80  
    public int firstCharOrMinus1(IntSuffixTree_managed tree) {
81  
      ret firstOrMinus1(text(tree));
82  
    }
83  
    
84  
    Node getChild(IntSuffixTree_managed tree, int c) {
85  
      L<Node> children = children();
86  
      int i = generalizedBinarySearch2(children, n -> cmp(n.firstCharOrMinus1(tree), c));
87  
      ret i >= 0 ? children.get(i) : null;
88  
    }
89  
    
90  
    toString { ret "Node@" + addr + ", " + from() + "/" + lText() + ". " + nChildren(children()); }
91  
  }
92  
93  
  *() {}
94  
  *(L<Int> fullText) {
95  
    process(fullText);
96  
  }
97  
  
98  
  // load
99  
  *(IIntMemory mem, int root) {
100  
    this.mem = new ManagedIntObjects_v1_virtual(mem);
101  
    this.root = new Node(root);
102  
  }
103  
  
104  
  void process(L<Int> fullText) {
105  
    this.fullText = fullText;
106  
    root = new Node(0, 0);
107  
    ++nodeCount;
108  
    for i over fullText: {
109  
      ping();
110  
      addSuffix(IntRange(i, l(fullText)));
111  
      if (((i+1) % printEvery) == 0) {
112  
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
113  
        mem.printStats();
114  
      }
115  
    }
116  
  }
117  
118  
  void addSuffix(IntRange s) {
119  
    Node node = root;
120  
    
121  
    while (!empty(s)) {
122  
      int _n = lCommonPrefix_lists(node.text(IntSuffixTree_managed.this), subList(fullText, s));
123  
      s.start += _n;
124  
      if (_n >= node.lText()) { // node text exhausted
125  
        if (empty(s)) { // pattern also exhausted - done
126  
          //print("Exhausted: " + node.from() + "/" + node.to());
127  
          // add a new termination node
128  
          Node nNew = new Node(s);
129  
          ++nodeCount;
130  
          node.addChild(IntSuffixTree_managed.this, nNew);
131  
          ret;
132  
        } else {
133  
          Node n = node.getChild(IntSuffixTree_managed.this, firstOrMinus1(subList(fullText, s)));
134  
          if (n == null) {
135  
            n = new Node(s);
136  
            ++nodeCount;
137  
            node.addChild(IntSuffixTree_managed.this, n);
138  
            ret;
139  
          } else
140  
            node = n;
141  
        }
142  
      } else { // node text not exhausted
143  
        // split node. first, move all the node's vitals to a new node nOld
144  
        Node nOld = new Node(node.from()+_n, node.to());
145  
        ++nodeCount;
146  
        nOld.children(node.children_ptr());
147  
        node.children(mem.nullPtr());
148  
        node.to(node.from()+_n);
149  
        node.addChild(IntSuffixTree_managed.this, nOld);
150  
151  
        // now add a new node
152  
        Node nNew = new Node(s);
153  
        ++nodeCount;
154  
        node.addChild(IntSuffixTree_managed.this, nNew);
155  
        ret;
156  
      }
157  
    }
158  
  }
159  
160  
  public L<Int> indicesOf(L<Int> pattern) {
161  
    ret asList(indicesOf_iterator(pattern));
162  
  }
163  
164  
  public ItIt<Int> indicesOf_iterator(L<Int> pattern) {
165  
    ret mapI_notNull(allNodesUnder(scanDown(root, pattern)),
166  
      nad -> nad.node.isTerminal(this) ? nad.position() : null);
167  
  }
168  
  
169  
  srecord NodeAndDepth(Node node, int depth) {
170  
    int position() {
171  
      int position = node.from()-depth;
172  
      //print("from=" + node.from() + ", to=" + node.to() + ", depth=" + depth + ", position=" + position);
173  
      ret position;
174  
    }
175  
    
176  
    Cl<NodeAndDepth> children() {
177  
      ret lmap wrapChild(node.children());
178  
    }
179  
    
180  
    NodeAndDepth wrapChild(Node n) {
181  
      ret n == null ? null : new NodeAndDepth(n, depth+node.lText());
182  
    }
183  
    
184  
    NodeAndDepth getChild(IntSuffixTree_managed tree, int c) {
185  
      ret wrapChild(node.getChild(tree, c));
186  
    }
187  
  }
188  
189  
  NodeAndDepth scanDown(Node node, L<Int> pattern) {
190  
    int lPattern = l(pattern), iPattern = 0;
191  
    NodeAndDepth nad = new(node, 0);
192  
    while true {
193  
      int n = lCommonPrefix_lists(nad.node.text(this), subList(pattern, iPattern));
194  
      iPattern += n;
195  
      if (iPattern >= lPattern) break; // pattern exhausted - done
196  
      if (n < nad.node.lText()) null; // mismatch, exit
197  
      NodeAndDepth child = nad.getChild(IntSuffixTree_managed.this, or(get(pattern, iPattern), -1));
198  
      if (child != null) continue with nad = child;
199  
      null;
200  
    }
201  
    
202  
    ret nad;
203  
  }
204  
  
205  
  void printMe() {
206  
    printNode("", "", new NodeAndDepth(root, 0));
207  
  }
208  
  
209  
  void printNode(S indent, S pre, NodeAndDepth nad) {
210  
    print(indent + pre + takeFirst(20, nad.node.text(this)) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]"));
211  
    fOr (NodeAndDepth n : nad.children()) {
212  
      printNode(indent + "  ", "", n);
213  
    }
214  
  }
215  
  
216  
  ItIt<NodeAndDepth> allNodes() {
217  
    ret allNodesUnder(new NodeAndDepth(root, 0));
218  
  }
219  
  
220  
  // includes the node itself
221  
  ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) {
222  
    new L<Iterator<NodeAndDepth>> stack;
223  
    if (nad != null)
224  
      stack.add(iteratorLL(nad));
225  
    ret iteratorFromFunction_if0(() -> {
226  
      while (nempty(stack)) {
227  
        if (!last(stack).hasNext())
228  
          popLast(stack);
229  
        else {
230  
          NodeAndDepth n = last(stack).next();
231  
          stack.add((Iterator) iterator(n.children()));
232  
          ret n;
233  
        }
234  
      }
235  
      null;
236  
    });
237  
  }
238  
  
239  
  public void scanForCollection(IManagedObjectCollector gc) {
240  
    if (root == null) ret;
241  
    root.scanForCollection(gc);
242  
  }
243  
  
244  
  public void close() {
245  
    closeAll(dependentCloseables);
246  
  }
247  
  
248  
  void saveToFile(File f) {
249  
    mem.set(0, root.addr);
250  
    mem.saveToFile(f);
251  
  }
252  
  
253  
  Node newNode(int from, int to) {
254  
    ret new Node(from, to);
255  
  }
256  
}

Author comment

Began life as a copy of #1029230

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029263
Snippet name: IntSuffixTree_managed [use ints instead of chars]
Eternal ID of this version: #1029263/24
Text MD5: eca6c7df63523f3e8af8cd6efd75319d
Transpilation MD5: 1b25dcc4276b1188cab6bbc2525bcbcd
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2021-06-23 22:18:03
Source code size: 8096 bytes / 256 lines
Pitched / IR pitched: No / No
Views / Downloads: 374 / 731
Version history: 23 change(s)
Referenced in: [show references]