Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

245
LINES

< > BotCompany Repo | #1029230 // SuffixTree_managed [compact version without object headers]

JavaX fragment (include) [tags: use-pretranspiled]

Uses 11335K of libraries. Click here for Pure Java version (15704L/107K).

1  
sclass SuffixTree_managed implements IManagedObject, AutoCloseable {
2  
  replace Addr with int.
3  
  
4  
  new ManagedIntObjects_v1 mem;
5  
  Node root;
6  
  CharSequence fullText;
7  
  int nodeCount;
8  
  Cl<AutoCloseable> dependentCloseables;
9  
  
10  
  Comparator<Node> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this);
11  
  
12  
  class Node extends AbstractManagedObject {
13  
    // children is a pointer to an int "array" (length field + ints)
14  
    final static int ofs_from = 0, ofs_to = 1, ofs_children = 2;
15  
    final static int objectSize = ofs_children+1;
16  
    
17  
    *(Addr addr) { super(addr); }
18  
    
19  
    // create new object
20  
    *() {
21  
      addr = mem.alloc(objectSize);
22  
    }
23  
    
24  
    // create new object
25  
    *(int from, int to) {
26  
      this();
27  
      from(from);
28  
      to(to);
29  
    }
30  
    
31  
    // create new object
32  
    *(Substring s) {
33  
      this(s.startIndex(), s.endIndex());
34  
    }
35  
    
36  
    // fields & GC handling
37  
    
38  
    int from() { ret mem.get(addr+ofs_from); }
39  
    int to() { ret mem.get(addr+ofs_to); }
40  
    Addr children_ptr() { ret mem.get(addr+ofs_children); }
41  
    L<Node> children() {
42  
      ret convertListElementsBothWays(
43  
        n -> new Node(n),
44  
        n -> n.addr,
45  
        mem.pointerArray(children_ptr()));
46  
    }
47  
    
48  
    void from(int from) { mem.set(addr+ofs_from, from); }
49  
    void to(int to) { mem.set(addr+ofs_to, to); }
50  
    void children(Addr addr) { mem.set(this.addr+ofs_children, addr); }
51  
    
52  
    public void scanForCollection(IManagedObjectCollector gc) {
53  
      gc.noteObject(addr, objectSize, this == root ? newAddr -> { addr = newAddr; } : null);
54  
      gc.notePointer(addr+ofs_children);
55  
      gc.notePointerArray(children_ptr());
56  
      for (Node n : children())
57  
        n.scanForCollection(gc);
58  
    }
59  
    
60  
    // other methods
61  
    
62  
    Substring text(SuffixTree_managed tree) { ret Substring(tree.fullText, from(), to()); }
63  
    
64  
    int lText() { ret to()-from(); }
65  
    
66  
    bool isTerminal(SuffixTree_managed tree) { ret to() == l(tree.fullText); }
67  
    
68  
    void addChild(SuffixTree_managed tree, Node n) {
69  
      Addr oldChildren = children_ptr();
70  
      int i = l(children());
71  
      //print(children_ptr := children_ptr() + ", l=" + l(children()));
72  
      children(mem.resizePointerArray(oldChildren, i+1);
73  
      //print(children_ptr := children_ptr() + ", l=" + l(children()));
74  
      mem.freePointerArray(oldChildren);
75  
      children().set(i, n);
76  
      sortInPlace(children(), tree.childComparator);
77  
    }
78  
    
79  
    public int firstCharOrMinus1(SuffixTree_managed tree) {
80  
      ret charToIntOrMinus1(text(tree));
81  
    }
82  
    
83  
    Node getChild(SuffixTree_managed tree, int c) {
84  
      L<Node> children = children();
85  
      int i = generalizedBinarySearch2(children, n -> cmp(n.firstCharOrMinus1(tree), c));
86  
      ret i >= 0 ? children.get(i) : null;
87  
    }
88  
    
89  
    toString { ret "Node@" + addr + ", " + from() + "/" + lText() + ". " + nChildren(children()); }
90  
  }
91  
92  
  *() {}
93  
  *(S *fullText) {
94  
    process(fullText);
95  
  }
96  
  
97  
  // load
98  
  *(IIntMemory mem, int root) {
99  
    this.mem = new ManagedIntObjects_v1_virtual(mem);
100  
    this.root = new Node(root);
101  
  }
102  
  
103  
  void process(S fullText) {
104  
    this.fullText = fullText;
105  
    root = new Node(0, 0);
106  
    ++nodeCount;
107  
    for i over fullText: {
108  
      addSuffix(Substring(fullText, i));
109  
      if (((i+1) % 1000000) == 0) {
110  
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
111  
        mem.printStats();
112  
      }
113  
    }
114  
  }
115  
116  
  void addSuffix(Substring s) {
117  
    Node node = root;
118  
    
119  
    while (!empty(s)) {
120  
      int _n = lCommonPrefix_CharSequence(node.text(SuffixTree_managed.this), s);
121  
      s = s.substring(_n);
122  
      if (_n >= node.lText()) { // node text exhausted
123  
        if (empty(s)) { // pattern also exhausted - done
124  
          //print("Exhausted: " + node.from() + "/" + node.to());
125  
          // add a new termination node
126  
          Node nNew = new Node(s);
127  
          ++nodeCount;
128  
          node.addChild(SuffixTree_managed.this, nNew);
129  
          ret;
130  
        } else {
131  
          Node n = node.getChild(SuffixTree_managed.this, charToIntOrMinus1(s));
132  
          if (n == null) {
133  
            n = new Node(s);
134  
            ++nodeCount;
135  
            node.addChild(SuffixTree_managed.this, n);
136  
            ret;
137  
          } else
138  
            node = n;
139  
        }
140  
      } else { // node text not exhausted
141  
        // split node. first, move all the node's vitals to a new node nOld
142  
        Node nOld = new Node(node.from()+_n, node.to());
143  
        ++nodeCount;
144  
        nOld.children(node.children_ptr());
145  
        node.children(mem.nullPtr());
146  
        node.to(node.from()+_n);
147  
        node.addChild(SuffixTree_managed.this, nOld);
148  
149  
        // now add a new node
150  
        Node nNew = new Node(s);
151  
        ++nodeCount;
152  
        node.addChild(SuffixTree_managed.this, nNew);
153  
        ret;
154  
      }
155  
    }
156  
  }
157  
158  
  public L<Int> indicesOf(S pattern) {
159  
    ret asList(indicesOf_iterator(pattern));
160  
  }
161  
162  
  public ItIt<Int> indicesOf_iterator(S pattern) {
163  
    ret mapI_notNull(allNodesUnder(scanDown(root, pattern)),
164  
      nad -> nad.node.isTerminal(this) ? nad.position() : null);
165  
  }
166  
  
167  
  srecord NodeAndDepth(Node node, int depth) {
168  
    int position() {
169  
      int position = node.from()-depth;
170  
      //print("from=" + node.from() + ", to=" + node.to() + ", depth=" + depth + ", position=" + position);
171  
      ret position;
172  
    }
173  
    
174  
    Cl<NodeAndDepth> children() {
175  
      ret lmap wrapChild(node.children());
176  
    }
177  
    
178  
    NodeAndDepth wrapChild(Node n) {
179  
      ret n == null ? null : new NodeAndDepth(n, depth+node.lText());
180  
    }
181  
    
182  
    NodeAndDepth getChild(SuffixTree_managed tree, int c) {
183  
      ret wrapChild(node.getChild(tree, c));
184  
    }
185  
  }
186  
187  
  NodeAndDepth scanDown(Node node, S pattern) {
188  
    int lPattern = l(pattern), iPattern = 0;
189  
    NodeAndDepth nad = new(node, 0);
190  
    while true {
191  
      int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern));
192  
      iPattern += n;
193  
      if (iPattern >= lPattern) break; // pattern exhausted - done
194  
      if (n < nad.node.lText()) null; // mismatch, exit
195  
      NodeAndDepth child = nad.getChild(SuffixTree_managed.this, charAtAsIntOrMinus1(pattern, iPattern));
196  
      if (child != null) continue with nad = child;
197  
      null;
198  
    }
199  
    
200  
    ret nad;
201  
  }
202  
  
203  
  void printMe() {
204  
    printNode("", "", new NodeAndDepth(root, 0));
205  
  }
206  
  
207  
  void printNode(S indent, S pre, NodeAndDepth nad) {
208  
    print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]"));
209  
    fOr (NodeAndDepth n : nad.children()) {
210  
      printNode(indent + "  ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n);
211  
    }
212  
  }
213  
  
214  
  ItIt<NodeAndDepth> allNodes() {
215  
    ret allNodesUnder(new NodeAndDepth(root, 0));
216  
  }
217  
  
218  
  // includes the node itself
219  
  ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) {
220  
    new L<Iterator<NodeAndDepth>> stack;
221  
    if (nad != null)
222  
      stack.add(iteratorLL(nad));
223  
    ret iteratorFromFunction_if0(() -> {
224  
      while (nempty(stack)) {
225  
        if (!last(stack).hasNext())
226  
          popLast(stack);
227  
        else {
228  
          NodeAndDepth n = last(stack).next();
229  
          stack.add((Iterator) iterator(n.children()));
230  
          ret n;
231  
        }
232  
      }
233  
      null;
234  
    });
235  
  }
236  
  
237  
  public void scanForCollection(IManagedObjectCollector gc) {
238  
    if (root == null) ret;
239  
    root.scanForCollection(gc);
240  
  }
241  
  
242  
  public void close() {
243  
    closeAll(dependentCloseables);
244  
  }
245  
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029230
Snippet name: SuffixTree_managed [compact version without object headers]
Eternal ID of this version: #1029230/51
Text MD5: ddff76e4b37841f09d8cf1d94f24c194
Transpilation MD5: 34bb38366c54dfe113cc2de4289aceac
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-27 18:14:59
Source code size: 7673 bytes / 245 lines
Pitched / IR pitched: No / No
Views / Downloads: 251 / 697
Version history: 50 change(s)
Referenced in: [show references]