Uses 11335K of libraries. Click here for Pure Java version (15704L/107K).
1 | sclass SuffixTree_managed implements IManagedObject, AutoCloseable { |
2 | replace Addr with int. |
3 | |
4 | new ManagedIntObjects_v1 mem; |
5 | Node root; |
6 | CharSequence fullText; |
7 | int nodeCount; |
8 | Cl<AutoCloseable> dependentCloseables; |
9 | |
10 | Comparator<Node> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this); |
11 | |
12 | class Node extends AbstractManagedObject { |
13 | // children is a pointer to an int "array" (length field + ints) |
14 | final static int ofs_from = 0, ofs_to = 1, ofs_children = 2; |
15 | final static int objectSize = ofs_children+1; |
16 | |
17 | *(Addr addr) { super(addr); } |
18 | |
19 | // create new object |
20 | *() { |
21 | addr = mem.alloc(objectSize); |
22 | } |
23 | |
24 | // create new object |
25 | *(int from, int to) { |
26 | this(); |
27 | from(from); |
28 | to(to); |
29 | } |
30 | |
31 | // create new object |
32 | *(Substring s) { |
33 | this(s.startIndex(), s.endIndex()); |
34 | } |
35 | |
36 | // fields & GC handling |
37 | |
38 | int from() { ret mem.get(addr+ofs_from); } |
39 | int to() { ret mem.get(addr+ofs_to); } |
40 | Addr children_ptr() { ret mem.get(addr+ofs_children); } |
41 | L<Node> children() { |
42 | ret convertListElementsBothWays( |
43 | n -> new Node(n), |
44 | n -> n.addr, |
45 | mem.pointerArray(children_ptr())); |
46 | } |
47 | |
48 | void from(int from) { mem.set(addr+ofs_from, from); } |
49 | void to(int to) { mem.set(addr+ofs_to, to); } |
50 | void children(Addr addr) { mem.set(this.addr+ofs_children, addr); } |
51 | |
52 | public void scanForCollection(IManagedObjectCollector gc) { |
53 | gc.noteObject(addr, objectSize, this == root ? newAddr -> { addr = newAddr; } : null); |
54 | gc.notePointer(addr+ofs_children); |
55 | gc.notePointerArray(children_ptr()); |
56 | for (Node n : children()) |
57 | n.scanForCollection(gc); |
58 | } |
59 | |
60 | // other methods |
61 | |
62 | Substring text(SuffixTree_managed tree) { ret Substring(tree.fullText, from(), to()); } |
63 | |
64 | int lText() { ret to()-from(); } |
65 | |
66 | bool isTerminal(SuffixTree_managed tree) { ret to() == l(tree.fullText); } |
67 | |
68 | void addChild(SuffixTree_managed tree, Node n) { |
69 | Addr oldChildren = children_ptr(); |
70 | int i = l(children()); |
71 | //print(children_ptr := children_ptr() + ", l=" + l(children())); |
72 | children(mem.resizePointerArray(oldChildren, i+1); |
73 | //print(children_ptr := children_ptr() + ", l=" + l(children())); |
74 | mem.freePointerArray(oldChildren); |
75 | children().set(i, n); |
76 | sortInPlace(children(), tree.childComparator); |
77 | } |
78 | |
79 | public int firstCharOrMinus1(SuffixTree_managed tree) { |
80 | ret charToIntOrMinus1(text(tree)); |
81 | } |
82 | |
83 | Node getChild(SuffixTree_managed tree, int c) { |
84 | L<Node> children = children(); |
85 | int i = generalizedBinarySearch2(children, n -> cmp(n.firstCharOrMinus1(tree), c)); |
86 | ret i >= 0 ? children.get(i) : null; |
87 | } |
88 | |
89 | toString { ret "Node@" + addr + ", " + from() + "/" + lText() + ". " + nChildren(children()); } |
90 | } |
91 | |
92 | *() {} |
93 | *(S *fullText) { |
94 | process(fullText); |
95 | } |
96 | |
97 | // load |
98 | *(IIntMemory mem, int root) { |
99 | this.mem = new ManagedIntObjects_v1_virtual(mem); |
100 | this.root = new Node(root); |
101 | } |
102 | |
103 | void process(S fullText) { |
104 | this.fullText = fullText; |
105 | root = new Node(0, 0); |
106 | ++nodeCount; |
107 | for i over fullText: { |
108 | addSuffix(Substring(fullText, i)); |
109 | if (((i+1) % 1000000) == 0) { |
110 | print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")"); |
111 | mem.printStats(); |
112 | } |
113 | } |
114 | } |
115 | |
116 | void addSuffix(Substring s) { |
117 | Node node = root; |
118 | |
119 | while (!empty(s)) { |
120 | int _n = lCommonPrefix_CharSequence(node.text(SuffixTree_managed.this), s); |
121 | s = s.substring(_n); |
122 | if (_n >= node.lText()) { // node text exhausted |
123 | if (empty(s)) { // pattern also exhausted - done |
124 | //print("Exhausted: " + node.from() + "/" + node.to()); |
125 | // add a new termination node |
126 | Node nNew = new Node(s); |
127 | ++nodeCount; |
128 | node.addChild(SuffixTree_managed.this, nNew); |
129 | ret; |
130 | } else { |
131 | Node n = node.getChild(SuffixTree_managed.this, charToIntOrMinus1(s)); |
132 | if (n == null) { |
133 | n = new Node(s); |
134 | ++nodeCount; |
135 | node.addChild(SuffixTree_managed.this, n); |
136 | ret; |
137 | } else |
138 | node = n; |
139 | } |
140 | } else { // node text not exhausted |
141 | // split node. first, move all the node's vitals to a new node nOld |
142 | Node nOld = new Node(node.from()+_n, node.to()); |
143 | ++nodeCount; |
144 | nOld.children(node.children_ptr()); |
145 | node.children(mem.nullPtr()); |
146 | node.to(node.from()+_n); |
147 | node.addChild(SuffixTree_managed.this, nOld); |
148 | |
149 | // now add a new node |
150 | Node nNew = new Node(s); |
151 | ++nodeCount; |
152 | node.addChild(SuffixTree_managed.this, nNew); |
153 | ret; |
154 | } |
155 | } |
156 | } |
157 | |
158 | public L<Int> indicesOf(S pattern) { |
159 | ret asList(indicesOf_iterator(pattern)); |
160 | } |
161 | |
162 | public ItIt<Int> indicesOf_iterator(S pattern) { |
163 | ret mapI_notNull(allNodesUnder(scanDown(root, pattern)), |
164 | nad -> nad.node.isTerminal(this) ? nad.position() : null); |
165 | } |
166 | |
167 | srecord NodeAndDepth(Node node, int depth) { |
168 | int position() { |
169 | int position = node.from()-depth; |
170 | //print("from=" + node.from() + ", to=" + node.to() + ", depth=" + depth + ", position=" + position); |
171 | ret position; |
172 | } |
173 | |
174 | Cl<NodeAndDepth> children() { |
175 | ret lmap wrapChild(node.children()); |
176 | } |
177 | |
178 | NodeAndDepth wrapChild(Node n) { |
179 | ret n == null ? null : new NodeAndDepth(n, depth+node.lText()); |
180 | } |
181 | |
182 | NodeAndDepth getChild(SuffixTree_managed tree, int c) { |
183 | ret wrapChild(node.getChild(tree, c)); |
184 | } |
185 | } |
186 | |
187 | NodeAndDepth scanDown(Node node, S pattern) { |
188 | int lPattern = l(pattern), iPattern = 0; |
189 | NodeAndDepth nad = new(node, 0); |
190 | while true { |
191 | int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern)); |
192 | iPattern += n; |
193 | if (iPattern >= lPattern) break; // pattern exhausted - done |
194 | if (n < nad.node.lText()) null; // mismatch, exit |
195 | NodeAndDepth child = nad.getChild(SuffixTree_managed.this, charAtAsIntOrMinus1(pattern, iPattern)); |
196 | if (child != null) continue with nad = child; |
197 | null; |
198 | } |
199 | |
200 | ret nad; |
201 | } |
202 | |
203 | void printMe() { |
204 | printNode("", "", new NodeAndDepth(root, 0)); |
205 | } |
206 | |
207 | void printNode(S indent, S pre, NodeAndDepth nad) { |
208 | print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]")); |
209 | fOr (NodeAndDepth n : nad.children()) { |
210 | printNode(indent + " ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n); |
211 | } |
212 | } |
213 | |
214 | ItIt<NodeAndDepth> allNodes() { |
215 | ret allNodesUnder(new NodeAndDepth(root, 0)); |
216 | } |
217 | |
218 | // includes the node itself |
219 | ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) { |
220 | new L<Iterator<NodeAndDepth>> stack; |
221 | if (nad != null) |
222 | stack.add(iteratorLL(nad)); |
223 | ret iteratorFromFunction_if0(() -> { |
224 | while (nempty(stack)) { |
225 | if (!last(stack).hasNext()) |
226 | popLast(stack); |
227 | else { |
228 | NodeAndDepth n = last(stack).next(); |
229 | stack.add((Iterator) iterator(n.children())); |
230 | ret n; |
231 | } |
232 | } |
233 | null; |
234 | }); |
235 | } |
236 | |
237 | public void scanForCollection(IManagedObjectCollector gc) { |
238 | if (root == null) ret; |
239 | root.scanForCollection(gc); |
240 | } |
241 | |
242 | public void close() { |
243 | closeAll(dependentCloseables); |
244 | } |
245 | } |
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029230 |
Snippet name: | SuffixTree_managed [compact version without object headers] |
Eternal ID of this version: | #1029230/51 |
Text MD5: | ddff76e4b37841f09d8cf1d94f24c194 |
Transpilation MD5: | 34bb38366c54dfe113cc2de4289aceac |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-27 18:14:59 |
Source code size: | 7673 bytes / 245 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 331 / 805 |
Version history: | 50 change(s) |
Referenced in: | [show references] |