Libraryless. Click here for Pure Java version (14542L/98K).
1 | // A naive suffix tree implementation |
2 | sclass SuffixTree { |
3 | Node root; |
4 | S fullText; |
5 | int nodeCount; |
6 | |
7 | Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this); |
8 | new DummyNode dummyNode; |
9 | |
10 | sinterface IFirstChar { |
11 | int firstCharOrMinus1(SuffixTree tree); |
12 | } |
13 | |
14 | sclass DummyNode implements IFirstChar { |
15 | int firstChar; |
16 | |
17 | public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; } |
18 | } |
19 | |
20 | sclass Node implements IFirstChar { |
21 | int from, to; // our range in fullText |
22 | O children; // null, a Node or Node[] sorted by first character |
23 | |
24 | *() {} |
25 | *(int *from, int *to) {} |
26 | *(Substring s) { from = s.startIndex(); to = s.endIndex(); } |
27 | |
28 | Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); } |
29 | |
30 | int lText() { ret to-from; } |
31 | |
32 | bool isTerminal(SuffixTree tree) { ret to == l(tree.fullText); } |
33 | |
34 | void addChild(SuffixTree tree, Node n) { |
35 | if (children == null) children = n; |
36 | else if (children cast Node) |
37 | children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator); |
38 | else { |
39 | Node[] c = cast children; |
40 | Node[] x = new[l(c)+1]; |
41 | arraycopy(c, 0, x, 0, l(c)); |
42 | x[l(x)-1] = n; |
43 | children = sortArrayInPlace(x, tree.childComparator); |
44 | } |
45 | } |
46 | |
47 | Cl<Node> children() { |
48 | if (children == null) ret emptyList(); |
49 | if (children cast Node) ret ll(children); |
50 | ret wrapArrayAsList((Node[]) children); |
51 | } |
52 | |
53 | public int firstCharOrMinus1(SuffixTree tree) { |
54 | ret charToIntOrMinus1(text(tree)); |
55 | } |
56 | |
57 | Node getChild(SuffixTree tree, int c) { |
58 | if (children == null) null; |
59 | if (children cast Node) |
60 | ret children.firstCharOrMinus1(tree) == c ? children : null; |
61 | tree.dummyNode.firstChar = c; |
62 | Node[] x = cast children; |
63 | int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator); |
64 | ret i >= 0 ? x[i] : null; |
65 | } |
66 | } |
67 | |
68 | *() {} |
69 | *(S *fullText) { |
70 | root = new Node(0, 0); |
71 | ++nodeCount; |
72 | for i over fullText: { |
73 | addSuffix(Substring(fullText, i)); |
74 | if (((i+1) % 1000000) == 0) |
75 | print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")"); |
76 | } |
77 | } |
78 | |
79 | void addSuffix(Substring s) { |
80 | Node node = root; |
81 | |
82 | while (!empty(s)) { |
83 | int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s); |
84 | s = s.substring(_n); |
85 | if (_n >= node.lText()) { // node text exhausted |
86 | if (empty(s)) { // pattern also exhausted - done |
87 | //print("Exhausted: " + node.from + "/" + node.to); |
88 | // add a new termination node |
89 | Node nNew = new Node(s); |
90 | ++nodeCount; |
91 | node.addChild(SuffixTree.this, nNew); |
92 | ret; |
93 | } else { |
94 | Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s)); |
95 | if (n == null) { |
96 | n = new Node(s); |
97 | ++nodeCount; |
98 | node.addChild(SuffixTree.this, n); |
99 | ret; |
100 | } else |
101 | node = n; |
102 | } |
103 | } else { // node text not exhausted |
104 | // split node. first, move all the node's vitals to a new node nOld |
105 | Node nOld = new Node(node.from+_n, node.to); |
106 | ++nodeCount; |
107 | nOld.children = node.children; |
108 | node.children = null; |
109 | node.to = node.from+_n; |
110 | node.addChild(SuffixTree.this, nOld); |
111 | |
112 | // now add a new node |
113 | Node nNew = new Node(s); |
114 | ++nodeCount; |
115 | node.addChild(SuffixTree.this, nNew); |
116 | ret; |
117 | } |
118 | } |
119 | } |
120 | |
121 | public L<Int> indicesOf(S pattern) { |
122 | ret asList(indicesOf_iterator(pattern)); |
123 | } |
124 | |
125 | public ItIt<Int> indicesOf_iterator(S pattern) { |
126 | ret mapI_notNull(allNodesUnder(scanDown(root, pattern)), |
127 | nad -> nad.node.isTerminal(this) ? nad.position() : null); |
128 | } |
129 | |
130 | srecord NodeAndDepth(Node node, int depth) { |
131 | int position() { |
132 | int position = node.from-depth; |
133 | //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position); |
134 | ret position; |
135 | } |
136 | |
137 | Cl<NodeAndDepth> children() { |
138 | ret lmap wrapChild(node.children()); |
139 | } |
140 | |
141 | NodeAndDepth wrapChild(Node n) { |
142 | ret n == null ? null : new NodeAndDepth(n, depth+node.lText()); |
143 | } |
144 | |
145 | NodeAndDepth getChild(SuffixTree tree, int c) { |
146 | ret wrapChild(node.getChild(tree, c)); |
147 | } |
148 | } |
149 | |
150 | NodeAndDepth scanDown(Node node, S pattern) { |
151 | int lPattern = l(pattern), iPattern = 0; |
152 | NodeAndDepth nad = new(node, 0); |
153 | while true { |
154 | int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern)); |
155 | iPattern += n; |
156 | if (iPattern >= lPattern) break; // pattern exhausted - done |
157 | if (n < nad.node.lText()) null; // mismatch, exit |
158 | NodeAndDepth child = nad.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern)); |
159 | if (child != null) continue with nad = child; |
160 | null; |
161 | } |
162 | |
163 | ret nad; |
164 | } |
165 | |
166 | void printMe() { |
167 | printNode("", "", new NodeAndDepth(root, 0)); |
168 | } |
169 | |
170 | void printNode(S indent, S pre, NodeAndDepth nad) { |
171 | print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]")); |
172 | fOr (NodeAndDepth n : nad.children()) { |
173 | printNode(indent + " ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n); |
174 | } |
175 | } |
176 | |
177 | ItIt<NodeAndDepth> allNodes() { |
178 | ret allNodesUnder(new NodeAndDepth(root, 0)); |
179 | } |
180 | |
181 | // includes the node itself |
182 | ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) { |
183 | new L<Iterator<NodeAndDepth>> stack; |
184 | if (nad != null) |
185 | stack.add(iteratorLL(nad)); |
186 | ret iteratorFromFunction_if0(() -> { |
187 | while (nempty(stack)) { |
188 | if (!last(stack).hasNext()) |
189 | popLast(stack); |
190 | else { |
191 | NodeAndDepth n = last(stack).next(); |
192 | stack.add((Iterator) iterator(n.children())); |
193 | ret n; |
194 | } |
195 | } |
196 | null; |
197 | }); |
198 | } |
199 | } |
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029234 |
Snippet name: | SuffixTree [LIVE] |
Eternal ID of this version: | #1029234/2 |
Text MD5: | 8c5e7742ad2b749364b7723e6745c9ba |
Transpilation MD5: | 0d0678082061b40b0d102be8999ad145 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-26 19:48:36 |
Source code size: | 6169 bytes / 199 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 237 / 525 |
Version history: | 1 change(s) |
Referenced in: | [show references] |