Libraryless. Click here for Pure Java version (14578L/99K).
1 | // A naive suffix tree implementation |
2 | sclass SuffixTree {
|
3 | Node root; |
4 | S fullText; |
5 | int nodeCount; |
6 | bool useTailOptimization; // doesn't work yet |
7 | |
8 | Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this); |
9 | new DummyNode dummyNode; |
10 | new Map<Int, Node> tailMap; // position -> node already made |
11 | |
12 | sinterface IFirstChar {
|
13 | int firstCharOrMinus1(SuffixTree tree); |
14 | } |
15 | |
16 | sclass DummyNode implements IFirstChar {
|
17 | int firstChar; |
18 | |
19 | public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; }
|
20 | } |
21 | |
22 | sclass Node implements IFirstChar {
|
23 | int from, to; // our range in fullText |
24 | O children; // null, a Node or Node[] sorted by first character |
25 | |
26 | *() {}
|
27 | *(int *from, int *to) {}
|
28 | *(Substring s) { from = s.startIndex(); to = s.endIndex(); }
|
29 | |
30 | Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); }
|
31 | |
32 | int lText() { ret to-from; }
|
33 | |
34 | bool isTerminal(SuffixTree tree) { ret to == l(tree.fullText); }
|
35 | |
36 | void addChild(SuffixTree tree, Node n) {
|
37 | if (children == null) children = n; |
38 | else if (children cast Node) |
39 | children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator);
|
40 | else {
|
41 | Node[] c = cast children; |
42 | Node[] x = new[l(c)+1]; |
43 | arraycopy(c, 0, x, 0, l(c)); |
44 | x[l(x)-1] = n; |
45 | children = sortArrayInPlace(x, tree.childComparator); |
46 | } |
47 | } |
48 | |
49 | Cl<Node> children() {
|
50 | if (children == null) ret emptyList(); |
51 | if (children cast Node) ret ll(children); |
52 | ret wrapArrayAsList((Node[]) children); |
53 | } |
54 | |
55 | public int firstCharOrMinus1(SuffixTree tree) {
|
56 | ret charToIntOrMinus1(text(tree)); |
57 | } |
58 | |
59 | Node getChild(SuffixTree tree, int c) {
|
60 | if (children == null) null; |
61 | if (children cast Node) |
62 | ret children.firstCharOrMinus1(tree) == c ? children : null; |
63 | tree.dummyNode.firstChar = c; |
64 | Node[] x = cast children; |
65 | int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator); |
66 | ret i >= 0 ? x[i] : null; |
67 | } |
68 | } |
69 | |
70 | *() {}
|
71 | *(S fullText) {
|
72 | processText(fullText); |
73 | } |
74 | |
75 | void processText(S fullText) {
|
76 | this.fullText = fullText; |
77 | root = new Node(0, 0); |
78 | ++nodeCount; |
79 | for i over fullText: {
|
80 | //for (int i = l(fullText)-1; i >= 0; i--) {
|
81 | addSuffix(Substring(fullText, i)); |
82 | if (((i+1) % 1000000) == 0) |
83 | print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
|
84 | } |
85 | doneBuilding(); |
86 | } |
87 | |
88 | void doneBuilding {
|
89 | tailMap = null; |
90 | } |
91 | |
92 | void addSuffix(Substring s) {
|
93 | Node node = root; |
94 | |
95 | while (!empty(s)) {
|
96 | int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s); |
97 | s = s.substring(_n); |
98 | if (_n >= node.lText()) { // node text exhausted
|
99 | if (empty(s)) { // pattern also exhausted - done
|
100 | //print("Exhausted: " + node.from + "/" + node.to);
|
101 | // add a new termination node |
102 | Node nNew = newNode(s); |
103 | node.addChild(SuffixTree.this, nNew); |
104 | ret; |
105 | } else { // node text exhausted, pattern not exhausted
|
106 | Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s)); |
107 | if (n == null) {
|
108 | n = newNode(s); |
109 | node.addChild(SuffixTree.this, n); |
110 | ret; |
111 | } else |
112 | node = n; |
113 | } |
114 | } else { // node text not exhausted, pattern not exhausted
|
115 | // split node. first, move all the node's vitals to a new node nOld |
116 | Node nOld = newNode(node.from+_n, node.to); |
117 | nOld.children = node.children; |
118 | node.children = null; |
119 | node.to = node.from+_n; |
120 | node.addChild(SuffixTree.this, nOld); |
121 | |
122 | // now add a new node |
123 | Node nNew = newNode(s); |
124 | node.addChild(SuffixTree.this, nNew); |
125 | ret; |
126 | } |
127 | } |
128 | } |
129 | |
130 | public L<Int> indicesOf(S pattern) {
|
131 | ret asList(indicesOf_iterator(pattern)); |
132 | } |
133 | |
134 | public ItIt<Int> indicesOf_iterator(S pattern) {
|
135 | ret mapI_notNull(allNodesUnder(scanDown(root, pattern)), |
136 | nad -> nad.node.isTerminal(this) ? nad.position() : null); |
137 | } |
138 | |
139 | srecord NodeAndDepth(Node node, int depth) {
|
140 | int position() {
|
141 | int position = node.from-depth; |
142 | //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position);
|
143 | ret position; |
144 | } |
145 | |
146 | Cl<NodeAndDepth> children() {
|
147 | ret lmap wrapChild(node.children()); |
148 | } |
149 | |
150 | NodeAndDepth wrapChild(Node n) {
|
151 | ret n == null ? null : new NodeAndDepth(n, depth+node.lText()); |
152 | } |
153 | |
154 | NodeAndDepth getChild(SuffixTree tree, int c) {
|
155 | ret wrapChild(node.getChild(tree, c)); |
156 | } |
157 | } |
158 | |
159 | NodeAndDepth scanDown(Node node, S pattern) {
|
160 | int lPattern = l(pattern), iPattern = 0; |
161 | NodeAndDepth nad = new(node, 0); |
162 | while true {
|
163 | int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern)); |
164 | iPattern += n; |
165 | if (iPattern >= lPattern) break; // pattern exhausted - done |
166 | if (n < nad.node.lText()) null; // mismatch, exit |
167 | NodeAndDepth child = nad.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern)); |
168 | if (child != null) continue with nad = child; |
169 | null; |
170 | } |
171 | |
172 | ret nad; |
173 | } |
174 | |
175 | void printMe() {
|
176 | printNode("", "", new NodeAndDepth(root, 0));
|
177 | } |
178 | |
179 | void printNode(S indent, S pre, NodeAndDepth nad) {
|
180 | print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]")); |
181 | fOr (NodeAndDepth n : nad.children()) {
|
182 | printNode(indent + " ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n); |
183 | } |
184 | } |
185 | |
186 | ItIt<NodeAndDepth> allNodes() {
|
187 | ret allNodesUnder(new NodeAndDepth(root, 0)); |
188 | } |
189 | |
190 | // includes the node itself |
191 | ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) {
|
192 | new L<Iterator<NodeAndDepth>> stack; |
193 | if (nad != null) |
194 | stack.add(iteratorLL(nad)); |
195 | ret iteratorFromFunction_if0(() -> {
|
196 | while (nempty(stack)) {
|
197 | if (!last(stack).hasNext()) |
198 | popLast(stack); |
199 | else {
|
200 | NodeAndDepth n = last(stack).next(); |
201 | stack.add((Iterator) iterator(n.children())); |
202 | ret n; |
203 | } |
204 | } |
205 | null; |
206 | }); |
207 | } |
208 | |
209 | // also query & update tailMap |
210 | Node newNode(Substring s) {
|
211 | Node nNew = useTailOptimization && s.endIndex() == l(fullText) ? mapGet(tailMap, s.startIndex()) : null; |
212 | if (nNew == null) {
|
213 | nNew = new Node(s); |
214 | ++nodeCount; |
215 | if (useTailOptimization) |
216 | mapPut(tailMap, s.startIndex(), nNew); |
217 | } else {
|
218 | print("Using existing node for index " + s.startIndex() + ":");
|
219 | printNode(" ", "", new NodeAndDepth(nNew, 0));
|
220 | } |
221 | ret nNew; |
222 | } |
223 | |
224 | Node newNode(int from, int to) {
|
225 | ret newNode(Substring(fullText, from, to)); |
226 | } |
227 | } |
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
| Snippet ID: | #1029202 |
| Snippet name: | SuffixTree [graph optimization doesn't work yet, dev.] |
| Eternal ID of this version: | #1029202/95 |
| Text MD5: | c3467518c4ca65c5fc109d16d114fef0 |
| Transpilation MD5: | c43f4a33bde6daa13d936bf050ea280b |
| Author: | stefan |
| Category: | javax / text searching |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2020-07-26 19:47:59 |
| Source code size: | 7072 bytes / 227 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 586 / 1000 |
| Version history: | 94 change(s) |
| Referenced in: | [show references] |