Libraryless. Click here for Pure Java version (14953L/102K).
1 | // A naive suffix tree implementation |
2 | sclass SuffixTree { |
3 | Node root; |
4 | S fullText; |
5 | int nodeCount; |
6 | |
7 | Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this); |
8 | new DummyNode dummyNode; |
9 | |
10 | sinterface IFirstChar { |
11 | int firstCharOrMinus1(SuffixTree tree); |
12 | } |
13 | |
14 | sclass DummyNode implements IFirstChar { |
15 | int firstChar; |
16 | |
17 | public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; } |
18 | } |
19 | |
20 | sclass Node implements IFirstChar { |
21 | int from, to; // our range in fullText |
22 | O children; // null, a Node or Node[] sorted by first character |
23 | int position = -1; |
24 | |
25 | *() {} |
26 | *(int *from, int *to) {} |
27 | *(Substring s) { from = s.startIndex(); to = s.endIndex(); } |
28 | |
29 | Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); } |
30 | |
31 | int lText() { ret to-from; } |
32 | |
33 | void addPosition(int i) { |
34 | if (position >= 0) fail("Already have position"); |
35 | position = i; |
36 | } |
37 | |
38 | void addChild(SuffixTree tree, Node n) { |
39 | if (children == null) children = n; |
40 | else if (children cast Node) |
41 | children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator); |
42 | else { |
43 | Node[] c = cast children; |
44 | Node[] x = new[l(c)+1]; |
45 | arraycopy(c, 0, x, 0, l(c)); |
46 | x[l(x)-1] = n; |
47 | children = sortArrayInPlace(x, tree.childComparator); |
48 | } |
49 | } |
50 | |
51 | Cl<Node> children() { |
52 | if (children == null) ret emptyList(); |
53 | if (children cast Node) ret ll(children); |
54 | ret wrapArrayAsList((Node[]) children); |
55 | } |
56 | |
57 | CompactTreeSet<IFirstChar> makeEmptyChildrenSet(SuffixTree tree) { ret new CompactTreeSet<>(tree.childComparator); } |
58 | |
59 | public int firstCharOrMinus1(SuffixTree tree) { |
60 | ret charToIntOrMinus1(text(tree)); |
61 | } |
62 | |
63 | Node getChild(SuffixTree tree, int c) { |
64 | if (children == null) null; |
65 | if (children cast Node) |
66 | ret children.firstCharOrMinus1(tree) == c ? children : null; |
67 | tree.dummyNode.firstChar = c; |
68 | Node[] x = cast children; |
69 | int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator); |
70 | ret i >= 0 ? x[i] : null; |
71 | } |
72 | } |
73 | |
74 | *() {} |
75 | *(S *fullText) { |
76 | root = new Node(0, 0); |
77 | ++nodeCount; |
78 | for i over fullText: { |
79 | addSuffix(Substring(fullText, i), i); |
80 | if (((i+1) % 1000000) == 0) |
81 | print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")"); |
82 | } |
83 | } |
84 | |
85 | void addSuffix(Substring s, int position) { |
86 | Node node = root; |
87 | |
88 | while (!empty(s)) { |
89 | int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s); |
90 | s = s.substring(_n); |
91 | if (_n >= node.lText()) { // node text exhausted |
92 | if (empty(s)) { // pattern also exhausted - done |
93 | node.addPosition(position); |
94 | ret; |
95 | } else { |
96 | Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s)); |
97 | if (n == null) { |
98 | n = new Node(s); |
99 | ++nodeCount; |
100 | n.addPosition(position); |
101 | node.addChild(SuffixTree.this, n); |
102 | ret; |
103 | } else |
104 | node = n; |
105 | } |
106 | } else { // node text not exhausted |
107 | // split node. first, move all the node's vitals to a new node nOld |
108 | Node nOld = new Node(node.from+_n, node.to); |
109 | ++nodeCount; |
110 | nOld.position = node.position; |
111 | node.position = -1; |
112 | nOld.children = node.children; |
113 | node.children = null; |
114 | node.to = node.from+_n; |
115 | node.addChild(SuffixTree.this, nOld); |
116 | |
117 | // now add a new node |
118 | Node nNew = new Node(s); |
119 | ++nodeCount; |
120 | nNew.addPosition(position); |
121 | node.addChild(SuffixTree.this, nNew); |
122 | ret; |
123 | } |
124 | } |
125 | } |
126 | |
127 | public L<Int> indicesOf(S pattern) { |
128 | ret asList(indicesOf_iterator(pattern)); |
129 | } |
130 | |
131 | public ItIt<Int> indicesOf_iterator(S pattern) { |
132 | ret mapI_notNull(allNodesUnder(scanDown(root, pattern)), |
133 | n -> n.position >= 0 ? n.position : null); |
134 | } |
135 | |
136 | Node scanDown(Node node, S pattern) { |
137 | int lPattern = l(pattern), iPattern = 0; |
138 | while true { |
139 | int n = lCommonPrefix_CharSequence(node.text(this), Substring(pattern, iPattern)); |
140 | iPattern += n; |
141 | if (iPattern >= lPattern) break; // pattern exhausted - done |
142 | if (n < node.lText()) null; // mismatch, exit |
143 | Node child = node.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern)); |
144 | if (child != null) continue with node = child; |
145 | null; |
146 | } |
147 | |
148 | ret node; |
149 | } |
150 | |
151 | L<Int> getPositions(Node node) { |
152 | new IntBuffer out; |
153 | collectPositions(out, node); |
154 | ret out.asVirtualList(); |
155 | } |
156 | |
157 | void collectPositions(IntBuffer out, Node node) { |
158 | if (node == null) ret; |
159 | if (node.position >= 0) out.add(node.position); |
160 | fOr (IFirstChar n : node.children()) |
161 | collectPositions(out, (Node) n); |
162 | } |
163 | |
164 | void printMe() { |
165 | printNode("", "", root); |
166 | } |
167 | |
168 | void printNode(S indent, S pre, Node node) { |
169 | print(indent + pre + quote(shorten(20, node.text(this))) + (node.position < 0 ? "" : " [" + node.position + "]")); |
170 | fOr (IFirstChar _n : node.children()) { |
171 | Node n = cast _n; |
172 | printNode(indent + " ", "[" + (n.lText() == 0 ? "end" : quote(n.text(this).charAt(0))) + "] ", n); |
173 | } |
174 | } |
175 | |
176 | ItIt<Node> allNodes() { |
177 | ret allNodesUnder(root); |
178 | } |
179 | |
180 | // includes the node itself |
181 | ItIt<Node> allNodesUnder(Node node) { |
182 | new L<Iterator<Node>> stack; |
183 | if (node != null) |
184 | stack.add(iteratorLL(node)); |
185 | ret iteratorFromFunction_if0(() -> { |
186 | while (nempty(stack)) { |
187 | if (!last(stack).hasNext()) |
188 | popLast(stack); |
189 | else { |
190 | Node n = last(stack).next(); |
191 | stack.add((Iterator) iterator(n.children())); |
192 | ret n; |
193 | } |
194 | } |
195 | null; |
196 | }); |
197 | } |
198 | } |
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029220 |
Snippet name: | SuffixTree [yet another backup] |
Eternal ID of this version: | #1029220/1 |
Text MD5: | 3e74f9b249616e509a9799f94c228150 |
Transpilation MD5: | 23457315c064e89c68204da3f09bc5b8 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-26 00:20:19 |
Source code size: | 5975 bytes / 198 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 187 / 259 |
Referenced in: | [show references] |