1 | // A naive suffix tree implementation |
2 | sclass SuffixTree { |
3 | Node root; |
4 | S fullText; |
5 | int nodeCount; |
6 | |
7 | Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1()-b.firstCharOrMinus1(); |
8 | new DummyNode dummyNode; |
9 | |
10 | sinterface IFirstChar { |
11 | int firstCharOrMinus1(); |
12 | } |
13 | |
14 | sclass DummyNode implements IFirstChar { |
15 | int firstChar; |
16 | |
17 | public int firstCharOrMinus1() { ret firstChar; } |
18 | } |
19 | |
20 | sclass Node implements IFirstChar { |
21 | Substring text; |
22 | CompactTreeSet<IFirstChar> children; // actually, <Node> |
23 | IntBuffer positions; |
24 | |
25 | *() {} |
26 | *(Substring *text) {} |
27 | |
28 | void addPosition(int i) { |
29 | if (positions == null) positions = new IntBuffer; |
30 | positions.add(i); |
31 | } |
32 | |
33 | void addChild(SuffixTree tree, Node n) { |
34 | if (children == null) children = makeEmptyChildrenSet(tree); |
35 | children.add(n); |
36 | } |
37 | |
38 | CompactTreeSet<IFirstChar> makeEmptyChildrenSet(SuffixTree tree) { ret new CompactTreeSet<>(tree.childComparator); } |
39 | |
40 | public int firstCharOrMinus1() { |
41 | ret charToIntOrMinus1(text); |
42 | } |
43 | |
44 | Node getChild(SuffixTree tree, int c) { |
45 | if (children == null) null; |
46 | tree.dummyNode.firstChar = c; |
47 | ret (Node) children.find(tree.dummyNode); |
48 | } |
49 | } |
50 | |
51 | *() {} |
52 | *(S *fullText) { |
53 | root = new Node(Substring(fullText, 0, 0)); |
54 | ++nodeCount; |
55 | for i over fullText: { |
56 | addSuffix(Substring(fullText, i), i); |
57 | if (((i+1) % 1000000) == 0) |
58 | print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")"); |
59 | } |
60 | } |
61 | |
62 | void addSuffix(Substring s, int position) { |
63 | Node node = root; |
64 | |
65 | while (!empty(s)) { |
66 | int _n = lCommonPrefix_CharSequence(node.text, s); |
67 | s = s.substring(_n); |
68 | if (_n >= l(node.text)) { // node text exhausted |
69 | if (empty(s)) { // pattern also exhausted - done |
70 | node.addPosition(position); |
71 | ret; |
72 | } else { |
73 | Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s)); |
74 | if (n == null) { |
75 | n = new Node(s); |
76 | ++nodeCount; |
77 | n.addPosition(position); |
78 | node.addChild(SuffixTree.this, n); |
79 | ret; |
80 | } else |
81 | node = n; |
82 | } |
83 | } else { // node text not exhausted |
84 | // split node. first, move all the node's vitals to a new node nOld |
85 | Node nOld = new Node(node.text.substring(_n)); |
86 | ++nodeCount; |
87 | nOld.positions = node.positions; |
88 | node.positions = null; |
89 | nOld.children = node.children; |
90 | node.children = null; |
91 | node.text = node.text.substring(0, _n); |
92 | node.addChild(SuffixTree.this, nOld); |
93 | |
94 | // now add a new node |
95 | Node nNew = new Node(s); |
96 | ++nodeCount; |
97 | nNew.addPosition(position); |
98 | node.addChild(SuffixTree.this, nNew); |
99 | ret; |
100 | } |
101 | } |
102 | } |
103 | |
104 | public L<Int> indicesOf(S pattern) { |
105 | new IntBuffer out; |
106 | getIndicesOf(out, root, pattern, 0); |
107 | ret out.asVirtualList(); |
108 | } |
109 | |
110 | void getIndicesOf(IntBuffer out, Node node, S pattern, int iPattern) { |
111 | int lPattern = l(pattern); |
112 | while true { |
113 | int n = lCommonPrefix_CharSequence(node.text, Substring(pattern, iPattern)); |
114 | iPattern += n; |
115 | if (iPattern >= lPattern) break; // pattern exhausted - done |
116 | if (n < l(node.text)) ret; // mismatch, exit |
117 | Node child = node.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern)); |
118 | if (child != null) continue with node = child; |
119 | ret; |
120 | } |
121 | |
122 | collectPositions(out, node); |
123 | } |
124 | |
125 | L<Int> getPositions(Node node) { |
126 | new IntBuffer out; |
127 | collectPositions(out, node); |
128 | ret out.asVirtualList(); |
129 | } |
130 | |
131 | void collectPositions(IntBuffer out, Node node) { |
132 | if (node == null) ret; |
133 | out.addAll(node.positions); |
134 | fOr (IFirstChar n : node.children) |
135 | collectPositions(out, (Node) n); |
136 | } |
137 | |
138 | void printMe() { |
139 | printNode("", "", root); |
140 | } |
141 | |
142 | void printNode(S indent, S pre, Node node) { |
143 | print(indent + pre + quote(shorten(20, node.text)) + (empty(node.positions) ? "" : " " + node.positions)); |
144 | fOr (IFirstChar _n : node.children) { |
145 | Node n = cast _n; |
146 | printNode(indent + " ", "[" + (empty(n.text) ? "end" : quote(n.text.charAt(0))) + "] ", n); |
147 | } |
148 | } |
149 | |
150 | ItIt<Node> allNodes() { |
151 | new L<Iterator<Node>> stack; |
152 | stack.add(iteratorLL(root)); |
153 | ret iteratorFromFunction_if0(() -> { |
154 | while (nempty(stack)) { |
155 | if (!last(stack).hasNext()) |
156 | popLast(stack); |
157 | else { |
158 | Node n = last(stack).next(); |
159 | stack.add((Iterator) iterator(n.children)); |
160 | ret n; |
161 | } |
162 | } |
163 | null; |
164 | }); |
165 | } |
166 | } |
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029213 |
Snippet name: | SuffixTree [partly optimized, not sure if correct] |
Eternal ID of this version: | #1029213/1 |
Text MD5: | 14996bbefb96fbd55e8a2f99d75f03ed |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-25 14:25:38 |
Source code size: | 4791 bytes / 166 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 184 / 199 |
Referenced in: | [show references] |