Libraryless. Click here for Pure Java version (14874L/101K).
1 | // A naive suffix tree implementation |
2 | sclass IntSuffixTree { |
3 | Node root; |
4 | L<Int> fullText; |
5 | int nodeCount; |
6 | |
7 | Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this); |
8 | new DummyNode dummyNode; |
9 | int printEvery = 1000000; |
10 | |
11 | sinterface IFirstChar { |
12 | int firstCharOrMinus1(IntSuffixTree tree); |
13 | } |
14 | |
15 | sclass DummyNode implements IFirstChar { |
16 | int firstChar; |
17 | |
18 | public int firstCharOrMinus1(IntSuffixTree tree) { ret firstChar; } |
19 | } |
20 | |
21 | sclass Node implements IFirstChar { |
22 | int from, to; // our range in fullText |
23 | CompactTreeSet<IFirstChar> children; |
24 | |
25 | *() {} |
26 | *(int *from, int *to) {} |
27 | *(IntRange s) { from = s.start; to = s.end; } |
28 | |
29 | L<Int> text(IntSuffixTree tree) { ret subList(tree.fullText, from, to); } |
30 | |
31 | int lText() { ret to-from; } |
32 | |
33 | bool isTerminal(IntSuffixTree tree) { ret to == l(tree.fullText); } |
34 | |
35 | void addChild(IntSuffixTree tree, Node n) { |
36 | if (children == null) children = makeEmptyChildrenSet(tree); |
37 | children.add(n); |
38 | } |
39 | |
40 | CompactTreeSet<IFirstChar> makeEmptyChildrenSet(IntSuffixTree tree) { |
41 | ret new CompactTreeSet<>(tree.childComparator); |
42 | } |
43 | |
44 | Cl<IFirstChar> children() { |
45 | ret children; |
46 | } |
47 | |
48 | public int firstCharOrMinus1(IntSuffixTree tree) { |
49 | ret getOrMinus1(tree.fullText, from); |
50 | } |
51 | |
52 | Node getChild(IntSuffixTree tree, int c) { |
53 | if (children == null) null; |
54 | tree.dummyNode.firstChar = c; |
55 | ret (Node) children.find(tree.dummyNode); |
56 | } |
57 | } |
58 | |
59 | *() {} |
60 | *(L<Int> fullText) { |
61 | process(fullText); |
62 | } |
63 | |
64 | void process(L<Int> fullText) { |
65 | root = new Node(0, 0); |
66 | ++nodeCount; |
67 | for i over fullText: { |
68 | ping(); |
69 | addSuffix(IntRange(i, l(fullText))); |
70 | if (((i+1) % printEvery) == 0) |
71 | print((i+1) + "/" + l(fullText ) + " suffixes added (" + nNodes(nodeCount) + ")"); |
72 | } |
73 | } |
74 | |
75 | void addSuffix(IntRange s) { |
76 | Node node = root; |
77 | |
78 | while (!empty(s)) { |
79 | int _n = lCommonPrefix_lists_ext(fullText, node.from, fullText, s.start); |
80 | s.start += _n; |
81 | if (_n >= node.lText()) { // node text exhausted |
82 | if (empty(s)) { // pattern also exhausted - done |
83 | //print("Exhausted: " + node.from + "/" + node.to); |
84 | // add a new termination node |
85 | Node nNew = new Node(s); |
86 | ++nodeCount; |
87 | node.addChild(IntSuffixTree.this, nNew); |
88 | ret; |
89 | } else { |
90 | Node n = node.getChild(IntSuffixTree.this, getOrMinus1(fullText, s.start)); |
91 | if (n == null) { |
92 | n = new Node(s); |
93 | ++nodeCount; |
94 | node.addChild(IntSuffixTree.this, n); |
95 | ret; |
96 | } else |
97 | node = n; |
98 | } |
99 | } else { // node text not exhausted |
100 | // split node. first, move all the node's vitals to a new node nOld |
101 | Node nOld = new Node(node.from+_n, node.to); |
102 | ++nodeCount; |
103 | nOld.children = node.children; |
104 | node.children = null; |
105 | node.to = node.from+_n; |
106 | node.addChild(IntSuffixTree.this, nOld); |
107 | |
108 | // now add a new node |
109 | Node nNew = new Node(s); |
110 | ++nodeCount; |
111 | node.addChild(IntSuffixTree.this, nNew); |
112 | ret; |
113 | } |
114 | } |
115 | } |
116 | |
117 | public L<Int> indicesOf(L<Int> pattern) { |
118 | ret asList(indicesOf_iterator(pattern)); |
119 | } |
120 | |
121 | public ItIt<Int> indicesOf_iterator(L<Int> pattern) { |
122 | ret mapI_notNull(allNodesUnder(scanDown(root, pattern)), |
123 | nad -> nad.node.isTerminal(this) ? nad.position() : null); |
124 | } |
125 | |
126 | srecord NodeAndDepth(Node node, int depth) { |
127 | int position() { |
128 | int position = node.from-depth; |
129 | //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position); |
130 | ret position; |
131 | } |
132 | |
133 | Cl<NodeAndDepth> children() { |
134 | ret lmap wrapChild((Cl<Node>) (Cl) node.children()); |
135 | } |
136 | |
137 | NodeAndDepth wrapChild(Node n) { |
138 | ret n == null ? null : new NodeAndDepth(n, depth+node.lText()); |
139 | } |
140 | |
141 | NodeAndDepth getChild(IntSuffixTree tree, int c) { |
142 | ret wrapChild(node.getChild(tree, c)); |
143 | } |
144 | } |
145 | |
146 | NodeAndDepth scanDown(Node node, L<Int> pattern) { |
147 | int lPattern = l(pattern), iPattern = 0; |
148 | NodeAndDepth nad = new(node, 0); |
149 | while true { |
150 | int n = lCommonPrefix_lists(nad.node.text(this), subList(pattern, iPattern)); |
151 | iPattern += n; |
152 | if (iPattern >= lPattern) break; // pattern exhausted - done |
153 | if (n < nad.node.lText()) null; // mismatch, exit |
154 | NodeAndDepth child = nad.getChild(IntSuffixTree.this, or(get(pattern, iPattern), -1)); |
155 | if (child != null) continue with nad = child; |
156 | null; |
157 | } |
158 | |
159 | ret nad; |
160 | } |
161 | |
162 | void printMe() { |
163 | printNode("", "", new NodeAndDepth(root, 0)); |
164 | } |
165 | |
166 | void printNode(S indent, S pre, NodeAndDepth nad) { |
167 | print(indent + pre + takeFirst(20, nad.node.text(this)) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]")); |
168 | fOr (NodeAndDepth n : nad.children()) { |
169 | printNode(indent + " ", takeFirst(1, n.node.text(this)) + " ", n); |
170 | } |
171 | } |
172 | |
173 | ItIt<NodeAndDepth> allNodes() { |
174 | ret allNodesUnder(new NodeAndDepth(root, 0)); |
175 | } |
176 | |
177 | // includes the node itself |
178 | ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) { |
179 | new L<Iterator<NodeAndDepth>> stack; |
180 | if (nad != null) |
181 | stack.add(iteratorLL(nad)); |
182 | ret iteratorFromFunction_if0(() -> { |
183 | while (nempty(stack)) { |
184 | if (!last(stack).hasNext()) |
185 | popLast(stack); |
186 | else { |
187 | NodeAndDepth n = last(stack).next(); |
188 | stack.add((Iterator) iterator(n.children())); |
189 | ret n; |
190 | } |
191 | } |
192 | null; |
193 | }); |
194 | } |
195 | } |
Began life as a copy of #1029234
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029269 |
Snippet name: | IntSuffixTree [OK, but slow creation algorithm] |
Eternal ID of this version: | #1029269/27 |
Text MD5: | 5fea51137dfa0e7387a3bf19bc548a98 |
Transpilation MD5: | cd8ca402060f10eb680c26e8d4928d81 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-29 20:41:19 |
Source code size: | 5841 bytes / 195 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 336 / 708 |
Version history: | 26 change(s) |
Referenced in: | [show references] |