Uses 911K of libraries. Click here for Pure Java version (4306L/23K).
1 | !7 |
2 | |
3 | // adapted from https://github.com/eugenp/tutorials/tree/master/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree |
4 | |
5 | extend Substring { |
6 | bool endsAtEnd() { ret i+l == s.length(); } |
7 | } |
8 | |
9 | cmodule SuffixTreeSpike > DynPrintLog { |
10 | switchable S text = "hello world what is up in the world"; |
11 | switchable S pattern = "world"; |
12 | transient SuffixTree tree; |
13 | |
14 | sclass Node { |
15 | Substring text; |
16 | new L<Node> children; |
17 | int position; // Note: position is either -1 or same or smaller than text.startIndex() |
18 | static int nodeCounter; |
19 | int nodeNr = ++nodeCounter; // for printing |
20 | |
21 | *(Substring *text, int *position) {} |
22 | |
23 | Substring getText() { ret text; } |
24 | void setText(Substring text) { |
25 | print("Setting text: " + quote(this.text) + " => " + quote(text); |
26 | this.text = text; |
27 | } |
28 | |
29 | int getPosition() { ret position; } |
30 | void setPosition(int position) { this.position = position; } |
31 | L<Node> getChildren() { ret children; } |
32 | void setChildren(L<Node> children) { this.children = children; } |
33 | |
34 | S printTree(S depthIndicator) { |
35 | S str = ""; |
36 | S positionStr = position > -1 ? " [" + position + "]" : ""; |
37 | str += depthIndicator + "Node " + nodeNr + " " + quote(text) + positionStr + "\n"; |
38 | |
39 | for (int i = 0; i < children.size(); i++) |
40 | str += children.get(i).printTree(depthIndicator + " "); |
41 | return str; |
42 | } |
43 | |
44 | toString { ret printTree(""); } |
45 | } |
46 | |
47 | sclass SuffixTree { |
48 | private static final String WORD_TERMINATION = "$"; |
49 | private static final int POSITION_UNDEFINED = -1; |
50 | Node root; |
51 | S fullText; |
52 | S textWithTerminator; // all Substrings are constructed off this |
53 | |
54 | *(S text) { |
55 | textWithTerminator = text + WORD_TERMINATION; |
56 | root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED); |
57 | for i over text: |
58 | addSuffix(Substring(textWithTerminator, i), i); |
59 | fullText = text; |
60 | } |
61 | |
62 | public LS searchText(S pattern) { |
63 | //LOGGER.info("Searching for pattern \"{}\"", pattern); |
64 | new LS result; |
65 | L<Node> nodes = getAllNodesInTraversePath(pattern, root, false); |
66 | |
67 | if (nodes.size() > 0) { |
68 | Node lastNode = nodes.get(nodes.size() - 1); |
69 | if (lastNode != null) { |
70 | L<Int> positions = getPositions(lastNode); |
71 | sortInPlace(positions); |
72 | positions.forEach(m -> result.add((markPatternInText(m, pattern)))); |
73 | } |
74 | } |
75 | return result; |
76 | } |
77 | |
78 | private void addSuffix(Substring suffix, int position) { |
79 | print("Adding new suffix: " + quote(suffix)); |
80 | List<Node> nodes = getAllNodesInTraversePath(suffix, root, true); |
81 | if (nodes.size() == 0) { |
82 | addChildNode(root, suffix, position); |
83 | //LOGGER.info("{}", printTree()); |
84 | } else { |
85 | Node lastNode = nodes.remove(nodes.size() - 1); |
86 | Substring newText = suffix; |
87 | if (nodes.size() > 0) { |
88 | Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText())); |
89 | |
90 | // Remove prefix from newText already included in parent |
91 | newText = newText.substring(existingSuffixUptoLastNode.length()); |
92 | } |
93 | extendNode(lastNode, newText, position); |
94 | //LOGGER.info("{}", printTree()); |
95 | } |
96 | } |
97 | |
98 | private List<Integer> getPositions(Node node) { |
99 | List<Integer> positions = new ArrayList<>(); |
100 | if (node.getText() |
101 | /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) { |
102 | positions.add(node.getPosition()); |
103 | } |
104 | for (int i = 0; i < node.getChildren() |
105 | .size(); i++) { |
106 | positions.addAll(getPositions(node.getChildren() |
107 | .get(i))); |
108 | } |
109 | return positions; |
110 | } |
111 | |
112 | private String markPatternInText(Integer startPosition, String pattern) { |
113 | String matchingTextLHS = fullText.substring(0, startPosition); |
114 | String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); |
115 | String matchingTextRHS = fullText.substring(startPosition + pattern.length()); |
116 | return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; |
117 | } |
118 | |
119 | private void addChildNode(Node parentNode, Substring text, int position) { |
120 | parentNode.getChildren() |
121 | .add(new Node(text, position)); |
122 | } |
123 | |
124 | private void extendNode(Node node, Substring newText, int position) { |
125 | Substring currentText = node.getText(); |
126 | Substring commonPrefix = getLongestCommonPrefix(currentText, newText); |
127 | |
128 | if (neq(commonPrefix, currentText)) { |
129 | Substring parentText = currentText.substring(0, commonPrefix.length()); |
130 | Substring childText = currentText.substring(commonPrefix.length()); |
131 | splitNodeToParentAndChild(node, parentText, childText); |
132 | } |
133 | |
134 | Substring remainingText = newText.substring(commonPrefix.length()); |
135 | addChildNode(node, remainingText, position); |
136 | } |
137 | |
138 | private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) { |
139 | Node childNode = new Node(childNewText, parentNode.getPosition()); |
140 | |
141 | if (parentNode.getChildren() |
142 | .size() > 0) { |
143 | while (parentNode.getChildren() |
144 | .size() > 0) { |
145 | childNode.getChildren() |
146 | .add(parentNode.getChildren() |
147 | .remove(0)); |
148 | } |
149 | } |
150 | |
151 | parentNode.getChildren().add(childNode); |
152 | parentNode.setText(parentNewText); |
153 | parentNode.setPosition(POSITION_UNDEFINED); |
154 | } |
155 | |
156 | private Substring getLongestCommonPrefix(Substring str1, Substring str2) { |
157 | int compareLength = Math.min(str1.length(), str2.length()); |
158 | for (int i = 0; i < compareLength; i++) |
159 | if (str1.charAt(i) != str2.charAt(i)) |
160 | ret str1.substring(0, i); |
161 | ret str1.substring(0, compareLength); |
162 | } |
163 | |
164 | private List<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) { |
165 | List<Node> nodes = new ArrayList<>(); |
166 | for (int i = 0; i < startNode.getChildren().size(); i++) { |
167 | Node currentNode = startNode.getChildren().get(i); |
168 | CharSequence nodeText = currentNode.getText(); |
169 | if (empty(nodeText)) fail("Node text empty: " + currentNode); |
170 | if (pattern.charAt(0) == nodeText.charAt(0)) { |
171 | if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { |
172 | nodes.add(currentNode); |
173 | return nodes; |
174 | } |
175 | |
176 | int compareLength = Math.min(nodeText.length(), pattern.length()); |
177 | for (int j = 1; j < compareLength; j++) { |
178 | if (pattern.charAt(j) != nodeText.charAt(j)) { |
179 | if (isAllowPartialMatch) { |
180 | nodes.add(currentNode); |
181 | } |
182 | return nodes; |
183 | } |
184 | } |
185 | |
186 | nodes.add(currentNode); |
187 | if (pattern.length() > compareLength) { |
188 | List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch); |
189 | if (nodes2.size() > 0) { |
190 | nodes.addAll(nodes2); |
191 | } else if (!isAllowPartialMatch) { |
192 | nodes.add(null); |
193 | } |
194 | } |
195 | return nodes; |
196 | } |
197 | } |
198 | return nodes; |
199 | } |
200 | |
201 | S printTree() { ret root.printTree(""); } |
202 | } |
203 | |
204 | Cl<Node> allChildren(Node root) { |
205 | ret transitiveHullOfFunction_inOrder(a -> a.getChildren(), root); |
206 | } |
207 | |
208 | start-thread { |
209 | Node.nodeCounter = 0; |
210 | tree = new SuffixTree(text); |
211 | print(tree.printTree()); |
212 | //print(structWordWrap(tree)); |
213 | for (Node node : allChildren(tree.root)) |
214 | print("pos=" + node.position, ", str: " + struct(node.text)); |
215 | print("Estimated tree size: " + nBytes(guessObjectSize(tree))); |
216 | print(+text); |
217 | print(+pattern); |
218 | pnl(tree.searchText(pattern)); |
219 | } |
220 | } |
Began life as a copy of #1028130
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028134 |
Snippet name: | Suffix Tree Spike v3 [simplifying further, dev.] |
Eternal ID of this version: | #1028134/8 |
Text MD5: | 595fb74721b37adb34ff705482bd8aab |
Transpilation MD5: | d72ee5c1abc33b92a374c147d1e63222 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX source code (Dynamic Module) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-23 12:52:41 |
Source code size: | 8372 bytes / 220 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 278 / 460 |
Version history: | 7 change(s) |
Referenced in: | [show references] |