Uses 911K of libraries. Click here for Pure Java version (4028L/21K).
1 | !7 |
2 | |
3 | // adapted from https://github.com/eugenp/tutorials/tree/master/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree |
4 | |
5 | extend Substring { |
6 | bool endsAtEnd() { ret i+l == s.length(); } |
7 | } |
8 | |
9 | cmodule SuffixTreeSpike > DynPrintLog { |
10 | switchable S text = "hello world what is up in the world"; |
11 | switchable S pattern = "world"; |
12 | transient SuffixTree tree; |
13 | |
14 | sclass Node { |
15 | Substring text; |
16 | new L<Node> children; |
17 | int position; |
18 | |
19 | *(Substring *text, int *position) {} |
20 | |
21 | Substring getText() { ret text; } |
22 | void setText(Substring text) { |
23 | print("Setting text: " + quote(this.text) + " => " + quote(text); |
24 | this.text = text; |
25 | } |
26 | |
27 | int getPosition() { ret position; } |
28 | void setPosition(int position) { this.position = position; } |
29 | L<Node> getChildren() { ret children; } |
30 | void setChildren(L<Node> children) { this.children = children; } |
31 | |
32 | S printTree(String depthIndicator) { |
33 | S str = ""; |
34 | S positionStr = position > -1 ? "[" + position + "]" : ""; |
35 | str += depthIndicator + text + positionStr + "\n"; |
36 | |
37 | for (int i = 0; i < children.size(); i++) |
38 | str += children.get(i).printTree(depthIndicator + "\t"); |
39 | return str; |
40 | } |
41 | |
42 | toString { ret printTree(""); } |
43 | } |
44 | |
45 | sclass SuffixTree { |
46 | private static final String WORD_TERMINATION = "$"; |
47 | private static final int POSITION_UNDEFINED = -1; |
48 | Node root; |
49 | S fullText; |
50 | S textWithTerminator; // all Substrings are constructed off this |
51 | |
52 | *(S text) { |
53 | textWithTerminator = text + WORD_TERMINATION; |
54 | root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED); |
55 | for i over text: |
56 | addSuffix(Substring(textWithTerminator, i), i); |
57 | fullText = text; |
58 | } |
59 | |
60 | public LS searchText(S pattern) { |
61 | //LOGGER.info("Searching for pattern \"{}\"", pattern); |
62 | new LS result; |
63 | L<Node> nodes = getAllNodesInTraversePath(pattern, root, false); |
64 | |
65 | if (nodes.size() > 0) { |
66 | Node lastNode = nodes.get(nodes.size() - 1); |
67 | if (lastNode != null) { |
68 | L<Int> positions = getPositions(lastNode); |
69 | sortInPlace(positions); |
70 | positions.forEach(m -> result.add((markPatternInText(m, pattern)))); |
71 | } |
72 | } |
73 | return result; |
74 | } |
75 | |
76 | private void addSuffix(Substring suffix, int position) { |
77 | print("Adding new suffix: " + quote(suffix)); |
78 | List<Node> nodes = getAllNodesInTraversePath(suffix, root, true); |
79 | if (nodes.size() == 0) { |
80 | addChildNode(root, suffix, position); |
81 | //LOGGER.info("{}", printTree()); |
82 | } else { |
83 | Node lastNode = nodes.remove(nodes.size() - 1); |
84 | Substring newText = suffix; |
85 | if (nodes.size() > 0) { |
86 | Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText())); |
87 | |
88 | // Remove prefix from newText already included in parent |
89 | newText = newText.substring(existingSuffixUptoLastNode.length()); |
90 | } |
91 | extendNode(lastNode, newText, position); |
92 | //LOGGER.info("{}", printTree()); |
93 | } |
94 | } |
95 | |
96 | private List<Integer> getPositions(Node node) { |
97 | List<Integer> positions = new ArrayList<>(); |
98 | if (node.getText() |
99 | /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) { |
100 | positions.add(node.getPosition()); |
101 | } |
102 | for (int i = 0; i < node.getChildren() |
103 | .size(); i++) { |
104 | positions.addAll(getPositions(node.getChildren() |
105 | .get(i))); |
106 | } |
107 | return positions; |
108 | } |
109 | |
110 | private String markPatternInText(Integer startPosition, String pattern) { |
111 | String matchingTextLHS = fullText.substring(0, startPosition); |
112 | String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); |
113 | String matchingTextRHS = fullText.substring(startPosition + pattern.length()); |
114 | return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; |
115 | } |
116 | |
117 | private void addChildNode(Node parentNode, Substring text, int position) { |
118 | parentNode.getChildren() |
119 | .add(new Node(text, position)); |
120 | } |
121 | |
122 | private void extendNode(Node node, Substring newText, int position) { |
123 | Substring currentText = node.getText(); |
124 | Substring commonPrefix = getLongestCommonPrefix(currentText, newText); |
125 | |
126 | if (neq(commonPrefix, currentText)) { |
127 | Substring parentText = currentText.substring(0, commonPrefix.length()); |
128 | Substring childText = currentText.substring(commonPrefix.length()); |
129 | splitNodeToParentAndChild(node, parentText, childText); |
130 | } |
131 | |
132 | Substring remainingText = newText.substring(commonPrefix.length()); |
133 | addChildNode(node, remainingText, position); |
134 | } |
135 | |
136 | private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) { |
137 | Node childNode = new Node(childNewText, parentNode.getPosition()); |
138 | |
139 | if (parentNode.getChildren() |
140 | .size() > 0) { |
141 | while (parentNode.getChildren() |
142 | .size() > 0) { |
143 | childNode.getChildren() |
144 | .add(parentNode.getChildren() |
145 | .remove(0)); |
146 | } |
147 | } |
148 | |
149 | parentNode.getChildren().add(childNode); |
150 | parentNode.setText(parentNewText); |
151 | parentNode.setPosition(POSITION_UNDEFINED); |
152 | } |
153 | |
154 | private Substring getLongestCommonPrefix(Substring str1, Substring str2) { |
155 | int compareLength = Math.min(str1.length(), str2.length()); |
156 | for (int i = 0; i < compareLength; i++) |
157 | if (str1.charAt(i) != str2.charAt(i)) |
158 | ret str1.substring(0, i); |
159 | ret str1.substring(0, compareLength); |
160 | } |
161 | |
162 | private List<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) { |
163 | List<Node> nodes = new ArrayList<>(); |
164 | for (int i = 0; i < startNode.getChildren().size(); i++) { |
165 | Node currentNode = startNode.getChildren().get(i); |
166 | CharSequence nodeText = currentNode.getText(); |
167 | if (empty(nodeText)) fail("Node text empty: " + currentNode); |
168 | if (pattern.charAt(0) == nodeText.charAt(0)) { |
169 | if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { |
170 | nodes.add(currentNode); |
171 | return nodes; |
172 | } |
173 | |
174 | int compareLength = Math.min(nodeText.length(), pattern.length()); |
175 | for (int j = 1; j < compareLength; j++) { |
176 | if (pattern.charAt(j) != nodeText.charAt(j)) { |
177 | if (isAllowPartialMatch) { |
178 | nodes.add(currentNode); |
179 | } |
180 | return nodes; |
181 | } |
182 | } |
183 | |
184 | nodes.add(currentNode); |
185 | if (pattern.length() > compareLength) { |
186 | List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch); |
187 | if (nodes2.size() > 0) { |
188 | nodes.addAll(nodes2); |
189 | } else if (!isAllowPartialMatch) { |
190 | nodes.add(null); |
191 | } |
192 | } |
193 | return nodes; |
194 | } |
195 | } |
196 | return nodes; |
197 | } |
198 | |
199 | S printTree() { ret root.printTree(""); } |
200 | } |
201 | |
202 | start-thread { |
203 | tree = new SuffixTree(text); |
204 | print(tree.printTree()); |
205 | print("Estimated tree size: " + nBytes(guessObjectSize(tree))); |
206 | print(+text); |
207 | print(+pattern); |
208 | pnl(tree.searchText(pattern)); |
209 | } |
210 | } |
Began life as a copy of #1028128
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028130 |
Snippet name: | Suffix Tree Spike v2 [storing substrings efficiently, seems OK] |
Eternal ID of this version: | #1028130/18 |
Text MD5: | e425b97cf88c16bf50bdf77698633d61 |
Transpilation MD5: | 747e21e600e736ce64714cb27e224a25 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX source code (Dynamic Module) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-23 12:35:39 |
Source code size: | 7898 bytes / 210 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 327 / 2175 |
Version history: | 17 change(s) |
Referenced in: | [show references] |