Libraryless. Click here for Pure Java version (3173L/20K).
1 | sclass SuffixTree { |
2 | Node root; |
3 | S fullText; |
4 | S textWithTerminator; // all Substrings are constructed off this |
5 | |
6 | private static final String WORD_TERMINATION = "$"; |
7 | private static final int POSITION_UNDEFINED = -1; |
8 | |
9 | sclass Node { |
10 | Substring text; |
11 | new L<Node> children; |
12 | int position; |
13 | |
14 | *(Substring *text, int *position) {} |
15 | |
16 | Substring getText() { ret text; } |
17 | void setText(Substring text) { |
18 | //print("Setting text: " + quote(this.text) + " => " + quote(text); |
19 | this.text = text; |
20 | } |
21 | |
22 | int getPosition() { ret position; } |
23 | void setPosition(int position) { this.position = position; } |
24 | L<Node> getChildren() { ret children; } |
25 | void setChildren(L<Node> children) { this.children = children; } |
26 | |
27 | S printTree(String depthIndicator) { |
28 | S str = ""; |
29 | S positionStr = position > -1 ? "[" + position + "]" : ""; |
30 | str += depthIndicator + text + positionStr + "\n"; |
31 | |
32 | for (int i = 0; i < children.size(); i++) |
33 | str += children.get(i).printTree(depthIndicator + "\t"); |
34 | return str; |
35 | } |
36 | |
37 | toString { ret printTree(""); } |
38 | } |
39 | |
40 | *(S text) { |
41 | textWithTerminator = text + WORD_TERMINATION; |
42 | root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED); |
43 | for i over text: |
44 | addSuffix(Substring(textWithTerminator, i), i); |
45 | fullText = text; |
46 | } |
47 | |
48 | public LS markText(S pattern) { |
49 | //LOGGER.info("Searching for pattern \"{}\"", pattern); |
50 | new LS result; |
51 | L<Node> nodes = getAllNodesInTraversePath(pattern, root, false); |
52 | |
53 | if (nodes.size() > 0) { |
54 | Node lastNode = nodes.get(nodes.size() - 1); |
55 | if (lastNode != null) { |
56 | L<Int> positions = getPositions(lastNode); |
57 | sortInPlace(positions); |
58 | positions.forEach(m -> result.add((markPatternInText(m, pattern)))); |
59 | } |
60 | } |
61 | return result; |
62 | } |
63 | |
64 | private void addSuffix(Substring suffix, int position) { |
65 | //print("Adding new suffix: " + quote(suffix)); |
66 | List<Node> nodes = getAllNodesInTraversePath(suffix, root, true); |
67 | if (nodes.size() == 0) { |
68 | addChildNode(root, suffix, position); |
69 | //LOGGER.info("{}", printTree()); |
70 | } else { |
71 | Node lastNode = nodes.remove(nodes.size() - 1); |
72 | Substring newText = suffix; |
73 | if (nodes.size() > 0) { |
74 | Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText())); |
75 | |
76 | // Remove prefix from newText already included in parent |
77 | newText = newText.substring(existingSuffixUptoLastNode.length()); |
78 | } |
79 | extendNode(lastNode, newText, position); |
80 | //LOGGER.info("{}", printTree()); |
81 | } |
82 | } |
83 | |
84 | private List<Integer> getPositions(Node node) { |
85 | List<Integer> positions = new ArrayList<>(); |
86 | if (node.getText() |
87 | /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) { |
88 | positions.add(node.getPosition()); |
89 | } |
90 | for (int i = 0; i < node.getChildren() |
91 | .size(); i++) { |
92 | positions.addAll(getPositions(node.getChildren() |
93 | .get(i))); |
94 | } |
95 | return positions; |
96 | } |
97 | |
98 | private String markPatternInText(Integer startPosition, String pattern) { |
99 | String matchingTextLHS = fullText.substring(0, startPosition); |
100 | String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); |
101 | String matchingTextRHS = fullText.substring(startPosition + pattern.length()); |
102 | return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; |
103 | } |
104 | |
105 | private void addChildNode(Node parentNode, Substring text, int position) { |
106 | parentNode.getChildren() |
107 | .add(new Node(text, position)); |
108 | } |
109 | |
110 | private void extendNode(Node node, Substring newText, int position) { |
111 | Substring currentText = node.getText(); |
112 | Substring commonPrefix = getLongestCommonPrefix(currentText, newText); |
113 | |
114 | if (neq(commonPrefix, currentText)) { |
115 | Substring parentText = currentText.substring(0, commonPrefix.length()); |
116 | Substring childText = currentText.substring(commonPrefix.length()); |
117 | splitNodeToParentAndChild(node, parentText, childText); |
118 | } |
119 | |
120 | Substring remainingText = newText.substring(commonPrefix.length()); |
121 | addChildNode(node, remainingText, position); |
122 | } |
123 | |
124 | private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) { |
125 | Node childNode = new Node(childNewText, parentNode.getPosition()); |
126 | |
127 | while (parentNode.getChildren().size() > 0) |
128 | childNode.getChildren().add(parentNode.getChildren().remove(0)); |
129 | |
130 | parentNode.getChildren().add(childNode); |
131 | parentNode.setText(parentNewText); |
132 | parentNode.setPosition(POSITION_UNDEFINED); |
133 | } |
134 | |
135 | private Substring getLongestCommonPrefix(Substring str1, Substring str2) { |
136 | int compareLength = Math.min(str1.length(), str2.length()); |
137 | for (int i = 0; i < compareLength; i++) |
138 | if (str1.charAt(i) != str2.charAt(i)) |
139 | ret str1.substring(0, i); |
140 | ret str1.substring(0, compareLength); |
141 | } |
142 | |
143 | private L<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) { |
144 | new L<Node> nodes; |
145 | for (int i = 0; i < startNode.getChildren().size(); i++) { |
146 | Node currentNode = startNode.getChildren().get(i); |
147 | CharSequence nodeText = currentNode.getText(); |
148 | if (empty(nodeText)) fail("Node text empty: " + currentNode); |
149 | if (pattern.charAt(0) == nodeText.charAt(0)) { |
150 | if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { |
151 | nodes.add(currentNode); |
152 | return nodes; |
153 | } |
154 | |
155 | int compareLength = Math.min(nodeText.length(), pattern.length()); |
156 | for (int j = 1; j < compareLength; j++) { |
157 | if (pattern.charAt(j) != nodeText.charAt(j)) { |
158 | if (isAllowPartialMatch) { |
159 | nodes.add(currentNode); |
160 | } |
161 | return nodes; |
162 | } |
163 | } |
164 | |
165 | nodes.add(currentNode); |
166 | if (pattern.length() > compareLength) { |
167 | List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch); |
168 | if (nodes2.size() > 0) { |
169 | nodes.addAll(nodes2); |
170 | } else if (!isAllowPartialMatch) { |
171 | nodes.add(null); |
172 | } |
173 | } |
174 | return nodes; |
175 | } |
176 | } |
177 | return nodes; |
178 | } |
179 | |
180 | S printTree() { ret root.printTree(""); } |
181 | } |
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029205 |
Snippet name: | SuffixTree [old version, probably broken] |
Eternal ID of this version: | #1029205/1 |
Text MD5: | 97646c453c7be91d03dd7ca415771b05 |
Transpilation MD5: | 221fe941078518b72620d6ece32b9954 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-25 12:02:56 |
Source code size: | 6827 bytes / 181 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 213 / 282 |
Referenced in: | [show references] |