Uses 911K of libraries. Click here for Pure Java version (4028L/21K).
!7 // adapted from https://github.com/eugenp/tutorials/tree/master/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree extend Substring { bool endsAtEnd() { ret i+l == s.length(); } } cmodule SuffixTreeSpike > DynPrintLog { switchable S text = "hello world what is up in the world"; switchable S pattern = "world"; transient SuffixTree tree; sclass Node { Substring text; new L<Node> children; int position; *(Substring *text, int *position) {} Substring getText() { ret text; } void setText(Substring text) { print("Setting text: " + quote(this.text) + " => " + quote(text); this.text = text; } int getPosition() { ret position; } void setPosition(int position) { this.position = position; } L<Node> getChildren() { ret children; } void setChildren(L<Node> children) { this.children = children; } S printTree(String depthIndicator) { S str = ""; S positionStr = position > -1 ? "[" + position + "]" : ""; str += depthIndicator + text + positionStr + "\n"; for (int i = 0; i < children.size(); i++) str += children.get(i).printTree(depthIndicator + "\t"); return str; } toString { ret printTree(""); } } sclass SuffixTree { private static final String WORD_TERMINATION = "$"; private static final int POSITION_UNDEFINED = -1; Node root; S fullText; S textWithTerminator; // all Substrings are constructed off this *(S text) { textWithTerminator = text + WORD_TERMINATION; root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED); for i over text: addSuffix(Substring(textWithTerminator, i), i); fullText = text; } public LS searchText(S pattern) { //LOGGER.info("Searching for pattern \"{}\"", pattern); new LS result; L<Node> nodes = getAllNodesInTraversePath(pattern, root, false); if (nodes.size() > 0) { Node lastNode = nodes.get(nodes.size() - 1); if (lastNode != null) { L<Int> positions = getPositions(lastNode); sortInPlace(positions); positions.forEach(m -> result.add((markPatternInText(m, pattern)))); } } return result; } private void addSuffix(Substring suffix, int position) { print("Adding new suffix: " + quote(suffix)); List<Node> nodes = getAllNodesInTraversePath(suffix, root, true); if (nodes.size() == 0) { addChildNode(root, suffix, position); //LOGGER.info("{}", printTree()); } else { Node lastNode = nodes.remove(nodes.size() - 1); Substring newText = suffix; if (nodes.size() > 0) { Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText())); // Remove prefix from newText already included in parent newText = newText.substring(existingSuffixUptoLastNode.length()); } extendNode(lastNode, newText, position); //LOGGER.info("{}", printTree()); } } private List<Integer> getPositions(Node node) { List<Integer> positions = new ArrayList<>(); if (node.getText() /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) { positions.add(node.getPosition()); } for (int i = 0; i < node.getChildren() .size(); i++) { positions.addAll(getPositions(node.getChildren() .get(i))); } return positions; } private String markPatternInText(Integer startPosition, String pattern) { String matchingTextLHS = fullText.substring(0, startPosition); String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); String matchingTextRHS = fullText.substring(startPosition + pattern.length()); return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; } private void addChildNode(Node parentNode, Substring text, int position) { parentNode.getChildren() .add(new Node(text, position)); } private void extendNode(Node node, Substring newText, int position) { Substring currentText = node.getText(); Substring commonPrefix = getLongestCommonPrefix(currentText, newText); if (neq(commonPrefix, currentText)) { Substring parentText = currentText.substring(0, commonPrefix.length()); Substring childText = currentText.substring(commonPrefix.length()); splitNodeToParentAndChild(node, parentText, childText); } Substring remainingText = newText.substring(commonPrefix.length()); addChildNode(node, remainingText, position); } private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) { Node childNode = new Node(childNewText, parentNode.getPosition()); if (parentNode.getChildren() .size() > 0) { while (parentNode.getChildren() .size() > 0) { childNode.getChildren() .add(parentNode.getChildren() .remove(0)); } } parentNode.getChildren().add(childNode); parentNode.setText(parentNewText); parentNode.setPosition(POSITION_UNDEFINED); } private Substring getLongestCommonPrefix(Substring str1, Substring str2) { int compareLength = Math.min(str1.length(), str2.length()); for (int i = 0; i < compareLength; i++) if (str1.charAt(i) != str2.charAt(i)) ret str1.substring(0, i); ret str1.substring(0, compareLength); } private List<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) { List<Node> nodes = new ArrayList<>(); for (int i = 0; i < startNode.getChildren().size(); i++) { Node currentNode = startNode.getChildren().get(i); CharSequence nodeText = currentNode.getText(); if (empty(nodeText)) fail("Node text empty: " + currentNode); if (pattern.charAt(0) == nodeText.charAt(0)) { if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { nodes.add(currentNode); return nodes; } int compareLength = Math.min(nodeText.length(), pattern.length()); for (int j = 1; j < compareLength; j++) { if (pattern.charAt(j) != nodeText.charAt(j)) { if (isAllowPartialMatch) { nodes.add(currentNode); } return nodes; } } nodes.add(currentNode); if (pattern.length() > compareLength) { List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch); if (nodes2.size() > 0) { nodes.addAll(nodes2); } else if (!isAllowPartialMatch) { nodes.add(null); } } return nodes; } } return nodes; } S printTree() { ret root.printTree(""); } } start-thread { tree = new SuffixTree(text); print(tree.printTree()); print("Estimated tree size: " + nBytes(guessObjectSize(tree))); print(+text); print(+pattern); pnl(tree.searchText(pattern)); } }
Began life as a copy of #1028128
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028130 |
Snippet name: | Suffix Tree Spike v2 [storing substrings efficiently, seems OK] |
Eternal ID of this version: | #1028130/18 |
Text MD5: | e425b97cf88c16bf50bdf77698633d61 |
Transpilation MD5: | 747e21e600e736ce64714cb27e224a25 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX source code (Dynamic Module) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-23 12:35:39 |
Source code size: | 7898 bytes / 210 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 326 / 2174 |
Version history: | 17 change(s) |
Referenced in: | #1028134 - Suffix Tree Spike v3 [simplifying further, dev.] #1029201 - Suffix Tree Spike v2.1 [dev.] |