Uses 911K of libraries. Click here for Pure Java version (3756L/20K).
!7 // adapted from https://github.com/eugenp/tutorials/tree/master/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree cmodule SuffixTreeSpike > DynPrintLog { switchable S text = "hello world what is up in the world"; switchable S pattern = "world"; transient SuffixTree tree; sclass Node { S text; new L<Node> children; int position; *(S *text, int *position) {} S getText() { ret text; } void setText(S text) { this.text = text; } int getPosition() { ret position; } void setPosition(int position) { this.position = position; } L<Node> getChildren() { ret children; } void setChildren(L<Node> children) { this.children = children; } S printTree(String depthIndicator) { S str = ""; S positionStr = position > -1 ? "[" + position + "]" : ""; str += depthIndicator + text + positionStr + "\n"; for (int i = 0; i < children.size(); i++) str += children.get(i).printTree(depthIndicator + "\t"); return str; } toString { ret printTree(""); } } sclass SuffixTree { private static final String WORD_TERMINATION = "$"; private static final int POSITION_UNDEFINED = -1; Node root; S fullText; *(S text) { root = new Node("", POSITION_UNDEFINED); for (int i = 0; i < text.length(); i++) { addSuffix(text.substring(i) + WORD_TERMINATION, i); } fullText = text; } public LS searchText(S pattern) { //LOGGER.info("Searching for pattern \"{}\"", pattern); List<String> result = new ArrayList<>(); List<Node> nodes = getAllNodesInTraversePath(pattern, root, false); if (nodes.size() > 0) { Node lastNode = nodes.get(nodes.size() - 1); if (lastNode != null) { List<Integer> positions = getPositions(lastNode); /*positions = positions.stream() .sorted() .collect(Collectors.toList());*/ sortInPlace(positions); positions.forEach(m -> result.add((markPatternInText(m, pattern)))); } } return result; } private void addSuffix(String suffix, int position) { //LOGGER.info(">>>>>>>>>>>> Adding new suffix {}", suffix); List<Node> nodes = getAllNodesInTraversePath(suffix, root, true); if (nodes.size() == 0) { addChildNode(root, suffix, position); //LOGGER.info("{}", printTree()); } else { Node lastNode = nodes.remove(nodes.size() - 1); String newText = suffix; if (nodes.size() > 0) { String existingSuffixUptoLastNode = nodes.stream() .map(a -> a.getText()) .reduce("", String::concat); // Remove prefix from newText already included in parent newText = newText.substring(existingSuffixUptoLastNode.length()); } extendNode(lastNode, newText, position); //LOGGER.info("{}", printTree()); } } private List<Integer> getPositions(Node node) { List<Integer> positions = new ArrayList<>(); if (node.getText() .endsWith(WORD_TERMINATION)) { positions.add(node.getPosition()); } for (int i = 0; i < node.getChildren() .size(); i++) { positions.addAll(getPositions(node.getChildren() .get(i))); } return positions; } private String markPatternInText(Integer startPosition, String pattern) { String matchingTextLHS = fullText.substring(0, startPosition); String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); String matchingTextRHS = fullText.substring(startPosition + pattern.length()); return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; } private void addChildNode(Node parentNode, String text, int position) { parentNode.getChildren() .add(new Node(text, position)); } private void extendNode(Node node, String newText, int position) { String currentText = node.getText(); String commonPrefix = getLongestCommonPrefix(currentText, newText); if (commonPrefix != currentText) { String parentText = currentText.substring(0, commonPrefix.length()); String childText = currentText.substring(commonPrefix.length()); splitNodeToParentAndChild(node, parentText, childText); } String remainingText = newText.substring(commonPrefix.length()); addChildNode(node, remainingText, position); } private void splitNodeToParentAndChild(Node parentNode, String parentNewText, String childNewText) { Node childNode = new Node(childNewText, parentNode.getPosition()); if (parentNode.getChildren() .size() > 0) { while (parentNode.getChildren() .size() > 0) { childNode.getChildren() .add(parentNode.getChildren() .remove(0)); } } parentNode.getChildren() .add(childNode); parentNode.setText(parentNewText); parentNode.setPosition(POSITION_UNDEFINED); } private String getLongestCommonPrefix(String str1, String str2) { int compareLength = Math.min(str1.length(), str2.length()); for (int i = 0; i < compareLength; i++) { if (str1.charAt(i) != str2.charAt(i)) { return str1.substring(0, i); } } return str1.substring(0, compareLength); } private List<Node> getAllNodesInTraversePath(String pattern, Node startNode, boolean isAllowPartialMatch) { List<Node> nodes = new ArrayList<>(); for (int i = 0; i < startNode.getChildren() .size(); i++) { Node currentNode = startNode.getChildren() .get(i); String nodeText = currentNode.getText(); if (pattern.charAt(0) == nodeText.charAt(0)) { if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { nodes.add(currentNode); return nodes; } int compareLength = Math.min(nodeText.length(), pattern.length()); for (int j = 1; j < compareLength; j++) { if (pattern.charAt(j) != nodeText.charAt(j)) { if (isAllowPartialMatch) { nodes.add(currentNode); } return nodes; } } nodes.add(currentNode); if (pattern.length() > compareLength) { List<Node> nodes2 = getAllNodesInTraversePath(pattern.substring(compareLength), currentNode, isAllowPartialMatch); if (nodes2.size() > 0) { nodes.addAll(nodes2); } else if (!isAllowPartialMatch) { nodes.add(null); } } return nodes; } } return nodes; } S printTree() { ret root.printTree(""); } } start-thread { tree = new SuffixTree(text); print(tree.printTree()); print("Estimated tree size: " + nBytes(guessObjectSize(tree))); print(+text); print(+pattern); pnl(tree.searchText(pattern)); } }
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1028128 |
Snippet name: | Suffix Tree Spike v1 [seems OK] |
Eternal ID of this version: | #1028128/7 |
Text MD5: | 203020476937643ad2d171e987647b5e |
Transpilation MD5: | 3c6e660a3ace1f8d2035233f68cf00b8 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX source code (Dynamic Module) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-05-23 12:01:48 |
Source code size: | 7805 bytes / 211 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 235 / 416 |
Version history: | 6 change(s) |
Referenced in: | #1028130 - Suffix Tree Spike v2 [storing substrings efficiently, seems OK] |