Libraryless. Click here for Pure Java version (3173L/20K).
sclass SuffixTree { Node root; S fullText; S textWithTerminator; // all Substrings are constructed off this private static final String WORD_TERMINATION = "$"; private static final int POSITION_UNDEFINED = -1; sclass Node { Substring text; new L<Node> children; int position; *(Substring *text, int *position) {} Substring getText() { ret text; } void setText(Substring text) { //print("Setting text: " + quote(this.text) + " => " + quote(text); this.text = text; } int getPosition() { ret position; } void setPosition(int position) { this.position = position; } L<Node> getChildren() { ret children; } void setChildren(L<Node> children) { this.children = children; } S printTree(String depthIndicator) { S str = ""; S positionStr = position > -1 ? "[" + position + "]" : ""; str += depthIndicator + text + positionStr + "\n"; for (int i = 0; i < children.size(); i++) str += children.get(i).printTree(depthIndicator + "\t"); return str; } toString { ret printTree(""); } } *(S text) { textWithTerminator = text + WORD_TERMINATION; root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED); for i over text: addSuffix(Substring(textWithTerminator, i), i); fullText = text; } public LS markText(S pattern) { //LOGGER.info("Searching for pattern \"{}\"", pattern); new LS result; L<Node> nodes = getAllNodesInTraversePath(pattern, root, false); if (nodes.size() > 0) { Node lastNode = nodes.get(nodes.size() - 1); if (lastNode != null) { L<Int> positions = getPositions(lastNode); sortInPlace(positions); positions.forEach(m -> result.add((markPatternInText(m, pattern)))); } } return result; } private void addSuffix(Substring suffix, int position) { //print("Adding new suffix: " + quote(suffix)); List<Node> nodes = getAllNodesInTraversePath(suffix, root, true); if (nodes.size() == 0) { addChildNode(root, suffix, position); //LOGGER.info("{}", printTree()); } else { Node lastNode = nodes.remove(nodes.size() - 1); Substring newText = suffix; if (nodes.size() > 0) { Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText())); // Remove prefix from newText already included in parent newText = newText.substring(existingSuffixUptoLastNode.length()); } extendNode(lastNode, newText, position); //LOGGER.info("{}", printTree()); } } private List<Integer> getPositions(Node node) { List<Integer> positions = new ArrayList<>(); if (node.getText() /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) { positions.add(node.getPosition()); } for (int i = 0; i < node.getChildren() .size(); i++) { positions.addAll(getPositions(node.getChildren() .get(i))); } return positions; } private String markPatternInText(Integer startPosition, String pattern) { String matchingTextLHS = fullText.substring(0, startPosition); String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); String matchingTextRHS = fullText.substring(startPosition + pattern.length()); return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; } private void addChildNode(Node parentNode, Substring text, int position) { parentNode.getChildren() .add(new Node(text, position)); } private void extendNode(Node node, Substring newText, int position) { Substring currentText = node.getText(); Substring commonPrefix = getLongestCommonPrefix(currentText, newText); if (neq(commonPrefix, currentText)) { Substring parentText = currentText.substring(0, commonPrefix.length()); Substring childText = currentText.substring(commonPrefix.length()); splitNodeToParentAndChild(node, parentText, childText); } Substring remainingText = newText.substring(commonPrefix.length()); addChildNode(node, remainingText, position); } private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) { Node childNode = new Node(childNewText, parentNode.getPosition()); while (parentNode.getChildren().size() > 0) childNode.getChildren().add(parentNode.getChildren().remove(0)); parentNode.getChildren().add(childNode); parentNode.setText(parentNewText); parentNode.setPosition(POSITION_UNDEFINED); } private Substring getLongestCommonPrefix(Substring str1, Substring str2) { int compareLength = Math.min(str1.length(), str2.length()); for (int i = 0; i < compareLength; i++) if (str1.charAt(i) != str2.charAt(i)) ret str1.substring(0, i); ret str1.substring(0, compareLength); } private L<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) { new L<Node> nodes; for (int i = 0; i < startNode.getChildren().size(); i++) { Node currentNode = startNode.getChildren().get(i); CharSequence nodeText = currentNode.getText(); if (empty(nodeText)) fail("Node text empty: " + currentNode); if (pattern.charAt(0) == nodeText.charAt(0)) { if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { nodes.add(currentNode); return nodes; } int compareLength = Math.min(nodeText.length(), pattern.length()); for (int j = 1; j < compareLength; j++) { if (pattern.charAt(j) != nodeText.charAt(j)) { if (isAllowPartialMatch) { nodes.add(currentNode); } return nodes; } } nodes.add(currentNode); if (pattern.length() > compareLength) { List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch); if (nodes2.size() > 0) { nodes.addAll(nodes2); } else if (!isAllowPartialMatch) { nodes.add(null); } } return nodes; } } return nodes; } S printTree() { ret root.printTree(""); } }
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029205 |
Snippet name: | SuffixTree [old version, probably broken] |
Eternal ID of this version: | #1029205/1 |
Text MD5: | 97646c453c7be91d03dd7ca415771b05 |
Transpilation MD5: | 221fe941078518b72620d6ece32b9954 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-25 12:02:56 |
Source code size: | 6827 bytes / 181 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 211 / 279 |
Referenced in: | [show references] |