Libraryless. Click here for Pure Java version (14578L/99K).
// A naive suffix tree implementation sclass SuffixTree { Node root; S fullText; int nodeCount; bool useTailOptimization; // doesn't work yet Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this); new DummyNode dummyNode; new Map<Int, Node> tailMap; // position -> node already made sinterface IFirstChar { int firstCharOrMinus1(SuffixTree tree); } sclass DummyNode implements IFirstChar { int firstChar; public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; } } sclass Node implements IFirstChar { int from, to; // our range in fullText O children; // null, a Node or Node[] sorted by first character *() {} *(int *from, int *to) {} *(Substring s) { from = s.startIndex(); to = s.endIndex(); } Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); } int lText() { ret to-from; } bool isTerminal(SuffixTree tree) { ret to == l(tree.fullText); } void addChild(SuffixTree tree, Node n) { if (children == null) children = n; else if (children cast Node) children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator); else { Node[] c = cast children; Node[] x = new[l(c)+1]; arraycopy(c, 0, x, 0, l(c)); x[l(x)-1] = n; children = sortArrayInPlace(x, tree.childComparator); } } Cl<Node> children() { if (children == null) ret emptyList(); if (children cast Node) ret ll(children); ret wrapArrayAsList((Node[]) children); } public int firstCharOrMinus1(SuffixTree tree) { ret charToIntOrMinus1(text(tree)); } Node getChild(SuffixTree tree, int c) { if (children == null) null; if (children cast Node) ret children.firstCharOrMinus1(tree) == c ? children : null; tree.dummyNode.firstChar = c; Node[] x = cast children; int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator); ret i >= 0 ? x[i] : null; } } *() {} *(S fullText) { processText(fullText); } void processText(S fullText) { this.fullText = fullText; root = new Node(0, 0); ++nodeCount; for i over fullText: { //for (int i = l(fullText)-1; i >= 0; i--) { addSuffix(Substring(fullText, i)); if (((i+1) % 1000000) == 0) print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")"); } doneBuilding(); } void doneBuilding { tailMap = null; } void addSuffix(Substring s) { Node node = root; while (!empty(s)) { int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s); s = s.substring(_n); if (_n >= node.lText()) { // node text exhausted if (empty(s)) { // pattern also exhausted - done //print("Exhausted: " + node.from + "/" + node.to); // add a new termination node Node nNew = newNode(s); node.addChild(SuffixTree.this, nNew); ret; } else { // node text exhausted, pattern not exhausted Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s)); if (n == null) { n = newNode(s); node.addChild(SuffixTree.this, n); ret; } else node = n; } } else { // node text not exhausted, pattern not exhausted // split node. first, move all the node's vitals to a new node nOld Node nOld = newNode(node.from+_n, node.to); nOld.children = node.children; node.children = null; node.to = node.from+_n; node.addChild(SuffixTree.this, nOld); // now add a new node Node nNew = newNode(s); node.addChild(SuffixTree.this, nNew); ret; } } } public L<Int> indicesOf(S pattern) { ret asList(indicesOf_iterator(pattern)); } public ItIt<Int> indicesOf_iterator(S pattern) { ret mapI_notNull(allNodesUnder(scanDown(root, pattern)), nad -> nad.node.isTerminal(this) ? nad.position() : null); } srecord NodeAndDepth(Node node, int depth) { int position() { int position = node.from-depth; //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position); ret position; } Cl<NodeAndDepth> children() { ret lmap wrapChild(node.children()); } NodeAndDepth wrapChild(Node n) { ret n == null ? null : new NodeAndDepth(n, depth+node.lText()); } NodeAndDepth getChild(SuffixTree tree, int c) { ret wrapChild(node.getChild(tree, c)); } } NodeAndDepth scanDown(Node node, S pattern) { int lPattern = l(pattern), iPattern = 0; NodeAndDepth nad = new(node, 0); while true { int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern)); iPattern += n; if (iPattern >= lPattern) break; // pattern exhausted - done if (n < nad.node.lText()) null; // mismatch, exit NodeAndDepth child = nad.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern)); if (child != null) continue with nad = child; null; } ret nad; } void printMe() { printNode("", "", new NodeAndDepth(root, 0)); } void printNode(S indent, S pre, NodeAndDepth nad) { print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]")); fOr (NodeAndDepth n : nad.children()) { printNode(indent + " ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n); } } ItIt<NodeAndDepth> allNodes() { ret allNodesUnder(new NodeAndDepth(root, 0)); } // includes the node itself ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) { new L<Iterator<NodeAndDepth>> stack; if (nad != null) stack.add(iteratorLL(nad)); ret iteratorFromFunction_if0(() -> { while (nempty(stack)) { if (!last(stack).hasNext()) popLast(stack); else { NodeAndDepth n = last(stack).next(); stack.add((Iterator) iterator(n.children())); ret n; } } null; }); } // also query & update tailMap Node newNode(Substring s) { Node nNew = useTailOptimization && s.endIndex() == l(fullText) ? mapGet(tailMap, s.startIndex()) : null; if (nNew == null) { nNew = new Node(s); ++nodeCount; if (useTailOptimization) mapPut(tailMap, s.startIndex(), nNew); } else { print("Using existing node for index " + s.startIndex() + ":"); printNode(" ", "", new NodeAndDepth(nNew, 0)); } ret nNew; } Node newNode(int from, int to) { ret newNode(Substring(fullText, from, to)); } }
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029202 |
Snippet name: | SuffixTree [graph optimization doesn't work yet, dev.] |
Eternal ID of this version: | #1029202/95 |
Text MD5: | c3467518c4ca65c5fc109d16d114fef0 |
Transpilation MD5: | c43f4a33bde6daa13d936bf050ea280b |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-26 19:47:59 |
Source code size: | 7072 bytes / 227 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 374 / 746 |
Version history: | 94 change(s) |
Referenced in: | [show references] |