Libraryless. Click here for Pure Java version (3312L/21K).
1 | sclass SuffixTree { |
2 | Node root; |
3 | S fullText; |
4 | |
5 | sclass Node { |
6 | Substring text; |
7 | Map<Int, Node> children; // key is first character or -1 for end of string |
8 | IntBuffer positions; |
9 | |
10 | *() {} |
11 | *(Substring *text) {} |
12 | |
13 | void addPosition(int i) { |
14 | if (positions == null) positions = new IntBuffer; |
15 | positions.add(i); |
16 | } |
17 | |
18 | void addChild(Node n) { |
19 | if (children == null) children = new Map; |
20 | children.put(charToIntOrMinus1(n.text), n); |
21 | } |
22 | |
23 | Node getChild(int c) { |
24 | ret mapGet(children, c); |
25 | } |
26 | } |
27 | |
28 | *() {} |
29 | *(S *fullText) { |
30 | root = new Node(Substring(fullText, 0, 0)); |
31 | for i over fullText: |
32 | addSuffix(Substring(fullText, i), i); |
33 | } |
34 | |
35 | void addSuffix(Substring s, int position) { |
36 | Node node = root; |
37 | |
38 | while (!empty(s)) { |
39 | int _n = lCommonPrefix_CharSequence(node.text, s); |
40 | s = s.substring(_n); |
41 | if (_n >= l(node.text)) { // node text exhausted |
42 | if (empty(s)) { |
43 | node.addPosition(position); |
44 | ret; |
45 | } else { |
46 | Node n = node.getChild(charToIntOrMinus1(s)); |
47 | if (n == null) { |
48 | n = new Node(s); |
49 | n.addPosition(position); |
50 | node.addChild(n); |
51 | ret; |
52 | } else |
53 | node = n; |
54 | } |
55 | } else { // node text not exhausted |
56 | // split node. first, move all the node's vitals to a new node nOld |
57 | Node nOld = new Node(node.text.substring(_n)); |
58 | nOld.positions = node.positions; |
59 | node.positions = null; |
60 | nOld.children = node.children; |
61 | node.children = null; |
62 | node.text = node.text.substring(0, _n); |
63 | node.addChild(nOld); |
64 | |
65 | // now add a new node |
66 | Node nNew = new Node(s); |
67 | nNew.addPosition(position); |
68 | node.addChild(nNew); |
69 | ret; |
70 | } |
71 | } |
72 | } |
73 | |
74 | public L<Int> indicesOf(S pattern) { |
75 | new IntBuffer out; |
76 | getIndicesOf(out, root, pattern, 0); |
77 | ret out.asVirtualList(); |
78 | } |
79 | |
80 | void getIndicesOf(IntBuffer out, Node node, S pattern, int iPattern) { |
81 | int lPattern = l(pattern); |
82 | while true { |
83 | int n = lCommonPrefix_CharSequence(node.text, Substring(pattern, iPattern)); |
84 | iPattern += n; |
85 | if (iPattern >= lPattern) break; |
86 | if (n < l(node.text)) break; |
87 | Node child = node.getChild(charAtAsIntOrMinus1(pattern, iPattern)); |
88 | if (child != null) node = child; |
89 | } |
90 | |
91 | collectPositions(out, node); |
92 | } |
93 | |
94 | L<Int> getPositions(Node node) { |
95 | new IntBuffer out; |
96 | collectPositions(out, node); |
97 | ret out.asVirtualList(); |
98 | } |
99 | |
100 | void collectPositions(IntBuffer out, Node node) { |
101 | if (node == null) ret; |
102 | out.addAll(node.positions); |
103 | fOr (Node n : values(node.children)) |
104 | collectPositions(out, n); |
105 | } |
106 | |
107 | void printMe() { |
108 | printNode("", "", root); |
109 | } |
110 | |
111 | void printNode(S indent, S pre, Node node) { |
112 | print(indent + pre + quote(shorten(20, node.text)) + (empty(node.positions) ? "" : " " + node.positions)); |
113 | fOr (int c, Node n : node.children) |
114 | printNode(indent + " ", "[" + (c < 0 ? "end" : quote((char) c)) + "] ", n); |
115 | } |
116 | } |
Began life as a copy of #1029202
download show line numbers debug dex old transpilations
Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1029210 |
Snippet name: | SuffixTree [OK, backup before optimization] |
Eternal ID of this version: | #1029210/1 |
Text MD5: | ddcf61d36ee3e6788d7ed71f2e696e39 |
Transpilation MD5: | 060b156144ed667832707870a23f1888 |
Author: | stefan |
Category: | javax / text searching |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2020-07-25 13:23:27 |
Source code size: | 3211 bytes / 116 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 188 / 253 |
Referenced in: | [show references] |