Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

199
LINES

< > BotCompany Repo | #1029234 // SuffixTree [LIVE]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (14542L/98K).

1  
// A naive suffix tree implementation
2  
sclass SuffixTree {
3  
  Node root;
4  
  S fullText;
5  
  int nodeCount;
6  
  
7  
  Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this);
8  
  new DummyNode dummyNode;
9  
  
10  
  sinterface IFirstChar {
11  
    int firstCharOrMinus1(SuffixTree tree);
12  
  }
13  
  
14  
  sclass DummyNode implements IFirstChar {
15  
    int firstChar;
16  
    
17  
    public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; }
18  
  }
19  
20  
  sclass Node implements IFirstChar {
21  
    int from, to; // our range in fullText
22  
    O children; // null, a Node or Node[] sorted by first character
23  
24  
    *() {}
25  
    *(int *from, int *to) {}
26  
    *(Substring s) { from = s.startIndex(); to = s.endIndex(); }
27  
    
28  
    Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); }
29  
    
30  
    int lText() { ret to-from; }
31  
    
32  
    bool isTerminal(SuffixTree tree) { ret to == l(tree.fullText); }
33  
    
34  
    void addChild(SuffixTree tree, Node n) {
35  
      if (children == null) children = n;
36  
      else if (children cast Node)
37  
        children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator);
38  
      else {
39  
        Node[] c = cast children;
40  
        Node[] x = new[l(c)+1];
41  
        arraycopy(c, 0, x, 0, l(c));
42  
        x[l(x)-1] = n;
43  
        children = sortArrayInPlace(x, tree.childComparator);
44  
      }
45  
    }
46  
    
47  
    Cl<Node> children() {
48  
      if (children == null) ret emptyList();
49  
      if (children cast Node) ret ll(children);
50  
      ret wrapArrayAsList((Node[]) children);
51  
    }
52  
    
53  
    public int firstCharOrMinus1(SuffixTree tree) {
54  
      ret charToIntOrMinus1(text(tree));
55  
    }
56  
    
57  
    Node getChild(SuffixTree tree, int c) {
58  
      if (children == null) null;
59  
      if (children cast Node)
60  
        ret children.firstCharOrMinus1(tree) == c ? children : null;
61  
      tree.dummyNode.firstChar = c;
62  
      Node[] x = cast children;
63  
      int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator);
64  
      ret i >= 0 ? x[i] : null;
65  
    }
66  
  }
67  
68  
  *() {}
69  
  *(S *fullText) {
70  
    root = new Node(0, 0);
71  
    ++nodeCount;
72  
    for i over fullText: {
73  
      addSuffix(Substring(fullText, i));
74  
      if (((i+1) % 1000000) == 0)
75  
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
76  
    }
77  
  }
78  
79  
  void addSuffix(Substring s) {
80  
    Node node = root;
81  
    
82  
    while (!empty(s)) {
83  
      int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s);
84  
      s = s.substring(_n);
85  
      if (_n >= node.lText()) { // node text exhausted
86  
        if (empty(s)) { // pattern also exhausted - done
87  
          //print("Exhausted: " + node.from + "/" + node.to);
88  
          // add a new termination node
89  
          Node nNew = new Node(s);
90  
          ++nodeCount;
91  
          node.addChild(SuffixTree.this, nNew);
92  
          ret;
93  
        } else {
94  
          Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s));
95  
          if (n == null) {
96  
            n = new Node(s);
97  
            ++nodeCount;
98  
            node.addChild(SuffixTree.this, n);
99  
            ret;
100  
          } else
101  
            node = n;
102  
        }
103  
      } else { // node text not exhausted
104  
        // split node. first, move all the node's vitals to a new node nOld
105  
        Node nOld = new Node(node.from+_n, node.to);
106  
        ++nodeCount;
107  
        nOld.children = node.children;
108  
        node.children = null;
109  
        node.to = node.from+_n;
110  
        node.addChild(SuffixTree.this, nOld);
111  
112  
        // now add a new node
113  
        Node nNew = new Node(s);
114  
        ++nodeCount;
115  
        node.addChild(SuffixTree.this, nNew);
116  
        ret;
117  
      }
118  
    }
119  
  }
120  
121  
  public L<Int> indicesOf(S pattern) {
122  
    ret asList(indicesOf_iterator(pattern));
123  
  }
124  
125  
  public ItIt<Int> indicesOf_iterator(S pattern) {
126  
    ret mapI_notNull(allNodesUnder(scanDown(root, pattern)),
127  
      nad -> nad.node.isTerminal(this) ? nad.position() : null);
128  
  }
129  
  
130  
  srecord NodeAndDepth(Node node, int depth) {
131  
    int position() {
132  
      int position = node.from-depth;
133  
      //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position);
134  
      ret position;
135  
    }
136  
    
137  
    Cl<NodeAndDepth> children() {
138  
      ret lmap wrapChild(node.children());
139  
    }
140  
    
141  
    NodeAndDepth wrapChild(Node n) {
142  
      ret n == null ? null : new NodeAndDepth(n, depth+node.lText());
143  
    }
144  
    
145  
    NodeAndDepth getChild(SuffixTree tree, int c) {
146  
      ret wrapChild(node.getChild(tree, c));
147  
    }
148  
  }
149  
150  
  NodeAndDepth scanDown(Node node, S pattern) {
151  
    int lPattern = l(pattern), iPattern = 0;
152  
    NodeAndDepth nad = new(node, 0);
153  
    while true {
154  
      int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern));
155  
      iPattern += n;
156  
      if (iPattern >= lPattern) break; // pattern exhausted - done
157  
      if (n < nad.node.lText()) null; // mismatch, exit
158  
      NodeAndDepth child = nad.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern));
159  
      if (child != null) continue with nad = child;
160  
      null;
161  
    }
162  
    
163  
    ret nad;
164  
  }
165  
  
166  
  void printMe() {
167  
    printNode("", "", new NodeAndDepth(root, 0));
168  
  }
169  
  
170  
  void printNode(S indent, S pre, NodeAndDepth nad) {
171  
    print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]"));
172  
    fOr (NodeAndDepth n : nad.children()) {
173  
      printNode(indent + "  ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n);
174  
    }
175  
  }
176  
  
177  
  ItIt<NodeAndDepth> allNodes() {
178  
    ret allNodesUnder(new NodeAndDepth(root, 0));
179  
  }
180  
  
181  
  // includes the node itself
182  
  ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) {
183  
    new L<Iterator<NodeAndDepth>> stack;
184  
    if (nad != null)
185  
      stack.add(iteratorLL(nad));
186  
    ret iteratorFromFunction_if0(() -> {
187  
      while (nempty(stack)) {
188  
        if (!last(stack).hasNext())
189  
          popLast(stack);
190  
        else {
191  
          NodeAndDepth n = last(stack).next();
192  
          stack.add((Iterator) iterator(n.children()));
193  
          ret n;
194  
        }
195  
      }
196  
      null;
197  
    });
198  
  }
199  
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029234
Snippet name: SuffixTree [LIVE]
Eternal ID of this version: #1029234/2
Text MD5: 8c5e7742ad2b749364b7723e6745c9ba
Transpilation MD5: 0d0678082061b40b0d102be8999ad145
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-26 19:48:36
Source code size: 6169 bytes / 199 lines
Pitched / IR pitched: No / No
Views / Downloads: 173 / 437
Version history: 1 change(s)
Referenced in: [show references]