Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

198
LINES

< > BotCompany Repo | #1029220 // SuffixTree [yet another backup]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (14953L/102K).

1  
// A naive suffix tree implementation
2  
sclass SuffixTree {
3  
  Node root;
4  
  S fullText;
5  
  int nodeCount;
6  
  
7  
  Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this);
8  
  new DummyNode dummyNode;
9  
  
10  
  sinterface IFirstChar {
11  
    int firstCharOrMinus1(SuffixTree tree);
12  
  }
13  
  
14  
  sclass DummyNode implements IFirstChar {
15  
    int firstChar;
16  
    
17  
    public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; }
18  
  }
19  
20  
  sclass Node implements IFirstChar {
21  
    int from, to; // our range in fullText
22  
    O children; // null, a Node or Node[] sorted by first character
23  
    int position = -1;
24  
25  
    *() {}
26  
    *(int *from, int *to) {}
27  
    *(Substring s) { from = s.startIndex(); to = s.endIndex(); }
28  
    
29  
    Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); }
30  
    
31  
    int lText() { ret to-from; }
32  
    
33  
    void addPosition(int i) {
34  
      if (position >= 0) fail("Already have position");
35  
      position = i;
36  
    }
37  
    
38  
    void addChild(SuffixTree tree, Node n) {
39  
      if (children == null) children = n;
40  
      else if (children cast Node)
41  
        children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator);
42  
      else {
43  
        Node[] c = cast children;
44  
        Node[] x = new[l(c)+1];
45  
        arraycopy(c, 0, x, 0, l(c));
46  
        x[l(x)-1] = n;
47  
        children = sortArrayInPlace(x, tree.childComparator);
48  
      }
49  
    }
50  
    
51  
    Cl<Node> children() {
52  
      if (children == null) ret emptyList();
53  
      if (children cast Node) ret ll(children);
54  
      ret wrapArrayAsList((Node[]) children);
55  
    }
56  
    
57  
    CompactTreeSet<IFirstChar> makeEmptyChildrenSet(SuffixTree tree) { ret new CompactTreeSet<>(tree.childComparator); }
58  
    
59  
    public int firstCharOrMinus1(SuffixTree tree) {
60  
      ret charToIntOrMinus1(text(tree));
61  
    }
62  
    
63  
    Node getChild(SuffixTree tree, int c) {
64  
      if (children == null) null;
65  
      if (children cast Node)
66  
        ret children.firstCharOrMinus1(tree) == c ? children : null;
67  
      tree.dummyNode.firstChar = c;
68  
      Node[] x = cast children;
69  
      int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator);
70  
      ret i >= 0 ? x[i] : null;
71  
    }
72  
  }
73  
74  
  *() {}
75  
  *(S *fullText) {
76  
    root = new Node(0, 0);
77  
    ++nodeCount;
78  
    for i over fullText: {
79  
      addSuffix(Substring(fullText, i), i);
80  
      if (((i+1) % 1000000) == 0)
81  
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
82  
    }
83  
  }
84  
85  
  void addSuffix(Substring s, int position) {
86  
    Node node = root;
87  
    
88  
    while (!empty(s)) {
89  
      int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s);
90  
      s = s.substring(_n);
91  
      if (_n >= node.lText()) { // node text exhausted
92  
        if (empty(s)) { // pattern also exhausted - done
93  
          node.addPosition(position);
94  
          ret;
95  
        } else {
96  
          Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s));
97  
          if (n == null) {
98  
            n = new Node(s);
99  
            ++nodeCount;
100  
            n.addPosition(position);
101  
            node.addChild(SuffixTree.this, n);
102  
            ret;
103  
          } else
104  
            node = n;
105  
        }
106  
      } else { // node text not exhausted
107  
        // split node. first, move all the node's vitals to a new node nOld
108  
        Node nOld = new Node(node.from+_n, node.to);
109  
        ++nodeCount;
110  
        nOld.position = node.position;
111  
        node.position = -1;
112  
        nOld.children = node.children;
113  
        node.children = null;
114  
        node.to = node.from+_n;
115  
        node.addChild(SuffixTree.this, nOld);
116  
117  
        // now add a new node
118  
        Node nNew = new Node(s);
119  
        ++nodeCount;
120  
        nNew.addPosition(position);
121  
        node.addChild(SuffixTree.this, nNew);
122  
        ret;
123  
      }
124  
    }
125  
  }
126  
127  
  public L<Int> indicesOf(S pattern) {
128  
    ret asList(indicesOf_iterator(pattern));
129  
  }
130  
131  
  public ItIt<Int> indicesOf_iterator(S pattern) {
132  
    ret mapI_notNull(allNodesUnder(scanDown(root, pattern)),
133  
      n -> n.position >= 0 ? n.position : null);
134  
  }
135  
136  
  Node scanDown(Node node, S pattern) {
137  
    int lPattern = l(pattern), iPattern = 0;
138  
    while true {
139  
      int n = lCommonPrefix_CharSequence(node.text(this), Substring(pattern, iPattern));
140  
      iPattern += n;
141  
      if (iPattern >= lPattern) break; // pattern exhausted - done
142  
      if (n < node.lText()) null; // mismatch, exit
143  
      Node child = node.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern));
144  
      if (child != null) continue with node = child;
145  
      null;
146  
    }
147  
    
148  
    ret node;
149  
  }
150  
  
151  
  L<Int> getPositions(Node node) {
152  
    new IntBuffer out;
153  
    collectPositions(out, node);
154  
    ret out.asVirtualList();
155  
  }
156  
  
157  
  void collectPositions(IntBuffer out, Node node) {
158  
    if (node == null) ret;
159  
    if (node.position >= 0) out.add(node.position);
160  
    fOr (IFirstChar n : node.children())
161  
      collectPositions(out, (Node) n);
162  
  }
163  
  
164  
  void printMe() {
165  
    printNode("", "", root);
166  
  }
167  
  
168  
  void printNode(S indent, S pre, Node node) {
169  
    print(indent + pre + quote(shorten(20, node.text(this))) + (node.position < 0 ? "" : " [" + node.position + "]"));
170  
    fOr (IFirstChar _n : node.children()) {
171  
      Node n = cast _n;
172  
      printNode(indent + "  ", "[" + (n.lText() == 0 ? "end" : quote(n.text(this).charAt(0))) + "] ", n);
173  
    }
174  
  }
175  
  
176  
  ItIt<Node> allNodes() {
177  
    ret allNodesUnder(root);
178  
  }
179  
  
180  
  // includes the node itself
181  
  ItIt<Node> allNodesUnder(Node node) {
182  
    new L<Iterator<Node>> stack;
183  
    if (node != null)
184  
      stack.add(iteratorLL(node));
185  
    ret iteratorFromFunction_if0(() -> {
186  
      while (nempty(stack)) {
187  
        if (!last(stack).hasNext())
188  
          popLast(stack);
189  
        else {
190  
          Node n = last(stack).next();
191  
          stack.add((Iterator) iterator(n.children()));
192  
          ret n;
193  
        }
194  
      }
195  
      null;
196  
    });
197  
  }
198  
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029220
Snippet name: SuffixTree [yet another backup]
Eternal ID of this version: #1029220/1
Text MD5: 3e74f9b249616e509a9799f94c228150
Transpilation MD5: 23457315c064e89c68204da3f09bc5b8
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-26 00:20:19
Source code size: 5975 bytes / 198 lines
Pitched / IR pitched: No / No
Views / Downloads: 187 / 259
Referenced in: [show references]