Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

166
LINES

< > BotCompany Repo | #1029214 // SuffixTree [another partly optimized backup]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (14683L/100K).

1  
// A naive suffix tree implementation
2  
sclass SuffixTree {
3  
  Node root;
4  
  S fullText;
5  
  int nodeCount;
6  
  
7  
  Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1()-b.firstCharOrMinus1();
8  
  new DummyNode dummyNode;
9  
  
10  
  sinterface IFirstChar {
11  
    int firstCharOrMinus1();
12  
  }
13  
  
14  
  sclass DummyNode implements IFirstChar {
15  
    int firstChar;
16  
    
17  
    public int firstCharOrMinus1() { ret firstChar; }
18  
  }
19  
20  
  sclass Node implements IFirstChar {
21  
    Substring text;
22  
    CompactTreeSet<IFirstChar> children; // actually, <Node>
23  
    int position = -1;
24  
25  
    *() {}
26  
    *(Substring *text) {}
27  
    
28  
    void addPosition(int i) {
29  
      if (position >= 0) fail("Already have position");
30  
      position = i;
31  
    }
32  
    
33  
    void addChild(SuffixTree tree, Node n) {
34  
      if (children == null) children = makeEmptyChildrenSet(tree);
35  
      children.add(n);
36  
    }
37  
    
38  
    CompactTreeSet<IFirstChar> makeEmptyChildrenSet(SuffixTree tree) { ret new CompactTreeSet<>(tree.childComparator); }
39  
    
40  
    public int firstCharOrMinus1() {
41  
      ret charToIntOrMinus1(text);
42  
    }
43  
    
44  
    Node getChild(SuffixTree tree, int c) {
45  
      if (children == null) null;
46  
      tree.dummyNode.firstChar = c;
47  
      ret (Node) children.find(tree.dummyNode);
48  
    }
49  
  }
50  
51  
  *() {}
52  
  *(S *fullText) {
53  
    root = new Node(Substring(fullText, 0, 0));
54  
    ++nodeCount;
55  
    for i over fullText: {
56  
      addSuffix(Substring(fullText, i), i);
57  
      if (((i+1) % 1000000) == 0)
58  
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
59  
    }
60  
  }
61  
62  
  void addSuffix(Substring s, int position) {
63  
    Node node = root;
64  
    
65  
    while (!empty(s)) {
66  
      int _n = lCommonPrefix_CharSequence(node.text, s);
67  
      s = s.substring(_n);
68  
      if (_n >= l(node.text)) { // node text exhausted
69  
        if (empty(s)) { // pattern also exhausted - done
70  
          node.addPosition(position);
71  
          ret;
72  
        } else {
73  
          Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s));
74  
          if (n == null) {
75  
            n = new Node(s);
76  
            ++nodeCount;
77  
            n.addPosition(position);
78  
            node.addChild(SuffixTree.this, n);
79  
            ret;
80  
          } else
81  
            node = n;
82  
        }
83  
      } else { // node text not exhausted
84  
        // split node. first, move all the node's vitals to a new node nOld
85  
        Node nOld = new Node(node.text.substring(_n));
86  
        ++nodeCount;
87  
        nOld.position = node.position;
88  
        node.position = -1;
89  
        nOld.children = node.children;
90  
        node.children = null;
91  
        node.text = node.text.substring(0, _n);
92  
        node.addChild(SuffixTree.this, nOld);
93  
94  
        // now add a new node
95  
        Node nNew = new Node(s);
96  
        ++nodeCount;
97  
        nNew.addPosition(position);
98  
        node.addChild(SuffixTree.this, nNew);
99  
        ret;
100  
      }
101  
    }
102  
  }
103  
104  
  public L<Int> indicesOf(S pattern) {
105  
    new IntBuffer out;
106  
    getIndicesOf(out, root, pattern, 0);
107  
    ret out.asVirtualList();
108  
  }
109  
  
110  
  void getIndicesOf(IntBuffer out, Node node, S pattern, int iPattern) {
111  
    int lPattern = l(pattern);
112  
    while true {
113  
      int n = lCommonPrefix_CharSequence(node.text, Substring(pattern, iPattern));
114  
      iPattern += n;
115  
      if (iPattern >= lPattern) break; // pattern exhausted - done
116  
      if (n < l(node.text)) ret; // mismatch, exit
117  
      Node child = node.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern));
118  
      if (child != null) continue with node = child;
119  
      ret;
120  
    }
121  
    
122  
    collectPositions(out, node);
123  
  }
124  
  
125  
  L<Int> getPositions(Node node) {
126  
    new IntBuffer out;
127  
    collectPositions(out, node);
128  
    ret out.asVirtualList();
129  
  }
130  
  
131  
  void collectPositions(IntBuffer out, Node node) {
132  
    if (node == null) ret;
133  
    if (node.position >= 0) out.add(node.position);
134  
    fOr (IFirstChar n : node.children)
135  
      collectPositions(out, (Node) n);
136  
  }
137  
  
138  
  void printMe() {
139  
    printNode("", "", root);
140  
  }
141  
  
142  
  void printNode(S indent, S pre, Node node) {
143  
    print(indent + pre + quote(shorten(20, node.text)) + (node.position < 0 ? "" : " [" + node.position + "]"));
144  
    fOr (IFirstChar _n : node.children) {
145  
      Node n = cast _n;
146  
      printNode(indent + "  ", "[" + (empty(n.text) ? "end" : quote(n.text.charAt(0))) + "] ", n);
147  
    }
148  
  }
149  
  
150  
  ItIt<Node> allNodes() {
151  
    new L<Iterator<Node>> stack;
152  
    stack.add(iteratorLL(root));
153  
    ret iteratorFromFunction_if0(() -> {
154  
      while (nempty(stack)) {
155  
        if (!last(stack).hasNext())
156  
          popLast(stack);
157  
        else {
158  
          Node n = last(stack).next();
159  
          stack.add((Iterator) iterator(n.children));
160  
          ret n;
161  
        }
162  
      }
163  
      null;
164  
    });
165  
  }
166  
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029214
Snippet name: SuffixTree [another partly optimized backup]
Eternal ID of this version: #1029214/1
Text MD5: 93ad16b5efe496ce64a96135f2480f78
Transpilation MD5: adf7c8bc884521faba14e8974ee8b449
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-25 14:29:12
Source code size: 4802 bytes / 166 lines
Pitched / IR pitched: No / No
Views / Downloads: 171 / 240
Referenced in: [show references]