Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

227
LINES

< > BotCompany Repo | #1029202 // SuffixTree [graph optimization doesn't work yet, dev.]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (14578L/99K).

1  
// A naive suffix tree implementation
2  
sclass SuffixTree {
3  
  Node root;
4  
  S fullText;
5  
  int nodeCount;
6  
  bool useTailOptimization; // doesn't work yet
7  
  
8  
  Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this);
9  
  new DummyNode dummyNode;
10  
  new Map<Int, Node> tailMap; // position -> node already made
11  
  
12  
  sinterface IFirstChar {
13  
    int firstCharOrMinus1(SuffixTree tree);
14  
  }
15  
  
16  
  sclass DummyNode implements IFirstChar {
17  
    int firstChar;
18  
    
19  
    public int firstCharOrMinus1(SuffixTree tree) { ret firstChar; }
20  
  }
21  
22  
  sclass Node implements IFirstChar {
23  
    int from, to; // our range in fullText
24  
    O children; // null, a Node or Node[] sorted by first character
25  
26  
    *() {}
27  
    *(int *from, int *to) {}
28  
    *(Substring s) { from = s.startIndex(); to = s.endIndex(); }
29  
    
30  
    Substring text(SuffixTree tree) { ret Substring(tree.fullText, from, to); }
31  
    
32  
    int lText() { ret to-from; }
33  
    
34  
    bool isTerminal(SuffixTree tree) { ret to == l(tree.fullText); }
35  
    
36  
    void addChild(SuffixTree tree, Node n) {
37  
      if (children == null) children = n;
38  
      else if (children cast Node)
39  
        children = sortArrayInPlace(new Node[] {children, n}, tree.childComparator);
40  
      else {
41  
        Node[] c = cast children;
42  
        Node[] x = new[l(c)+1];
43  
        arraycopy(c, 0, x, 0, l(c));
44  
        x[l(x)-1] = n;
45  
        children = sortArrayInPlace(x, tree.childComparator);
46  
      }
47  
    }
48  
    
49  
    Cl<Node> children() {
50  
      if (children == null) ret emptyList();
51  
      if (children cast Node) ret ll(children);
52  
      ret wrapArrayAsList((Node[]) children);
53  
    }
54  
    
55  
    public int firstCharOrMinus1(SuffixTree tree) {
56  
      ret charToIntOrMinus1(text(tree));
57  
    }
58  
    
59  
    Node getChild(SuffixTree tree, int c) {
60  
      if (children == null) null;
61  
      if (children cast Node)
62  
        ret children.firstCharOrMinus1(tree) == c ? children : null;
63  
      tree.dummyNode.firstChar = c;
64  
      Node[] x = cast children;
65  
      int i = Arrays.binarySearch(x, tree.dummyNode, tree.childComparator);
66  
      ret i >= 0 ? x[i] : null;
67  
    }
68  
  }
69  
70  
  *() {}
71  
  *(S fullText) {
72  
    processText(fullText);
73  
  }
74  
  
75  
  void processText(S fullText) {
76  
    this.fullText = fullText;
77  
    root = new Node(0, 0);
78  
    ++nodeCount;
79  
    for i over fullText: {
80  
    //for (int i = l(fullText)-1; i >= 0; i--) {
81  
      addSuffix(Substring(fullText, i));
82  
      if (((i+1) % 1000000) == 0)
83  
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
84  
    }
85  
    doneBuilding();
86  
  }
87  
  
88  
  void doneBuilding {
89  
    tailMap = null;
90  
  }
91  
92  
  void addSuffix(Substring s) {
93  
    Node node = root;
94  
    
95  
    while (!empty(s)) {
96  
      int _n = lCommonPrefix_CharSequence(node.text(SuffixTree.this), s);
97  
      s = s.substring(_n);
98  
      if (_n >= node.lText()) { // node text exhausted
99  
        if (empty(s)) { // pattern also exhausted - done
100  
          //print("Exhausted: " + node.from + "/" + node.to);
101  
          // add a new termination node
102  
          Node nNew = newNode(s);
103  
          node.addChild(SuffixTree.this, nNew);
104  
          ret;
105  
        } else { // node text exhausted, pattern not exhausted
106  
          Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s));
107  
          if (n == null) {
108  
            n = newNode(s);
109  
            node.addChild(SuffixTree.this, n);
110  
            ret;
111  
          } else
112  
            node = n;
113  
        }
114  
      } else { // node text not exhausted, pattern not exhausted
115  
        // split node. first, move all the node's vitals to a new node nOld
116  
        Node nOld = newNode(node.from+_n, node.to);
117  
        nOld.children = node.children;
118  
        node.children = null;
119  
        node.to = node.from+_n;
120  
        node.addChild(SuffixTree.this, nOld);
121  
122  
        // now add a new node
123  
        Node nNew = newNode(s);
124  
        node.addChild(SuffixTree.this, nNew);
125  
        ret;
126  
      }
127  
    }
128  
  }
129  
130  
  public L<Int> indicesOf(S pattern) {
131  
    ret asList(indicesOf_iterator(pattern));
132  
  }
133  
134  
  public ItIt<Int> indicesOf_iterator(S pattern) {
135  
    ret mapI_notNull(allNodesUnder(scanDown(root, pattern)),
136  
      nad -> nad.node.isTerminal(this) ? nad.position() : null);
137  
  }
138  
  
139  
  srecord NodeAndDepth(Node node, int depth) {
140  
    int position() {
141  
      int position = node.from-depth;
142  
      //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position);
143  
      ret position;
144  
    }
145  
    
146  
    Cl<NodeAndDepth> children() {
147  
      ret lmap wrapChild(node.children());
148  
    }
149  
    
150  
    NodeAndDepth wrapChild(Node n) {
151  
      ret n == null ? null : new NodeAndDepth(n, depth+node.lText());
152  
    }
153  
    
154  
    NodeAndDepth getChild(SuffixTree tree, int c) {
155  
      ret wrapChild(node.getChild(tree, c));
156  
    }
157  
  }
158  
159  
  NodeAndDepth scanDown(Node node, S pattern) {
160  
    int lPattern = l(pattern), iPattern = 0;
161  
    NodeAndDepth nad = new(node, 0);
162  
    while true {
163  
      int n = lCommonPrefix_CharSequence(nad.node.text(this), Substring(pattern, iPattern));
164  
      iPattern += n;
165  
      if (iPattern >= lPattern) break; // pattern exhausted - done
166  
      if (n < nad.node.lText()) null; // mismatch, exit
167  
      NodeAndDepth child = nad.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern));
168  
      if (child != null) continue with nad = child;
169  
      null;
170  
    }
171  
    
172  
    ret nad;
173  
  }
174  
  
175  
  void printMe() {
176  
    printNode("", "", new NodeAndDepth(root, 0));
177  
  }
178  
  
179  
  void printNode(S indent, S pre, NodeAndDepth nad) {
180  
    print(indent + pre + quote(shorten(20, nad.node.text(this))) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]"));
181  
    fOr (NodeAndDepth n : nad.children()) {
182  
      printNode(indent + "  ", "[" + (n.node.lText() == 0 ? "end" : quote(n.node.text(this).charAt(0))) + "] ", n);
183  
    }
184  
  }
185  
  
186  
  ItIt<NodeAndDepth> allNodes() {
187  
    ret allNodesUnder(new NodeAndDepth(root, 0));
188  
  }
189  
  
190  
  // includes the node itself
191  
  ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) {
192  
    new L<Iterator<NodeAndDepth>> stack;
193  
    if (nad != null)
194  
      stack.add(iteratorLL(nad));
195  
    ret iteratorFromFunction_if0(() -> {
196  
      while (nempty(stack)) {
197  
        if (!last(stack).hasNext())
198  
          popLast(stack);
199  
        else {
200  
          NodeAndDepth n = last(stack).next();
201  
          stack.add((Iterator) iterator(n.children()));
202  
          ret n;
203  
        }
204  
      }
205  
      null;
206  
    });
207  
  }
208  
209  
  // also query & update tailMap
210  
  Node newNode(Substring s) {  
211  
    Node nNew = useTailOptimization && s.endIndex() == l(fullText) ? mapGet(tailMap, s.startIndex()) : null;
212  
    if (nNew == null) {
213  
      nNew = new Node(s);
214  
      ++nodeCount;
215  
      if (useTailOptimization)
216  
        mapPut(tailMap, s.startIndex(), nNew);
217  
    } else {
218  
      print("Using existing node for index " + s.startIndex() + ":");
219  
      printNode("  ", "", new NodeAndDepth(nNew, 0));
220  
    }
221  
    ret nNew;
222  
  }
223  
  
224  
  Node newNode(int from, int to) {  
225  
    ret newNode(Substring(fullText, from, to));
226  
  }
227  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029202
Snippet name: SuffixTree [graph optimization doesn't work yet, dev.]
Eternal ID of this version: #1029202/95
Text MD5: c3467518c4ca65c5fc109d16d114fef0
Transpilation MD5: c43f4a33bde6daa13d936bf050ea280b
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-26 19:47:59
Source code size: 7072 bytes / 227 lines
Pitched / IR pitched: No / No
Views / Downloads: 376 / 749
Version history: 94 change(s)
Referenced in: [show references]