Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

195
LINES

< > BotCompany Repo | #1029269 // IntSuffixTree [OK, but slow creation algorithm]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (14874L/101K).

1  
// A naive suffix tree implementation
2  
sclass IntSuffixTree {
3  
  Node root;
4  
  L<Int> fullText;
5  
  int nodeCount;
6  
  
7  
  Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1(this)-b.firstCharOrMinus1(this);
8  
  new DummyNode dummyNode;
9  
  int printEvery = 1000000;
10  
  
11  
  sinterface IFirstChar {
12  
    int firstCharOrMinus1(IntSuffixTree tree);
13  
  }
14  
  
15  
  sclass DummyNode implements IFirstChar {
16  
    int firstChar;
17  
    
18  
    public int firstCharOrMinus1(IntSuffixTree tree) { ret firstChar; }
19  
  }
20  
21  
  sclass Node implements IFirstChar {
22  
    int from, to; // our range in fullText
23  
    CompactTreeSet<IFirstChar> children;
24  
25  
    *() {}
26  
    *(int *from, int *to) {}
27  
    *(IntRange s) { from = s.start; to = s.end; }
28  
    
29  
    L<Int> text(IntSuffixTree tree) { ret subList(tree.fullText, from, to); }
30  
    
31  
    int lText() { ret to-from; }
32  
    
33  
    bool isTerminal(IntSuffixTree tree) { ret to == l(tree.fullText); }
34  
    
35  
    void addChild(IntSuffixTree tree, Node n) {
36  
      if (children == null) children = makeEmptyChildrenSet(tree);
37  
      children.add(n);
38  
    }
39  
    
40  
    CompactTreeSet<IFirstChar> makeEmptyChildrenSet(IntSuffixTree tree) {
41  
      ret new CompactTreeSet<>(tree.childComparator);
42  
    }
43  
        
44  
    Cl<IFirstChar> children() {
45  
      ret children;
46  
    }
47  
    
48  
    public int firstCharOrMinus1(IntSuffixTree tree) {
49  
      ret getOrMinus1(tree.fullText, from);
50  
    }
51  
    
52  
    Node getChild(IntSuffixTree tree, int c) {
53  
      if (children == null) null;
54  
      tree.dummyNode.firstChar = c;
55  
      ret (Node) children.find(tree.dummyNode);
56  
    }
57  
  }
58  
59  
  *() {}
60  
  *(L<Int> fullText) {
61  
    process(fullText);
62  
  }
63  
  
64  
  void process(L<Int> fullText) {
65  
    root = new Node(0, 0);
66  
    ++nodeCount;
67  
    for i over fullText: {
68  
      ping();
69  
      addSuffix(IntRange(i, l(fullText)));
70  
      if (((i+1) % printEvery) == 0)
71  
        print((i+1) + "/" + l(fullText ) + " suffixes added (" + nNodes(nodeCount) + ")");
72  
    }
73  
  }
74  
75  
  void addSuffix(IntRange s) {
76  
    Node node = root;
77  
    
78  
    while (!empty(s)) {
79  
      int _n = lCommonPrefix_lists_ext(fullText, node.from, fullText, s.start);
80  
      s.start += _n;
81  
      if (_n >= node.lText()) { // node text exhausted
82  
        if (empty(s)) { // pattern also exhausted - done
83  
          //print("Exhausted: " + node.from + "/" + node.to);
84  
          // add a new termination node
85  
          Node nNew = new Node(s);
86  
          ++nodeCount;
87  
          node.addChild(IntSuffixTree.this, nNew);
88  
          ret;
89  
        } else {
90  
          Node n = node.getChild(IntSuffixTree.this, getOrMinus1(fullText, s.start));
91  
          if (n == null) {
92  
            n = new Node(s);
93  
            ++nodeCount;
94  
            node.addChild(IntSuffixTree.this, n);
95  
            ret;
96  
          } else
97  
            node = n;
98  
        }
99  
      } else { // node text not exhausted
100  
        // split node. first, move all the node's vitals to a new node nOld
101  
        Node nOld = new Node(node.from+_n, node.to);
102  
        ++nodeCount;
103  
        nOld.children = node.children;
104  
        node.children = null;
105  
        node.to = node.from+_n;
106  
        node.addChild(IntSuffixTree.this, nOld);
107  
108  
        // now add a new node
109  
        Node nNew = new Node(s);
110  
        ++nodeCount;
111  
        node.addChild(IntSuffixTree.this, nNew);
112  
        ret;
113  
      }
114  
    }
115  
  }
116  
117  
  public L<Int> indicesOf(L<Int> pattern) {
118  
    ret asList(indicesOf_iterator(pattern));
119  
  }
120  
121  
  public ItIt<Int> indicesOf_iterator(L<Int> pattern) {
122  
    ret mapI_notNull(allNodesUnder(scanDown(root, pattern)),
123  
      nad -> nad.node.isTerminal(this) ? nad.position() : null);
124  
  }
125  
  
126  
  srecord NodeAndDepth(Node node, int depth) {
127  
    int position() {
128  
      int position = node.from-depth;
129  
      //print("from=" + node.from + ", to=" + node.to + ", depth=" + depth + ", position=" + position);
130  
      ret position;
131  
    }
132  
    
133  
    Cl<NodeAndDepth> children() {
134  
      ret lmap wrapChild((Cl<Node>) (Cl) node.children());
135  
    }
136  
    
137  
    NodeAndDepth wrapChild(Node n) {
138  
      ret n == null ? null : new NodeAndDepth(n, depth+node.lText());
139  
    }
140  
    
141  
    NodeAndDepth getChild(IntSuffixTree tree, int c) {
142  
      ret wrapChild(node.getChild(tree, c));
143  
    }
144  
  }
145  
146  
  NodeAndDepth scanDown(Node node, L<Int> pattern) {
147  
    int lPattern = l(pattern), iPattern = 0;
148  
    NodeAndDepth nad = new(node, 0);
149  
    while true {
150  
      int n = lCommonPrefix_lists(nad.node.text(this), subList(pattern, iPattern));
151  
      iPattern += n;
152  
      if (iPattern >= lPattern) break; // pattern exhausted - done
153  
      if (n < nad.node.lText()) null; // mismatch, exit
154  
      NodeAndDepth child = nad.getChild(IntSuffixTree.this, or(get(pattern, iPattern), -1));
155  
      if (child != null) continue with nad = child;
156  
      null;
157  
    }
158  
    
159  
    ret nad;
160  
  }
161  
  
162  
  void printMe() {
163  
    printNode("", "", new NodeAndDepth(root, 0));
164  
  }
165  
  
166  
  void printNode(S indent, S pre, NodeAndDepth nad) {
167  
    print(indent + pre + takeFirst(20, nad.node.text(this)) + (!nad.node.isTerminal(this) ? "" : " [" + nad.position() + "]"));
168  
    fOr (NodeAndDepth n : nad.children()) {
169  
      printNode(indent + "  ", takeFirst(1, n.node.text(this)) + " ", n);
170  
    }
171  
  }
172  
  
173  
  ItIt<NodeAndDepth> allNodes() {
174  
    ret allNodesUnder(new NodeAndDepth(root, 0));
175  
  }
176  
  
177  
  // includes the node itself
178  
  ItIt<NodeAndDepth> allNodesUnder(NodeAndDepth nad) {
179  
    new L<Iterator<NodeAndDepth>> stack;
180  
    if (nad != null)
181  
      stack.add(iteratorLL(nad));
182  
    ret iteratorFromFunction_if0(() -> {
183  
      while (nempty(stack)) {
184  
        if (!last(stack).hasNext())
185  
          popLast(stack);
186  
        else {
187  
          NodeAndDepth n = last(stack).next();
188  
          stack.add((Iterator) iterator(n.children()));
189  
          ret n;
190  
        }
191  
      }
192  
      null;
193  
    });
194  
  }
195  
}

Author comment

Began life as a copy of #1029234

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029269
Snippet name: IntSuffixTree [OK, but slow creation algorithm]
Eternal ID of this version: #1029269/27
Text MD5: 5fea51137dfa0e7387a3bf19bc548a98
Transpilation MD5: cd8ca402060f10eb680c26e8d4928d81
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-29 20:41:19
Source code size: 5841 bytes / 195 lines
Pitched / IR pitched: No / No
Views / Downloads: 229 / 561
Version history: 26 change(s)
Referenced in: [show references]