Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

166
LINES

< > BotCompany Repo | #1029213 // SuffixTree [partly optimized, not sure if correct]

JavaX fragment (include)

// A naive suffix tree implementation
sclass SuffixTree {
  Node root;
  S fullText;
  int nodeCount;
  
  Comparator<IFirstChar> childComparator = (a, b) -> a.firstCharOrMinus1()-b.firstCharOrMinus1();
  new DummyNode dummyNode;
  
  sinterface IFirstChar {
    int firstCharOrMinus1();
  }
  
  sclass DummyNode implements IFirstChar {
    int firstChar;
    
    public int firstCharOrMinus1() { ret firstChar; }
  }

  sclass Node implements IFirstChar {
    Substring text;
    CompactTreeSet<IFirstChar> children; // actually, <Node>
    IntBuffer positions;

    *() {}
    *(Substring *text) {}
    
    void addPosition(int i) {
      if (positions == null) positions = new IntBuffer;
      positions.add(i);
    }
    
    void addChild(SuffixTree tree, Node n) {
      if (children == null) children = makeEmptyChildrenSet(tree);
      children.add(n);
    }
    
    CompactTreeSet<IFirstChar> makeEmptyChildrenSet(SuffixTree tree) { ret new CompactTreeSet<>(tree.childComparator); }
    
    public int firstCharOrMinus1() {
      ret charToIntOrMinus1(text);
    }
    
    Node getChild(SuffixTree tree, int c) {
      if (children == null) null;
      tree.dummyNode.firstChar = c;
      ret (Node) children.find(tree.dummyNode);
    }
  }

  *() {}
  *(S *fullText) {
    root = new Node(Substring(fullText, 0, 0));
    ++nodeCount;
    for i over fullText: {
      addSuffix(Substring(fullText, i), i);
      if (((i+1) % 1000000) == 0)
        print((i+1) + " suffixes added (" + nNodes(nodeCount) + ")");
    }
  }

  void addSuffix(Substring s, int position) {
    Node node = root;
    
    while (!empty(s)) {
      int _n = lCommonPrefix_CharSequence(node.text, s);
      s = s.substring(_n);
      if (_n >= l(node.text)) { // node text exhausted
        if (empty(s)) { // pattern also exhausted - done
          node.addPosition(position);
          ret;
        } else {
          Node n = node.getChild(SuffixTree.this, charToIntOrMinus1(s));
          if (n == null) {
            n = new Node(s);
            ++nodeCount;
            n.addPosition(position);
            node.addChild(SuffixTree.this, n);
            ret;
          } else
            node = n;
        }
      } else { // node text not exhausted
        // split node. first, move all the node's vitals to a new node nOld
        Node nOld = new Node(node.text.substring(_n));
        ++nodeCount;
        nOld.positions = node.positions;
        node.positions = null;
        nOld.children = node.children;
        node.children = null;
        node.text = node.text.substring(0, _n);
        node.addChild(SuffixTree.this, nOld);

        // now add a new node
        Node nNew = new Node(s);
        ++nodeCount;
        nNew.addPosition(position);
        node.addChild(SuffixTree.this, nNew);
        ret;
      }
    }
  }

  public L<Int> indicesOf(S pattern) {
    new IntBuffer out;
    getIndicesOf(out, root, pattern, 0);
    ret out.asVirtualList();
  }
  
  void getIndicesOf(IntBuffer out, Node node, S pattern, int iPattern) {
    int lPattern = l(pattern);
    while true {
      int n = lCommonPrefix_CharSequence(node.text, Substring(pattern, iPattern));
      iPattern += n;
      if (iPattern >= lPattern) break; // pattern exhausted - done
      if (n < l(node.text)) ret; // mismatch, exit
      Node child = node.getChild(SuffixTree.this, charAtAsIntOrMinus1(pattern, iPattern));
      if (child != null) continue with node = child;
      ret;
    }
    
    collectPositions(out, node);
  }
  
  L<Int> getPositions(Node node) {
    new IntBuffer out;
    collectPositions(out, node);
    ret out.asVirtualList();
  }
  
  void collectPositions(IntBuffer out, Node node) {
    if (node == null) ret;
    out.addAll(node.positions);
    fOr (IFirstChar n : node.children)
      collectPositions(out, (Node) n);
  }
  
  void printMe() {
    printNode("", "", root);
  }
  
  void printNode(S indent, S pre, Node node) {
    print(indent + pre + quote(shorten(20, node.text)) + (empty(node.positions) ? "" : " " + node.positions));
    fOr (IFirstChar _n : node.children) {
      Node n = cast _n;
      printNode(indent + "  ", "[" + (empty(n.text) ? "end" : quote(n.text.charAt(0))) + "] ", n);
    }
  }
  
  ItIt<Node> allNodes() {
    new L<Iterator<Node>> stack;
    stack.add(iteratorLL(root));
    ret iteratorFromFunction_if0(() -> {
      while (nempty(stack)) {
        if (!last(stack).hasNext())
          popLast(stack);
        else {
          Node n = last(stack).next();
          stack.add((Iterator) iterator(n.children));
          ret n;
        }
      }
      null;
    });
  }
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029213
Snippet name: SuffixTree [partly optimized, not sure if correct]
Eternal ID of this version: #1029213/1
Text MD5: 14996bbefb96fbd55e8a2f99d75f03ed
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-25 14:25:38
Source code size: 4791 bytes / 166 lines
Pitched / IR pitched: No / No
Views / Downloads: 111 / 125
Referenced in: [show references]