Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

181
LINES

< > BotCompany Repo | #1029205 - SuffixTree [old version, probably broken]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3173L/20K).

sclass SuffixTree {
  Node root;
  S fullText;
  S textWithTerminator; // all Substrings are constructed off this

  private static final String WORD_TERMINATION = "$";
  private static final int POSITION_UNDEFINED = -1;
  
  sclass Node {
    Substring text;
    new L<Node> children;
    int position;

    *(Substring *text, int *position) {}

    Substring getText() { ret text; }
    void setText(Substring text) {
      //print("Setting text: " + quote(this.text) + " => " + quote(text);
      this.text = text;
    }

    int getPosition() { ret position; }
    void setPosition(int position) { this.position = position; }
    L<Node> getChildren() { ret children; }
    void setChildren(L<Node> children) { this.children = children; }

    S printTree(String depthIndicator) {
        S str = "";
        S positionStr = position > -1 ? "[" + position + "]" : "";
        str += depthIndicator + text + positionStr + "\n";

        for (int i = 0; i < children.size(); i++)
          str += children.get(i).printTree(depthIndicator + "\t");
        return str;
    }

    toString { ret printTree(""); }
  }
  
  *(S text) {
    textWithTerminator = text + WORD_TERMINATION;
    root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED);
    for i over text:
      addSuffix(Substring(textWithTerminator, i), i);
    fullText = text;
  }

  public LS markText(S pattern) {
    //LOGGER.info("Searching for pattern \"{}\"", pattern);
    new LS result;
    L<Node> nodes = getAllNodesInTraversePath(pattern, root, false);

    if (nodes.size() > 0) {
        Node lastNode = nodes.get(nodes.size() - 1);
        if (lastNode != null) {
            L<Int> positions = getPositions(lastNode);
            sortInPlace(positions);
            positions.forEach(m -> result.add((markPatternInText(m, pattern))));
        }
    }
    return result;
  }

  private void addSuffix(Substring suffix, int position) {
      //print("Adding new suffix: " + quote(suffix));
      List<Node> nodes = getAllNodesInTraversePath(suffix, root, true);
      if (nodes.size() == 0) {
          addChildNode(root, suffix, position);
          //LOGGER.info("{}", printTree());
      } else {
          Node lastNode = nodes.remove(nodes.size() - 1);
          Substring newText = suffix;
          if (nodes.size() > 0) {
              Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText()));

              // Remove prefix from newText already included in parent
              newText = newText.substring(existingSuffixUptoLastNode.length());
          }
          extendNode(lastNode, newText, position);
          //LOGGER.info("{}", printTree());
      }
  }

  private List<Integer> getPositions(Node node) {
      List<Integer> positions = new ArrayList<>();
      if (node.getText()
          /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) {
          positions.add(node.getPosition());
      }
      for (int i = 0; i < node.getChildren()
          .size(); i++) {
          positions.addAll(getPositions(node.getChildren()
              .get(i)));
      }
      return positions;
  }

  private String markPatternInText(Integer startPosition, String pattern) {
      String matchingTextLHS = fullText.substring(0, startPosition);
      String matchingText = fullText.substring(startPosition, startPosition + pattern.length());
      String matchingTextRHS = fullText.substring(startPosition + pattern.length());
      return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS;
  }

  private void addChildNode(Node parentNode, Substring text, int position) {
      parentNode.getChildren()
          .add(new Node(text, position));
  }

  private void extendNode(Node node, Substring newText, int position) {
      Substring currentText = node.getText();
      Substring commonPrefix = getLongestCommonPrefix(currentText, newText);

      if (neq(commonPrefix, currentText)) {
          Substring parentText = currentText.substring(0, commonPrefix.length());
          Substring childText = currentText.substring(commonPrefix.length());
          splitNodeToParentAndChild(node, parentText, childText);
      }

      Substring remainingText = newText.substring(commonPrefix.length());
      addChildNode(node, remainingText, position);
  }

  private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) {
      Node childNode = new Node(childNewText, parentNode.getPosition());

      while (parentNode.getChildren().size() > 0)
        childNode.getChildren().add(parentNode.getChildren().remove(0));

      parentNode.getChildren().add(childNode);
      parentNode.setText(parentNewText);
      parentNode.setPosition(POSITION_UNDEFINED);
  }

  private Substring getLongestCommonPrefix(Substring str1, Substring str2) {
    int compareLength = Math.min(str1.length(), str2.length());
    for (int i = 0; i < compareLength; i++)
        if (str1.charAt(i) != str2.charAt(i))
            ret str1.substring(0, i);
    ret str1.substring(0, compareLength);
  }

  private L<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) {
      new L<Node> nodes;
      for (int i = 0; i < startNode.getChildren().size(); i++) {
          Node currentNode = startNode.getChildren().get(i);
          CharSequence nodeText = currentNode.getText();
          if (empty(nodeText)) fail("Node text empty: " + currentNode);
          if (pattern.charAt(0) == nodeText.charAt(0)) {
              if (isAllowPartialMatch && pattern.length() <= nodeText.length()) {
                  nodes.add(currentNode);
                  return nodes;
              }

              int compareLength = Math.min(nodeText.length(), pattern.length());
              for (int j = 1; j < compareLength; j++) {
                  if (pattern.charAt(j) != nodeText.charAt(j)) {
                      if (isAllowPartialMatch) {
                          nodes.add(currentNode);
                      }
                      return nodes;
                  }
              }

              nodes.add(currentNode);
              if (pattern.length() > compareLength) {
                  List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch);
                  if (nodes2.size() > 0) {
                      nodes.addAll(nodes2);
                  } else if (!isAllowPartialMatch) {
                      nodes.add(null);
                  }
              }
              return nodes;
          }
      }
      return nodes;
  }

  S printTree() { ret root.printTree(""); }
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex   

Travelled to 6 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029205
Snippet name: SuffixTree [old version, probably broken]
Eternal ID of this version: #1029205/1
Text MD5: 97646c453c7be91d03dd7ca415771b05
Transpilation MD5: 221fe941078518b72620d6ece32b9954
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-25 12:02:56
Source code size: 6827 bytes / 181 lines
Pitched / IR pitched: No / No
Views / Downloads: 32 / 53
Referenced in: [show references]

Formerly at http://tinybrain.de/1029205 & http://1029205.tinybrain.de