Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

181
LINES

< > BotCompany Repo | #1029205 // SuffixTree [old version, probably broken]

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (3173L/20K).

1  
sclass SuffixTree {
2  
  Node root;
3  
  S fullText;
4  
  S textWithTerminator; // all Substrings are constructed off this
5  
6  
  private static final String WORD_TERMINATION = "$";
7  
  private static final int POSITION_UNDEFINED = -1;
8  
  
9  
  sclass Node {
10  
    Substring text;
11  
    new L<Node> children;
12  
    int position;
13  
14  
    *(Substring *text, int *position) {}
15  
16  
    Substring getText() { ret text; }
17  
    void setText(Substring text) {
18  
      //print("Setting text: " + quote(this.text) + " => " + quote(text);
19  
      this.text = text;
20  
    }
21  
22  
    int getPosition() { ret position; }
23  
    void setPosition(int position) { this.position = position; }
24  
    L<Node> getChildren() { ret children; }
25  
    void setChildren(L<Node> children) { this.children = children; }
26  
27  
    S printTree(String depthIndicator) {
28  
        S str = "";
29  
        S positionStr = position > -1 ? "[" + position + "]" : "";
30  
        str += depthIndicator + text + positionStr + "\n";
31  
32  
        for (int i = 0; i < children.size(); i++)
33  
          str += children.get(i).printTree(depthIndicator + "\t");
34  
        return str;
35  
    }
36  
37  
    toString { ret printTree(""); }
38  
  }
39  
  
40  
  *(S text) {
41  
    textWithTerminator = text + WORD_TERMINATION;
42  
    root = new Node(Substring(textWithTerminator, 0, 0), POSITION_UNDEFINED);
43  
    for i over text:
44  
      addSuffix(Substring(textWithTerminator, i), i);
45  
    fullText = text;
46  
  }
47  
48  
  public LS markText(S pattern) {
49  
    //LOGGER.info("Searching for pattern \"{}\"", pattern);
50  
    new LS result;
51  
    L<Node> nodes = getAllNodesInTraversePath(pattern, root, false);
52  
53  
    if (nodes.size() > 0) {
54  
        Node lastNode = nodes.get(nodes.size() - 1);
55  
        if (lastNode != null) {
56  
            L<Int> positions = getPositions(lastNode);
57  
            sortInPlace(positions);
58  
            positions.forEach(m -> result.add((markPatternInText(m, pattern))));
59  
        }
60  
    }
61  
    return result;
62  
  }
63  
64  
  private void addSuffix(Substring suffix, int position) {
65  
      //print("Adding new suffix: " + quote(suffix));
66  
      List<Node> nodes = getAllNodesInTraversePath(suffix, root, true);
67  
      if (nodes.size() == 0) {
68  
          addChildNode(root, suffix, position);
69  
          //LOGGER.info("{}", printTree());
70  
      } else {
71  
          Node lastNode = nodes.remove(nodes.size() - 1);
72  
          Substring newText = suffix;
73  
          if (nodes.size() > 0) {
74  
              Substring existingSuffixUptoLastNode = joinSubstringObjects(map(nodes, a -> a.getText()));
75  
76  
              // Remove prefix from newText already included in parent
77  
              newText = newText.substring(existingSuffixUptoLastNode.length());
78  
          }
79  
          extendNode(lastNode, newText, position);
80  
          //LOGGER.info("{}", printTree());
81  
      }
82  
  }
83  
84  
  private List<Integer> getPositions(Node node) {
85  
      List<Integer> positions = new ArrayList<>();
86  
      if (node.getText()
87  
          /*.endsWith(WORD_TERMINATION)*/.endsAtEnd()) {
88  
          positions.add(node.getPosition());
89  
      }
90  
      for (int i = 0; i < node.getChildren()
91  
          .size(); i++) {
92  
          positions.addAll(getPositions(node.getChildren()
93  
              .get(i)));
94  
      }
95  
      return positions;
96  
  }
97  
98  
  private String markPatternInText(Integer startPosition, String pattern) {
99  
      String matchingTextLHS = fullText.substring(0, startPosition);
100  
      String matchingText = fullText.substring(startPosition, startPosition + pattern.length());
101  
      String matchingTextRHS = fullText.substring(startPosition + pattern.length());
102  
      return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS;
103  
  }
104  
105  
  private void addChildNode(Node parentNode, Substring text, int position) {
106  
      parentNode.getChildren()
107  
          .add(new Node(text, position));
108  
  }
109  
110  
  private void extendNode(Node node, Substring newText, int position) {
111  
      Substring currentText = node.getText();
112  
      Substring commonPrefix = getLongestCommonPrefix(currentText, newText);
113  
114  
      if (neq(commonPrefix, currentText)) {
115  
          Substring parentText = currentText.substring(0, commonPrefix.length());
116  
          Substring childText = currentText.substring(commonPrefix.length());
117  
          splitNodeToParentAndChild(node, parentText, childText);
118  
      }
119  
120  
      Substring remainingText = newText.substring(commonPrefix.length());
121  
      addChildNode(node, remainingText, position);
122  
  }
123  
124  
  private void splitNodeToParentAndChild(Node parentNode, Substring parentNewText, Substring childNewText) {
125  
      Node childNode = new Node(childNewText, parentNode.getPosition());
126  
127  
      while (parentNode.getChildren().size() > 0)
128  
        childNode.getChildren().add(parentNode.getChildren().remove(0));
129  
130  
      parentNode.getChildren().add(childNode);
131  
      parentNode.setText(parentNewText);
132  
      parentNode.setPosition(POSITION_UNDEFINED);
133  
  }
134  
135  
  private Substring getLongestCommonPrefix(Substring str1, Substring str2) {
136  
    int compareLength = Math.min(str1.length(), str2.length());
137  
    for (int i = 0; i < compareLength; i++)
138  
        if (str1.charAt(i) != str2.charAt(i))
139  
            ret str1.substring(0, i);
140  
    ret str1.substring(0, compareLength);
141  
  }
142  
143  
  private L<Node> getAllNodesInTraversePath(CharSequence pattern, Node startNode, boolean isAllowPartialMatch) {
144  
      new L<Node> nodes;
145  
      for (int i = 0; i < startNode.getChildren().size(); i++) {
146  
          Node currentNode = startNode.getChildren().get(i);
147  
          CharSequence nodeText = currentNode.getText();
148  
          if (empty(nodeText)) fail("Node text empty: " + currentNode);
149  
          if (pattern.charAt(0) == nodeText.charAt(0)) {
150  
              if (isAllowPartialMatch && pattern.length() <= nodeText.length()) {
151  
                  nodes.add(currentNode);
152  
                  return nodes;
153  
              }
154  
155  
              int compareLength = Math.min(nodeText.length(), pattern.length());
156  
              for (int j = 1; j < compareLength; j++) {
157  
                  if (pattern.charAt(j) != nodeText.charAt(j)) {
158  
                      if (isAllowPartialMatch) {
159  
                          nodes.add(currentNode);
160  
                      }
161  
                      return nodes;
162  
                  }
163  
              }
164  
165  
              nodes.add(currentNode);
166  
              if (pattern.length() > compareLength) {
167  
                  List<Node> nodes2 = getAllNodesInTraversePath(subCharSequence(pattern, compareLength), currentNode, isAllowPartialMatch);
168  
                  if (nodes2.size() > 0) {
169  
                      nodes.addAll(nodes2);
170  
                  } else if (!isAllowPartialMatch) {
171  
                      nodes.add(null);
172  
                  }
173  
              }
174  
              return nodes;
175  
          }
176  
      }
177  
      return nodes;
178  
  }
179  
180  
  S printTree() { ret root.printTree(""); }
181  
}

Author comment

Began life as a copy of #1029202

download  show line numbers  debug dex  old transpilations   

Travelled to 7 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1029205
Snippet name: SuffixTree [old version, probably broken]
Eternal ID of this version: #1029205/1
Text MD5: 97646c453c7be91d03dd7ca415771b05
Transpilation MD5: 221fe941078518b72620d6ece32b9954
Author: stefan
Category: javax / text searching
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2020-07-25 12:02:56
Source code size: 6827 bytes / 181 lines
Pitched / IR pitched: No / No
Views / Downloads: 142 / 189
Referenced in: [show references]