Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

104
LINES

< > BotCompany Repo | #1005238 // sentencesFromHTML

JavaX fragment (include)

1  
static L<S> sentencesFromHTML(S html) {
2  
  new SentencesFromHTML x;
3  
  x.parseHTML(html);
4  
  ret x.sentences;
5  
}
6  
7  
sclass SentencesFromHTML {
8  
  new L<S> sentences;
9  
  bool withQuestions = true;
10  
11  
  void parseHTML(S html) {
12  
    // tokenize and clean up html
13  
    L<S> tok = htmlcoarsetok(html);
14  
    tok = removeScripts(tok);
15  
    tok = dropTags(tok, "span");
16  
    tok = dropTags(tok, "a");
17  
    tok = dropTags(tok, "b");
18  
    tok = dropTags(tok, "small");
19  
    
20  
    // tok = dropAllTags(tok); // Too much!
21  
    //print(structure(tok));
22  
  
23  
    for (int i = 0; i < tok.size(); i += 2) {
24  
      S line = tok.get(i).trim();
25  
      if (line.startsWith("<!--")) continue; // HTML comment
26  
      if (line.length() == 0) continue;
27  
      //print("Line: " + quote(line));
28  
      interpretLine(line);
29  
    }
30  
  }
31  
32  
  static boolean hasCharacters(S s) {
33  
    for (int i = 0; i < s.length(); i++)
34  
      if (Character.isLetter(s.charAt(i)))
35  
        ret true;
36  
    ret false;
37  
  }
38  
  
39  
  void interpretLine(S s) {
40  
    for (S sentence : getSentences(s)) {
41  
      char first = sentence.charAt(0);
42  
      if (Character.isLowerCase(first) || ",;:=".indexOf(first) >= 0) continue;
43  
      if (!hasCharacters(sentence)) continue;
44  
      //L<S> tok = javaTok(sentence);
45  
      //print(structure(tok));
46  
      sentence = htmldecode(sentence);
47  
      //print("Sentence found: " + quote(sentence));
48  
      sentences.add(sentence);
49  
    }
50  
  }
51  
  
52  
  L<S> getSentences(S s) {
53  
    L<S> tok = javaTok(s); // To parse quoted things
54  
    fixSpaces(tok);
55  
    new L<S> list;
56  
    int i = 0;
57  
    while (true) {
58  
      int j = i;
59  
      do {
60  
        j = smartIndexOfMulti(tok, withQuestions ? ll(".", "?") : ll("."), j+1);
61  
        if (j >= l(tok)) return list;
62  
      } while (j+1 < tok.size()-1 && tok.get(j+1).equals("")); // matches stuff like "9.5"
63  
      
64  
      S sentence = join(tok.subList(i, j+1)).trim();
65  
      if (sentence.length() > 1)
66  
        list.add(sentence);
67  
      i = j+1;
68  
    }
69  
  }
70  
  
71  
  // So everything's on one line.
72  
  static void fixSpaces(L<S> tok) {
73  
    for (int i = 0; i < l(tok); i += 2)
74  
      tok.set(i, tok.get(i).equals("") ? "" : " ");
75  
  }
76  
  
77  
  static L<S> removeScripts(L<S> tok) {
78  
    tok = new ArrayList<S>(tok);
79  
    for (int i = 1; i < tok.size(); )
80  
      if (tagIs(tok.get(i), "script")) {
81  
        int j = i;
82  
        while (j < tok.size() && !tagIs(tok.get(j), "/script"))
83  
          j += 2;
84  
        while (j+1 > i-1) { // Yeah it's inefficient...
85  
          tok.remove(i-1); --j; 
86  
        }
87  
      } else
88  
        i += 2;
89  
    return tok;
90  
  }
91  
  
92  
  static L<S> dropTags(L<S> tok, S tag) {
93  
    new L<S> list;
94  
    for (int i = 0; i < tok.size(); i++) {
95  
      S t = tok.get(i);
96  
      if (tagIs(t, tag) || tagIs(t, "/" + tag)) {
97  
        list.set(list.size()-1, list.get(list.size()-1) + tok.get(i+1));
98  
        ++i;
99  
      } else
100  
        list.add(tok.get(i));
101  
    }
102  
    return list;
103  
  }
104  
}

Author comment

Began life as a copy of #1000858

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1005238
Snippet name: sentencesFromHTML
Eternal ID of this version: #1005238/1
Text MD5: 81d9b637720506fe3b03c5d7c2a9d0b3
Author: stefan
Category: javax / text analysis
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-10-26 17:08:44
Source code size: 2934 bytes / 104 lines
Pitched / IR pitched: No / No
Views / Downloads: 468 / 491
Referenced in: [show references]