Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

250
LINES

< > BotCompany Repo | #1002307 // NL Parser, hotwirable

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (3112L/21K/68K).

1  
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes
2  
3  
!752
4  
5  
// a recognition is identified by (startPos, className, endPos)
6  
7  
8  
// key 1 = start position, key 2 = class name, value = end position
9  
static Map<Integer, MultiMap<S, Integer>> recog;
10  
static L<S> tok;
11  
12  
static MultiMap<S, L<S>> productionMap;
13  
14  
static boolean debug = false;
15  
16  
static long timing;
17  
18  
static Set<S> adjectives; // list of all adjectives - filled from outside
19  
20  
21  
static S defaultParserRules = "#1002281";
22  
static S defaultMainProd = "line";
23  
24  
p {
25  
  startBot("Snippet Text Bot", "#1002084");
26  
}
27  
28  
static synchronized S parse(S inputText) {
29  
  ret parse(inputText, defaultParserRules, defaultMainProd);
30  
}
31  
32  
static synchronized S parse(S inputText, S parserRules, S mainProd) {
33  
  S rulesText = loadSnippetThroughBot(parserRules);
34  
35  
  productionMap = new MultiMap;
36  
  for (S rule : toLinesFullTrim(rulesText)) pcall {
37  
    //printF("Processing rule: *", rule);
38  
    L<S> lr = splitAtJavaToken(rule, "=");
39  
    if (l(lr) != 2) {
40  
      print("Weird rule: " + rule);
41  
      continue;
42  
    }
43  
    S l = lr.get(0), r = lr.get(1);
44  
    L<S> tokr = javaTok(r);
45  
    assertEquals(structure(tokr), 3, l(tokr));
46  
    S className = assertIdentifier(get(tokr, 1));
47  
    L<S> tok = javaTok(l);
48  
    tok = mergeBracketThingies(tok);
49  
    //printStructure(tok);
50  
    productionMap.put(className, tok);
51  
  }
52  
  
53  
  print(n(productionMap.size(), "production") + ".");
54  
  print();
55  
  
56  
  timing = now();
57  
  tok = javaTok(inputText);
58  
  Pos pos = new Pos(tok);
59  
  L<Integer> x = parseTop(pos, mainProd);
60  
  S result;
61  
  timing = now()-timing;
62  
  if (x.contains(l(tok)))
63  
    result = "parsed";
64  
  else if (!empty(x))
65  
    result = "beginning matches";
66  
  else
67  
    ret "not parsed";
68  
  L out = explainMatch(new Pos(tok), x.get(0), mainProd);
69  
  ret result + ", detailed analysis: " + structure(out);
70  
}
71  
72  
static L<Integer> parseTop(Pos pos, S mainProd) {
73  
  // init structures
74  
  recog = new TreeMap;
75  
  for (int i = pos.i; i < l(pos.tok); i += 2)
76  
    recog.put(i, new MultiMap);
77  
  print("parser: recog inited " + pos.i + " to " + l(pos.tok));
78  
  
79  
  // adjectives
80  
  if (adjectives != null)
81  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
82  
      S t = pos.tok.get(i);
83  
      if (adjectives.contains(t.toLowerCase()))
84  
        recog.get(i).setPut("adjective", i+2);
85  
    }
86  
87  
88  
  boolean anyChange;
89  
  do {
90  
    anyChange = false;
91  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
92  
      Pos pos2 = new Pos(pos.tok, i);
93  
      for (S className : productionMap.keySet()) {
94  
        MultiMap<S, Integer> rr = recog.get(i);
95  
        L<Integer> recs = rr.getActual(className);
96  
        L<L<S>> prods = productionMap.get(className);
97  
        for (L<S> prod : prods) {
98  
          int n = l(recs);
99  
          matchProd(pos2, new Pos(prod), className, recs);
100  
          anyChange = anyChange || l(recs) > n;
101  
        }
102  
        rr.clean(className);
103  
      }
104  
    }
105  
  } while (anyChange);
106  
  
107  
  ret recog.get(pos.i).get(mainProd);
108  
}
109  
110  
static class Pos {
111  
  L<S> tok;
112  
  int i = 1;
113  
  
114  
  *() {}
115  
  *(L<S> *tok) {}
116  
  *(L<S> *tok, int *i) {}
117  
  
118  
  boolean end() { ret i >= l(tok)-1; }
119  
  S get() { ret tok.get(i); }
120  
  public Pos clone() { ret new Pos(tok, i); }
121  
  public boolean equals(O o) {
122  
    if (!(o instanceof Pos)) ret false;
123  
    Pos pos = cast o;
124  
    ret tok == pos.tok && i == pos.i;
125  
  }
126  
  
127  
  S rest() {
128  
    ret join(subList(tok, i));
129  
  }
130  
131  
  Pos plus(int x) { ret new Pos(tok, i + x); }
132  
}
133  
134  
static void copy(Pos a, Pos b) {
135  
  b.tok = a.tok;
136  
  b.i = a.i;
137  
}
138  
139  
static void debug(S bla, Pos pos) {
140  
  if (debug)
141  
    print(bla + " on " + quote(pos.rest()));
142  
}
143  
144  
static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
145  
  if (prod.end())
146  
    setAdd(out, pos.i);
147  
  else if (pos.end())
148  
    ret;
149  
  else {
150  
    S p = prod.get();
151  
    
152  
    if (isBracketedID(p) && !specials.contains(p)) {
153  
      MultiMap<S, Integer> rr = recog.get(pos.i);
154  
      if (rr == null)
155  
        fail("parser: recog null at " + pos.i);
156  
      L<Integer> r = rr.get(unbracket(p));
157  
      
158  
      // keep parsing for every option
159  
  
160  
      for (int i : cloneList(r))
161  
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
162  
      
163  
    } else {
164  
      // it's a literal
165  
      S t = pos.get();
166  
      if (!matchToken(p, t))
167  
        ret;
168  
      
169  
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
170  
    }
171  
  }
172  
}
173  
174  
static L<S> specials = litlist("<quoted>", "<int>", "<identifier>");
175  
176  
static boolean matchToken(S p, S t) {
177  
  if (eq(p, "<quoted>")) {
178  
    if (!isQuoted(t)) ret false;
179  
  } else if (eq(p, "<int>")) {
180  
    if (!isInteger(t)) ret false;
181  
  } else if (eq(p, "<identifier>")) {
182  
    if (!isIdentifier(t)) ret false;
183  
  } else if (!(eq(p, "*") || eqic(p, t)))
184  
    ret false; // token mismatch
185  
  ret true;
186  
}
187  
188  
// assumes that there is a match (pos, class, endPos)
189  
// and gives explanations of how it was probably matched
190  
static L explainMatch(Pos pos, int endPos, S forClass) {
191  
  L<L<S>> prods = productionMap.get(forClass);
192  
  new L out;
193  
  for (L<S> prod : prods)
194  
    explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out);
195  
  ret out;
196  
}
197  
198  
// same, but with fixed production
199  
static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
200  
  if (prod.end()) {
201  
    if (pos.i == endPos)
202  
      out.add(cloneList(match));
203  
  } else if (pos.end())
204  
    ret;
205  
  else {
206  
    S p = prod.get();
207  
    
208  
    if (isBracketedID(p) && neq(p, "<quoted>")) {
209  
      S className = unbracket(p);
210  
      L<Integer> r = recog.get(pos.i).get(className);
211  
212  
      // keep parsing for every option
213  
  
214  
      for (int i : cloneList(r)) {
215  
        match.add(litlist(pos.i, i));
216  
        explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out);
217  
        removeLast(match);
218  
      }
219  
    } else {
220  
      // it's a literal
221  
      S t = pos.get();
222  
      if (!matchToken(p, t))
223  
        ret;
224  
      
225  
      //match.add(litlist(pos.i, p, pos.i+2));
226  
      explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out);
227  
      removeLast(match);
228  
    }
229  
  }
230  
}
231  
232  
static S loadSnippetThroughBot(S snippetID) {
233  
  S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
234  
  new Matches m;
235  
  assertTrue(match("the text of snippet * is *", answer, m));
236  
  ret m.unq(1);
237  
}
238  
239  
static S prettierAnalysis() {
240  
  new L<S> l;
241  
  for (int i = 1; i < l(tok); i += 2) {
242  
    MultiMap<S, Integer> rr = recog.get(i);
243  
    new L<S> bla;
244  
    for (S prod : rr.keys()) {
245  
      L<Integer> is = rr.get(prod);
246  
      bla.add(prod + " " + (max(is)-i)/2);
247  
    }
248  
    l.add(tok.get(i) + " : " + structure(bla));
249  
  } ret fromLines(l);
250  
}

Author comment

Began life as a copy of #1002306

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002307
Snippet name: NL Parser, hotwirable
Eternal ID of this version: #1002307/1
Text MD5: fb022bd644deed5964a7f11d615ae4e2
Transpilation MD5: c952e47f2f1ad33239c1a85663d4d687
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-01-05 17:52:41
Source code size: 6813 bytes / 250 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 715 / 815
Referenced in: [show references]