Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

508
LINES

< > BotCompany Repo | #1002719 // "Bottom-Up" NL Parser, faster explainMatch (LIVE EVERYWHERE)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (8299L/57K/186K).

1  
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes
2  
3  
// Productions elements: literals (just write them), <classname>, <classname varname>
4  
5  
!7
6  
7  
set flag NoAI.
8  
9  
// a recognition is identified by (startPos, className, endPos)
10  
11  
// key 1 = start position, key 2 = class name, value = end position
12  
static Map<Integer, MultiMap<S, Integer>> recog;
13  
static L<S> tok;
14  
15  
static MultiMap<S, L<S>> productionMap;
16  
17  
static boolean caseSensitive = false;
18  
19  
static boolean debug = false;
20  
21  
static long timing;
22  
static int iterations;
23  
24  
static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples
25  
26  
27  
static S defaultParserRules = "#1002281"; // NL
28  
static S javaRules = "#1002329";
29  
static boolean keepRules = false;
30  
31  
static class ParseResult {
32  
  L<S> tok;
33  
  Map<Integer, MultiMap<S, Integer>> recog;
34  
  
35  
  L<S> fullMatchClasses() {
36  
    new L<S> l;
37  
    MultiMap<S, Integer> rr = recog.get(1);
38  
    for (S className : rr.keys())
39  
      if (rr.get(className).contains(l(tok)))
40  
        l.add(className);
41  
    ret l;
42  
  }
43  
  
44  
  boolean parsesAs(S className) {
45  
    MultiMap<S, Integer> rr = recog.get(1);
46  
    ret rr.get(className).contains(l(tok));
47  
  }
48  
  
49  
  L explainLongestMatch() {
50  
    S c = findLongestMatch(1);
51  
    if (c == null) ret null;
52  
    int n = max(recog.get(1).get(c));
53  
    if (n == 1) ret null;
54  
    L l = explainMatch(new Pos(tok), n, c);
55  
    ret shortenExplanation(l);
56  
  }
57  
58  
  S findLongestMatch(int i) {
59  
    MultiMap<S, Integer> rr = recog.get(i);
60  
    new Map<S, Integer> map;
61  
    for (S c : rr.keys())
62  
      map.put(c, max(rr.get(c)));
63  
    ret keyWithBiggestValue(map);
64  
  }
65  
  
66  
  L explain(S className) {
67  
    ret explain(1, l(tok), className);
68  
  }
69  
  
70  
  L explainFull(S className) {
71  
    ret explainFull(1, l(tok), className);
72  
  }
73  
  
74  
  // full = not shortened (include all subclassings)
75  
  L explainFull(int fromToken, int toToken, S className) {
76  
    ret explainMatch(new Pos(tok, fromToken), toToken, className);
77  
  }
78  
  
79  
  L explain(int fromToken, int toToken, S className) {
80  
    L l = explainFull(fromToken, toToken, className);
81  
    if (debug)
82  
      print("Full explanation: " + sfu(l));
83  
    l = shortenExplanation(l);
84  
    if (debug)
85  
      print("Shortened explanation: " + sfu(l));
86  
    ret l;
87  
  }
88  
  
89  
  // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]]
90  
  
91  
  L shortenExplanation(L m) {
92  
    if (m == null) ret null;
93  
    if (l(m) == 5) {
94  
      int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
95  
      L sub = (L) m.get(4);
96  
      int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1);
97  
      
98  
      // for primitives, inner class seems to be null currently
99  
      // - in this case, leave outer match.
100  
      
101  
      S innerClass = getString(sub, 2);
102  
      if (ii == II && jj == JJ && innerClass != null)
103  
        ret shortenExplanation(sub); // it's a subclassing  - just drop the outer match
104  
    }
105  
    for (int i = 4; i < l(m); i++)
106  
      m.set(i, shortenExplanation((L) m.get(i)));
107  
    ret m;
108  
  }
109  
  
110  
  // assumes that there is a match (pos, class, endPos)
111  
  // and gives explanations of how it was probably matched.
112  
  // result list format: [className, prodText, [i1, i2], [i3, i4], ...]
113  
  L explainMatch(Pos pos, int endPos, S forClass) {
114  
    L<L<S>> prods = productionMap.get(forClass);
115  
    for (L<S> prod : prods) {
116  
      L match = litlist(pos.i, endPos, forClass, join(prod));
117  
      if (debug)
118  
        print("Starting match: " + structure(match));
119  
      L out = explainMatch2(pos, new Pos(prod), endPos, forClass, match);
120  
      if (out != null) ret out;
121  
    }
122  
    ret null;
123  
  }
124  
  
125  
  // same, but with fixed production
126  
  L explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match) {
127  
    if (prod.end()) {
128  
      if (pos.i == endPos) {
129  
        // match done! add sub-explanations.
130  
        
131  
        match = cloneList(match);
132  
        if (debug)
133  
          print("Match done! " + structure(match));
134  
        for (int i = 4; i < l(match); i++) {
135  
          L m = cloneList((L) match.get(i));
136  
          //print("m=" + structure(m));
137  
          int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
138  
          S className = cast m.get(2);
139  
          L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className);
140  
          //print(format("Recursed: * * * => ", ii, jj, className) +  structure(subExplanation));
141  
          if (subExplanation == null) {
142  
            if (debug) print("weird in " + forClass);
143  
          } else
144  
            m.addAll(subList(subExplanation, 3));
145  
            
146  
          match.set(i, m);
147  
        }
148  
        
149  
        if (debug)
150  
          print("Match completed: " + structure(match));
151  
        ret match;
152  
      }
153  
    } else {
154  
      S p = prod.get();
155  
      
156  
      if (isBracketedID(p)) {
157  
        // get class name and possible var name
158  
        
159  
        S className = unbracket(p);
160  
        S varName = null;
161  
        if (hasSpace(className)) {
162  
          int idx = className.indexOf(' ');
163  
          varName = className.substring(idx+1).trim();
164  
          className = className.substring(0, idx).trim();
165  
        }
166  
        
167  
        // look up if class found at this place
168  
        
169  
        L<Integer> r = recog.get(pos.i).get(className);
170  
171  
        // keep parsing for every option
172  
    
173  
        for (int i : cloneList(r)) {
174  
          match.add(litlist(pos.i, i, className));
175  
          L out = explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match);
176  
          if (out != null) ret out;
177  
          removeLast(match);
178  
        }
179  
      } else if (!pos.end()) {
180  
        // it's a literal
181  
        S t = pos.get();
182  
        if (!matchToken(p, t))
183  
          ret null;
184  
        
185  
        //match.add(litlist(pos.i, pos.i+2, p));
186  
        L out = explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match);
187  
        if (out != null) ret out;
188  
        //removeLast(match);
189  
      }
190  
    }
191  
    ret null;
192  
  }
193  
  
194  
  S prettierAnalysis() {
195  
    new L<S> l;
196  
    new TreeSet<S> emptyClasses;
197  
    for (int i = 1; i < l(tok); i += 2) {
198  
      MultiMap<S, Integer> rr = recog.get(i);
199  
      new L<S> bla;
200  
      for (S className : rr.keys()) {
201  
        L<Integer> is = rr.get(className);
202  
        if (is.contains(i))
203  
          emptyClasses.add(className);
204  
        int n = (max(is)-i)/2;
205  
        if (n > 0)
206  
          bla.add(className + " " + n);
207  
      }
208  
      l.add(tok.get(i) + " : " + structure(bla));
209  
    }
210  
    if (!empty(emptyClasses))
211  
      l.add("  with empty classes " + structure(asList(emptyClasses)));
212  
    ret fromLines(l);
213  
  }
214  
}
215  
216  
p {
217  
  //startBot("Snippet Text Bot", "#1002084");
218  
}
219  
220  
static ParseResult parse(S inputText) {
221  
  ret parse(inputText, defaultParserRules);
222  
}
223  
224  
static ParseResult jparse(S inputText) {
225  
  ret parse(inputText, javaRules);
226  
}
227  
228  
static S keptRulesID;
229  
static S keptRulesText;
230  
231  
static ParseResult parse(S inputText, S parserRules) {
232  
  ret parse(javaTok(inputText), parserRules);
233  
}
234  
235  
static synchronized ParseResult parse(L<S> inputTok, S parserRules) {
236  
  S rulesText;
237  
  if (!isSnippetID(parserRules))
238  
    rulesText = parserRules;
239  
  else {
240  
    if (keepRules && sameSnippetID(parserRules, keptRulesID))
241  
      rulesText = keptRulesText;
242  
    else {
243  
      rulesText = loadSnippet/*ThroughBot*/(parserRules);
244  
      if (keepRules) {
245  
        keptRulesID = parserRules;
246  
        keptRulesText = rulesText;
247  
      }
248  
    }
249  
  }
250  
251  
  productionMap = new MultiMap;
252  
  caseSensitive = false;
253  
  for (S rule : toLinesFullTrim(rulesText)) pcall {
254  
    if (l(javaTok(rule)) == 1) continue; // just a comment
255  
    
256  
    if (debug)
257  
      printF("Processing rule: *", rule);
258  
    
259  
    if (rule.contains("=")) {  
260  
      L<S> lr = splitAtJavaToken(rule, "=");
261  
      S l, r;
262  
      if (l(lr) == 1) {
263  
        l = "";
264  
        r = lr.get(0);
265  
      } else if (l(lr) >= 2) {
266  
        l = join(" = ", lr.subList(0, l(lr)-1));
267  
        r = last(lr);
268  
        if (debug)
269  
          print("rule reformatted to: " + l + " .=. " + r);
270  
      } else {
271  
        print("Weird rule: " + rule);
272  
        continue;
273  
      }
274  
      L<S> tokr = mergeBracketThingies(javaTok(r));
275  
      assertEquals(structure(tokr), 3, l(tokr));
276  
      S className = assertIdentifier(unbracket(get(tokr, 1)));
277  
      L<S> tok = unquoteAll(javaTok(l));
278  
      tok = mergeBracketThingies(tok);
279  
      //printStructure(tok);
280  
      productionMap.put(className, tok);
281  
    } else if (match("case-sensitive", rule)) {
282  
      print("Now case sensitive.");
283  
      caseSensitive = true;
284  
    } else if (match("case-insensitive", rule)) {
285  
      caseSensitive = false;
286  
    } else
287  
      print("Weird rule: " + rule);
288  
  }
289  
  
290  
  if (debug) {
291  
    print(n(productionMap.size(), "production") + ".");
292  
    print();
293  
  }
294  
  
295  
  timing = now();
296  
  tok = inputTok;
297  
  /* HACK
298  
    tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes
299  
    tok.add("");
300  
  */
301  
  Pos pos = new Pos(tok);
302  
  parseTop(pos);
303  
  timing = now()-timing;
304  
  
305  
  int numTok = l(tok)/2;
306  
  /*if (printStats)
307  
    print("Iterations required for " + numTok + " tokens: ?, time: " + timing + " ms");*/
308  
  
309  
  new ParseResult res;
310  
  res.tok = tok;
311  
  res.recog = recog;
312  
  ret res;
313  
}
314  
315  
static void parseTop(Pos pos) {
316  
  // init structures
317  
  recog = new TreeMap;
318  
  for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end
319  
    recog.put(i, new MultiMap);
320  
  //print("parser: recog inited " + pos.i + " to " + l(pos.tok));
321  
  
322  
  // process dicts
323  
  for (S className : dicts.keySet()) {
324  
    Set<S> values = dicts.get(className);
325  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
326  
      S t = pos.tok.get(i);
327  
      if (values.contains(t.toLowerCase()))
328  
        recog.get(i).setPut(className, i+2);
329  
    }
330  
  }
331  
  
332  
  matchSpecials();
333  
334  
  boolean anyChange;
335  
  iterations = 0;
336  
  iter(pos);
337  
}
338  
339  
static void iter(Pos pos) {
340  
  MultiMap<S, L<S>> higherProductions = withoutGroundProductions(productionMap);
341  
  
342  
  for (int i = l(pos.tok); i >= 1; i -= 2) {
343  
    Pos pos2 = new Pos(pos.tok, i);
344  
    int it = 0;
345  
    boolean anyChange = false;
346  
    do {
347  
      anyChange = false;
348  
      MultiMap<S, L<S>> mapToUse = it == 0 ? productionMap : higherProductions;
349  
      it++;
350  
      for (S className : mapToUse.keySet()) {
351  
        MultiMap<S, Integer> rr = recog.get(i);
352  
        L<Integer> recs = rr.getActual(className);
353  
        L<L<S>> prods = mapToUse.get(className);
354  
        for (L<S> prod : prods) {
355  
          int n = l(recs);
356  
          matchProd(pos2, new Pos(prod), className, recs);
357  
          anyChange = anyChange || l(recs) > n;
358  
        }
359  
        rr.clean(className);
360  
      }
361  
    } while (anyChange);
362  
  }
363  
}
364  
365  
static class Pos {
366  
  L<S> tok;
367  
  int i = 1;
368  
  
369  
  *() {}
370  
  *(L<S> *tok) {}
371  
  *(L<S> *tok, int *i) {}
372  
  
373  
  boolean end() { ret i >= l(tok)-1; }
374  
  S get() { ret tok.get(i); }
375  
  public Pos clone() { ret new Pos(tok, i); }
376  
  public boolean equals(O o) {
377  
    if (!(o instanceof Pos)) ret false;
378  
    Pos pos = cast o;
379  
    ret tok == pos.tok && i == pos.i;
380  
  }
381  
  
382  
  S rest() {
383  
    ret join(subList(tok, i));
384  
  }
385  
386  
  Pos plus(int x) { ret new Pos(tok, i + x); }
387  
}
388  
389  
static void copy(Pos a, Pos b) {
390  
  b.tok = a.tok;
391  
  b.i = a.i;
392  
}
393  
394  
static void debug(S bla, Pos pos) {
395  
  if (debug)
396  
    print(bla + " on " + quote(pos.rest()));
397  
}
398  
399  
static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
400  
  if (prod.end())
401  
    setAdd(out, pos.i);
402  
  else {
403  
    S p = prod.get();
404  
    
405  
    if (isBracketedID(p)/* && !specials.contains(p)*/) {
406  
      MultiMap<S, Integer> rr = recog.get(pos.i);
407  
      if (rr == null)
408  
        fail("parser: recog null at " + pos.i);
409  
        
410  
      // get class name
411  
      
412  
      S className = unbracket(p);
413  
      if (hasSpace(className)) {
414  
        int idx = className.indexOf(' ');
415  
        className = className.substring(0, idx).trim();
416  
      }
417  
      
418  
      L<Integer> r = rr.get(className);
419  
      
420  
      // keep parsing for every option
421  
  
422  
      for (int i : cloneList(r))
423  
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
424  
      
425  
    } else if (!pos.end()) { // it's a literal - check if at end of token stream first
426  
      
427  
      S t = pos.get();
428  
      if (!matchToken(p, t))
429  
        ret;
430  
      
431  
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
432  
    }
433  
  }
434  
}
435  
436  
static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>", "<id>", "<extidentifier>", "ucid");
437  
438  
static boolean matchToken(S p, S t) {
439  
  if (eq(p, "<quoted>")) {
440  
    if (!isQuoted(t)) ret false;
441  
  } else if (eq(p, "<int>")) {
442  
    if (!isInteger(t)) ret false;
443  
  } else if (eqOneOf(p, "<identifier>", "<id>")) {
444  
    if (!isIdentifier(t)) ret false;
445  
  } else if (eq(p, "<ucid>")) {
446  
    if (!(isIdentifier(t) && startsWithUpperCase(t))) ret false;
447  
  } else if (eq(p, "<extidentifier>")) {
448  
    if (!isExtendedIdentifier(t)) ret false;
449  
  } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t))))
450  
    ret false; // token mismatch
451  
  ret true;
452  
}
453  
454  
static S loadSnippetThroughBot(S snippetID) {
455  
  print("Loading parser rules.");
456  
  time {
457  
    S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
458  
    new Matches m;
459  
    assertTrue(match("the text of snippet * is *", answer, m));
460  
    ret m.unq(1);
461  
  }
462  
}
463  
464  
// Non-recursing productions don't need to be repeatedly executed
465  
static MultiMap<S, L<S>> withoutGroundProductions(MultiMap<S, L<S>> productionMap) {
466  
  MultiMap<S, L<S>> newMap = new MultiMap;
467  
  
468  
  for (S className : new HashSet<S>(productionMap.keySet())) {
469  
    L<L<S>> prods = productionMap.get(className);
470  
    for (L<S> prod : prods)
471  
      if (!isGroundProduction(prod))
472  
        newMap.put(className, prod);
473  
  }
474  
  
475  
  ret newMap;
476  
}
477  
478  
static boolean isGroundProduction(L<S> prod) {
479  
  for (int i = 1; i < l(prod); i += 2) {
480  
    S t = prod.get(i);
481  
    if (isBracketedID(t) && !specials.contains(t))
482  
      ret false;
483  
  }
484  
  ret true;
485  
}
486  
487  
static void addDict(S className, Collection<S> entries) {
488  
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
489  
}
490  
491  
// TODO: do only those requested in grammar
492  
493  
static void matchSpecials() {
494  
  for (int i = 1; i < l(tok); i += 2) {
495  
    MultiMap<S, Integer> rr = recog.get(i);
496  
    for (S s : specials)
497  
      if (matchToken(s, tok.get(i)))
498  
        rr.put(unbracket(s), i+2);
499  
  }
500  
}
501  
502  
static L<S> unquoteAll(L<S> tok) {
503  
  L<S> t = cloneList(tok);
504  
  for (int i = 1; i < l(t); i += 2)
505  
    if (isQuoted(t.get(i)))
506  
      t.set(i, unquote(t.get(i)));
507  
  ret t;
508  
}

Author comment

Began life as a copy of #1002472

download  show line numbers  debug dex  old transpilations   

Travelled to 16 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002719
Snippet name: "Bottom-Up" NL Parser, faster explainMatch (LIVE EVERYWHERE)
Eternal ID of this version: #1002719/12
Text MD5: f3bad2905ecdacbe90a91ada158fa2fe
Transpilation MD5: 1cf5b651a3557e7aded44fca248c0cbe
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2018-09-16 14:05:31
Source code size: 14775 bytes / 508 lines
Pitched / IR pitched: No / No
Views / Downloads: 845 / 3805
Version history: 11 change(s)
Referenced in: [show references]