Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

484
LINES

< > BotCompany Repo | #1002472 // NL Parser, implementing names & new eval order (superseded by #1002719)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (4249L/28K/89K).

1  
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes
2  
3  
// Productions elements: literals (just write them), <classname>, <classname varname>
4  
5  
!759
6  
7  
// a recognition is identified by (startPos, className, endPos)
8  
9  
// key 1 = start position, key 2 = class name, value = end position
10  
static Map<Integer, MultiMap<S, Integer>> recog;
11  
static L<S> tok;
12  
13  
static MultiMap<S, L<S>> productionMap;
14  
15  
static boolean caseSensitive = false;
16  
17  
static boolean debug = false;
18  
19  
static long timing;
20  
static int iterations;
21  
22  
static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples
23  
24  
25  
static S defaultParserRules = "#1002281"; // NL
26  
static S javaRules = "#1002329";
27  
static boolean keepRules = false;
28  
29  
static class ParseResult {
30  
  L<S> tok;
31  
  Map<Integer, MultiMap<S, Integer>> recog;
32  
  
33  
  L<S> fullMatchClasses() {
34  
    new L<S> l;
35  
    MultiMap<S, Integer> rr = recog.get(1);
36  
    for (S className : rr.keys())
37  
      if (rr.get(className).contains(l(tok)))
38  
        l.add(className);
39  
    ret l;
40  
  }
41  
  
42  
  boolean parsesAs(S className) {
43  
    MultiMap<S, Integer> rr = recog.get(1);
44  
    ret rr.get(className).contains(l(tok));
45  
  }
46  
  
47  
  L explainLongestMatch() {
48  
    S c = findLongestMatch(1);
49  
    if (c == null) ret null;
50  
    int n = max(recog.get(1).get(c));
51  
    if (n == 1) ret null;
52  
    L l = cast explainMatch(new Pos(tok), n, c).get(0);
53  
    ret shortenExplanation(l);
54  
  }
55  
56  
  S findLongestMatch(int i) {
57  
    MultiMap<S, Integer> rr = recog.get(i);
58  
    new Map<S, Integer> map;
59  
    for (S c : rr.keys())
60  
      map.put(c, max(rr.get(c)));
61  
    ret keyWithBiggestValue(map);
62  
  }
63  
  
64  
  L explain(S className) {
65  
    ret explain(1, l(tok), className);
66  
  }
67  
  
68  
  L explainFull(S className) {
69  
    ret explainFull(1, l(tok), className);
70  
  }
71  
  
72  
  // full = not shortened (include all subclassings)
73  
  L explainFull(int fromToken, int toToken, S className) {
74  
    L<L> matches = explainMatch(new Pos(tok, fromToken), toToken, className);
75  
    ret get(matches, 0);
76  
  }
77  
  
78  
  L explain(int fromToken, int toToken, S className) {
79  
    L l = explainFull(fromToken, toToken, className);
80  
    if (debug)
81  
      print("Full explanation: " + structure(l));
82  
    ret shortenExplanation(l);
83  
  }
84  
  
85  
  // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]]
86  
  
87  
  L shortenExplanation(L m) {
88  
    if (m == null) ret null;
89  
    if (l(m) == 5) {
90  
      int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
91  
      L sub = (L) m.get(4);
92  
      int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1);
93  
      
94  
      // for primitives, inner class seems to be null currently
95  
      // - in this case, leave outer match.
96  
      
97  
      S innerClass = getString(sub, 2);
98  
      if (ii == II && jj == JJ && innerClass != null)
99  
        ret shortenExplanation(sub); // it's a subclassing  - just drop the outer match
100  
    }
101  
    for (int i = 4; i < l(m); i++)
102  
      m.set(i, shortenExplanation((L) m.get(i)));
103  
    ret m;
104  
  }
105  
  
106  
  // assumes that there is a match (pos, class, endPos)
107  
  // and gives explanations of how it was probably matched.
108  
  // result list format: [className, prodText, [i1, i2], [i3, i4], ...]
109  
  L explainMatch(Pos pos, int endPos, S forClass) {
110  
    L<L<S>> prods = productionMap.get(forClass);
111  
    new L out;
112  
    for (L<S> prod : prods) {
113  
      L match = litlist(pos.i, endPos, forClass, join(prod));
114  
      if (debug)
115  
        print("Starting match: " + structure(match));
116  
      explainMatch2(pos, new Pos(prod), endPos, forClass, match, out);
117  
    }
118  
    ret out;
119  
  }
120  
  
121  
  // same, but with fixed production
122  
  void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
123  
    if (prod.end()) {
124  
      if (pos.i == endPos) {
125  
        // match done! add sub-explanations.
126  
        
127  
        match = cloneList(match);
128  
        if (debug)
129  
          print("Match done! " + structure(match));
130  
        for (int i = 4; i < l(match); i++) {
131  
          L m = cloneList((L) match.get(i));
132  
          //print("m=" + structure(m));
133  
          int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
134  
          S className = cast m.get(2);
135  
          L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className);
136  
          //print(format("Recursed: * * * => ", ii, jj, className) +  structure(subExplanation));
137  
          if (empty(subExplanation)) {
138  
            // weird
139  
          } else
140  
            m.addAll(subList((L) subExplanation.get(0), 3));
141  
            
142  
          match.set(i, m);
143  
        }
144  
        
145  
        if (debug)
146  
          print("Match completed: " + structure(match));
147  
        out.add(match);
148  
      }
149  
    } else {
150  
      S p = prod.get();
151  
      
152  
      if (isBracketedID(p)) {
153  
        // get class name and possible var name
154  
        
155  
        S className = unbracket(p);
156  
        S varName = null;
157  
        if (hasSpace(className)) {
158  
          int idx = className.indexOf(' ');
159  
          varName = className.substring(idx+1).trim();
160  
          className = className.substring(0, idx).trim();
161  
        }
162  
        
163  
        // look up if class found at this place
164  
        
165  
        L<Integer> r = recog.get(pos.i).get(className);
166  
167  
        // keep parsing for every option
168  
    
169  
        for (int i : cloneList(r)) {
170  
          match.add(litlist(pos.i, i, className));
171  
          explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out);
172  
          removeLast(match);
173  
        }
174  
      } else if (!pos.end()) {
175  
        // it's a literal
176  
        S t = pos.get();
177  
        if (!matchToken(p, t))
178  
          ret;
179  
        
180  
        //match.add(litlist(pos.i, pos.i+2, p));
181  
        explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out);
182  
        //removeLast(match);
183  
      }
184  
    }
185  
  }
186  
  
187  
  S prettierAnalysis() {
188  
    new L<S> l;
189  
    new TreeSet<S> emptyClasses;
190  
    for (int i = 1; i < l(tok); i += 2) {
191  
      MultiMap<S, Integer> rr = recog.get(i);
192  
      new L<S> bla;
193  
      for (S className : rr.keys()) {
194  
        L<Integer> is = rr.get(className);
195  
        if (is.contains(i))
196  
          emptyClasses.add(className);
197  
        int n = (max(is)-i)/2;
198  
        if (n > 0)
199  
          bla.add(className + " " + n);
200  
      }
201  
      l.add(tok.get(i) + " : " + structure(bla));
202  
    }
203  
    if (!empty(emptyClasses))
204  
      l.add("  with empty classes " + structure(asList(emptyClasses)));
205  
    ret fromLines(l);
206  
  }
207  
}
208  
209  
p {
210  
  startBot("Snippet Text Bot", "#1002084");
211  
}
212  
213  
static ParseResult parse(S inputText) {
214  
  ret parse(inputText, defaultParserRules);
215  
}
216  
217  
static ParseResult jparse(S inputText) {
218  
  ret parse(inputText, javaRules);
219  
}
220  
221  
static S keptRulesID;
222  
static S keptRulesText;
223  
224  
static ParseResult parse(S inputText, S parserRules) {
225  
  ret parse(javaTok(inputText), parserRules);
226  
}
227  
228  
static synchronized ParseResult parse(L<S> inputTok, S parserRules) {
229  
  S rulesText;
230  
  if (!isSnippetID(parserRules))
231  
    rulesText = parserRules;
232  
  else {
233  
    if (keepRules && sameSnippetID(parserRules, keptRulesID))
234  
      rulesText = keptRulesText;
235  
    else {
236  
      rulesText = loadSnippetThroughBot(parserRules);
237  
      if (keepRules) {
238  
        keptRulesID = parserRules;
239  
        keptRulesText = rulesText;
240  
      }
241  
    }
242  
  }
243  
244  
  productionMap = new MultiMap;
245  
  caseSensitive = false;
246  
  for (S rule : toLinesFullTrim(rulesText)) pcall {
247  
    if (l(javaTok(rule)) == 1) continue; // just a comment
248  
    
249  
    if (debug)
250  
      printF("Processing rule: *", rule);
251  
    
252  
    if (rule.contains("=")) {  
253  
      L<S> lr = splitAtJavaToken(rule, "=");
254  
      S l, r;
255  
      if (l(lr) == 1) {
256  
        l = "";
257  
        r = lr.get(0);
258  
      } else if (l(lr) >= 2) {
259  
        l = join(" = ", lr.subList(0, l(lr)-1));
260  
        r = last(lr);
261  
      } else {
262  
        print("Weird rule: " + rule);
263  
        continue;
264  
      }
265  
      L<S> tokr = mergeBracketThingies(javaTok(r));
266  
      assertEquals(structure(tokr), 3, l(tokr));
267  
      S className = assertIdentifier(unbracket(get(tokr, 1)));
268  
      L<S> tok = javaTok(l);
269  
      tok = mergeBracketThingies(tok);
270  
      //printStructure(tok);
271  
      productionMap.put(className, tok);
272  
    } else if (match("case-sensitive", rule)) {
273  
      print("Now case sensitive.");
274  
      caseSensitive = true;
275  
    } else if (match("case-insensitive", rule)) {
276  
      caseSensitive = false;
277  
    } else
278  
      print("Weird rule: " + rule);
279  
  }
280  
  
281  
  if (debug) {
282  
    print(n(productionMap.size(), "production") + ".");
283  
    print();
284  
  }
285  
  
286  
  timing = now();
287  
  tok = inputTok;
288  
  /* HACK
289  
    tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes
290  
    tok.add("");
291  
  */
292  
  Pos pos = new Pos(tok);
293  
  parseTop(pos);
294  
  timing = now()-timing;
295  
  
296  
  int numTok = l(tok)/2;
297  
  print("Iterations required for " + numTok + " tokens: " + iterations*numTok*productionMap.size() + ", time: " + timing + " ms");
298  
  
299  
  new ParseResult res;
300  
  res.tok = tok;
301  
  res.recog = recog;
302  
  ret res;
303  
}
304  
305  
static void parseTop(Pos pos) {
306  
  // init structures
307  
  recog = new TreeMap;
308  
  for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end
309  
    recog.put(i, new MultiMap);
310  
  //print("parser: recog inited " + pos.i + " to " + l(pos.tok));
311  
  
312  
  // process dicts
313  
  for (S className : dicts.keySet()) {
314  
    Set<S> values = dicts.get(className);
315  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
316  
      S t = pos.tok.get(i);
317  
      if (values.contains(t.toLowerCase()))
318  
        recog.get(i).setPut(className, i+2);
319  
    }
320  
  }
321  
  
322  
  matchSpecials();
323  
324  
  boolean anyChange;
325  
  iterations = 0;
326  
  iter(pos);
327  
}
328  
329  
static void iter(Pos pos) {
330  
  MultiMap<S, L<S>> higherProductions = withoutGroundProductions(productionMap);
331  
  
332  
  for (int i = l(pos.tok); i >= 1; i -= 2) {
333  
    Pos pos2 = new Pos(pos.tok, i);
334  
    int it = 0;
335  
    boolean anyChange = false;
336  
    do {
337  
      anyChange = false;
338  
      MultiMap<S, L<S>> mapToUse = it == 0 ? productionMap : higherProductions;
339  
      it++;
340  
      for (S className : mapToUse.keySet()) {
341  
        MultiMap<S, Integer> rr = recog.get(i);
342  
        L<Integer> recs = rr.getActual(className);
343  
        L<L<S>> prods = mapToUse.get(className);
344  
        for (L<S> prod : prods) {
345  
          int n = l(recs);
346  
          matchProd(pos2, new Pos(prod), className, recs);
347  
          anyChange = anyChange || l(recs) > n;
348  
        }
349  
        rr.clean(className);
350  
      }
351  
    } while (anyChange);
352  
  }
353  
}
354  
355  
static class Pos {
356  
  L<S> tok;
357  
  int i = 1;
358  
  
359  
  *() {}
360  
  *(L<S> *tok) {}
361  
  *(L<S> *tok, int *i) {}
362  
  
363  
  boolean end() { ret i >= l(tok)-1; }
364  
  S get() { ret tok.get(i); }
365  
  public Pos clone() { ret new Pos(tok, i); }
366  
  public boolean equals(O o) {
367  
    if (!(o instanceof Pos)) ret false;
368  
    Pos pos = cast o;
369  
    ret tok == pos.tok && i == pos.i;
370  
  }
371  
  
372  
  S rest() {
373  
    ret join(subList(tok, i));
374  
  }
375  
376  
  Pos plus(int x) { ret new Pos(tok, i + x); }
377  
}
378  
379  
static void copy(Pos a, Pos b) {
380  
  b.tok = a.tok;
381  
  b.i = a.i;
382  
}
383  
384  
static void debug(S bla, Pos pos) {
385  
  if (debug)
386  
    print(bla + " on " + quote(pos.rest()));
387  
}
388  
389  
static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
390  
  if (prod.end())
391  
    setAdd(out, pos.i);
392  
  else {
393  
    S p = prod.get();
394  
    
395  
    if (isBracketedID(p)/* && !specials.contains(p)*/) {
396  
      MultiMap<S, Integer> rr = recog.get(pos.i);
397  
      if (rr == null)
398  
        fail("parser: recog null at " + pos.i);
399  
        
400  
      // get class name
401  
      
402  
      S className = unbracket(p);
403  
      if (hasSpace(className)) {
404  
        int idx = className.indexOf(' ');
405  
        className = className.substring(0, idx).trim();
406  
      }
407  
      
408  
      L<Integer> r = rr.get(className);
409  
      
410  
      // keep parsing for every option
411  
  
412  
      for (int i : cloneList(r))
413  
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
414  
      
415  
    } else if (!pos.end()) { // it's a literal - check if at end of token stream first
416  
      
417  
      S t = pos.get();
418  
      if (!matchToken(p, t))
419  
        ret;
420  
      
421  
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
422  
    }
423  
  }
424  
}
425  
426  
static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>");
427  
428  
static boolean matchToken(S p, S t) {
429  
  if (eq(p, "<quoted>")) {
430  
    if (!isQuoted(t)) ret false;
431  
  } else if (eq(p, "<int>")) {
432  
    if (!isInteger(t)) ret false;
433  
  } else if (eq(p, "<identifier>")) {
434  
    if (!isIdentifier(t)) ret false;
435  
  } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t))))
436  
    ret false; // token mismatch
437  
  ret true;
438  
}
439  
440  
static S loadSnippetThroughBot(S snippetID) {
441  
  print("Loading parser rules.");
442  
  time {
443  
    S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
444  
    new Matches m;
445  
    assertTrue(match("the text of snippet * is *", answer, m));
446  
    ret m.unq(1);
447  
  }
448  
}
449  
450  
// Non-recursing productions don't need to be repeatedly executed
451  
static MultiMap<S, L<S>> withoutGroundProductions(MultiMap<S, L<S>> productionMap) {
452  
  MultiMap<S, L<S>> newMap = new MultiMap;
453  
  
454  
  for (S className : new HashSet<S>(productionMap.keySet())) {
455  
    L<L<S>> prods = productionMap.get(className);
456  
    for (L<S> prod : prods)
457  
      if (!isGroundProduction(prod))
458  
        newMap.put(className, prod);
459  
  }
460  
  
461  
  ret newMap;
462  
}
463  
464  
static boolean isGroundProduction(L<S> prod) {
465  
  for (int i = 1; i < l(prod); i += 2) {
466  
    S t = prod.get(i);
467  
    if (isBracketedID(t) && !specials.contains(t))
468  
      ret false;
469  
  }
470  
  ret true;
471  
}
472  
473  
static void addDict(S className, Collection<S> entries) {
474  
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
475  
}
476  
477  
static void matchSpecials() {
478  
  for (int i = 1; i < l(tok); i += 2) {
479  
    MultiMap<S, Integer> rr = recog.get(i);
480  
    for (S s : specials)
481  
      if (matchToken(s, tok.get(i)))
482  
        rr.put(unbracket(s), i+2);
483  
  }
484  
}

Author comment

Began life as a copy of #1002369

download  show line numbers  debug dex  old transpilations   

Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002472
Snippet name: NL Parser, implementing names & new eval order (superseded by #1002719)
Eternal ID of this version: #1002472/1
Text MD5: 38a4871c6689b3a8fa8225be0cf65dee
Transpilation MD5: c257994d7d0333ff130c4a2475710bfc
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-02-18 19:12:44
Source code size: 14065 bytes / 484 lines
Pitched / IR pitched: No / No
Views / Downloads: 605 / 1545
Referenced in: [show references]