Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

455
LINES

< > BotCompany Repo | #1002369 // NL Parser, reworking the interface (CURRENT)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (3477L/24K/75K).

1  
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes
2  
3  
!759
4  
5  
// a recognition is identified by (startPos, className, endPos)
6  
7  
8  
// key 1 = start position, key 2 = class name, value = end position
9  
static Map<Integer, MultiMap<S, Integer>> recog;
10  
static L<S> tok;
11  
12  
static MultiMap<S, L<S>> productionMap;
13  
14  
static boolean caseSensitive = false;
15  
16  
static boolean debug = false;
17  
18  
static long timing;
19  
static int iterations;
20  
21  
static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples
22  
23  
24  
static S defaultParserRules = "#1002281"; // NL
25  
static S javaRules = "#1002329";
26  
static boolean keepRules = false;
27  
28  
static class ParseResult {
29  
  L<S> tok;
30  
  Map<Integer, MultiMap<S, Integer>> recog;
31  
  
32  
  L<S> fullMatchClasses() {
33  
    new L<S> l;
34  
    MultiMap<S, Integer> rr = recog.get(1);
35  
    for (S className : rr.keys())
36  
      if (rr.get(className).contains(l(tok)))
37  
        l.add(className);
38  
    ret l;
39  
  }
40  
  
41  
  L explainLongestMatch() {
42  
    S c = findLongestMatch(1);
43  
    if (c == null) ret null;
44  
    int n = max(recog.get(1).get(c));
45  
    if (n == 1) ret null;
46  
    L l = cast explainMatch(new Pos(tok), n, c).get(0);
47  
    ret shortenExplanation(l);
48  
  }
49  
50  
  S findLongestMatch(int i) {
51  
    MultiMap<S, Integer> rr = recog.get(i);
52  
    new Map<S, Integer> map;
53  
    for (S c : rr.keys())
54  
      map.put(c, max(rr.get(c)));
55  
    ret keyWithBiggestValue(map);
56  
  }
57  
  
58  
  L explain(S className) {
59  
    ret explain(1, l(tok), className);
60  
  }
61  
  
62  
  L explainFull(S className) {
63  
    ret explainFull(1, l(tok), className);
64  
  }
65  
  
66  
  // full = not shortened (include all subclassings)
67  
  L explainFull(int fromToken, int toToken, S className) {
68  
    L<L> matches = explainMatch(new Pos(tok, fromToken), toToken, className);
69  
    ret get(matches, 0);
70  
  }
71  
  
72  
  L explain(int fromToken, int toToken, S className) {
73  
    L l = explainFull(fromToken, toToken, className);
74  
    if (debug)
75  
      print("Full explanation: " + structure(l));
76  
    ret shortenExplanation(l);
77  
  }
78  
  
79  
  // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]]
80  
  
81  
  L shortenExplanation(L m) {
82  
    if (m == null) ret null;
83  
    if (l(m) == 5) {
84  
      int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
85  
      L sub = (L) m.get(4);
86  
      int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1);
87  
      if (ii == II && jj == JJ)
88  
        ret shortenExplanation(sub); // it's a subclassing  - just drop the outer match
89  
    }
90  
    for (int i = 4; i < l(m); i++)
91  
      m.set(i, shortenExplanation((L) m.get(i)));
92  
    ret m;
93  
  }
94  
  
95  
  // assumes that there is a match (pos, class, endPos)
96  
  // and gives explanations of how it was probably matched.
97  
  // result list format: [className, prodText, [i1, i2], [i3, i4], ...]
98  
  L explainMatch(Pos pos, int endPos, S forClass) {
99  
    L<L<S>> prods = productionMap.get(forClass);
100  
    new L out;
101  
    for (L<S> prod : prods) {
102  
      L match = litlist(pos.i, endPos, forClass, join(prod));
103  
      if (debug)
104  
        print("Starting match: " + structure(match));
105  
      explainMatch2(pos, new Pos(prod), endPos, forClass, match, out);
106  
    }
107  
    ret out;
108  
  }
109  
  
110  
  // same, but with fixed production
111  
  void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
112  
    if (prod.end()) {
113  
      if (pos.i == endPos) {
114  
        // match done! add sub-explanations.
115  
        
116  
        match = cloneList(match);
117  
        if (debug)
118  
          print("Match done! " + structure(match));
119  
        for (int i = 4; i < l(match); i++) {
120  
          L m = cloneList((L) match.get(i));
121  
          //print("m=" + structure(m));
122  
          int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
123  
          S className = cast m.get(2);
124  
          L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className);
125  
          //print(format("Recursed: * * * => ", ii, jj, className) +  structure(subExplanation));
126  
          if (empty(subExplanation)) {
127  
            // weird
128  
          } else
129  
            m.addAll(subList((L) subExplanation.get(0), 3));
130  
            
131  
          match.set(i, m);
132  
        }
133  
        
134  
        if (debug)
135  
          print("Match completed: " + structure(match));
136  
        out.add(match);
137  
      }
138  
    } else if (pos.end())
139  
      ret;
140  
    else {
141  
      S p = prod.get();
142  
      
143  
      if (isBracketedID(p)) {
144  
        S className = unbracket(p);
145  
        L<Integer> r = recog.get(pos.i).get(className);
146  
  
147  
        // keep parsing for every option
148  
    
149  
        for (int i : cloneList(r)) {
150  
          match.add(litlist(pos.i, i, className));
151  
          explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out);
152  
          removeLast(match);
153  
        }
154  
      } else if (!pos.end()) {
155  
        // it's a literal
156  
        S t = pos.get();
157  
        if (!matchToken(p, t))
158  
          ret;
159  
        
160  
        //match.add(litlist(pos.i, pos.i+2, p));
161  
        explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out);
162  
        //removeLast(match);
163  
      }
164  
    }
165  
  }
166  
  
167  
  S prettierAnalysis() {
168  
    new L<S> l;
169  
    new TreeSet<S> emptyClasses;
170  
    for (int i = 1; i < l(tok); i += 2) {
171  
      MultiMap<S, Integer> rr = recog.get(i);
172  
      new L<S> bla;
173  
      for (S className : rr.keys()) {
174  
        L<Integer> is = rr.get(className);
175  
        if (is.contains(i))
176  
          emptyClasses.add(className);
177  
        int n = (max(is)-i)/2;
178  
        if (n > 0)
179  
          bla.add(className + " " + n);
180  
      }
181  
      l.add(tok.get(i) + " : " + structure(bla));
182  
    }
183  
    if (!empty(emptyClasses))
184  
      l.add("  with empty classes " + structure(asList(emptyClasses)));
185  
    ret fromLines(l);
186  
  }
187  
}
188  
189  
p {
190  
  startBot("Snippet Text Bot", "#1002084");
191  
}
192  
193  
static ParseResult parse(S inputText) {
194  
  ret parse(inputText, defaultParserRules);
195  
}
196  
197  
static ParseResult jparse(S inputText) {
198  
  ret parse(inputText, javaRules);
199  
}
200  
201  
static S keptRulesID;
202  
static S keptRulesText;
203  
204  
static synchronized ParseResult parse(S inputText, S parserRules) {
205  
  S rulesText;
206  
  if (!isSnippetID(parserRules))
207  
    rulesText = parserRules;
208  
  else {
209  
    if (keepRules && sameSnippetID(parserRules, keptRulesID))
210  
      rulesText = keptRulesText;
211  
    else {
212  
      rulesText = loadSnippetThroughBot(parserRules);
213  
      if (keepRules) {
214  
        keptRulesID = parserRules;
215  
        keptRulesText = rulesText;
216  
      }
217  
    }
218  
  }
219  
220  
  productionMap = new MultiMap;
221  
  caseSensitive = false;
222  
  for (S rule : toLinesFullTrim(rulesText)) pcall {
223  
    if (l(javaTok(rule)) == 1) continue; // just a comment
224  
    
225  
    if (debug)
226  
      printF("Processing rule: *", rule);
227  
    
228  
    if (rule.contains("=")) {  
229  
      L<S> lr = splitAtJavaToken(rule, "=");
230  
      S l, r;
231  
      if (l(lr) == 1) {
232  
        l = "";
233  
        r = lr.get(0);
234  
      } else if (l(lr) >= 2) {
235  
        l = join(" = ", lr.subList(0, l(lr)-1));
236  
        r = last(lr);
237  
      } else {
238  
        print("Weird rule: " + rule);
239  
        continue;
240  
      }
241  
      L<S> tokr = mergeBracketThingies(javaTok(r));
242  
      assertEquals(structure(tokr), 3, l(tokr));
243  
      S className = assertIdentifier(unbracket(get(tokr, 1)));
244  
      L<S> tok = javaTok(l);
245  
      tok = mergeBracketThingies(tok);
246  
      //printStructure(tok);
247  
      productionMap.put(className, tok);
248  
    } else if (match("case-sensitive", rule)) {
249  
      //print("Now case sensitive.");
250  
      caseSensitive = true;
251  
    } else if (match("case-insensitive", rule)) {
252  
      caseSensitive = false;
253  
    } else
254  
      print("Weird rule: " + rule);
255  
  }
256  
  
257  
  if (debug) {
258  
    print(n(productionMap.size(), "production") + ".");
259  
    print();
260  
  }
261  
  
262  
  timing = now();
263  
  tok = javaTok(inputText);
264  
  Pos pos = new Pos(tok);
265  
  parseTop(pos);
266  
  timing = now()-timing;
267  
  
268  
  int numTok = l(tok)/2;
269  
  print("Iterations required for " + numTok + " tokens: " + iterations*numTok*productionMap.size() + ", time: " + timing + " ms");
270  
  
271  
  new ParseResult res;
272  
  res.tok = tok;
273  
  res.recog = recog;
274  
  ret res;
275  
}
276  
277  
static void parseTop(Pos pos) {
278  
  // init structures
279  
  recog = new TreeMap;
280  
  for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end
281  
    recog.put(i, new MultiMap);
282  
  //print("parser: recog inited " + pos.i + " to " + l(pos.tok));
283  
  
284  
  // process dicts
285  
  for (S className : dicts.keySet()) {
286  
    Set<S> values = dicts.get(className);
287  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
288  
      S t = pos.tok.get(i);
289  
      if (values.contains(t.toLowerCase()))
290  
        recog.get(i).setPut(className, i+2);
291  
    }
292  
  }
293  
  
294  
  matchSpecials();
295  
296  
  boolean anyChange;
297  
  iterations = 0;
298  
  MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap);
299  
  while (iter(pos)) {}
300  
  
301  
  iter(pos); // XX - one more - fixes anything?
302  
  
303  
  productionMap = fullProductionMap; // restore ground productions
304  
}
305  
306  
static boolean iter(Pos pos) {
307  
  boolean anyChange = false;
308  
  ++iterations;
309  
  if (iterations == 2) removeGroundProductions();
310  
  if (debug) print("Iteration " + iterations);
311  
  for (int i = pos.i; i <= l(pos.tok); i += 2) {
312  
    Pos pos2 = new Pos(pos.tok, i);
313  
    for (S className : productionMap.keySet()) {
314  
      MultiMap<S, Integer> rr = recog.get(i);
315  
      L<Integer> recs = rr.getActual(className);
316  
      L<L<S>> prods = productionMap.get(className);
317  
      for (L<S> prod : prods) {
318  
        int n = l(recs);
319  
        matchProd(pos2, new Pos(prod), className, recs);
320  
        anyChange = anyChange || l(recs) > n;
321  
      }
322  
      rr.clean(className);
323  
    }
324  
  }
325  
  ret anyChange;
326  
}
327  
328  
static class Pos {
329  
  L<S> tok;
330  
  int i = 1;
331  
  
332  
  *() {}
333  
  *(L<S> *tok) {}
334  
  *(L<S> *tok, int *i) {}
335  
  
336  
  boolean end() { ret i >= l(tok)-1; }
337  
  S get() { ret tok.get(i); }
338  
  public Pos clone() { ret new Pos(tok, i); }
339  
  public boolean equals(O o) {
340  
    if (!(o instanceof Pos)) ret false;
341  
    Pos pos = cast o;
342  
    ret tok == pos.tok && i == pos.i;
343  
  }
344  
  
345  
  S rest() {
346  
    ret join(subList(tok, i));
347  
  }
348  
349  
  Pos plus(int x) { ret new Pos(tok, i + x); }
350  
}
351  
352  
static void copy(Pos a, Pos b) {
353  
  b.tok = a.tok;
354  
  b.i = a.i;
355  
}
356  
357  
static void debug(S bla, Pos pos) {
358  
  if (debug)
359  
    print(bla + " on " + quote(pos.rest()));
360  
}
361  
362  
static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
363  
  if (prod.end())
364  
    setAdd(out, pos.i);
365  
  else if (pos.end())
366  
    ret;
367  
  else {
368  
    S p = prod.get();
369  
    
370  
    if (isBracketedID(p)/* && !specials.contains(p)*/) {
371  
      MultiMap<S, Integer> rr = recog.get(pos.i);
372  
      if (rr == null)
373  
        fail("parser: recog null at " + pos.i);
374  
      L<Integer> r = rr.get(unbracket(p));
375  
      
376  
      // keep parsing for every option
377  
  
378  
      for (int i : cloneList(r))
379  
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
380  
      
381  
    } else {
382  
      // it's a literal
383  
      S t = pos.get();
384  
      if (!matchToken(p, t))
385  
        ret;
386  
      
387  
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
388  
    }
389  
  }
390  
}
391  
392  
static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>");
393  
394  
static boolean matchToken(S p, S t) {
395  
  if (eq(p, "<quoted>")) {
396  
    if (!isQuoted(t)) ret false;
397  
  } else if (eq(p, "<int>")) {
398  
    if (!isInteger(t)) ret false;
399  
  } else if (eq(p, "<identifier>")) {
400  
    if (!isIdentifier(t)) ret false;
401  
  } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t))))
402  
    ret false; // token mismatch
403  
  ret true;
404  
}
405  
406  
static S loadSnippetThroughBot(S snippetID) {
407  
  print("Loading parser rules.");
408  
  time {
409  
    S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
410  
    new Matches m;
411  
    assertTrue(match("the text of snippet * is *", answer, m));
412  
    ret m.unq(1);
413  
  }
414  
}
415  
416  
// Non-recursing productions don't need to be repeatedly executed
417  
static void removeGroundProductions() {
418  
  int n = productionMap.size();
419  
  for (S className : new HashSet<S>(productionMap.keySet())) {
420  
    L<L<S>> prods = productionMap.getActual(className);
421  
    for (int i = 0; i < l(prods); )
422  
      if (isGroundProduction(prods.get(i))) {
423  
        if (debug)
424  
          print("Ground production: " + className + " = " + join(prods.get(i)));
425  
        prods.remove(i);
426  
      } else
427  
        ++i;
428  
    productionMap.clean(className);
429  
  }
430  
  n -= productionMap.size();
431  
  if (debug && n != 0)
432  
    print(n + " ground productions removed.");
433  
}
434  
435  
static boolean isGroundProduction(L<S> prod) {
436  
  for (int i = 1; i < l(prod); i += 2) {
437  
    S t = prod.get(i);
438  
    if (isBracketedID(t) && !specials.contains(t))
439  
      ret false;
440  
  }
441  
  ret true;
442  
}
443  
444  
static void addDict(S className, Collection<S> entries) {
445  
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
446  
}
447  
448  
static void matchSpecials() {
449  
  for (int i = 1; i < l(tok); i += 2) {
450  
    MultiMap<S, Integer> rr = recog.get(i);
451  
    for (S s : specials)
452  
      if (matchToken(s, tok.get(i)))
453  
        rr.put(unbracket(s), i+2);
454  
  }
455  
}

Author comment

Began life as a copy of #1002328

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002369
Snippet name: NL Parser, reworking the interface (CURRENT)
Eternal ID of this version: #1002369/1
Text MD5: 3ce27e57b13470355771d3230d8662d9
Transpilation MD5: be1e90f4e22154f18df1abcb2fa3645a
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-01-31 03:44:01
Source code size: 13095 bytes / 455 lines
Pitched / IR pitched: No / No
Views / Downloads: 743 / 2461
Referenced in: [show references]