Libraryless. Click here for Pure Java version (8299L/57K/186K).
1 | // Idea: For every position, store the productions recognized to start there, then infer up to higher classes |
2 | |
3 | // Productions elements: literals (just write them), <classname>, <classname varname> |
4 | |
5 | !7 |
6 | |
7 | set flag NoAI. |
8 | |
9 | // a recognition is identified by (startPos, className, endPos) |
10 | |
11 | // key 1 = start position, key 2 = class name, value = end position |
12 | static Map<Integer, MultiMap<S, Integer>> recog; |
13 | static L<S> tok; |
14 | |
15 | static MultiMap<S, L<S>> productionMap; |
16 | |
17 | static boolean caseSensitive = false; |
18 | |
19 | static boolean debug = false; |
20 | |
21 | static long timing; |
22 | static int iterations; |
23 | |
24 | static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples |
25 | |
26 | |
27 | static S defaultParserRules = "#1002281"; // NL |
28 | static S javaRules = "#1002329"; |
29 | static boolean keepRules = false; |
30 | |
31 | static class ParseResult { |
32 | L<S> tok; |
33 | Map<Integer, MultiMap<S, Integer>> recog; |
34 | |
35 | L<S> fullMatchClasses() { |
36 | new L<S> l; |
37 | MultiMap<S, Integer> rr = recog.get(1); |
38 | for (S className : rr.keys()) |
39 | if (rr.get(className).contains(l(tok))) |
40 | l.add(className); |
41 | ret l; |
42 | } |
43 | |
44 | boolean parsesAs(S className) { |
45 | MultiMap<S, Integer> rr = recog.get(1); |
46 | ret rr.get(className).contains(l(tok)); |
47 | } |
48 | |
49 | L explainLongestMatch() { |
50 | S c = findLongestMatch(1); |
51 | if (c == null) ret null; |
52 | int n = max(recog.get(1).get(c)); |
53 | if (n == 1) ret null; |
54 | L l = explainMatch(new Pos(tok), n, c); |
55 | ret shortenExplanation(l); |
56 | } |
57 | |
58 | S findLongestMatch(int i) { |
59 | MultiMap<S, Integer> rr = recog.get(i); |
60 | new Map<S, Integer> map; |
61 | for (S c : rr.keys()) |
62 | map.put(c, max(rr.get(c))); |
63 | ret keyWithBiggestValue(map); |
64 | } |
65 | |
66 | L explain(S className) { |
67 | ret explain(1, l(tok), className); |
68 | } |
69 | |
70 | L explainFull(S className) { |
71 | ret explainFull(1, l(tok), className); |
72 | } |
73 | |
74 | // full = not shortened (include all subclassings) |
75 | L explainFull(int fromToken, int toToken, S className) { |
76 | ret explainMatch(new Pos(tok, fromToken), toToken, className); |
77 | } |
78 | |
79 | L explain(int fromToken, int toToken, S className) { |
80 | L l = explainFull(fromToken, toToken, className); |
81 | if (debug) |
82 | print("Full explanation: " + sfu(l)); |
83 | l = shortenExplanation(l); |
84 | if (debug) |
85 | print("Shortened explanation: " + sfu(l)); |
86 | ret l; |
87 | } |
88 | |
89 | // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]] |
90 | |
91 | L shortenExplanation(L m) { |
92 | if (m == null) ret null; |
93 | if (l(m) == 5) { |
94 | int ii = (Integer) m.get(0), jj = (Integer) m.get(1); |
95 | L sub = (L) m.get(4); |
96 | int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1); |
97 | |
98 | // for primitives, inner class seems to be null currently |
99 | // - in this case, leave outer match. |
100 | |
101 | S innerClass = getString(sub, 2); |
102 | if (ii == II && jj == JJ && innerClass != null) |
103 | ret shortenExplanation(sub); // it's a subclassing - just drop the outer match |
104 | } |
105 | for (int i = 4; i < l(m); i++) |
106 | m.set(i, shortenExplanation((L) m.get(i))); |
107 | ret m; |
108 | } |
109 | |
110 | // assumes that there is a match (pos, class, endPos) |
111 | // and gives explanations of how it was probably matched. |
112 | // result list format: [className, prodText, [i1, i2], [i3, i4], ...] |
113 | L explainMatch(Pos pos, int endPos, S forClass) { |
114 | L<L<S>> prods = productionMap.get(forClass); |
115 | for (L<S> prod : prods) { |
116 | L match = litlist(pos.i, endPos, forClass, join(prod)); |
117 | if (debug) |
118 | print("Starting match: " + structure(match)); |
119 | L out = explainMatch2(pos, new Pos(prod), endPos, forClass, match); |
120 | if (out != null) ret out; |
121 | } |
122 | ret null; |
123 | } |
124 | |
125 | // same, but with fixed production |
126 | L explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match) { |
127 | if (prod.end()) { |
128 | if (pos.i == endPos) { |
129 | // match done! add sub-explanations. |
130 | |
131 | match = cloneList(match); |
132 | if (debug) |
133 | print("Match done! " + structure(match)); |
134 | for (int i = 4; i < l(match); i++) { |
135 | L m = cloneList((L) match.get(i)); |
136 | //print("m=" + structure(m)); |
137 | int ii = (Integer) m.get(0), jj = (Integer) m.get(1); |
138 | S className = cast m.get(2); |
139 | L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className); |
140 | //print(format("Recursed: * * * => ", ii, jj, className) + structure(subExplanation)); |
141 | if (subExplanation == null) { |
142 | if (debug) print("weird in " + forClass); |
143 | } else |
144 | m.addAll(subList(subExplanation, 3)); |
145 | |
146 | match.set(i, m); |
147 | } |
148 | |
149 | if (debug) |
150 | print("Match completed: " + structure(match)); |
151 | ret match; |
152 | } |
153 | } else { |
154 | S p = prod.get(); |
155 | |
156 | if (isBracketedID(p)) { |
157 | // get class name and possible var name |
158 | |
159 | S className = unbracket(p); |
160 | S varName = null; |
161 | if (hasSpace(className)) { |
162 | int idx = className.indexOf(' '); |
163 | varName = className.substring(idx+1).trim(); |
164 | className = className.substring(0, idx).trim(); |
165 | } |
166 | |
167 | // look up if class found at this place |
168 | |
169 | L<Integer> r = recog.get(pos.i).get(className); |
170 | |
171 | // keep parsing for every option |
172 | |
173 | for (int i : cloneList(r)) { |
174 | match.add(litlist(pos.i, i, className)); |
175 | L out = explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match); |
176 | if (out != null) ret out; |
177 | removeLast(match); |
178 | } |
179 | } else if (!pos.end()) { |
180 | // it's a literal |
181 | S t = pos.get(); |
182 | if (!matchToken(p, t)) |
183 | ret null; |
184 | |
185 | //match.add(litlist(pos.i, pos.i+2, p)); |
186 | L out = explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match); |
187 | if (out != null) ret out; |
188 | //removeLast(match); |
189 | } |
190 | } |
191 | ret null; |
192 | } |
193 | |
194 | S prettierAnalysis() { |
195 | new L<S> l; |
196 | new TreeSet<S> emptyClasses; |
197 | for (int i = 1; i < l(tok); i += 2) { |
198 | MultiMap<S, Integer> rr = recog.get(i); |
199 | new L<S> bla; |
200 | for (S className : rr.keys()) { |
201 | L<Integer> is = rr.get(className); |
202 | if (is.contains(i)) |
203 | emptyClasses.add(className); |
204 | int n = (max(is)-i)/2; |
205 | if (n > 0) |
206 | bla.add(className + " " + n); |
207 | } |
208 | l.add(tok.get(i) + " : " + structure(bla)); |
209 | } |
210 | if (!empty(emptyClasses)) |
211 | l.add(" with empty classes " + structure(asList(emptyClasses))); |
212 | ret fromLines(l); |
213 | } |
214 | } |
215 | |
216 | p { |
217 | //startBot("Snippet Text Bot", "#1002084"); |
218 | } |
219 | |
220 | static ParseResult parse(S inputText) { |
221 | ret parse(inputText, defaultParserRules); |
222 | } |
223 | |
224 | static ParseResult jparse(S inputText) { |
225 | ret parse(inputText, javaRules); |
226 | } |
227 | |
228 | static S keptRulesID; |
229 | static S keptRulesText; |
230 | |
231 | static ParseResult parse(S inputText, S parserRules) { |
232 | ret parse(javaTok(inputText), parserRules); |
233 | } |
234 | |
235 | static synchronized ParseResult parse(L<S> inputTok, S parserRules) { |
236 | S rulesText; |
237 | if (!isSnippetID(parserRules)) |
238 | rulesText = parserRules; |
239 | else { |
240 | if (keepRules && sameSnippetID(parserRules, keptRulesID)) |
241 | rulesText = keptRulesText; |
242 | else { |
243 | rulesText = loadSnippet/*ThroughBot*/(parserRules); |
244 | if (keepRules) { |
245 | keptRulesID = parserRules; |
246 | keptRulesText = rulesText; |
247 | } |
248 | } |
249 | } |
250 | |
251 | productionMap = new MultiMap; |
252 | caseSensitive = false; |
253 | for (S rule : toLinesFullTrim(rulesText)) pcall { |
254 | if (l(javaTok(rule)) == 1) continue; // just a comment |
255 | |
256 | if (debug) |
257 | printF("Processing rule: *", rule); |
258 | |
259 | if (rule.contains("=")) { |
260 | L<S> lr = splitAtJavaToken(rule, "="); |
261 | S l, r; |
262 | if (l(lr) == 1) { |
263 | l = ""; |
264 | r = lr.get(0); |
265 | } else if (l(lr) >= 2) { |
266 | l = join(" = ", lr.subList(0, l(lr)-1)); |
267 | r = last(lr); |
268 | if (debug) |
269 | print("rule reformatted to: " + l + " .=. " + r); |
270 | } else { |
271 | print("Weird rule: " + rule); |
272 | continue; |
273 | } |
274 | L<S> tokr = mergeBracketThingies(javaTok(r)); |
275 | assertEquals(structure(tokr), 3, l(tokr)); |
276 | S className = assertIdentifier(unbracket(get(tokr, 1))); |
277 | L<S> tok = unquoteAll(javaTok(l)); |
278 | tok = mergeBracketThingies(tok); |
279 | //printStructure(tok); |
280 | productionMap.put(className, tok); |
281 | } else if (match("case-sensitive", rule)) { |
282 | print("Now case sensitive."); |
283 | caseSensitive = true; |
284 | } else if (match("case-insensitive", rule)) { |
285 | caseSensitive = false; |
286 | } else |
287 | print("Weird rule: " + rule); |
288 | } |
289 | |
290 | if (debug) { |
291 | print(n(productionMap.size(), "production") + "."); |
292 | print(); |
293 | } |
294 | |
295 | timing = now(); |
296 | tok = inputTok; |
297 | /* HACK |
298 | tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes |
299 | tok.add(""); |
300 | */ |
301 | Pos pos = new Pos(tok); |
302 | parseTop(pos); |
303 | timing = now()-timing; |
304 | |
305 | int numTok = l(tok)/2; |
306 | /*if (printStats) |
307 | print("Iterations required for " + numTok + " tokens: ?, time: " + timing + " ms");*/ |
308 | |
309 | new ParseResult res; |
310 | res.tok = tok; |
311 | res.recog = recog; |
312 | ret res; |
313 | } |
314 | |
315 | static void parseTop(Pos pos) { |
316 | // init structures |
317 | recog = new TreeMap; |
318 | for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end |
319 | recog.put(i, new MultiMap); |
320 | //print("parser: recog inited " + pos.i + " to " + l(pos.tok)); |
321 | |
322 | // process dicts |
323 | for (S className : dicts.keySet()) { |
324 | Set<S> values = dicts.get(className); |
325 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
326 | S t = pos.tok.get(i); |
327 | if (values.contains(t.toLowerCase())) |
328 | recog.get(i).setPut(className, i+2); |
329 | } |
330 | } |
331 | |
332 | matchSpecials(); |
333 | |
334 | boolean anyChange; |
335 | iterations = 0; |
336 | iter(pos); |
337 | } |
338 | |
339 | static void iter(Pos pos) { |
340 | MultiMap<S, L<S>> higherProductions = withoutGroundProductions(productionMap); |
341 | |
342 | for (int i = l(pos.tok); i >= 1; i -= 2) { |
343 | Pos pos2 = new Pos(pos.tok, i); |
344 | int it = 0; |
345 | boolean anyChange = false; |
346 | do { |
347 | anyChange = false; |
348 | MultiMap<S, L<S>> mapToUse = it == 0 ? productionMap : higherProductions; |
349 | it++; |
350 | for (S className : mapToUse.keySet()) { |
351 | MultiMap<S, Integer> rr = recog.get(i); |
352 | L<Integer> recs = rr.getActual(className); |
353 | L<L<S>> prods = mapToUse.get(className); |
354 | for (L<S> prod : prods) { |
355 | int n = l(recs); |
356 | matchProd(pos2, new Pos(prod), className, recs); |
357 | anyChange = anyChange || l(recs) > n; |
358 | } |
359 | rr.clean(className); |
360 | } |
361 | } while (anyChange); |
362 | } |
363 | } |
364 | |
365 | static class Pos { |
366 | L<S> tok; |
367 | int i = 1; |
368 | |
369 | *() {} |
370 | *(L<S> *tok) {} |
371 | *(L<S> *tok, int *i) {} |
372 | |
373 | boolean end() { ret i >= l(tok)-1; } |
374 | S get() { ret tok.get(i); } |
375 | public Pos clone() { ret new Pos(tok, i); } |
376 | public boolean equals(O o) { |
377 | if (!(o instanceof Pos)) ret false; |
378 | Pos pos = cast o; |
379 | ret tok == pos.tok && i == pos.i; |
380 | } |
381 | |
382 | S rest() { |
383 | ret join(subList(tok, i)); |
384 | } |
385 | |
386 | Pos plus(int x) { ret new Pos(tok, i + x); } |
387 | } |
388 | |
389 | static void copy(Pos a, Pos b) { |
390 | b.tok = a.tok; |
391 | b.i = a.i; |
392 | } |
393 | |
394 | static void debug(S bla, Pos pos) { |
395 | if (debug) |
396 | print(bla + " on " + quote(pos.rest())); |
397 | } |
398 | |
399 | static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { |
400 | if (prod.end()) |
401 | setAdd(out, pos.i); |
402 | else { |
403 | S p = prod.get(); |
404 | |
405 | if (isBracketedID(p)/* && !specials.contains(p)*/) { |
406 | MultiMap<S, Integer> rr = recog.get(pos.i); |
407 | if (rr == null) |
408 | fail("parser: recog null at " + pos.i); |
409 | |
410 | // get class name |
411 | |
412 | S className = unbracket(p); |
413 | if (hasSpace(className)) { |
414 | int idx = className.indexOf(' '); |
415 | className = className.substring(0, idx).trim(); |
416 | } |
417 | |
418 | L<Integer> r = rr.get(className); |
419 | |
420 | // keep parsing for every option |
421 | |
422 | for (int i : cloneList(r)) |
423 | matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); |
424 | |
425 | } else if (!pos.end()) { // it's a literal - check if at end of token stream first |
426 | |
427 | S t = pos.get(); |
428 | if (!matchToken(p, t)) |
429 | ret; |
430 | |
431 | matchProd(pos.plus(2), prod.plus(2), forClass, out); |
432 | } |
433 | } |
434 | } |
435 | |
436 | static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>", "<id>", "<extidentifier>", "ucid"); |
437 | |
438 | static boolean matchToken(S p, S t) { |
439 | if (eq(p, "<quoted>")) { |
440 | if (!isQuoted(t)) ret false; |
441 | } else if (eq(p, "<int>")) { |
442 | if (!isInteger(t)) ret false; |
443 | } else if (eqOneOf(p, "<identifier>", "<id>")) { |
444 | if (!isIdentifier(t)) ret false; |
445 | } else if (eq(p, "<ucid>")) { |
446 | if (!(isIdentifier(t) && startsWithUpperCase(t))) ret false; |
447 | } else if (eq(p, "<extidentifier>")) { |
448 | if (!isExtendedIdentifier(t)) ret false; |
449 | } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t)))) |
450 | ret false; // token mismatch |
451 | ret true; |
452 | } |
453 | |
454 | static S loadSnippetThroughBot(S snippetID) { |
455 | print("Loading parser rules."); |
456 | time { |
457 | S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); |
458 | new Matches m; |
459 | assertTrue(match("the text of snippet * is *", answer, m)); |
460 | ret m.unq(1); |
461 | } |
462 | } |
463 | |
464 | // Non-recursing productions don't need to be repeatedly executed |
465 | static MultiMap<S, L<S>> withoutGroundProductions(MultiMap<S, L<S>> productionMap) { |
466 | MultiMap<S, L<S>> newMap = new MultiMap; |
467 | |
468 | for (S className : new HashSet<S>(productionMap.keySet())) { |
469 | L<L<S>> prods = productionMap.get(className); |
470 | for (L<S> prod : prods) |
471 | if (!isGroundProduction(prod)) |
472 | newMap.put(className, prod); |
473 | } |
474 | |
475 | ret newMap; |
476 | } |
477 | |
478 | static boolean isGroundProduction(L<S> prod) { |
479 | for (int i = 1; i < l(prod); i += 2) { |
480 | S t = prod.get(i); |
481 | if (isBracketedID(t) && !specials.contains(t)) |
482 | ret false; |
483 | } |
484 | ret true; |
485 | } |
486 | |
487 | static void addDict(S className, Collection<S> entries) { |
488 | dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); |
489 | } |
490 | |
491 | // TODO: do only those requested in grammar |
492 | |
493 | static void matchSpecials() { |
494 | for (int i = 1; i < l(tok); i += 2) { |
495 | MultiMap<S, Integer> rr = recog.get(i); |
496 | for (S s : specials) |
497 | if (matchToken(s, tok.get(i))) |
498 | rr.put(unbracket(s), i+2); |
499 | } |
500 | } |
501 | |
502 | static L<S> unquoteAll(L<S> tok) { |
503 | L<S> t = cloneList(tok); |
504 | for (int i = 1; i < l(t); i += 2) |
505 | if (isQuoted(t.get(i))) |
506 | t.set(i, unquote(t.get(i))); |
507 | ret t; |
508 | } |
Began life as a copy of #1002472
download show line numbers debug dex old transpilations
Travelled to 16 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002719 |
Snippet name: | "Bottom-Up" NL Parser, faster explainMatch (LIVE EVERYWHERE) |
Eternal ID of this version: | #1002719/12 |
Text MD5: | f3bad2905ecdacbe90a91ada158fa2fe |
Transpilation MD5: | 1cf5b651a3557e7aded44fca248c0cbe |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2018-09-16 14:05:31 |
Source code size: | 14775 bytes / 508 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 845 / 3805 |
Version history: | 11 change(s) |
Referenced in: | [show references] |