Libraryless. Click here for Pure Java version (4249L/28K/89K).
1 | // Idea: For every position, store the productions recognized to start there, then infer up to higher classes |
2 | |
3 | // Productions elements: literals (just write them), <classname>, <classname varname> |
4 | |
5 | !759 |
6 | |
7 | // a recognition is identified by (startPos, className, endPos) |
8 | |
9 | // key 1 = start position, key 2 = class name, value = end position |
10 | static Map<Integer, MultiMap<S, Integer>> recog; |
11 | static L<S> tok; |
12 | |
13 | static MultiMap<S, L<S>> productionMap; |
14 | |
15 | static boolean caseSensitive = false; |
16 | |
17 | static boolean debug = false; |
18 | |
19 | static long timing; |
20 | static int iterations; |
21 | |
22 | static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples |
23 | |
24 | |
25 | static S defaultParserRules = "#1002281"; // NL |
26 | static S javaRules = "#1002329"; |
27 | static boolean keepRules = false; |
28 | |
29 | static class ParseResult { |
30 | L<S> tok; |
31 | Map<Integer, MultiMap<S, Integer>> recog; |
32 | |
33 | L<S> fullMatchClasses() { |
34 | new L<S> l; |
35 | MultiMap<S, Integer> rr = recog.get(1); |
36 | for (S className : rr.keys()) |
37 | if (rr.get(className).contains(l(tok))) |
38 | l.add(className); |
39 | ret l; |
40 | } |
41 | |
42 | boolean parsesAs(S className) { |
43 | MultiMap<S, Integer> rr = recog.get(1); |
44 | ret rr.get(className).contains(l(tok)); |
45 | } |
46 | |
47 | L explainLongestMatch() { |
48 | S c = findLongestMatch(1); |
49 | if (c == null) ret null; |
50 | int n = max(recog.get(1).get(c)); |
51 | if (n == 1) ret null; |
52 | L l = cast explainMatch(new Pos(tok), n, c).get(0); |
53 | ret shortenExplanation(l); |
54 | } |
55 | |
56 | S findLongestMatch(int i) { |
57 | MultiMap<S, Integer> rr = recog.get(i); |
58 | new Map<S, Integer> map; |
59 | for (S c : rr.keys()) |
60 | map.put(c, max(rr.get(c))); |
61 | ret keyWithBiggestValue(map); |
62 | } |
63 | |
64 | L explain(S className) { |
65 | ret explain(1, l(tok), className); |
66 | } |
67 | |
68 | L explainFull(S className) { |
69 | ret explainFull(1, l(tok), className); |
70 | } |
71 | |
72 | // full = not shortened (include all subclassings) |
73 | L explainFull(int fromToken, int toToken, S className) { |
74 | L<L> matches = explainMatch(new Pos(tok, fromToken), toToken, className); |
75 | ret get(matches, 0); |
76 | } |
77 | |
78 | L explain(int fromToken, int toToken, S className) { |
79 | L l = explainFull(fromToken, toToken, className); |
80 | if (debug) |
81 | print("Full explanation: " + structure(l)); |
82 | ret shortenExplanation(l); |
83 | } |
84 | |
85 | // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]] |
86 | |
87 | L shortenExplanation(L m) { |
88 | if (m == null) ret null; |
89 | if (l(m) == 5) { |
90 | int ii = (Integer) m.get(0), jj = (Integer) m.get(1); |
91 | L sub = (L) m.get(4); |
92 | int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1); |
93 | |
94 | // for primitives, inner class seems to be null currently |
95 | // - in this case, leave outer match. |
96 | |
97 | S innerClass = getString(sub, 2); |
98 | if (ii == II && jj == JJ && innerClass != null) |
99 | ret shortenExplanation(sub); // it's a subclassing - just drop the outer match |
100 | } |
101 | for (int i = 4; i < l(m); i++) |
102 | m.set(i, shortenExplanation((L) m.get(i))); |
103 | ret m; |
104 | } |
105 | |
106 | // assumes that there is a match (pos, class, endPos) |
107 | // and gives explanations of how it was probably matched. |
108 | // result list format: [className, prodText, [i1, i2], [i3, i4], ...] |
109 | L explainMatch(Pos pos, int endPos, S forClass) { |
110 | L<L<S>> prods = productionMap.get(forClass); |
111 | new L out; |
112 | for (L<S> prod : prods) { |
113 | L match = litlist(pos.i, endPos, forClass, join(prod)); |
114 | if (debug) |
115 | print("Starting match: " + structure(match)); |
116 | explainMatch2(pos, new Pos(prod), endPos, forClass, match, out); |
117 | } |
118 | ret out; |
119 | } |
120 | |
121 | // same, but with fixed production |
122 | void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) { |
123 | if (prod.end()) { |
124 | if (pos.i == endPos) { |
125 | // match done! add sub-explanations. |
126 | |
127 | match = cloneList(match); |
128 | if (debug) |
129 | print("Match done! " + structure(match)); |
130 | for (int i = 4; i < l(match); i++) { |
131 | L m = cloneList((L) match.get(i)); |
132 | //print("m=" + structure(m)); |
133 | int ii = (Integer) m.get(0), jj = (Integer) m.get(1); |
134 | S className = cast m.get(2); |
135 | L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className); |
136 | //print(format("Recursed: * * * => ", ii, jj, className) + structure(subExplanation)); |
137 | if (empty(subExplanation)) { |
138 | // weird |
139 | } else |
140 | m.addAll(subList((L) subExplanation.get(0), 3)); |
141 | |
142 | match.set(i, m); |
143 | } |
144 | |
145 | if (debug) |
146 | print("Match completed: " + structure(match)); |
147 | out.add(match); |
148 | } |
149 | } else { |
150 | S p = prod.get(); |
151 | |
152 | if (isBracketedID(p)) { |
153 | // get class name and possible var name |
154 | |
155 | S className = unbracket(p); |
156 | S varName = null; |
157 | if (hasSpace(className)) { |
158 | int idx = className.indexOf(' '); |
159 | varName = className.substring(idx+1).trim(); |
160 | className = className.substring(0, idx).trim(); |
161 | } |
162 | |
163 | // look up if class found at this place |
164 | |
165 | L<Integer> r = recog.get(pos.i).get(className); |
166 | |
167 | // keep parsing for every option |
168 | |
169 | for (int i : cloneList(r)) { |
170 | match.add(litlist(pos.i, i, className)); |
171 | explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); |
172 | removeLast(match); |
173 | } |
174 | } else if (!pos.end()) { |
175 | // it's a literal |
176 | S t = pos.get(); |
177 | if (!matchToken(p, t)) |
178 | ret; |
179 | |
180 | //match.add(litlist(pos.i, pos.i+2, p)); |
181 | explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); |
182 | //removeLast(match); |
183 | } |
184 | } |
185 | } |
186 | |
187 | S prettierAnalysis() { |
188 | new L<S> l; |
189 | new TreeSet<S> emptyClasses; |
190 | for (int i = 1; i < l(tok); i += 2) { |
191 | MultiMap<S, Integer> rr = recog.get(i); |
192 | new L<S> bla; |
193 | for (S className : rr.keys()) { |
194 | L<Integer> is = rr.get(className); |
195 | if (is.contains(i)) |
196 | emptyClasses.add(className); |
197 | int n = (max(is)-i)/2; |
198 | if (n > 0) |
199 | bla.add(className + " " + n); |
200 | } |
201 | l.add(tok.get(i) + " : " + structure(bla)); |
202 | } |
203 | if (!empty(emptyClasses)) |
204 | l.add(" with empty classes " + structure(asList(emptyClasses))); |
205 | ret fromLines(l); |
206 | } |
207 | } |
208 | |
209 | p { |
210 | startBot("Snippet Text Bot", "#1002084"); |
211 | } |
212 | |
213 | static ParseResult parse(S inputText) { |
214 | ret parse(inputText, defaultParserRules); |
215 | } |
216 | |
217 | static ParseResult jparse(S inputText) { |
218 | ret parse(inputText, javaRules); |
219 | } |
220 | |
221 | static S keptRulesID; |
222 | static S keptRulesText; |
223 | |
224 | static ParseResult parse(S inputText, S parserRules) { |
225 | ret parse(javaTok(inputText), parserRules); |
226 | } |
227 | |
228 | static synchronized ParseResult parse(L<S> inputTok, S parserRules) { |
229 | S rulesText; |
230 | if (!isSnippetID(parserRules)) |
231 | rulesText = parserRules; |
232 | else { |
233 | if (keepRules && sameSnippetID(parserRules, keptRulesID)) |
234 | rulesText = keptRulesText; |
235 | else { |
236 | rulesText = loadSnippetThroughBot(parserRules); |
237 | if (keepRules) { |
238 | keptRulesID = parserRules; |
239 | keptRulesText = rulesText; |
240 | } |
241 | } |
242 | } |
243 | |
244 | productionMap = new MultiMap; |
245 | caseSensitive = false; |
246 | for (S rule : toLinesFullTrim(rulesText)) pcall { |
247 | if (l(javaTok(rule)) == 1) continue; // just a comment |
248 | |
249 | if (debug) |
250 | printF("Processing rule: *", rule); |
251 | |
252 | if (rule.contains("=")) { |
253 | L<S> lr = splitAtJavaToken(rule, "="); |
254 | S l, r; |
255 | if (l(lr) == 1) { |
256 | l = ""; |
257 | r = lr.get(0); |
258 | } else if (l(lr) >= 2) { |
259 | l = join(" = ", lr.subList(0, l(lr)-1)); |
260 | r = last(lr); |
261 | } else { |
262 | print("Weird rule: " + rule); |
263 | continue; |
264 | } |
265 | L<S> tokr = mergeBracketThingies(javaTok(r)); |
266 | assertEquals(structure(tokr), 3, l(tokr)); |
267 | S className = assertIdentifier(unbracket(get(tokr, 1))); |
268 | L<S> tok = javaTok(l); |
269 | tok = mergeBracketThingies(tok); |
270 | //printStructure(tok); |
271 | productionMap.put(className, tok); |
272 | } else if (match("case-sensitive", rule)) { |
273 | print("Now case sensitive."); |
274 | caseSensitive = true; |
275 | } else if (match("case-insensitive", rule)) { |
276 | caseSensitive = false; |
277 | } else |
278 | print("Weird rule: " + rule); |
279 | } |
280 | |
281 | if (debug) { |
282 | print(n(productionMap.size(), "production") + "."); |
283 | print(); |
284 | } |
285 | |
286 | timing = now(); |
287 | tok = inputTok; |
288 | /* HACK |
289 | tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes |
290 | tok.add(""); |
291 | */ |
292 | Pos pos = new Pos(tok); |
293 | parseTop(pos); |
294 | timing = now()-timing; |
295 | |
296 | int numTok = l(tok)/2; |
297 | print("Iterations required for " + numTok + " tokens: " + iterations*numTok*productionMap.size() + ", time: " + timing + " ms"); |
298 | |
299 | new ParseResult res; |
300 | res.tok = tok; |
301 | res.recog = recog; |
302 | ret res; |
303 | } |
304 | |
305 | static void parseTop(Pos pos) { |
306 | // init structures |
307 | recog = new TreeMap; |
308 | for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end |
309 | recog.put(i, new MultiMap); |
310 | //print("parser: recog inited " + pos.i + " to " + l(pos.tok)); |
311 | |
312 | // process dicts |
313 | for (S className : dicts.keySet()) { |
314 | Set<S> values = dicts.get(className); |
315 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
316 | S t = pos.tok.get(i); |
317 | if (values.contains(t.toLowerCase())) |
318 | recog.get(i).setPut(className, i+2); |
319 | } |
320 | } |
321 | |
322 | matchSpecials(); |
323 | |
324 | boolean anyChange; |
325 | iterations = 0; |
326 | iter(pos); |
327 | } |
328 | |
329 | static void iter(Pos pos) { |
330 | MultiMap<S, L<S>> higherProductions = withoutGroundProductions(productionMap); |
331 | |
332 | for (int i = l(pos.tok); i >= 1; i -= 2) { |
333 | Pos pos2 = new Pos(pos.tok, i); |
334 | int it = 0; |
335 | boolean anyChange = false; |
336 | do { |
337 | anyChange = false; |
338 | MultiMap<S, L<S>> mapToUse = it == 0 ? productionMap : higherProductions; |
339 | it++; |
340 | for (S className : mapToUse.keySet()) { |
341 | MultiMap<S, Integer> rr = recog.get(i); |
342 | L<Integer> recs = rr.getActual(className); |
343 | L<L<S>> prods = mapToUse.get(className); |
344 | for (L<S> prod : prods) { |
345 | int n = l(recs); |
346 | matchProd(pos2, new Pos(prod), className, recs); |
347 | anyChange = anyChange || l(recs) > n; |
348 | } |
349 | rr.clean(className); |
350 | } |
351 | } while (anyChange); |
352 | } |
353 | } |
354 | |
355 | static class Pos { |
356 | L<S> tok; |
357 | int i = 1; |
358 | |
359 | *() {} |
360 | *(L<S> *tok) {} |
361 | *(L<S> *tok, int *i) {} |
362 | |
363 | boolean end() { ret i >= l(tok)-1; } |
364 | S get() { ret tok.get(i); } |
365 | public Pos clone() { ret new Pos(tok, i); } |
366 | public boolean equals(O o) { |
367 | if (!(o instanceof Pos)) ret false; |
368 | Pos pos = cast o; |
369 | ret tok == pos.tok && i == pos.i; |
370 | } |
371 | |
372 | S rest() { |
373 | ret join(subList(tok, i)); |
374 | } |
375 | |
376 | Pos plus(int x) { ret new Pos(tok, i + x); } |
377 | } |
378 | |
379 | static void copy(Pos a, Pos b) { |
380 | b.tok = a.tok; |
381 | b.i = a.i; |
382 | } |
383 | |
384 | static void debug(S bla, Pos pos) { |
385 | if (debug) |
386 | print(bla + " on " + quote(pos.rest())); |
387 | } |
388 | |
389 | static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { |
390 | if (prod.end()) |
391 | setAdd(out, pos.i); |
392 | else { |
393 | S p = prod.get(); |
394 | |
395 | if (isBracketedID(p)/* && !specials.contains(p)*/) { |
396 | MultiMap<S, Integer> rr = recog.get(pos.i); |
397 | if (rr == null) |
398 | fail("parser: recog null at " + pos.i); |
399 | |
400 | // get class name |
401 | |
402 | S className = unbracket(p); |
403 | if (hasSpace(className)) { |
404 | int idx = className.indexOf(' '); |
405 | className = className.substring(0, idx).trim(); |
406 | } |
407 | |
408 | L<Integer> r = rr.get(className); |
409 | |
410 | // keep parsing for every option |
411 | |
412 | for (int i : cloneList(r)) |
413 | matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); |
414 | |
415 | } else if (!pos.end()) { // it's a literal - check if at end of token stream first |
416 | |
417 | S t = pos.get(); |
418 | if (!matchToken(p, t)) |
419 | ret; |
420 | |
421 | matchProd(pos.plus(2), prod.plus(2), forClass, out); |
422 | } |
423 | } |
424 | } |
425 | |
426 | static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>"); |
427 | |
428 | static boolean matchToken(S p, S t) { |
429 | if (eq(p, "<quoted>")) { |
430 | if (!isQuoted(t)) ret false; |
431 | } else if (eq(p, "<int>")) { |
432 | if (!isInteger(t)) ret false; |
433 | } else if (eq(p, "<identifier>")) { |
434 | if (!isIdentifier(t)) ret false; |
435 | } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t)))) |
436 | ret false; // token mismatch |
437 | ret true; |
438 | } |
439 | |
440 | static S loadSnippetThroughBot(S snippetID) { |
441 | print("Loading parser rules."); |
442 | time { |
443 | S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); |
444 | new Matches m; |
445 | assertTrue(match("the text of snippet * is *", answer, m)); |
446 | ret m.unq(1); |
447 | } |
448 | } |
449 | |
450 | // Non-recursing productions don't need to be repeatedly executed |
451 | static MultiMap<S, L<S>> withoutGroundProductions(MultiMap<S, L<S>> productionMap) { |
452 | MultiMap<S, L<S>> newMap = new MultiMap; |
453 | |
454 | for (S className : new HashSet<S>(productionMap.keySet())) { |
455 | L<L<S>> prods = productionMap.get(className); |
456 | for (L<S> prod : prods) |
457 | if (!isGroundProduction(prod)) |
458 | newMap.put(className, prod); |
459 | } |
460 | |
461 | ret newMap; |
462 | } |
463 | |
464 | static boolean isGroundProduction(L<S> prod) { |
465 | for (int i = 1; i < l(prod); i += 2) { |
466 | S t = prod.get(i); |
467 | if (isBracketedID(t) && !specials.contains(t)) |
468 | ret false; |
469 | } |
470 | ret true; |
471 | } |
472 | |
473 | static void addDict(S className, Collection<S> entries) { |
474 | dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); |
475 | } |
476 | |
477 | static void matchSpecials() { |
478 | for (int i = 1; i < l(tok); i += 2) { |
479 | MultiMap<S, Integer> rr = recog.get(i); |
480 | for (S s : specials) |
481 | if (matchToken(s, tok.get(i))) |
482 | rr.put(unbracket(s), i+2); |
483 | } |
484 | } |
Began life as a copy of #1002369
download show line numbers debug dex old transpilations
Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002472 |
Snippet name: | NL Parser, implementing names & new eval order (superseded by #1002719) |
Eternal ID of this version: | #1002472/1 |
Text MD5: | 38a4871c6689b3a8fa8225be0cf65dee |
Transpilation MD5: | c257994d7d0333ff130c4a2475710bfc |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-02-18 19:12:44 |
Source code size: | 14065 bytes / 484 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 683 / 1644 |
Referenced in: | [show references] |