Libraryless. Click here for Pure Java version (3477L/24K/75K).
1 | // Idea: For every position, store the productions recognized to start there, then infer up to higher classes |
2 | |
3 | !759 |
4 | |
5 | // a recognition is identified by (startPos, className, endPos) |
6 | |
7 | |
8 | // key 1 = start position, key 2 = class name, value = end position |
9 | static Map<Integer, MultiMap<S, Integer>> recog; |
10 | static L<S> tok; |
11 | |
12 | static MultiMap<S, L<S>> productionMap; |
13 | |
14 | static boolean caseSensitive = false; |
15 | |
16 | static boolean debug = false; |
17 | |
18 | static long timing; |
19 | static int iterations; |
20 | |
21 | static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples |
22 | |
23 | |
24 | static S defaultParserRules = "#1002281"; // NL |
25 | static S javaRules = "#1002329"; |
26 | static boolean keepRules = false; |
27 | |
28 | static class ParseResult { |
29 | L<S> tok; |
30 | Map<Integer, MultiMap<S, Integer>> recog; |
31 | |
32 | L<S> fullMatchClasses() { |
33 | new L<S> l; |
34 | MultiMap<S, Integer> rr = recog.get(1); |
35 | for (S className : rr.keys()) |
36 | if (rr.get(className).contains(l(tok))) |
37 | l.add(className); |
38 | ret l; |
39 | } |
40 | |
41 | L explainLongestMatch() { |
42 | S c = findLongestMatch(1); |
43 | if (c == null) ret null; |
44 | int n = max(recog.get(1).get(c)); |
45 | if (n == 1) ret null; |
46 | L l = cast explainMatch(new Pos(tok), n, c).get(0); |
47 | ret shortenExplanation(l); |
48 | } |
49 | |
50 | S findLongestMatch(int i) { |
51 | MultiMap<S, Integer> rr = recog.get(i); |
52 | new Map<S, Integer> map; |
53 | for (S c : rr.keys()) |
54 | map.put(c, max(rr.get(c))); |
55 | ret keyWithBiggestValue(map); |
56 | } |
57 | |
58 | L explain(S className) { |
59 | ret explain(1, l(tok), className); |
60 | } |
61 | |
62 | L explainFull(S className) { |
63 | ret explainFull(1, l(tok), className); |
64 | } |
65 | |
66 | // full = not shortened (include all subclassings) |
67 | L explainFull(int fromToken, int toToken, S className) { |
68 | L<L> matches = explainMatch(new Pos(tok, fromToken), toToken, className); |
69 | ret get(matches, 0); |
70 | } |
71 | |
72 | L explain(int fromToken, int toToken, S className) { |
73 | L l = explainFull(fromToken, toToken, className); |
74 | if (debug) |
75 | print("Full explanation: " + structure(l)); |
76 | ret shortenExplanation(l); |
77 | } |
78 | |
79 | // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]] |
80 | |
81 | L shortenExplanation(L m) { |
82 | if (m == null) ret null; |
83 | if (l(m) == 5) { |
84 | int ii = (Integer) m.get(0), jj = (Integer) m.get(1); |
85 | L sub = (L) m.get(4); |
86 | int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1); |
87 | if (ii == II && jj == JJ) |
88 | ret shortenExplanation(sub); // it's a subclassing - just drop the outer match |
89 | } |
90 | for (int i = 4; i < l(m); i++) |
91 | m.set(i, shortenExplanation((L) m.get(i))); |
92 | ret m; |
93 | } |
94 | |
95 | // assumes that there is a match (pos, class, endPos) |
96 | // and gives explanations of how it was probably matched. |
97 | // result list format: [className, prodText, [i1, i2], [i3, i4], ...] |
98 | L explainMatch(Pos pos, int endPos, S forClass) { |
99 | L<L<S>> prods = productionMap.get(forClass); |
100 | new L out; |
101 | for (L<S> prod : prods) { |
102 | L match = litlist(pos.i, endPos, forClass, join(prod)); |
103 | if (debug) |
104 | print("Starting match: " + structure(match)); |
105 | explainMatch2(pos, new Pos(prod), endPos, forClass, match, out); |
106 | } |
107 | ret out; |
108 | } |
109 | |
110 | // same, but with fixed production |
111 | void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) { |
112 | if (prod.end()) { |
113 | if (pos.i == endPos) { |
114 | // match done! add sub-explanations. |
115 | |
116 | match = cloneList(match); |
117 | if (debug) |
118 | print("Match done! " + structure(match)); |
119 | for (int i = 4; i < l(match); i++) { |
120 | L m = cloneList((L) match.get(i)); |
121 | //print("m=" + structure(m)); |
122 | int ii = (Integer) m.get(0), jj = (Integer) m.get(1); |
123 | S className = cast m.get(2); |
124 | L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className); |
125 | //print(format("Recursed: * * * => ", ii, jj, className) + structure(subExplanation)); |
126 | if (empty(subExplanation)) { |
127 | // weird |
128 | } else |
129 | m.addAll(subList((L) subExplanation.get(0), 3)); |
130 | |
131 | match.set(i, m); |
132 | } |
133 | |
134 | if (debug) |
135 | print("Match completed: " + structure(match)); |
136 | out.add(match); |
137 | } |
138 | } else if (pos.end()) |
139 | ret; |
140 | else { |
141 | S p = prod.get(); |
142 | |
143 | if (isBracketedID(p)) { |
144 | S className = unbracket(p); |
145 | L<Integer> r = recog.get(pos.i).get(className); |
146 | |
147 | // keep parsing for every option |
148 | |
149 | for (int i : cloneList(r)) { |
150 | match.add(litlist(pos.i, i, className)); |
151 | explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); |
152 | removeLast(match); |
153 | } |
154 | } else if (!pos.end()) { |
155 | // it's a literal |
156 | S t = pos.get(); |
157 | if (!matchToken(p, t)) |
158 | ret; |
159 | |
160 | //match.add(litlist(pos.i, pos.i+2, p)); |
161 | explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); |
162 | //removeLast(match); |
163 | } |
164 | } |
165 | } |
166 | |
167 | S prettierAnalysis() { |
168 | new L<S> l; |
169 | new TreeSet<S> emptyClasses; |
170 | for (int i = 1; i < l(tok); i += 2) { |
171 | MultiMap<S, Integer> rr = recog.get(i); |
172 | new L<S> bla; |
173 | for (S className : rr.keys()) { |
174 | L<Integer> is = rr.get(className); |
175 | if (is.contains(i)) |
176 | emptyClasses.add(className); |
177 | int n = (max(is)-i)/2; |
178 | if (n > 0) |
179 | bla.add(className + " " + n); |
180 | } |
181 | l.add(tok.get(i) + " : " + structure(bla)); |
182 | } |
183 | if (!empty(emptyClasses)) |
184 | l.add(" with empty classes " + structure(asList(emptyClasses))); |
185 | ret fromLines(l); |
186 | } |
187 | } |
188 | |
189 | p { |
190 | startBot("Snippet Text Bot", "#1002084"); |
191 | } |
192 | |
193 | static ParseResult parse(S inputText) { |
194 | ret parse(inputText, defaultParserRules); |
195 | } |
196 | |
197 | static ParseResult jparse(S inputText) { |
198 | ret parse(inputText, javaRules); |
199 | } |
200 | |
201 | static S keptRulesID; |
202 | static S keptRulesText; |
203 | |
204 | static synchronized ParseResult parse(S inputText, S parserRules) { |
205 | S rulesText; |
206 | if (!isSnippetID(parserRules)) |
207 | rulesText = parserRules; |
208 | else { |
209 | if (keepRules && sameSnippetID(parserRules, keptRulesID)) |
210 | rulesText = keptRulesText; |
211 | else { |
212 | rulesText = loadSnippetThroughBot(parserRules); |
213 | if (keepRules) { |
214 | keptRulesID = parserRules; |
215 | keptRulesText = rulesText; |
216 | } |
217 | } |
218 | } |
219 | |
220 | productionMap = new MultiMap; |
221 | caseSensitive = false; |
222 | for (S rule : toLinesFullTrim(rulesText)) pcall { |
223 | if (l(javaTok(rule)) == 1) continue; // just a comment |
224 | |
225 | if (debug) |
226 | printF("Processing rule: *", rule); |
227 | |
228 | if (rule.contains("=")) { |
229 | L<S> lr = splitAtJavaToken(rule, "="); |
230 | S l, r; |
231 | if (l(lr) == 1) { |
232 | l = ""; |
233 | r = lr.get(0); |
234 | } else if (l(lr) >= 2) { |
235 | l = join(" = ", lr.subList(0, l(lr)-1)); |
236 | r = last(lr); |
237 | } else { |
238 | print("Weird rule: " + rule); |
239 | continue; |
240 | } |
241 | L<S> tokr = mergeBracketThingies(javaTok(r)); |
242 | assertEquals(structure(tokr), 3, l(tokr)); |
243 | S className = assertIdentifier(unbracket(get(tokr, 1))); |
244 | L<S> tok = javaTok(l); |
245 | tok = mergeBracketThingies(tok); |
246 | //printStructure(tok); |
247 | productionMap.put(className, tok); |
248 | } else if (match("case-sensitive", rule)) { |
249 | //print("Now case sensitive."); |
250 | caseSensitive = true; |
251 | } else if (match("case-insensitive", rule)) { |
252 | caseSensitive = false; |
253 | } else |
254 | print("Weird rule: " + rule); |
255 | } |
256 | |
257 | if (debug) { |
258 | print(n(productionMap.size(), "production") + "."); |
259 | print(); |
260 | } |
261 | |
262 | timing = now(); |
263 | tok = javaTok(inputText); |
264 | Pos pos = new Pos(tok); |
265 | parseTop(pos); |
266 | timing = now()-timing; |
267 | |
268 | int numTok = l(tok)/2; |
269 | print("Iterations required for " + numTok + " tokens: " + iterations*numTok*productionMap.size() + ", time: " + timing + " ms"); |
270 | |
271 | new ParseResult res; |
272 | res.tok = tok; |
273 | res.recog = recog; |
274 | ret res; |
275 | } |
276 | |
277 | static void parseTop(Pos pos) { |
278 | // init structures |
279 | recog = new TreeMap; |
280 | for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end |
281 | recog.put(i, new MultiMap); |
282 | //print("parser: recog inited " + pos.i + " to " + l(pos.tok)); |
283 | |
284 | // process dicts |
285 | for (S className : dicts.keySet()) { |
286 | Set<S> values = dicts.get(className); |
287 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
288 | S t = pos.tok.get(i); |
289 | if (values.contains(t.toLowerCase())) |
290 | recog.get(i).setPut(className, i+2); |
291 | } |
292 | } |
293 | |
294 | matchSpecials(); |
295 | |
296 | boolean anyChange; |
297 | iterations = 0; |
298 | MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap); |
299 | while (iter(pos)) {} |
300 | |
301 | iter(pos); // XX - one more - fixes anything? |
302 | |
303 | productionMap = fullProductionMap; // restore ground productions |
304 | } |
305 | |
306 | static boolean iter(Pos pos) { |
307 | boolean anyChange = false; |
308 | ++iterations; |
309 | if (iterations == 2) removeGroundProductions(); |
310 | if (debug) print("Iteration " + iterations); |
311 | for (int i = pos.i; i <= l(pos.tok); i += 2) { |
312 | Pos pos2 = new Pos(pos.tok, i); |
313 | for (S className : productionMap.keySet()) { |
314 | MultiMap<S, Integer> rr = recog.get(i); |
315 | L<Integer> recs = rr.getActual(className); |
316 | L<L<S>> prods = productionMap.get(className); |
317 | for (L<S> prod : prods) { |
318 | int n = l(recs); |
319 | matchProd(pos2, new Pos(prod), className, recs); |
320 | anyChange = anyChange || l(recs) > n; |
321 | } |
322 | rr.clean(className); |
323 | } |
324 | } |
325 | ret anyChange; |
326 | } |
327 | |
328 | static class Pos { |
329 | L<S> tok; |
330 | int i = 1; |
331 | |
332 | *() {} |
333 | *(L<S> *tok) {} |
334 | *(L<S> *tok, int *i) {} |
335 | |
336 | boolean end() { ret i >= l(tok)-1; } |
337 | S get() { ret tok.get(i); } |
338 | public Pos clone() { ret new Pos(tok, i); } |
339 | public boolean equals(O o) { |
340 | if (!(o instanceof Pos)) ret false; |
341 | Pos pos = cast o; |
342 | ret tok == pos.tok && i == pos.i; |
343 | } |
344 | |
345 | S rest() { |
346 | ret join(subList(tok, i)); |
347 | } |
348 | |
349 | Pos plus(int x) { ret new Pos(tok, i + x); } |
350 | } |
351 | |
352 | static void copy(Pos a, Pos b) { |
353 | b.tok = a.tok; |
354 | b.i = a.i; |
355 | } |
356 | |
357 | static void debug(S bla, Pos pos) { |
358 | if (debug) |
359 | print(bla + " on " + quote(pos.rest())); |
360 | } |
361 | |
362 | static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { |
363 | if (prod.end()) |
364 | setAdd(out, pos.i); |
365 | else if (pos.end()) |
366 | ret; |
367 | else { |
368 | S p = prod.get(); |
369 | |
370 | if (isBracketedID(p)/* && !specials.contains(p)*/) { |
371 | MultiMap<S, Integer> rr = recog.get(pos.i); |
372 | if (rr == null) |
373 | fail("parser: recog null at " + pos.i); |
374 | L<Integer> r = rr.get(unbracket(p)); |
375 | |
376 | // keep parsing for every option |
377 | |
378 | for (int i : cloneList(r)) |
379 | matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); |
380 | |
381 | } else { |
382 | // it's a literal |
383 | S t = pos.get(); |
384 | if (!matchToken(p, t)) |
385 | ret; |
386 | |
387 | matchProd(pos.plus(2), prod.plus(2), forClass, out); |
388 | } |
389 | } |
390 | } |
391 | |
392 | static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>"); |
393 | |
394 | static boolean matchToken(S p, S t) { |
395 | if (eq(p, "<quoted>")) { |
396 | if (!isQuoted(t)) ret false; |
397 | } else if (eq(p, "<int>")) { |
398 | if (!isInteger(t)) ret false; |
399 | } else if (eq(p, "<identifier>")) { |
400 | if (!isIdentifier(t)) ret false; |
401 | } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t)))) |
402 | ret false; // token mismatch |
403 | ret true; |
404 | } |
405 | |
406 | static S loadSnippetThroughBot(S snippetID) { |
407 | print("Loading parser rules."); |
408 | time { |
409 | S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); |
410 | new Matches m; |
411 | assertTrue(match("the text of snippet * is *", answer, m)); |
412 | ret m.unq(1); |
413 | } |
414 | } |
415 | |
416 | // Non-recursing productions don't need to be repeatedly executed |
417 | static void removeGroundProductions() { |
418 | int n = productionMap.size(); |
419 | for (S className : new HashSet<S>(productionMap.keySet())) { |
420 | L<L<S>> prods = productionMap.getActual(className); |
421 | for (int i = 0; i < l(prods); ) |
422 | if (isGroundProduction(prods.get(i))) { |
423 | if (debug) |
424 | print("Ground production: " + className + " = " + join(prods.get(i))); |
425 | prods.remove(i); |
426 | } else |
427 | ++i; |
428 | productionMap.clean(className); |
429 | } |
430 | n -= productionMap.size(); |
431 | if (debug && n != 0) |
432 | print(n + " ground productions removed."); |
433 | } |
434 | |
435 | static boolean isGroundProduction(L<S> prod) { |
436 | for (int i = 1; i < l(prod); i += 2) { |
437 | S t = prod.get(i); |
438 | if (isBracketedID(t) && !specials.contains(t)) |
439 | ret false; |
440 | } |
441 | ret true; |
442 | } |
443 | |
444 | static void addDict(S className, Collection<S> entries) { |
445 | dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); |
446 | } |
447 | |
448 | static void matchSpecials() { |
449 | for (int i = 1; i < l(tok); i += 2) { |
450 | MultiMap<S, Integer> rr = recog.get(i); |
451 | for (S s : specials) |
452 | if (matchToken(s, tok.get(i))) |
453 | rr.put(unbracket(s), i+2); |
454 | } |
455 | } |
Began life as a copy of #1002328
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002369 |
Snippet name: | NL Parser, reworking the interface (CURRENT) |
Eternal ID of this version: | #1002369/1 |
Text MD5: | 3ce27e57b13470355771d3230d8662d9 |
Transpilation MD5: | be1e90f4e22154f18df1abcb2fa3645a |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-01-31 03:44:01 |
Source code size: | 13095 bytes / 455 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 816 / 2557 |
Referenced in: | [show references] |