Libraryless. Click here for Pure Java version (3162L/21K/69K).
1 | // Idea: For every position, store the productions recognized to start there, then infer up to higher classes |
2 | |
3 | !759 |
4 | |
5 | // a recognition is identified by (startPos, className, endPos) |
6 | |
7 | |
8 | // key 1 = start position, key 2 = class name, value = end position |
9 | static Map<Integer, MultiMap<S, Integer>> recog; |
10 | static L<S> tok; |
11 | |
12 | static MultiMap<S, L<S>> productionMap; |
13 | |
14 | static boolean debug = false; |
15 | |
16 | static long timing; |
17 | static int iterations; |
18 | |
19 | static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples |
20 | |
21 | |
22 | static S defaultParserRules = "#1002281"; |
23 | static S defaultMainProd = "line"; |
24 | |
25 | p { |
26 | startBot("Snippet Text Bot", "#1002084"); |
27 | } |
28 | |
29 | static synchronized S parse(S inputText) { |
30 | ret parse(inputText, defaultParserRules, defaultMainProd); |
31 | } |
32 | |
33 | static synchronized S parse(S inputText, S parserRules, S mainProd) { |
34 | S rulesText = loadSnippetThroughBot(parserRules); |
35 | |
36 | productionMap = new MultiMap; |
37 | for (S rule : toLinesFullTrim(rulesText)) pcall { |
38 | //printF("Processing rule: *", rule); |
39 | L<S> lr = splitAtJavaToken(rule, "="); |
40 | S l, r; |
41 | if (l(lr) == 1) { |
42 | l = ""; |
43 | r = lr.get(0); |
44 | } else if (l(lr) == 2) { |
45 | l = lr.get(0); |
46 | r = lr.get(1); |
47 | } else { |
48 | print("Weird rule: " + rule); |
49 | continue; |
50 | } |
51 | L<S> tokr = javaTok(r); |
52 | assertEquals(structure(tokr), 3, l(tokr)); |
53 | S className = assertIdentifier(get(tokr, 1)); |
54 | L<S> tok = javaTok(l); |
55 | tok = mergeBracketThingies(tok); |
56 | //printStructure(tok); |
57 | productionMap.put(className, tok); |
58 | } |
59 | |
60 | print(n(productionMap.size(), "production") + "."); |
61 | print(); |
62 | |
63 | timing = now(); |
64 | tok = javaTok(inputText); |
65 | Pos pos = new Pos(tok); |
66 | L<Integer> x = parseTop(pos, mainProd); |
67 | S result; |
68 | timing = now()-timing; |
69 | |
70 | int numTok = l(tok)/2; |
71 | print("Iterations required for " + numTok + " tokens: " + iterations*numTok + ", time: " + timing + " ms"); |
72 | |
73 | if (x.contains(l(tok))) |
74 | result = "parsed"; |
75 | else if (!empty(x)) |
76 | result = "beginning matches"; |
77 | else |
78 | ret "not parsed"; |
79 | L out = explainMatch(new Pos(tok), x.get(0), mainProd); |
80 | ret result + ", detailed analysis: " + structure(out); |
81 | } |
82 | |
83 | static L<Integer> parseTop(Pos pos, S mainProd) { |
84 | // init structures |
85 | recog = new TreeMap; |
86 | for (int i = pos.i; i < l(pos.tok); i += 2) |
87 | recog.put(i, new MultiMap); |
88 | print("parser: recog inited " + pos.i + " to " + l(pos.tok)); |
89 | |
90 | // process dicts |
91 | for (S className : dicts.keySet()) { |
92 | Set<S> values = dicts.get(className); |
93 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
94 | S t = pos.tok.get(i); |
95 | if (values.contains(t.toLowerCase())) |
96 | recog.get(i).setPut(className, i+2); |
97 | } |
98 | } |
99 | |
100 | boolean anyChange; |
101 | iterations = 0; |
102 | MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap); |
103 | do { |
104 | anyChange = false; |
105 | ++iterations; |
106 | if (iterations == 2) removeGroundProductions(); |
107 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
108 | Pos pos2 = new Pos(pos.tok, i); |
109 | for (S className : productionMap.keySet()) { |
110 | MultiMap<S, Integer> rr = recog.get(i); |
111 | L<Integer> recs = rr.getActual(className); |
112 | L<L<S>> prods = productionMap.get(className); |
113 | for (L<S> prod : prods) { |
114 | int n = l(recs); |
115 | matchProd(pos2, new Pos(prod), className, recs); |
116 | anyChange = anyChange || l(recs) > n; |
117 | } |
118 | rr.clean(className); |
119 | } |
120 | } |
121 | } while (anyChange); |
122 | productionMap = fullProductionMap; // restore ground productions |
123 | |
124 | ret recog.get(pos.i).get(mainProd); |
125 | } |
126 | |
127 | static class Pos { |
128 | L<S> tok; |
129 | int i = 1; |
130 | |
131 | *() {} |
132 | *(L<S> *tok) {} |
133 | *(L<S> *tok, int *i) {} |
134 | |
135 | boolean end() { ret i >= l(tok)-1; } |
136 | S get() { ret tok.get(i); } |
137 | public Pos clone() { ret new Pos(tok, i); } |
138 | public boolean equals(O o) { |
139 | if (!(o instanceof Pos)) ret false; |
140 | Pos pos = cast o; |
141 | ret tok == pos.tok && i == pos.i; |
142 | } |
143 | |
144 | S rest() { |
145 | ret join(subList(tok, i)); |
146 | } |
147 | |
148 | Pos plus(int x) { ret new Pos(tok, i + x); } |
149 | } |
150 | |
151 | static void copy(Pos a, Pos b) { |
152 | b.tok = a.tok; |
153 | b.i = a.i; |
154 | } |
155 | |
156 | static void debug(S bla, Pos pos) { |
157 | if (debug) |
158 | print(bla + " on " + quote(pos.rest())); |
159 | } |
160 | |
161 | static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { |
162 | if (prod.end()) |
163 | setAdd(out, pos.i); |
164 | else if (pos.end()) |
165 | ret; |
166 | else { |
167 | S p = prod.get(); |
168 | |
169 | if (isBracketedID(p) && !specials.contains(p)) { |
170 | MultiMap<S, Integer> rr = recog.get(pos.i); |
171 | if (rr == null) |
172 | fail("parser: recog null at " + pos.i); |
173 | L<Integer> r = rr.get(unbracket(p)); |
174 | |
175 | // keep parsing for every option |
176 | |
177 | for (int i : cloneList(r)) |
178 | matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); |
179 | |
180 | } else { |
181 | // it's a literal |
182 | S t = pos.get(); |
183 | if (!matchToken(p, t)) |
184 | ret; |
185 | |
186 | matchProd(pos.plus(2), prod.plus(2), forClass, out); |
187 | } |
188 | } |
189 | } |
190 | |
191 | static L<S> specials = litlist("<quoted>", "<int>", "<identifier>"); |
192 | |
193 | static boolean matchToken(S p, S t) { |
194 | if (eq(p, "<quoted>")) { |
195 | if (!isQuoted(t)) ret false; |
196 | } else if (eq(p, "<int>")) { |
197 | if (!isInteger(t)) ret false; |
198 | } else if (eq(p, "<identifier>")) { |
199 | if (!isIdentifier(t)) ret false; |
200 | } else if (!(eq(p, "*") || eqic(p, t))) |
201 | ret false; // token mismatch |
202 | ret true; |
203 | } |
204 | |
205 | // assumes that there is a match (pos, class, endPos) |
206 | // and gives explanations of how it was probably matched |
207 | static L explainMatch(Pos pos, int endPos, S forClass) { |
208 | L<L<S>> prods = productionMap.get(forClass); |
209 | new L out; |
210 | for (L<S> prod : prods) |
211 | explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out); |
212 | ret out; |
213 | } |
214 | |
215 | // same, but with fixed production |
216 | static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) { |
217 | if (prod.end()) { |
218 | if (pos.i == endPos) |
219 | out.add(cloneList(match)); |
220 | } else if (pos.end()) |
221 | ret; |
222 | else { |
223 | S p = prod.get(); |
224 | |
225 | if (isBracketedID(p) && neq(p, "<quoted>")) { |
226 | S className = unbracket(p); |
227 | L<Integer> r = recog.get(pos.i).get(className); |
228 | |
229 | // keep parsing for every option |
230 | |
231 | for (int i : cloneList(r)) { |
232 | match.add(litlist(pos.i, i)); |
233 | explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); |
234 | removeLast(match); |
235 | } |
236 | } else { |
237 | // it's a literal |
238 | S t = pos.get(); |
239 | if (!matchToken(p, t)) |
240 | ret; |
241 | |
242 | //match.add(litlist(pos.i, p, pos.i+2)); |
243 | explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); |
244 | removeLast(match); |
245 | } |
246 | } |
247 | } |
248 | |
249 | static S loadSnippetThroughBot(S snippetID) { |
250 | S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); |
251 | new Matches m; |
252 | assertTrue(match("the text of snippet * is *", answer, m)); |
253 | ret m.unq(1); |
254 | } |
255 | |
256 | static S prettierAnalysis() { |
257 | new L<S> l; |
258 | for (int i = 1; i < l(tok); i += 2) { |
259 | MultiMap<S, Integer> rr = recog.get(i); |
260 | new L<S> bla; |
261 | for (S prod : rr.keys()) { |
262 | L<Integer> is = rr.get(prod); |
263 | bla.add(prod + " " + (max(is)-i)/2); |
264 | } |
265 | l.add(tok.get(i) + " : " + structure(bla)); |
266 | } ret fromLines(l); |
267 | } |
268 | |
269 | // Non-recursing productions don't need to be repeatedly executed |
270 | static void removeGroundProductions() { |
271 | int n = productionMap.size(); |
272 | for (S className : new HashSet<S>(productionMap.keySet())) { |
273 | L<L<S>> prods = productionMap.getActual(className); |
274 | for (int i = 0; i < l(prods); ) |
275 | if (isGroundProduction(prods.get(i))) { |
276 | //print("Ground production: " + className + " = " + join(prods.get(i))); |
277 | prods.remove(i); |
278 | } else |
279 | ++i; |
280 | productionMap.clean(className); |
281 | } |
282 | n -= productionMap.size(); |
283 | if (n != 0) |
284 | print(n + " ground productions removed."); |
285 | } |
286 | |
287 | static boolean isGroundProduction(L<S> prod) { |
288 | for (int i = 1; i < l(prod); i += 2) { |
289 | S t = prod.get(i); |
290 | if (isBracketedID(t) && !specials.contains(unbracket(t))) |
291 | ret false; |
292 | } |
293 | ret true; |
294 | } |
295 | |
296 | static void addDict(S className, Collection<S> entries) { |
297 | dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); |
298 | } |
Began life as a copy of #1002307
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002328 |
Snippet name: | NL Parser, implementing "empty" classes & ground class optimisation |
Eternal ID of this version: | #1002328/1 |
Text MD5: | aa5353cac7328b453bcc61018ba8b672 |
Transpilation MD5: | 00ec6e09b91e996a60ca772a06365be6 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-01-07 17:35:18 |
Source code size: | 8380 bytes / 298 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 796 / 1111 |
Referenced in: | [show references] |