Libraryless. Click here for Pure Java version (3162L/21K/69K).
1 | // Idea: For every position, store the productions recognized to start there, then infer up to higher classes |
2 | |
3 | !759 |
4 | |
5 | // a recognition is identified by (startPos, className, endPos) |
6 | |
7 | |
8 | // key 1 = start position, key 2 = class name, value = end position |
9 | static Map<Integer, MultiMap<S, Integer>> recog; |
10 | static L<S> tok; |
11 | |
12 | static MultiMap<S, L<S>> productionMap; |
13 | |
14 | static boolean debug = false; |
15 | |
16 | static long timing; |
17 | static int iterations; |
18 | |
19 | static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples |
20 | |
21 | |
22 | static S defaultParserRules = "#1002281"; |
23 | static S defaultMainProd = "line"; |
24 | |
25 | p {
|
26 | startBot("Snippet Text Bot", "#1002084");
|
27 | } |
28 | |
29 | static synchronized S parse(S inputText) {
|
30 | ret parse(inputText, defaultParserRules, defaultMainProd); |
31 | } |
32 | |
33 | static synchronized S parse(S inputText, S parserRules, S mainProd) {
|
34 | S rulesText = loadSnippetThroughBot(parserRules); |
35 | |
36 | productionMap = new MultiMap; |
37 | for (S rule : toLinesFullTrim(rulesText)) pcall {
|
38 | //printF("Processing rule: *", rule);
|
39 | L<S> lr = splitAtJavaToken(rule, "="); |
40 | S l, r; |
41 | if (l(lr) == 1) {
|
42 | l = ""; |
43 | r = lr.get(0); |
44 | } else if (l(lr) == 2) {
|
45 | l = lr.get(0); |
46 | r = lr.get(1); |
47 | } else {
|
48 | print("Weird rule: " + rule);
|
49 | continue; |
50 | } |
51 | L<S> tokr = javaTok(r); |
52 | assertEquals(structure(tokr), 3, l(tokr)); |
53 | S className = assertIdentifier(get(tokr, 1)); |
54 | L<S> tok = javaTok(l); |
55 | tok = mergeBracketThingies(tok); |
56 | //printStructure(tok); |
57 | productionMap.put(className, tok); |
58 | } |
59 | |
60 | print(n(productionMap.size(), "production") + "."); |
61 | print(); |
62 | |
63 | timing = now(); |
64 | tok = javaTok(inputText); |
65 | Pos pos = new Pos(tok); |
66 | L<Integer> x = parseTop(pos, mainProd); |
67 | S result; |
68 | timing = now()-timing; |
69 | |
70 | int numTok = l(tok)/2; |
71 | print("Iterations required for " + numTok + " tokens: " + iterations*numTok + ", time: " + timing + " ms");
|
72 | |
73 | if (x.contains(l(tok))) |
74 | result = "parsed"; |
75 | else if (!empty(x)) |
76 | result = "beginning matches"; |
77 | else |
78 | ret "not parsed"; |
79 | L out = explainMatch(new Pos(tok), x.get(0), mainProd); |
80 | ret result + ", detailed analysis: " + structure(out); |
81 | } |
82 | |
83 | static L<Integer> parseTop(Pos pos, S mainProd) {
|
84 | // init structures |
85 | recog = new TreeMap; |
86 | for (int i = pos.i; i < l(pos.tok); i += 2) |
87 | recog.put(i, new MultiMap); |
88 | print("parser: recog inited " + pos.i + " to " + l(pos.tok));
|
89 | |
90 | // process dicts |
91 | for (S className : dicts.keySet()) {
|
92 | Set<S> values = dicts.get(className); |
93 | for (int i = pos.i; i < l(pos.tok); i += 2) {
|
94 | S t = pos.tok.get(i); |
95 | if (values.contains(t.toLowerCase())) |
96 | recog.get(i).setPut(className, i+2); |
97 | } |
98 | } |
99 | |
100 | boolean anyChange; |
101 | iterations = 0; |
102 | MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap); |
103 | do {
|
104 | anyChange = false; |
105 | ++iterations; |
106 | if (iterations == 2) removeGroundProductions(); |
107 | for (int i = pos.i; i < l(pos.tok); i += 2) {
|
108 | Pos pos2 = new Pos(pos.tok, i); |
109 | for (S className : productionMap.keySet()) {
|
110 | MultiMap<S, Integer> rr = recog.get(i); |
111 | L<Integer> recs = rr.getActual(className); |
112 | L<L<S>> prods = productionMap.get(className); |
113 | for (L<S> prod : prods) {
|
114 | int n = l(recs); |
115 | matchProd(pos2, new Pos(prod), className, recs); |
116 | anyChange = anyChange || l(recs) > n; |
117 | } |
118 | rr.clean(className); |
119 | } |
120 | } |
121 | } while (anyChange); |
122 | productionMap = fullProductionMap; // restore ground productions |
123 | |
124 | ret recog.get(pos.i).get(mainProd); |
125 | } |
126 | |
127 | static class Pos {
|
128 | L<S> tok; |
129 | int i = 1; |
130 | |
131 | *() {}
|
132 | *(L<S> *tok) {}
|
133 | *(L<S> *tok, int *i) {}
|
134 | |
135 | boolean end() { ret i >= l(tok)-1; }
|
136 | S get() { ret tok.get(i); }
|
137 | public Pos clone() { ret new Pos(tok, i); }
|
138 | public boolean equals(O o) {
|
139 | if (!(o instanceof Pos)) ret false; |
140 | Pos pos = cast o; |
141 | ret tok == pos.tok && i == pos.i; |
142 | } |
143 | |
144 | S rest() {
|
145 | ret join(subList(tok, i)); |
146 | } |
147 | |
148 | Pos plus(int x) { ret new Pos(tok, i + x); }
|
149 | } |
150 | |
151 | static void copy(Pos a, Pos b) {
|
152 | b.tok = a.tok; |
153 | b.i = a.i; |
154 | } |
155 | |
156 | static void debug(S bla, Pos pos) {
|
157 | if (debug) |
158 | print(bla + " on " + quote(pos.rest())); |
159 | } |
160 | |
161 | static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
|
162 | if (prod.end()) |
163 | setAdd(out, pos.i); |
164 | else if (pos.end()) |
165 | ret; |
166 | else {
|
167 | S p = prod.get(); |
168 | |
169 | if (isBracketedID(p) && !specials.contains(p)) {
|
170 | MultiMap<S, Integer> rr = recog.get(pos.i); |
171 | if (rr == null) |
172 | fail("parser: recog null at " + pos.i);
|
173 | L<Integer> r = rr.get(unbracket(p)); |
174 | |
175 | // keep parsing for every option |
176 | |
177 | for (int i : cloneList(r)) |
178 | matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); |
179 | |
180 | } else {
|
181 | // it's a literal |
182 | S t = pos.get(); |
183 | if (!matchToken(p, t)) |
184 | ret; |
185 | |
186 | matchProd(pos.plus(2), prod.plus(2), forClass, out); |
187 | } |
188 | } |
189 | } |
190 | |
191 | static L<S> specials = litlist("<quoted>", "<int>", "<identifier>");
|
192 | |
193 | static boolean matchToken(S p, S t) {
|
194 | if (eq(p, "<quoted>")) {
|
195 | if (!isQuoted(t)) ret false; |
196 | } else if (eq(p, "<int>")) {
|
197 | if (!isInteger(t)) ret false; |
198 | } else if (eq(p, "<identifier>")) {
|
199 | if (!isIdentifier(t)) ret false; |
200 | } else if (!(eq(p, "*") || eqic(p, t))) |
201 | ret false; // token mismatch |
202 | ret true; |
203 | } |
204 | |
205 | // assumes that there is a match (pos, class, endPos) |
206 | // and gives explanations of how it was probably matched |
207 | static L explainMatch(Pos pos, int endPos, S forClass) {
|
208 | L<L<S>> prods = productionMap.get(forClass); |
209 | new L out; |
210 | for (L<S> prod : prods) |
211 | explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out); |
212 | ret out; |
213 | } |
214 | |
215 | // same, but with fixed production |
216 | static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
|
217 | if (prod.end()) {
|
218 | if (pos.i == endPos) |
219 | out.add(cloneList(match)); |
220 | } else if (pos.end()) |
221 | ret; |
222 | else {
|
223 | S p = prod.get(); |
224 | |
225 | if (isBracketedID(p) && neq(p, "<quoted>")) {
|
226 | S className = unbracket(p); |
227 | L<Integer> r = recog.get(pos.i).get(className); |
228 | |
229 | // keep parsing for every option |
230 | |
231 | for (int i : cloneList(r)) {
|
232 | match.add(litlist(pos.i, i)); |
233 | explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); |
234 | removeLast(match); |
235 | } |
236 | } else {
|
237 | // it's a literal |
238 | S t = pos.get(); |
239 | if (!matchToken(p, t)) |
240 | ret; |
241 | |
242 | //match.add(litlist(pos.i, p, pos.i+2)); |
243 | explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); |
244 | removeLast(match); |
245 | } |
246 | } |
247 | } |
248 | |
249 | static S loadSnippetThroughBot(S snippetID) {
|
250 | S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
|
251 | new Matches m; |
252 | assertTrue(match("the text of snippet * is *", answer, m));
|
253 | ret m.unq(1); |
254 | } |
255 | |
256 | static S prettierAnalysis() {
|
257 | new L<S> l; |
258 | for (int i = 1; i < l(tok); i += 2) {
|
259 | MultiMap<S, Integer> rr = recog.get(i); |
260 | new L<S> bla; |
261 | for (S prod : rr.keys()) {
|
262 | L<Integer> is = rr.get(prod); |
263 | bla.add(prod + " " + (max(is)-i)/2); |
264 | } |
265 | l.add(tok.get(i) + " : " + structure(bla)); |
266 | } ret fromLines(l); |
267 | } |
268 | |
269 | // Non-recursing productions don't need to be repeatedly executed |
270 | static void removeGroundProductions() {
|
271 | int n = productionMap.size(); |
272 | for (S className : new HashSet<S>(productionMap.keySet())) {
|
273 | L<L<S>> prods = productionMap.getActual(className); |
274 | for (int i = 0; i < l(prods); ) |
275 | if (isGroundProduction(prods.get(i))) {
|
276 | //print("Ground production: " + className + " = " + join(prods.get(i)));
|
277 | prods.remove(i); |
278 | } else |
279 | ++i; |
280 | productionMap.clean(className); |
281 | } |
282 | n -= productionMap.size(); |
283 | if (n != 0) |
284 | print(n + " ground productions removed."); |
285 | } |
286 | |
287 | static boolean isGroundProduction(L<S> prod) {
|
288 | for (int i = 1; i < l(prod); i += 2) {
|
289 | S t = prod.get(i); |
290 | if (isBracketedID(t) && !specials.contains(unbracket(t))) |
291 | ret false; |
292 | } |
293 | ret true; |
294 | } |
295 | |
296 | static void addDict(S className, Collection<S> entries) {
|
297 | dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); |
298 | } |
Began life as a copy of #1002307
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1002328 |
| Snippet name: | NL Parser, implementing "empty" classes & ground class optimisation |
| Eternal ID of this version: | #1002328/1 |
| Text MD5: | aa5353cac7328b453bcc61018ba8b672 |
| Transpilation MD5: | 00ec6e09b91e996a60ca772a06365be6 |
| Author: | stefan |
| Category: | javax |
| Type: | JavaX source code |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2016-01-07 17:35:18 |
| Source code size: | 8380 bytes / 298 lines |
| Pitched / IR pitched: | No / Yes |
| Views / Downloads: | 1105 / 1454 |
| Referenced in: | [show references] |