Libraryless. Click here for Pure Java version (3112L/21K/68K).
1 | // Idea: For every position, store the productions recognized to start there, then infer up to higher classes |
2 | |
3 | !752 |
4 | |
5 | // a recognition is identified by (startPos, className, endPos) |
6 | |
7 | |
8 | // key 1 = start position, key 2 = class name, value = end position |
9 | static Map<Integer, MultiMap<S, Integer>> recog; |
10 | static L<S> tok; |
11 | |
12 | static MultiMap<S, L<S>> productionMap; |
13 | |
14 | static boolean debug = false; |
15 | |
16 | static long timing; |
17 | |
18 | static Set<S> adjectives; // list of all adjectives - filled from outside |
19 | |
20 | |
21 | static S defaultParserRules = "#1002281"; |
22 | static S defaultMainProd = "line"; |
23 | |
24 | p { |
25 | startBot("Snippet Text Bot", "#1002084"); |
26 | } |
27 | |
28 | static synchronized S parse(S inputText) { |
29 | ret parse(inputText, defaultParserRules, defaultMainProd); |
30 | } |
31 | |
32 | static synchronized S parse(S inputText, S parserRules, S mainProd) { |
33 | S rulesText = loadSnippetThroughBot(parserRules); |
34 | |
35 | productionMap = new MultiMap; |
36 | for (S rule : toLinesFullTrim(rulesText)) pcall { |
37 | //printF("Processing rule: *", rule); |
38 | L<S> lr = splitAtJavaToken(rule, "="); |
39 | if (l(lr) != 2) { |
40 | print("Weird rule: " + rule); |
41 | continue; |
42 | } |
43 | S l = lr.get(0), r = lr.get(1); |
44 | L<S> tokr = javaTok(r); |
45 | assertEquals(structure(tokr), 3, l(tokr)); |
46 | S className = assertIdentifier(get(tokr, 1)); |
47 | L<S> tok = javaTok(l); |
48 | tok = mergeBracketThingies(tok); |
49 | //printStructure(tok); |
50 | productionMap.put(className, tok); |
51 | } |
52 | |
53 | print(n(productionMap.size(), "production") + "."); |
54 | print(); |
55 | |
56 | timing = now(); |
57 | tok = javaTok(inputText); |
58 | Pos pos = new Pos(tok); |
59 | L<Integer> x = parseTop(pos, mainProd); |
60 | S result; |
61 | timing = now()-timing; |
62 | if (x.contains(l(tok))) |
63 | result = "parsed"; |
64 | else if (!empty(x)) |
65 | result = "beginning matches"; |
66 | else |
67 | ret "not parsed"; |
68 | L out = explainMatch(new Pos(tok), x.get(0), mainProd); |
69 | ret result + ", detailed analysis: " + structure(out); |
70 | } |
71 | |
72 | static L<Integer> parseTop(Pos pos, S mainProd) { |
73 | // init structures |
74 | recog = new TreeMap; |
75 | for (int i = pos.i; i < l(pos.tok); i += 2) |
76 | recog.put(i, new MultiMap); |
77 | print("parser: recog inited " + pos.i + " to " + l(pos.tok)); |
78 | |
79 | // adjectives |
80 | if (adjectives != null) |
81 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
82 | S t = pos.tok.get(i); |
83 | if (adjectives.contains(t.toLowerCase())) |
84 | recog.get(i).setPut("adjective", i+2); |
85 | } |
86 | |
87 | |
88 | boolean anyChange; |
89 | do { |
90 | anyChange = false; |
91 | for (int i = pos.i; i < l(pos.tok); i += 2) { |
92 | Pos pos2 = new Pos(pos.tok, i); |
93 | for (S className : productionMap.keySet()) { |
94 | MultiMap<S, Integer> rr = recog.get(i); |
95 | L<Integer> recs = rr.getActual(className); |
96 | L<L<S>> prods = productionMap.get(className); |
97 | for (L<S> prod : prods) { |
98 | int n = l(recs); |
99 | matchProd(pos2, new Pos(prod), className, recs); |
100 | anyChange = anyChange || l(recs) > n; |
101 | } |
102 | rr.clean(className); |
103 | } |
104 | } |
105 | } while (anyChange); |
106 | |
107 | ret recog.get(pos.i).get(mainProd); |
108 | } |
109 | |
110 | static class Pos { |
111 | L<S> tok; |
112 | int i = 1; |
113 | |
114 | *() {} |
115 | *(L<S> *tok) {} |
116 | *(L<S> *tok, int *i) {} |
117 | |
118 | boolean end() { ret i >= l(tok)-1; } |
119 | S get() { ret tok.get(i); } |
120 | public Pos clone() { ret new Pos(tok, i); } |
121 | public boolean equals(O o) { |
122 | if (!(o instanceof Pos)) ret false; |
123 | Pos pos = cast o; |
124 | ret tok == pos.tok && i == pos.i; |
125 | } |
126 | |
127 | S rest() { |
128 | ret join(subList(tok, i)); |
129 | } |
130 | |
131 | Pos plus(int x) { ret new Pos(tok, i + x); } |
132 | } |
133 | |
134 | static void copy(Pos a, Pos b) { |
135 | b.tok = a.tok; |
136 | b.i = a.i; |
137 | } |
138 | |
139 | static void debug(S bla, Pos pos) { |
140 | if (debug) |
141 | print(bla + " on " + quote(pos.rest())); |
142 | } |
143 | |
144 | static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { |
145 | if (prod.end()) |
146 | setAdd(out, pos.i); |
147 | else if (pos.end()) |
148 | ret; |
149 | else { |
150 | S p = prod.get(); |
151 | |
152 | if (isBracketedID(p) && !specials.contains(p)) { |
153 | MultiMap<S, Integer> rr = recog.get(pos.i); |
154 | if (rr == null) |
155 | fail("parser: recog null at " + pos.i); |
156 | L<Integer> r = rr.get(unbracket(p)); |
157 | |
158 | // keep parsing for every option |
159 | |
160 | for (int i : cloneList(r)) |
161 | matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); |
162 | |
163 | } else { |
164 | // it's a literal |
165 | S t = pos.get(); |
166 | if (!matchToken(p, t)) |
167 | ret; |
168 | |
169 | matchProd(pos.plus(2), prod.plus(2), forClass, out); |
170 | } |
171 | } |
172 | } |
173 | |
174 | static L<S> specials = litlist("<quoted>", "<int>", "<identifier>"); |
175 | |
176 | static boolean matchToken(S p, S t) { |
177 | if (eq(p, "<quoted>")) { |
178 | if (!isQuoted(t)) ret false; |
179 | } else if (eq(p, "<int>")) { |
180 | if (!isInteger(t)) ret false; |
181 | } else if (eq(p, "<identifier>")) { |
182 | if (!isIdentifier(t)) ret false; |
183 | } else if (!(eq(p, "*") || eqic(p, t))) |
184 | ret false; // token mismatch |
185 | ret true; |
186 | } |
187 | |
188 | // assumes that there is a match (pos, class, endPos) |
189 | // and gives explanations of how it was probably matched |
190 | static L explainMatch(Pos pos, int endPos, S forClass) { |
191 | L<L<S>> prods = productionMap.get(forClass); |
192 | new L out; |
193 | for (L<S> prod : prods) |
194 | explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out); |
195 | ret out; |
196 | } |
197 | |
198 | // same, but with fixed production |
199 | static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) { |
200 | if (prod.end()) { |
201 | if (pos.i == endPos) |
202 | out.add(cloneList(match)); |
203 | } else if (pos.end()) |
204 | ret; |
205 | else { |
206 | S p = prod.get(); |
207 | |
208 | if (isBracketedID(p) && neq(p, "<quoted>")) { |
209 | S className = unbracket(p); |
210 | L<Integer> r = recog.get(pos.i).get(className); |
211 | |
212 | // keep parsing for every option |
213 | |
214 | for (int i : cloneList(r)) { |
215 | match.add(litlist(pos.i, i)); |
216 | explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); |
217 | removeLast(match); |
218 | } |
219 | } else { |
220 | // it's a literal |
221 | S t = pos.get(); |
222 | if (!matchToken(p, t)) |
223 | ret; |
224 | |
225 | //match.add(litlist(pos.i, p, pos.i+2)); |
226 | explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); |
227 | removeLast(match); |
228 | } |
229 | } |
230 | } |
231 | |
232 | static S loadSnippetThroughBot(S snippetID) { |
233 | S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); |
234 | new Matches m; |
235 | assertTrue(match("the text of snippet * is *", answer, m)); |
236 | ret m.unq(1); |
237 | } |
238 | |
239 | static S prettierAnalysis() { |
240 | new L<S> l; |
241 | for (int i = 1; i < l(tok); i += 2) { |
242 | MultiMap<S, Integer> rr = recog.get(i); |
243 | new L<S> bla; |
244 | for (S prod : rr.keys()) { |
245 | L<Integer> is = rr.get(prod); |
246 | bla.add(prod + " " + (max(is)-i)/2); |
247 | } |
248 | l.add(tok.get(i) + " : " + structure(bla)); |
249 | } ret fromLines(l); |
250 | } |
Began life as a copy of #1002306
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002307 |
Snippet name: | NL Parser, hotwirable |
Eternal ID of this version: | #1002307/1 |
Text MD5: | fb022bd644deed5964a7f11d615ae4e2 |
Transpilation MD5: | c952e47f2f1ad33239c1a85663d4d687 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-01-05 17:52:41 |
Source code size: | 6813 bytes / 250 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 715 / 815 |
Referenced in: | [show references] |