Libraryless. Click here for Pure Java version (3162L/21K/69K).
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes !759 // a recognition is identified by (startPos, className, endPos) // key 1 = start position, key 2 = class name, value = end position static Map<Integer, MultiMap<S, Integer>> recog; static L<S> tok; static MultiMap<S, L<S>> productionMap; static boolean debug = false; static long timing; static int iterations; static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples static S defaultParserRules = "#1002281"; static S defaultMainProd = "line"; p { startBot("Snippet Text Bot", "#1002084"); } static synchronized S parse(S inputText) { ret parse(inputText, defaultParserRules, defaultMainProd); } static synchronized S parse(S inputText, S parserRules, S mainProd) { S rulesText = loadSnippetThroughBot(parserRules); productionMap = new MultiMap; for (S rule : toLinesFullTrim(rulesText)) pcall { //printF("Processing rule: *", rule); L<S> lr = splitAtJavaToken(rule, "="); S l, r; if (l(lr) == 1) { l = ""; r = lr.get(0); } else if (l(lr) == 2) { l = lr.get(0); r = lr.get(1); } else { print("Weird rule: " + rule); continue; } L<S> tokr = javaTok(r); assertEquals(structure(tokr), 3, l(tokr)); S className = assertIdentifier(get(tokr, 1)); L<S> tok = javaTok(l); tok = mergeBracketThingies(tok); //printStructure(tok); productionMap.put(className, tok); } print(n(productionMap.size(), "production") + "."); print(); timing = now(); tok = javaTok(inputText); Pos pos = new Pos(tok); L<Integer> x = parseTop(pos, mainProd); S result; timing = now()-timing; int numTok = l(tok)/2; print("Iterations required for " + numTok + " tokens: " + iterations*numTok + ", time: " + timing + " ms"); if (x.contains(l(tok))) result = "parsed"; else if (!empty(x)) result = "beginning matches"; else ret "not parsed"; L out = explainMatch(new Pos(tok), x.get(0), mainProd); ret result + ", detailed analysis: " + structure(out); } static L<Integer> parseTop(Pos pos, S mainProd) { // init structures recog = new TreeMap; for (int i = pos.i; i < l(pos.tok); i += 2) recog.put(i, new MultiMap); print("parser: recog inited " + pos.i + " to " + l(pos.tok)); // process dicts for (S className : dicts.keySet()) { Set<S> values = dicts.get(className); for (int i = pos.i; i < l(pos.tok); i += 2) { S t = pos.tok.get(i); if (values.contains(t.toLowerCase())) recog.get(i).setPut(className, i+2); } } boolean anyChange; iterations = 0; MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap); do { anyChange = false; ++iterations; if (iterations == 2) removeGroundProductions(); for (int i = pos.i; i < l(pos.tok); i += 2) { Pos pos2 = new Pos(pos.tok, i); for (S className : productionMap.keySet()) { MultiMap<S, Integer> rr = recog.get(i); L<Integer> recs = rr.getActual(className); L<L<S>> prods = productionMap.get(className); for (L<S> prod : prods) { int n = l(recs); matchProd(pos2, new Pos(prod), className, recs); anyChange = anyChange || l(recs) > n; } rr.clean(className); } } } while (anyChange); productionMap = fullProductionMap; // restore ground productions ret recog.get(pos.i).get(mainProd); } static class Pos { L<S> tok; int i = 1; *() {} *(L<S> *tok) {} *(L<S> *tok, int *i) {} boolean end() { ret i >= l(tok)-1; } S get() { ret tok.get(i); } public Pos clone() { ret new Pos(tok, i); } public boolean equals(O o) { if (!(o instanceof Pos)) ret false; Pos pos = cast o; ret tok == pos.tok && i == pos.i; } S rest() { ret join(subList(tok, i)); } Pos plus(int x) { ret new Pos(tok, i + x); } } static void copy(Pos a, Pos b) { b.tok = a.tok; b.i = a.i; } static void debug(S bla, Pos pos) { if (debug) print(bla + " on " + quote(pos.rest())); } static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { if (prod.end()) setAdd(out, pos.i); else if (pos.end()) ret; else { S p = prod.get(); if (isBracketedID(p) && !specials.contains(p)) { MultiMap<S, Integer> rr = recog.get(pos.i); if (rr == null) fail("parser: recog null at " + pos.i); L<Integer> r = rr.get(unbracket(p)); // keep parsing for every option for (int i : cloneList(r)) matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); } else { // it's a literal S t = pos.get(); if (!matchToken(p, t)) ret; matchProd(pos.plus(2), prod.plus(2), forClass, out); } } } static L<S> specials = litlist("<quoted>", "<int>", "<identifier>"); static boolean matchToken(S p, S t) { if (eq(p, "<quoted>")) { if (!isQuoted(t)) ret false; } else if (eq(p, "<int>")) { if (!isInteger(t)) ret false; } else if (eq(p, "<identifier>")) { if (!isIdentifier(t)) ret false; } else if (!(eq(p, "*") || eqic(p, t))) ret false; // token mismatch ret true; } // assumes that there is a match (pos, class, endPos) // and gives explanations of how it was probably matched static L explainMatch(Pos pos, int endPos, S forClass) { L<L<S>> prods = productionMap.get(forClass); new L out; for (L<S> prod : prods) explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out); ret out; } // same, but with fixed production static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) { if (prod.end()) { if (pos.i == endPos) out.add(cloneList(match)); } else if (pos.end()) ret; else { S p = prod.get(); if (isBracketedID(p) && neq(p, "<quoted>")) { S className = unbracket(p); L<Integer> r = recog.get(pos.i).get(className); // keep parsing for every option for (int i : cloneList(r)) { match.add(litlist(pos.i, i)); explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); removeLast(match); } } else { // it's a literal S t = pos.get(); if (!matchToken(p, t)) ret; //match.add(litlist(pos.i, p, pos.i+2)); explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); removeLast(match); } } } static S loadSnippetThroughBot(S snippetID) { S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); new Matches m; assertTrue(match("the text of snippet * is *", answer, m)); ret m.unq(1); } static S prettierAnalysis() { new L<S> l; for (int i = 1; i < l(tok); i += 2) { MultiMap<S, Integer> rr = recog.get(i); new L<S> bla; for (S prod : rr.keys()) { L<Integer> is = rr.get(prod); bla.add(prod + " " + (max(is)-i)/2); } l.add(tok.get(i) + " : " + structure(bla)); } ret fromLines(l); } // Non-recursing productions don't need to be repeatedly executed static void removeGroundProductions() { int n = productionMap.size(); for (S className : new HashSet<S>(productionMap.keySet())) { L<L<S>> prods = productionMap.getActual(className); for (int i = 0; i < l(prods); ) if (isGroundProduction(prods.get(i))) { //print("Ground production: " + className + " = " + join(prods.get(i))); prods.remove(i); } else ++i; productionMap.clean(className); } n -= productionMap.size(); if (n != 0) print(n + " ground productions removed."); } static boolean isGroundProduction(L<S> prod) { for (int i = 1; i < l(prod); i += 2) { S t = prod.get(i); if (isBracketedID(t) && !specials.contains(unbracket(t))) ret false; } ret true; } static void addDict(S className, Collection<S> entries) { dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); }
Began life as a copy of #1002307
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002328 |
Snippet name: | NL Parser, implementing "empty" classes & ground class optimisation |
Eternal ID of this version: | #1002328/1 |
Text MD5: | aa5353cac7328b453bcc61018ba8b672 |
Transpilation MD5: | 00ec6e09b91e996a60ca772a06365be6 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-01-07 17:35:18 |
Source code size: | 8380 bytes / 298 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 795 / 1110 |
Referenced in: | [show references] |