Libraryless. Click here for Pure Java version (4249L/28K/89K).
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes // Productions elements: literals (just write them), <classname>, <classname varname> !759 // a recognition is identified by (startPos, className, endPos) // key 1 = start position, key 2 = class name, value = end position static Map<Integer, MultiMap<S, Integer>> recog; static L<S> tok; static MultiMap<S, L<S>> productionMap; static boolean caseSensitive = false; static boolean debug = false; static long timing; static int iterations; static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples static S defaultParserRules = "#1002281"; // NL static S javaRules = "#1002329"; static boolean keepRules = false; static class ParseResult { L<S> tok; Map<Integer, MultiMap<S, Integer>> recog; L<S> fullMatchClasses() { new L<S> l; MultiMap<S, Integer> rr = recog.get(1); for (S className : rr.keys()) if (rr.get(className).contains(l(tok))) l.add(className); ret l; } boolean parsesAs(S className) { MultiMap<S, Integer> rr = recog.get(1); ret rr.get(className).contains(l(tok)); } L explainLongestMatch() { S c = findLongestMatch(1); if (c == null) ret null; int n = max(recog.get(1).get(c)); if (n == 1) ret null; L l = cast explainMatch(new Pos(tok), n, c).get(0); ret shortenExplanation(l); } S findLongestMatch(int i) { MultiMap<S, Integer> rr = recog.get(i); new Map<S, Integer> map; for (S c : rr.keys()) map.put(c, max(rr.get(c))); ret keyWithBiggestValue(map); } L explain(S className) { ret explain(1, l(tok), className); } L explainFull(S className) { ret explainFull(1, l(tok), className); } // full = not shortened (include all subclassings) L explainFull(int fromToken, int toToken, S className) { L<L> matches = explainMatch(new Pos(tok, fromToken), toToken, className); ret get(matches, 0); } L explain(int fromToken, int toToken, S className) { L l = explainFull(fromToken, toToken, className); if (debug) print("Full explanation: " + structure(l)); ret shortenExplanation(l); } // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]] L shortenExplanation(L m) { if (m == null) ret null; if (l(m) == 5) { int ii = (Integer) m.get(0), jj = (Integer) m.get(1); L sub = (L) m.get(4); int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1); // for primitives, inner class seems to be null currently // - in this case, leave outer match. S innerClass = getString(sub, 2); if (ii == II && jj == JJ && innerClass != null) ret shortenExplanation(sub); // it's a subclassing - just drop the outer match } for (int i = 4; i < l(m); i++) m.set(i, shortenExplanation((L) m.get(i))); ret m; } // assumes that there is a match (pos, class, endPos) // and gives explanations of how it was probably matched. // result list format: [className, prodText, [i1, i2], [i3, i4], ...] L explainMatch(Pos pos, int endPos, S forClass) { L<L<S>> prods = productionMap.get(forClass); new L out; for (L<S> prod : prods) { L match = litlist(pos.i, endPos, forClass, join(prod)); if (debug) print("Starting match: " + structure(match)); explainMatch2(pos, new Pos(prod), endPos, forClass, match, out); } ret out; } // same, but with fixed production void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) { if (prod.end()) { if (pos.i == endPos) { // match done! add sub-explanations. match = cloneList(match); if (debug) print("Match done! " + structure(match)); for (int i = 4; i < l(match); i++) { L m = cloneList((L) match.get(i)); //print("m=" + structure(m)); int ii = (Integer) m.get(0), jj = (Integer) m.get(1); S className = cast m.get(2); L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className); //print(format("Recursed: * * * => ", ii, jj, className) + structure(subExplanation)); if (empty(subExplanation)) { // weird } else m.addAll(subList((L) subExplanation.get(0), 3)); match.set(i, m); } if (debug) print("Match completed: " + structure(match)); out.add(match); } } else { S p = prod.get(); if (isBracketedID(p)) { // get class name and possible var name S className = unbracket(p); S varName = null; if (hasSpace(className)) { int idx = className.indexOf(' '); varName = className.substring(idx+1).trim(); className = className.substring(0, idx).trim(); } // look up if class found at this place L<Integer> r = recog.get(pos.i).get(className); // keep parsing for every option for (int i : cloneList(r)) { match.add(litlist(pos.i, i, className)); explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out); removeLast(match); } } else if (!pos.end()) { // it's a literal S t = pos.get(); if (!matchToken(p, t)) ret; //match.add(litlist(pos.i, pos.i+2, p)); explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out); //removeLast(match); } } } S prettierAnalysis() { new L<S> l; new TreeSet<S> emptyClasses; for (int i = 1; i < l(tok); i += 2) { MultiMap<S, Integer> rr = recog.get(i); new L<S> bla; for (S className : rr.keys()) { L<Integer> is = rr.get(className); if (is.contains(i)) emptyClasses.add(className); int n = (max(is)-i)/2; if (n > 0) bla.add(className + " " + n); } l.add(tok.get(i) + " : " + structure(bla)); } if (!empty(emptyClasses)) l.add(" with empty classes " + structure(asList(emptyClasses))); ret fromLines(l); } } p { startBot("Snippet Text Bot", "#1002084"); } static ParseResult parse(S inputText) { ret parse(inputText, defaultParserRules); } static ParseResult jparse(S inputText) { ret parse(inputText, javaRules); } static S keptRulesID; static S keptRulesText; static ParseResult parse(S inputText, S parserRules) { ret parse(javaTok(inputText), parserRules); } static synchronized ParseResult parse(L<S> inputTok, S parserRules) { S rulesText; if (!isSnippetID(parserRules)) rulesText = parserRules; else { if (keepRules && sameSnippetID(parserRules, keptRulesID)) rulesText = keptRulesText; else { rulesText = loadSnippetThroughBot(parserRules); if (keepRules) { keptRulesID = parserRules; keptRulesText = rulesText; } } } productionMap = new MultiMap; caseSensitive = false; for (S rule : toLinesFullTrim(rulesText)) pcall { if (l(javaTok(rule)) == 1) continue; // just a comment if (debug) printF("Processing rule: *", rule); if (rule.contains("=")) { L<S> lr = splitAtJavaToken(rule, "="); S l, r; if (l(lr) == 1) { l = ""; r = lr.get(0); } else if (l(lr) >= 2) { l = join(" = ", lr.subList(0, l(lr)-1)); r = last(lr); } else { print("Weird rule: " + rule); continue; } L<S> tokr = mergeBracketThingies(javaTok(r)); assertEquals(structure(tokr), 3, l(tokr)); S className = assertIdentifier(unbracket(get(tokr, 1))); L<S> tok = javaTok(l); tok = mergeBracketThingies(tok); //printStructure(tok); productionMap.put(className, tok); } else if (match("case-sensitive", rule)) { print("Now case sensitive."); caseSensitive = true; } else if (match("case-insensitive", rule)) { caseSensitive = false; } else print("Weird rule: " + rule); } if (debug) { print(n(productionMap.size(), "production") + "."); print(); } timing = now(); tok = inputTok; /* HACK tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes tok.add(""); */ Pos pos = new Pos(tok); parseTop(pos); timing = now()-timing; int numTok = l(tok)/2; print("Iterations required for " + numTok + " tokens: " + iterations*numTok*productionMap.size() + ", time: " + timing + " ms"); new ParseResult res; res.tok = tok; res.recog = recog; ret res; } static void parseTop(Pos pos) { // init structures recog = new TreeMap; for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end recog.put(i, new MultiMap); //print("parser: recog inited " + pos.i + " to " + l(pos.tok)); // process dicts for (S className : dicts.keySet()) { Set<S> values = dicts.get(className); for (int i = pos.i; i < l(pos.tok); i += 2) { S t = pos.tok.get(i); if (values.contains(t.toLowerCase())) recog.get(i).setPut(className, i+2); } } matchSpecials(); boolean anyChange; iterations = 0; iter(pos); } static void iter(Pos pos) { MultiMap<S, L<S>> higherProductions = withoutGroundProductions(productionMap); for (int i = l(pos.tok); i >= 1; i -= 2) { Pos pos2 = new Pos(pos.tok, i); int it = 0; boolean anyChange = false; do { anyChange = false; MultiMap<S, L<S>> mapToUse = it == 0 ? productionMap : higherProductions; it++; for (S className : mapToUse.keySet()) { MultiMap<S, Integer> rr = recog.get(i); L<Integer> recs = rr.getActual(className); L<L<S>> prods = mapToUse.get(className); for (L<S> prod : prods) { int n = l(recs); matchProd(pos2, new Pos(prod), className, recs); anyChange = anyChange || l(recs) > n; } rr.clean(className); } } while (anyChange); } } static class Pos { L<S> tok; int i = 1; *() {} *(L<S> *tok) {} *(L<S> *tok, int *i) {} boolean end() { ret i >= l(tok)-1; } S get() { ret tok.get(i); } public Pos clone() { ret new Pos(tok, i); } public boolean equals(O o) { if (!(o instanceof Pos)) ret false; Pos pos = cast o; ret tok == pos.tok && i == pos.i; } S rest() { ret join(subList(tok, i)); } Pos plus(int x) { ret new Pos(tok, i + x); } } static void copy(Pos a, Pos b) { b.tok = a.tok; b.i = a.i; } static void debug(S bla, Pos pos) { if (debug) print(bla + " on " + quote(pos.rest())); } static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) { if (prod.end()) setAdd(out, pos.i); else { S p = prod.get(); if (isBracketedID(p)/* && !specials.contains(p)*/) { MultiMap<S, Integer> rr = recog.get(pos.i); if (rr == null) fail("parser: recog null at " + pos.i); // get class name S className = unbracket(p); if (hasSpace(className)) { int idx = className.indexOf(' '); className = className.substring(0, idx).trim(); } L<Integer> r = rr.get(className); // keep parsing for every option for (int i : cloneList(r)) matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); } else if (!pos.end()) { // it's a literal - check if at end of token stream first S t = pos.get(); if (!matchToken(p, t)) ret; matchProd(pos.plus(2), prod.plus(2), forClass, out); } } } static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>"); static boolean matchToken(S p, S t) { if (eq(p, "<quoted>")) { if (!isQuoted(t)) ret false; } else if (eq(p, "<int>")) { if (!isInteger(t)) ret false; } else if (eq(p, "<identifier>")) { if (!isIdentifier(t)) ret false; } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t)))) ret false; // token mismatch ret true; } static S loadSnippetThroughBot(S snippetID) { print("Loading parser rules."); time { S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); new Matches m; assertTrue(match("the text of snippet * is *", answer, m)); ret m.unq(1); } } // Non-recursing productions don't need to be repeatedly executed static MultiMap<S, L<S>> withoutGroundProductions(MultiMap<S, L<S>> productionMap) { MultiMap<S, L<S>> newMap = new MultiMap; for (S className : new HashSet<S>(productionMap.keySet())) { L<L<S>> prods = productionMap.get(className); for (L<S> prod : prods) if (!isGroundProduction(prod)) newMap.put(className, prod); } ret newMap; } static boolean isGroundProduction(L<S> prod) { for (int i = 1; i < l(prod); i += 2) { S t = prod.get(i); if (isBracketedID(t) && !specials.contains(t)) ret false; } ret true; } static void addDict(S className, Collection<S> entries) { dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); } static void matchSpecials() { for (int i = 1; i < l(tok); i += 2) { MultiMap<S, Integer> rr = recog.get(i); for (S s : specials) if (matchToken(s, tok.get(i))) rr.put(unbracket(s), i+2); } }
Began life as a copy of #1002369
download show line numbers debug dex old transpilations
Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1002472 |
Snippet name: | NL Parser, implementing names & new eval order (superseded by #1002719) |
Eternal ID of this version: | #1002472/1 |
Text MD5: | 38a4871c6689b3a8fa8225be0cf65dee |
Transpilation MD5: | c257994d7d0333ff130c4a2475710bfc |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-02-18 19:12:44 |
Source code size: | 14065 bytes / 484 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 682 / 1642 |
Referenced in: | [show references] |