// Idea: For every position, store the productions recognized to start there, then infer up to higher classes // Productions elements: literals (just write them), , !7 set flag NoAI. // a recognition is identified by (startPos, className, endPos) // key 1 = start position, key 2 = class name, value = end position static Map> recog; static L tok; static MultiMap> productionMap; static boolean caseSensitive = false; static boolean debug = false; static long timing; static int iterations; static Map> dicts = new TreeMap; // dictionaries - class name to lowercase examples static S defaultParserRules = "#1002281"; // NL static S javaRules = "#1002329"; static boolean keepRules = false; static class ParseResult { L tok; Map> recog; L fullMatchClasses() { new L l; MultiMap rr = recog.get(1); for (S className : rr.keys()) if (rr.get(className).contains(l(tok))) l.add(className); ret l; } boolean parsesAs(S className) { MultiMap rr = recog.get(1); ret rr.get(className).contains(l(tok)); } L explainLongestMatch() { S c = findLongestMatch(1); if (c == null) ret null; int n = max(recog.get(1).get(c)); if (n == 1) ret null; L l = explainMatch(new Pos(tok), n, c); ret shortenExplanation(l); } S findLongestMatch(int i) { MultiMap rr = recog.get(i); new Map map; for (S c : rr.keys()) map.put(c, max(rr.get(c))); ret keyWithBiggestValue(map); } L explain(S className) { ret explain(1, l(tok), className); } L explainFull(S className) { ret explainFull(1, l(tok), className); } // full = not shortened (include all subclassings) L explainFull(int fromToken, int toToken, S className) { ret explainMatch(new Pos(tok, fromToken), toToken, className); } L explain(int fromToken, int toToken, S className) { L l = explainFull(fromToken, toToken, className); if (debug) print("Full explanation: " + sfu(l)); l = shortenExplanation(l); if (debug) print("Shortened explanation: " + sfu(l)); ret l; } // Example: [[1, 11, "exp", "", [1, 11, "exp1", 11, "exp1", " + ", ...]]] L shortenExplanation(L m) { if (m == null) ret null; if (l(m) == 5) { int ii = (Integer) m.get(0), jj = (Integer) m.get(1); L sub = (L) m.get(4); int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1); // for primitives, inner class seems to be null currently // - in this case, leave outer match. S innerClass = getString(sub, 2); if (ii == II && jj == JJ && innerClass != null) ret shortenExplanation(sub); // it's a subclassing - just drop the outer match } for (int i = 4; i < l(m); i++) m.set(i, shortenExplanation((L) m.get(i))); ret m; } // assumes that there is a match (pos, class, endPos) // and gives explanations of how it was probably matched. // result list format: [className, prodText, [i1, i2], [i3, i4], ...] L explainMatch(Pos pos, int endPos, S forClass) { L> prods = productionMap.get(forClass); for (L prod : prods) { L match = litlist(pos.i, endPos, forClass, join(prod)); if (debug) print("Starting match: " + structure(match)); L out = explainMatch2(pos, new Pos(prod), endPos, forClass, match); if (out != null) ret out; } ret null; } // same, but with fixed production L explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match) { if (prod.end()) { if (pos.i == endPos) { // match done! add sub-explanations. match = cloneList(match); if (debug) print("Match done! " + structure(match)); for (int i = 4; i < l(match); i++) { L m = cloneList((L) match.get(i)); //print("m=" + structure(m)); int ii = (Integer) m.get(0), jj = (Integer) m.get(1); S className = cast m.get(2); L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className); //print(format("Recursed: * * * => ", ii, jj, className) + structure(subExplanation)); if (subExplanation == null) { if (debug) print("weird in " + forClass); } else m.addAll(subList(subExplanation, 3)); match.set(i, m); } if (debug) print("Match completed: " + structure(match)); ret match; } } else { S p = prod.get(); if (isBracketedID(p)) { // get class name and possible var name S className = unbracket(p); S varName = null; if (hasSpace(className)) { int idx = className.indexOf(' '); varName = className.substring(idx+1).trim(); className = className.substring(0, idx).trim(); } // look up if class found at this place L r = recog.get(pos.i).get(className); // keep parsing for every option for (int i : cloneList(r)) { match.add(litlist(pos.i, i, className)); L out = explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match); if (out != null) ret out; removeLast(match); } } else if (!pos.end()) { // it's a literal S t = pos.get(); if (!matchToken(p, t)) ret null; //match.add(litlist(pos.i, pos.i+2, p)); L out = explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match); if (out != null) ret out; //removeLast(match); } } ret null; } S prettierAnalysis() { new L l; new TreeSet emptyClasses; for (int i = 1; i < l(tok); i += 2) { MultiMap rr = recog.get(i); new L bla; for (S className : rr.keys()) { L is = rr.get(className); if (is.contains(i)) emptyClasses.add(className); int n = (max(is)-i)/2; if (n > 0) bla.add(className + " " + n); } l.add(tok.get(i) + " : " + structure(bla)); } if (!empty(emptyClasses)) l.add(" with empty classes " + structure(asList(emptyClasses))); ret fromLines(l); } } p { //startBot("Snippet Text Bot", "#1002084"); } static ParseResult parse(S inputText) { ret parse(inputText, defaultParserRules); } static ParseResult jparse(S inputText) { ret parse(inputText, javaRules); } static S keptRulesID; static S keptRulesText; static ParseResult parse(S inputText, S parserRules) { ret parse(javaTok(inputText), parserRules); } static synchronized ParseResult parse(L inputTok, S parserRules) { S rulesText; if (!isSnippetID(parserRules)) rulesText = parserRules; else { if (keepRules && sameSnippetID(parserRules, keptRulesID)) rulesText = keptRulesText; else { rulesText = loadSnippet/*ThroughBot*/(parserRules); if (keepRules) { keptRulesID = parserRules; keptRulesText = rulesText; } } } productionMap = new MultiMap; caseSensitive = false; for (S rule : toLinesFullTrim(rulesText)) pcall { if (l(javaTok(rule)) == 1) continue; // just a comment if (debug) printF("Processing rule: *", rule); if (rule.contains("=")) { L lr = splitAtJavaToken(rule, "="); S l, r; if (l(lr) == 1) { l = ""; r = lr.get(0); } else if (l(lr) >= 2) { l = join(" = ", lr.subList(0, l(lr)-1)); r = last(lr); if (debug) print("rule reformatted to: " + l + " .=. " + r); } else { print("Weird rule: " + rule); continue; } L tokr = mergeBracketThingies(javaTok(r)); assertEquals(structure(tokr), 3, l(tokr)); S className = assertIdentifier(unbracket(get(tokr, 1))); L tok = unquoteAll(javaTok(l)); tok = mergeBracketThingies(tok); //printStructure(tok); productionMap.put(className, tok); } else if (match("case-sensitive", rule)) { print("Now case sensitive."); caseSensitive = true; } else if (match("case-insensitive", rule)) { caseSensitive = false; } else print("Weird rule: " + rule); } if (debug) { print(n(productionMap.size(), "production") + "."); print(); } timing = now(); tok = inputTok; /* HACK tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes tok.add(""); */ Pos pos = new Pos(tok); parseTop(pos); timing = now()-timing; int numTok = l(tok)/2; /*if (printStats) print("Iterations required for " + numTok + " tokens: ?, time: " + timing + " ms");*/ new ParseResult res; res.tok = tok; res.recog = recog; ret res; } static void parseTop(Pos pos) { // init structures recog = new TreeMap; for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end recog.put(i, new MultiMap); //print("parser: recog inited " + pos.i + " to " + l(pos.tok)); // process dicts for (S className : dicts.keySet()) { Set values = dicts.get(className); for (int i = pos.i; i < l(pos.tok); i += 2) { S t = pos.tok.get(i); if (values.contains(t.toLowerCase())) recog.get(i).setPut(className, i+2); } } matchSpecials(); boolean anyChange; iterations = 0; iter(pos); } static void iter(Pos pos) { MultiMap> higherProductions = withoutGroundProductions(productionMap); for (int i = l(pos.tok); i >= 1; i -= 2) { Pos pos2 = new Pos(pos.tok, i); int it = 0; boolean anyChange = false; do { anyChange = false; MultiMap> mapToUse = it == 0 ? productionMap : higherProductions; it++; for (S className : mapToUse.keySet()) { MultiMap rr = recog.get(i); L recs = rr.getActual(className); L> prods = mapToUse.get(className); for (L prod : prods) { int n = l(recs); matchProd(pos2, new Pos(prod), className, recs); anyChange = anyChange || l(recs) > n; } rr.clean(className); } } while (anyChange); } } static class Pos { L tok; int i = 1; *() {} *(L *tok) {} *(L *tok, int *i) {} boolean end() { ret i >= l(tok)-1; } S get() { ret tok.get(i); } public Pos clone() { ret new Pos(tok, i); } public boolean equals(O o) { if (!(o instanceof Pos)) ret false; Pos pos = cast o; ret tok == pos.tok && i == pos.i; } S rest() { ret join(subList(tok, i)); } Pos plus(int x) { ret new Pos(tok, i + x); } } static void copy(Pos a, Pos b) { b.tok = a.tok; b.i = a.i; } static void debug(S bla, Pos pos) { if (debug) print(bla + " on " + quote(pos.rest())); } static void matchProd(Pos pos, Pos prod, S forClass, L out) { if (prod.end()) setAdd(out, pos.i); else { S p = prod.get(); if (isBracketedID(p)/* && !specials.contains(p)*/) { MultiMap rr = recog.get(pos.i); if (rr == null) fail("parser: recog null at " + pos.i); // get class name S className = unbracket(p); if (hasSpace(className)) { int idx = className.indexOf(' '); className = className.substring(0, idx).trim(); } L r = rr.get(className); // keep parsing for every option for (int i : cloneList(r)) matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out); } else if (!pos.end()) { // it's a literal - check if at end of token stream first S t = pos.get(); if (!matchToken(p, t)) ret; matchProd(pos.plus(2), prod.plus(2), forClass, out); } } } static L specials = litlist("", "", "", "", "", "", "ucid"); static boolean matchToken(S p, S t) { if (eq(p, "")) { if (!isQuoted(t)) ret false; } else if (eq(p, "")) { if (!isInteger(t)) ret false; } else if (eqOneOf(p, "", "")) { if (!isIdentifier(t)) ret false; } else if (eq(p, "")) { if (!(isIdentifier(t) && startsWithUpperCase(t))) ret false; } else if (eq(p, "")) { if (!isExtendedIdentifier(t)) ret false; } else if (!(eq(p, "") || (caseSensitive ? eq(p, t) : eqic(p, t)))) ret false; // token mismatch ret true; } static S loadSnippetThroughBot(S snippetID) { print("Loading parser rules."); time { S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID); new Matches m; assertTrue(match("the text of snippet * is *", answer, m)); ret m.unq(1); } } // Non-recursing productions don't need to be repeatedly executed static MultiMap> withoutGroundProductions(MultiMap> productionMap) { MultiMap> newMap = new MultiMap; for (S className : new HashSet(productionMap.keySet())) { L> prods = productionMap.get(className); for (L prod : prods) if (!isGroundProduction(prod)) newMap.put(className, prod); } ret newMap; } static boolean isGroundProduction(L prod) { for (int i = 1; i < l(prod); i += 2) { S t = prod.get(i); if (isBracketedID(t) && !specials.contains(t)) ret false; } ret true; } static void addDict(S className, Collection entries) { dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries)); } // TODO: do only those requested in grammar static void matchSpecials() { for (int i = 1; i < l(tok); i += 2) { MultiMap rr = recog.get(i); for (S s : specials) if (matchToken(s, tok.get(i))) rr.put(unbracket(s), i+2); } } static L unquoteAll(L tok) { L t = cloneList(tok); for (int i = 1; i < l(t); i += 2) if (isQuoted(t.get(i))) t.set(i, unquote(t.get(i))); ret t; }