"Bottom-Up" NL Parser, faster explainMatch (LIVE EVERYWHERE) [1002719]

// Idea: For every position, store the productions recognized to start there, then infer up to higher classes

// Productions elements: literals (just write them), <classname>, <classname varname>

!7

set flag NoAI.

// a recognition is identified by (startPos, className, endPos)

// key 1 = start position, key 2 = class name, value = end position
static Map<Integer, MultiMap<S, Integer>> recog;
static L<S> tok;

static MultiMap<S, L<S>> productionMap;

static boolean caseSensitive = false;

static boolean debug = false;

static long timing;
static int iterations;

static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples


static S defaultParserRules = "#1002281"; // NL
static S javaRules = "#1002329";
static boolean keepRules = false;

static class ParseResult {
  L<S> tok;
  Map<Integer, MultiMap<S, Integer>> recog;
  
  L<S> fullMatchClasses() {
    new L<S> l;
    MultiMap<S, Integer> rr = recog.get(1);
    for (S className : rr.keys())
      if (rr.get(className).contains(l(tok)))
        l.add(className);
    ret l;
  }
  
  boolean parsesAs(S className) {
    MultiMap<S, Integer> rr = recog.get(1);
    ret rr.get(className).contains(l(tok));
  }
  
  L explainLongestMatch() {
    S c = findLongestMatch(1);
    if (c == null) ret null;
    int n = max(recog.get(1).get(c));
    if (n == 1) ret null;
    L l = explainMatch(new Pos(tok), n, c);
    ret shortenExplanation(l);
  }

  S findLongestMatch(int i) {
    MultiMap<S, Integer> rr = recog.get(i);
    new Map<S, Integer> map;
    for (S c : rr.keys())
      map.put(c, max(rr.get(c)));
    ret keyWithBiggestValue(map);
  }
  
  L explain(S className) {
    ret explain(1, l(tok), className);
  }
  
  L explainFull(S className) {
    ret explainFull(1, l(tok), className);
  }
  
  // full = not shortened (include all subclassings)
  L explainFull(int fromToken, int toToken, S className) {
    ret explainMatch(new Pos(tok, fromToken), toToken, className);
  }
  
  L explain(int fromToken, int toToken, S className) {
    L l = explainFull(fromToken, toToken, className);
    if (debug)
      print("Full explanation: " + sfu(l));
    l = shortenExplanation(l);
    if (debug)
      print("Shortened explanation: " + sfu(l));
    ret l;
  }
  
  // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]]
  
  L shortenExplanation(L m) {
    if (m == null) ret null;
    if (l(m) == 5) {
      int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
      L sub = (L) m.get(4);
      int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1);
      
      // for primitives, inner class seems to be null currently
      // - in this case, leave outer match.
      
      S innerClass = getString(sub, 2);
      if (ii == II && jj == JJ && innerClass != null)
        ret shortenExplanation(sub); // it's a subclassing  - just drop the outer match
    }
    for (int i = 4; i < l(m); i++)
      m.set(i, shortenExplanation((L) m.get(i)));
    ret m;
  }
  
  // assumes that there is a match (pos, class, endPos)
  // and gives explanations of how it was probably matched.
  // result list format: [className, prodText, [i1, i2], [i3, i4], ...]
  L explainMatch(Pos pos, int endPos, S forClass) {
    L<L<S>> prods = productionMap.get(forClass);
    for (L<S> prod : prods) {
      L match = litlist(pos.i, endPos, forClass, join(prod));
      if (debug)
        print("Starting match: " + structure(match));
      L out = explainMatch2(pos, new Pos(prod), endPos, forClass, match);
      if (out != null) ret out;
    }
    ret null;
  }
  
  // same, but with fixed production
  L explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match) {
    if (prod.end()) {
      if (pos.i == endPos) {
        // match done! add sub-explanations.
        
        match = cloneList(match);
        if (debug)
          print("Match done! " + structure(match));
        for (int i = 4; i < l(match); i++) {
          L m = cloneList((L) match.get(i));
          //print("m=" + structure(m));
          int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
          S className = cast m.get(2);
          L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className);
          //print(format("Recursed: * * * => ", ii, jj, className) +  structure(subExplanation));
          if (subExplanation == null) {
            if (debug) print("weird in " + forClass);
          } else
            m.addAll(subList(subExplanation, 3));
            
          match.set(i, m);
        }
        
        if (debug)
          print("Match completed: " + structure(match));
        ret match;
      }
    } else {
      S p = prod.get();
      
      if (isBracketedID(p)) {
        // get class name and possible var name
        
        S className = unbracket(p);
        S varName = null;
        if (hasSpace(className)) {
          int idx = className.indexOf(' ');
          varName = className.substring(idx+1).trim();
          className = className.substring(0, idx).trim();
        }
        
        // look up if class found at this place
        
        L<Integer> r = recog.get(pos.i).get(className);

        // keep parsing for every option
    
        for (int i : cloneList(r)) {
          match.add(litlist(pos.i, i, className));
          L out = explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match);
          if (out != null) ret out;
          removeLast(match);
        }
      } else if (!pos.end()) {
        // it's a literal
        S t = pos.get();
        if (!matchToken(p, t))
          ret null;
        
        //match.add(litlist(pos.i, pos.i+2, p));
        L out = explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match);
        if (out != null) ret out;
        //removeLast(match);
      }
    }
    ret null;
  }
  
  S prettierAnalysis() {
    new L<S> l;
    new TreeSet<S> emptyClasses;
    for (int i = 1; i < l(tok); i += 2) {
      MultiMap<S, Integer> rr = recog.get(i);
      new L<S> bla;
      for (S className : rr.keys()) {
        L<Integer> is = rr.get(className);
        if (is.contains(i))
          emptyClasses.add(className);
        int n = (max(is)-i)/2;
        if (n > 0)
          bla.add(className + " " + n);
      }
      l.add(tok.get(i) + " : " + structure(bla));
    }
    if (!empty(emptyClasses))
      l.add("  with empty classes " + structure(asList(emptyClasses)));
    ret fromLines(l);
  }
}

p {
  //startBot("Snippet Text Bot", "#1002084");
}

static ParseResult parse(S inputText) {
  ret parse(inputText, defaultParserRules);
}

static ParseResult jparse(S inputText) {
  ret parse(inputText, javaRules);
}

static S keptRulesID;
static S keptRulesText;

static ParseResult parse(S inputText, S parserRules) {
  ret parse(javaTok(inputText), parserRules);
}

static synchronized ParseResult parse(L<S> inputTok, S parserRules) {
  S rulesText;
  if (!isSnippetID(parserRules))
    rulesText = parserRules;
  else {
    if (keepRules && sameSnippetID(parserRules, keptRulesID))
      rulesText = keptRulesText;
    else {
      rulesText = loadSnippet/*ThroughBot*/(parserRules);
      if (keepRules) {
        keptRulesID = parserRules;
        keptRulesText = rulesText;
      }
    }
  }

  productionMap = new MultiMap;
  caseSensitive = false;
  for (S rule : toLinesFullTrim(rulesText)) pcall {
    if (l(javaTok(rule)) == 1) continue; // just a comment
    
    if (debug)
      printF("Processing rule: *", rule);
    
    if (rule.contains("=")) {  
      L<S> lr = splitAtJavaToken(rule, "=");
      S l, r;
      if (l(lr) == 1) {
        l = "";
        r = lr.get(0);
      } else if (l(lr) >= 2) {
        l = join(" = ", lr.subList(0, l(lr)-1));
        r = last(lr);
        if (debug)
          print("rule reformatted to: " + l + " .=. " + r);
      } else {
        print("Weird rule: " + rule);
        continue;
      }
      L<S> tokr = mergeBracketThingies(javaTok(r));
      assertEquals(structure(tokr), 3, l(tokr));
      S className = assertIdentifier(unbracket(get(tokr, 1)));
      L<S> tok = unquoteAll(javaTok(l));
      tok = mergeBracketThingies(tok);
      //printStructure(tok);
      productionMap.put(className, tok);
    } else if (match("case-sensitive", rule)) {
      print("Now case sensitive.");
      caseSensitive = true;
    } else if (match("case-insensitive", rule)) {
      caseSensitive = false;
    } else
      print("Weird rule: " + rule);
  }
  
  if (debug) {
    print(n(productionMap.size(), "production") + ".");
    print();
  }
  
  timing = now();
  tok = inputTok;
  /* HACK
    tok.add("the bonk"); // add impossible token to find empty classes at the end... complicated though because l(tok) changes
    tok.add("");
  */
  Pos pos = new Pos(tok);
  parseTop(pos);
  timing = now()-timing;
  
  int numTok = l(tok)/2;
  /*if (printStats)
    print("Iterations required for " + numTok + " tokens: ?, time: " + timing + " ms");*/
  
  new ParseResult res;
  res.tok = tok;
  res.recog = recog;
  ret res;
}

static void parseTop(Pos pos) {
  // init structures
  recog = new TreeMap;
  for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end
    recog.put(i, new MultiMap);
  //print("parser: recog inited " + pos.i + " to " + l(pos.tok));
  
  // process dicts
  for (S className : dicts.keySet()) {
    Set<S> values = dicts.get(className);
    for (int i = pos.i; i < l(pos.tok); i += 2) {
      S t = pos.tok.get(i);
      if (values.contains(t.toLowerCase()))
        recog.get(i).setPut(className, i+2);
    }
  }
  
  matchSpecials();

  boolean anyChange;
  iterations = 0;
  iter(pos);
}

static void iter(Pos pos) {
  MultiMap<S, L<S>> higherProductions = withoutGroundProductions(productionMap);
  
  for (int i = l(pos.tok); i >= 1; i -= 2) {
    Pos pos2 = new Pos(pos.tok, i);
    int it = 0;
    boolean anyChange = false;
    do {
      anyChange = false;
      MultiMap<S, L<S>> mapToUse = it == 0 ? productionMap : higherProductions;
      it++;
      for (S className : mapToUse.keySet()) {
        MultiMap<S, Integer> rr = recog.get(i);
        L<Integer> recs = rr.getActual(className);
        L<L<S>> prods = mapToUse.get(className);
        for (L<S> prod : prods) {
          int n = l(recs);
          matchProd(pos2, new Pos(prod), className, recs);
          anyChange = anyChange || l(recs) > n;
        }
        rr.clean(className);
      }
    } while (anyChange);
  }
}

static class Pos {
  L<S> tok;
  int i = 1;
  
  *() {}
  *(L<S> *tok) {}
  *(L<S> *tok, int *i) {}
  
  boolean end() { ret i >= l(tok)-1; }
  S get() { ret tok.get(i); }
  public Pos clone() { ret new Pos(tok, i); }
  public boolean equals(O o) {
    if (!(o instanceof Pos)) ret false;
    Pos pos = cast o;
    ret tok == pos.tok && i == pos.i;
  }
  
  S rest() {
    ret join(subList(tok, i));
  }

  Pos plus(int x) { ret new Pos(tok, i + x); }
}

static void copy(Pos a, Pos b) {
  b.tok = a.tok;
  b.i = a.i;
}

static void debug(S bla, Pos pos) {
  if (debug)
    print(bla + " on " + quote(pos.rest()));
}

static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
  if (prod.end())
    setAdd(out, pos.i);
  else {
    S p = prod.get();
    
    if (isBracketedID(p)/* && !specials.contains(p)*/) {
      MultiMap<S, Integer> rr = recog.get(pos.i);
      if (rr == null)
        fail("parser: recog null at " + pos.i);
        
      // get class name
      
      S className = unbracket(p);
      if (hasSpace(className)) {
        int idx = className.indexOf(' ');
        className = className.substring(0, idx).trim();
      }
      
      L<Integer> r = rr.get(className);
      
      // keep parsing for every option
  
      for (int i : cloneList(r))
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
      
    } else if (!pos.end()) { // it's a literal - check if at end of token stream first
      
      S t = pos.get();
      if (!matchToken(p, t))
        ret;
      
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
    }
  }
}

static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>", "<id>", "<extidentifier>", "ucid");

static boolean matchToken(S p, S t) {
  if (eq(p, "<quoted>")) {
    if (!isQuoted(t)) ret false;
  } else if (eq(p, "<int>")) {
    if (!isInteger(t)) ret false;
  } else if (eqOneOf(p, "<identifier>", "<id>")) {
    if (!isIdentifier(t)) ret false;
  } else if (eq(p, "<ucid>")) {
    if (!(isIdentifier(t) && startsWithUpperCase(t))) ret false;
  } else if (eq(p, "<extidentifier>")) {
    if (!isExtendedIdentifier(t)) ret false;
  } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t))))
    ret false; // token mismatch
  ret true;
}

static S loadSnippetThroughBot(S snippetID) {
  print("Loading parser rules.");
  time {
    S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
    new Matches m;
    assertTrue(match("the text of snippet * is *", answer, m));
    ret m.unq(1);
  }
}

// Non-recursing productions don't need to be repeatedly executed
static MultiMap<S, L<S>> withoutGroundProductions(MultiMap<S, L<S>> productionMap) {
  MultiMap<S, L<S>> newMap = new MultiMap;
  
  for (S className : new HashSet<S>(productionMap.keySet())) {
    L<L<S>> prods = productionMap.get(className);
    for (L<S> prod : prods)
      if (!isGroundProduction(prod))
        newMap.put(className, prod);
  }
  
  ret newMap;
}

static boolean isGroundProduction(L<S> prod) {
  for (int i = 1; i < l(prod); i += 2) {
    S t = prod.get(i);
    if (isBracketedID(t) && !specials.contains(t))
      ret false;
  }
  ret true;
}

static void addDict(S className, Collection<S> entries) {
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
}

// TODO: do only those requested in grammar

static void matchSpecials() {
  for (int i = 1; i < l(tok); i += 2) {
    MultiMap<S, Integer> rr = recog.get(i);
    for (S s : specials)
      if (matchToken(s, tok.get(i)))
        rr.put(unbracket(s), i+2);
  }
}

static L<S> unquoteAll(L<S> tok) {
  L<S> t = cloneList(tok);
  for (int i = 1; i < l(t); i += 2)
    if (isQuoted(t.get(i)))
      t.set(i, unquote(t.get(i)));
  ret t;
}
Travelled to 16 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
Snippet ID:	#1002719
Snippet name:	"Bottom-Up" NL Parser, faster explainMatch (LIVE EVERYWHERE)
Eternal ID of this version:	#1002719/12
Text MD5:	f3bad2905ecdacbe90a91ada158fa2fe
Transpilation MD5:	1cf5b651a3557e7aded44fca248c0cbe
Author:	stefan
Category:	javax
Type:	JavaX source code
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2018-09-16 14:05:31
Source code size:	14775 bytes / 508 lines
Pitched / IR pitched:	No / No
Views / Downloads:	917 / 3899
Version history:	11 change(s)
Referenced in:	[show references]
< > BotCompany Repo | #1002719 // "Bottom-Up" NL Parser, faster explainMatch (LIVE EVERYWHERE)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Author comment