Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

455
LINES

< > BotCompany Repo | #1002369 // NL Parser, reworking the interface (CURRENT)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (3477L/24K/75K).

// Idea: For every position, store the productions recognized to start there, then infer up to higher classes

!759

// a recognition is identified by (startPos, className, endPos)


// key 1 = start position, key 2 = class name, value = end position
static Map<Integer, MultiMap<S, Integer>> recog;
static L<S> tok;

static MultiMap<S, L<S>> productionMap;

static boolean caseSensitive = false;

static boolean debug = false;

static long timing;
static int iterations;

static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples


static S defaultParserRules = "#1002281"; // NL
static S javaRules = "#1002329";
static boolean keepRules = false;

static class ParseResult {
  L<S> tok;
  Map<Integer, MultiMap<S, Integer>> recog;
  
  L<S> fullMatchClasses() {
    new L<S> l;
    MultiMap<S, Integer> rr = recog.get(1);
    for (S className : rr.keys())
      if (rr.get(className).contains(l(tok)))
        l.add(className);
    ret l;
  }
  
  L explainLongestMatch() {
    S c = findLongestMatch(1);
    if (c == null) ret null;
    int n = max(recog.get(1).get(c));
    if (n == 1) ret null;
    L l = cast explainMatch(new Pos(tok), n, c).get(0);
    ret shortenExplanation(l);
  }

  S findLongestMatch(int i) {
    MultiMap<S, Integer> rr = recog.get(i);
    new Map<S, Integer> map;
    for (S c : rr.keys())
      map.put(c, max(rr.get(c)));
    ret keyWithBiggestValue(map);
  }
  
  L explain(S className) {
    ret explain(1, l(tok), className);
  }
  
  L explainFull(S className) {
    ret explainFull(1, l(tok), className);
  }
  
  // full = not shortened (include all subclassings)
  L explainFull(int fromToken, int toToken, S className) {
    L<L> matches = explainMatch(new Pos(tok, fromToken), toToken, className);
    ret get(matches, 0);
  }
  
  L explain(int fromToken, int toToken, S className) {
    L l = explainFull(fromToken, toToken, className);
    if (debug)
      print("Full explanation: " + structure(l));
    ret shortenExplanation(l);
  }
  
  // Example: [[1, 11, "exp", "<exp1>", [1, 11, "exp1", 11, "exp1", "<exp1> + <exp2>", ...]]]
  
  L shortenExplanation(L m) {
    if (m == null) ret null;
    if (l(m) == 5) {
      int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
      L sub = (L) m.get(4);
      int II = (Integer) sub.get(0), JJ = (Integer) sub.get(1);
      if (ii == II && jj == JJ)
        ret shortenExplanation(sub); // it's a subclassing  - just drop the outer match
    }
    for (int i = 4; i < l(m); i++)
      m.set(i, shortenExplanation((L) m.get(i)));
    ret m;
  }
  
  // assumes that there is a match (pos, class, endPos)
  // and gives explanations of how it was probably matched.
  // result list format: [className, prodText, [i1, i2], [i3, i4], ...]
  L explainMatch(Pos pos, int endPos, S forClass) {
    L<L<S>> prods = productionMap.get(forClass);
    new L out;
    for (L<S> prod : prods) {
      L match = litlist(pos.i, endPos, forClass, join(prod));
      if (debug)
        print("Starting match: " + structure(match));
      explainMatch2(pos, new Pos(prod), endPos, forClass, match, out);
    }
    ret out;
  }
  
  // same, but with fixed production
  void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
    if (prod.end()) {
      if (pos.i == endPos) {
        // match done! add sub-explanations.
        
        match = cloneList(match);
        if (debug)
          print("Match done! " + structure(match));
        for (int i = 4; i < l(match); i++) {
          L m = cloneList((L) match.get(i));
          //print("m=" + structure(m));
          int ii = (Integer) m.get(0), jj = (Integer) m.get(1);
          S className = cast m.get(2);
          L subExplanation = explainMatch(new Pos(pos.tok, ii), jj, className);
          //print(format("Recursed: * * * => ", ii, jj, className) +  structure(subExplanation));
          if (empty(subExplanation)) {
            // weird
          } else
            m.addAll(subList((L) subExplanation.get(0), 3));
            
          match.set(i, m);
        }
        
        if (debug)
          print("Match completed: " + structure(match));
        out.add(match);
      }
    } else if (pos.end())
      ret;
    else {
      S p = prod.get();
      
      if (isBracketedID(p)) {
        S className = unbracket(p);
        L<Integer> r = recog.get(pos.i).get(className);
  
        // keep parsing for every option
    
        for (int i : cloneList(r)) {
          match.add(litlist(pos.i, i, className));
          explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out);
          removeLast(match);
        }
      } else if (!pos.end()) {
        // it's a literal
        S t = pos.get();
        if (!matchToken(p, t))
          ret;
        
        //match.add(litlist(pos.i, pos.i+2, p));
        explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out);
        //removeLast(match);
      }
    }
  }
  
  S prettierAnalysis() {
    new L<S> l;
    new TreeSet<S> emptyClasses;
    for (int i = 1; i < l(tok); i += 2) {
      MultiMap<S, Integer> rr = recog.get(i);
      new L<S> bla;
      for (S className : rr.keys()) {
        L<Integer> is = rr.get(className);
        if (is.contains(i))
          emptyClasses.add(className);
        int n = (max(is)-i)/2;
        if (n > 0)
          bla.add(className + " " + n);
      }
      l.add(tok.get(i) + " : " + structure(bla));
    }
    if (!empty(emptyClasses))
      l.add("  with empty classes " + structure(asList(emptyClasses)));
    ret fromLines(l);
  }
}

p {
  startBot("Snippet Text Bot", "#1002084");
}

static ParseResult parse(S inputText) {
  ret parse(inputText, defaultParserRules);
}

static ParseResult jparse(S inputText) {
  ret parse(inputText, javaRules);
}

static S keptRulesID;
static S keptRulesText;

static synchronized ParseResult parse(S inputText, S parserRules) {
  S rulesText;
  if (!isSnippetID(parserRules))
    rulesText = parserRules;
  else {
    if (keepRules && sameSnippetID(parserRules, keptRulesID))
      rulesText = keptRulesText;
    else {
      rulesText = loadSnippetThroughBot(parserRules);
      if (keepRules) {
        keptRulesID = parserRules;
        keptRulesText = rulesText;
      }
    }
  }

  productionMap = new MultiMap;
  caseSensitive = false;
  for (S rule : toLinesFullTrim(rulesText)) pcall {
    if (l(javaTok(rule)) == 1) continue; // just a comment
    
    if (debug)
      printF("Processing rule: *", rule);
    
    if (rule.contains("=")) {  
      L<S> lr = splitAtJavaToken(rule, "=");
      S l, r;
      if (l(lr) == 1) {
        l = "";
        r = lr.get(0);
      } else if (l(lr) >= 2) {
        l = join(" = ", lr.subList(0, l(lr)-1));
        r = last(lr);
      } else {
        print("Weird rule: " + rule);
        continue;
      }
      L<S> tokr = mergeBracketThingies(javaTok(r));
      assertEquals(structure(tokr), 3, l(tokr));
      S className = assertIdentifier(unbracket(get(tokr, 1)));
      L<S> tok = javaTok(l);
      tok = mergeBracketThingies(tok);
      //printStructure(tok);
      productionMap.put(className, tok);
    } else if (match("case-sensitive", rule)) {
      //print("Now case sensitive.");
      caseSensitive = true;
    } else if (match("case-insensitive", rule)) {
      caseSensitive = false;
    } else
      print("Weird rule: " + rule);
  }
  
  if (debug) {
    print(n(productionMap.size(), "production") + ".");
    print();
  }
  
  timing = now();
  tok = javaTok(inputText);
  Pos pos = new Pos(tok);
  parseTop(pos);
  timing = now()-timing;
  
  int numTok = l(tok)/2;
  print("Iterations required for " + numTok + " tokens: " + iterations*numTok*productionMap.size() + ", time: " + timing + " ms");
  
  new ParseResult res;
  res.tok = tok;
  res.recog = recog;
  ret res;
}

static void parseTop(Pos pos) {
  // init structures
  recog = new TreeMap;
  for (int i = pos.i; i <= l(pos.tok); i += 2) // contains one more element for empty classes at the end
    recog.put(i, new MultiMap);
  //print("parser: recog inited " + pos.i + " to " + l(pos.tok));
  
  // process dicts
  for (S className : dicts.keySet()) {
    Set<S> values = dicts.get(className);
    for (int i = pos.i; i < l(pos.tok); i += 2) {
      S t = pos.tok.get(i);
      if (values.contains(t.toLowerCase()))
        recog.get(i).setPut(className, i+2);
    }
  }
  
  matchSpecials();

  boolean anyChange;
  iterations = 0;
  MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap);
  while (iter(pos)) {}
  
  iter(pos); // XX - one more - fixes anything?
  
  productionMap = fullProductionMap; // restore ground productions
}

static boolean iter(Pos pos) {
  boolean anyChange = false;
  ++iterations;
  if (iterations == 2) removeGroundProductions();
  if (debug) print("Iteration " + iterations);
  for (int i = pos.i; i <= l(pos.tok); i += 2) {
    Pos pos2 = new Pos(pos.tok, i);
    for (S className : productionMap.keySet()) {
      MultiMap<S, Integer> rr = recog.get(i);
      L<Integer> recs = rr.getActual(className);
      L<L<S>> prods = productionMap.get(className);
      for (L<S> prod : prods) {
        int n = l(recs);
        matchProd(pos2, new Pos(prod), className, recs);
        anyChange = anyChange || l(recs) > n;
      }
      rr.clean(className);
    }
  }
  ret anyChange;
}

static class Pos {
  L<S> tok;
  int i = 1;
  
  *() {}
  *(L<S> *tok) {}
  *(L<S> *tok, int *i) {}
  
  boolean end() { ret i >= l(tok)-1; }
  S get() { ret tok.get(i); }
  public Pos clone() { ret new Pos(tok, i); }
  public boolean equals(O o) {
    if (!(o instanceof Pos)) ret false;
    Pos pos = cast o;
    ret tok == pos.tok && i == pos.i;
  }
  
  S rest() {
    ret join(subList(tok, i));
  }

  Pos plus(int x) { ret new Pos(tok, i + x); }
}

static void copy(Pos a, Pos b) {
  b.tok = a.tok;
  b.i = a.i;
}

static void debug(S bla, Pos pos) {
  if (debug)
    print(bla + " on " + quote(pos.rest()));
}

static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
  if (prod.end())
    setAdd(out, pos.i);
  else if (pos.end())
    ret;
  else {
    S p = prod.get();
    
    if (isBracketedID(p)/* && !specials.contains(p)*/) {
      MultiMap<S, Integer> rr = recog.get(pos.i);
      if (rr == null)
        fail("parser: recog null at " + pos.i);
      L<Integer> r = rr.get(unbracket(p));
      
      // keep parsing for every option
  
      for (int i : cloneList(r))
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
      
    } else {
      // it's a literal
      S t = pos.get();
      if (!matchToken(p, t))
        ret;
      
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
    }
  }
}

static L<S> specials = litlist("<any>", "<quoted>", "<int>", "<identifier>");

static boolean matchToken(S p, S t) {
  if (eq(p, "<quoted>")) {
    if (!isQuoted(t)) ret false;
  } else if (eq(p, "<int>")) {
    if (!isInteger(t)) ret false;
  } else if (eq(p, "<identifier>")) {
    if (!isIdentifier(t)) ret false;
  } else if (!(eq(p, "<any>") || (caseSensitive ? eq(p, t) : eqic(p, t))))
    ret false; // token mismatch
  ret true;
}

static S loadSnippetThroughBot(S snippetID) {
  print("Loading parser rules.");
  time {
    S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
    new Matches m;
    assertTrue(match("the text of snippet * is *", answer, m));
    ret m.unq(1);
  }
}

// Non-recursing productions don't need to be repeatedly executed
static void removeGroundProductions() {
  int n = productionMap.size();
  for (S className : new HashSet<S>(productionMap.keySet())) {
    L<L<S>> prods = productionMap.getActual(className);
    for (int i = 0; i < l(prods); )
      if (isGroundProduction(prods.get(i))) {
        if (debug)
          print("Ground production: " + className + " = " + join(prods.get(i)));
        prods.remove(i);
      } else
        ++i;
    productionMap.clean(className);
  }
  n -= productionMap.size();
  if (debug && n != 0)
    print(n + " ground productions removed.");
}

static boolean isGroundProduction(L<S> prod) {
  for (int i = 1; i < l(prod); i += 2) {
    S t = prod.get(i);
    if (isBracketedID(t) && !specials.contains(t))
      ret false;
  }
  ret true;
}

static void addDict(S className, Collection<S> entries) {
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
}

static void matchSpecials() {
  for (int i = 1; i < l(tok); i += 2) {
    MultiMap<S, Integer> rr = recog.get(i);
    for (S s : specials)
      if (matchToken(s, tok.get(i)))
        rr.put(unbracket(s), i+2);
  }
}

Author comment

Began life as a copy of #1002328

download  show line numbers  debug dex  old transpilations   

Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002369
Snippet name: NL Parser, reworking the interface (CURRENT)
Eternal ID of this version: #1002369/1
Text MD5: 3ce27e57b13470355771d3230d8662d9
Transpilation MD5: be1e90f4e22154f18df1abcb2fa3645a
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-01-31 03:44:01
Source code size: 13095 bytes / 455 lines
Pitched / IR pitched: No / No
Views / Downloads: 739 / 2454
Referenced in: [show references]