Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

298
LINES

< > BotCompany Repo | #1002328 // NL Parser, implementing "empty" classes & ground class optimisation

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (3162L/21K/69K).

// Idea: For every position, store the productions recognized to start there, then infer up to higher classes

!759

// a recognition is identified by (startPos, className, endPos)


// key 1 = start position, key 2 = class name, value = end position
static Map<Integer, MultiMap<S, Integer>> recog;
static L<S> tok;

static MultiMap<S, L<S>> productionMap;

static boolean debug = false;

static long timing;
static int iterations;

static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples


static S defaultParserRules = "#1002281";
static S defaultMainProd = "line";

p {
  startBot("Snippet Text Bot", "#1002084");
}

static synchronized S parse(S inputText) {
  ret parse(inputText, defaultParserRules, defaultMainProd);
}

static synchronized S parse(S inputText, S parserRules, S mainProd) {
  S rulesText = loadSnippetThroughBot(parserRules);

  productionMap = new MultiMap;
  for (S rule : toLinesFullTrim(rulesText)) pcall {
    //printF("Processing rule: *", rule);
    L<S> lr = splitAtJavaToken(rule, "=");
    S l, r;
    if (l(lr) == 1) {
      l = "";
      r = lr.get(0);
    } else if (l(lr) == 2) {
      l = lr.get(0);
      r = lr.get(1);
    } else {
      print("Weird rule: " + rule);
      continue;
    }
    L<S> tokr = javaTok(r);
    assertEquals(structure(tokr), 3, l(tokr));
    S className = assertIdentifier(get(tokr, 1));
    L<S> tok = javaTok(l);
    tok = mergeBracketThingies(tok);
    //printStructure(tok);
    productionMap.put(className, tok);
  }
  
  print(n(productionMap.size(), "production") + ".");
  print();
  
  timing = now();
  tok = javaTok(inputText);
  Pos pos = new Pos(tok);
  L<Integer> x = parseTop(pos, mainProd);
  S result;
  timing = now()-timing;
  
  int numTok = l(tok)/2;
  print("Iterations required for " + numTok + " tokens: " + iterations*numTok + ", time: " + timing + " ms");
  
  if (x.contains(l(tok)))
    result = "parsed";
  else if (!empty(x))
    result = "beginning matches";
  else
    ret "not parsed";
  L out = explainMatch(new Pos(tok), x.get(0), mainProd);
  ret result + ", detailed analysis: " + structure(out);
}

static L<Integer> parseTop(Pos pos, S mainProd) {
  // init structures
  recog = new TreeMap;
  for (int i = pos.i; i < l(pos.tok); i += 2)
    recog.put(i, new MultiMap);
  print("parser: recog inited " + pos.i + " to " + l(pos.tok));
  
  // process dicts
  for (S className : dicts.keySet()) {
    Set<S> values = dicts.get(className);
    for (int i = pos.i; i < l(pos.tok); i += 2) {
      S t = pos.tok.get(i);
      if (values.contains(t.toLowerCase()))
        recog.get(i).setPut(className, i+2);
    }
  }

  boolean anyChange;
  iterations = 0;
  MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap);
  do {
    anyChange = false;
    ++iterations;
    if (iterations == 2) removeGroundProductions();
    for (int i = pos.i; i < l(pos.tok); i += 2) {
      Pos pos2 = new Pos(pos.tok, i);
      for (S className : productionMap.keySet()) {
        MultiMap<S, Integer> rr = recog.get(i);
        L<Integer> recs = rr.getActual(className);
        L<L<S>> prods = productionMap.get(className);
        for (L<S> prod : prods) {
          int n = l(recs);
          matchProd(pos2, new Pos(prod), className, recs);
          anyChange = anyChange || l(recs) > n;
        }
        rr.clean(className);
      }
    }
  } while (anyChange);
  productionMap = fullProductionMap; // restore ground productions
  
  ret recog.get(pos.i).get(mainProd);
}

static class Pos {
  L<S> tok;
  int i = 1;
  
  *() {}
  *(L<S> *tok) {}
  *(L<S> *tok, int *i) {}
  
  boolean end() { ret i >= l(tok)-1; }
  S get() { ret tok.get(i); }
  public Pos clone() { ret new Pos(tok, i); }
  public boolean equals(O o) {
    if (!(o instanceof Pos)) ret false;
    Pos pos = cast o;
    ret tok == pos.tok && i == pos.i;
  }
  
  S rest() {
    ret join(subList(tok, i));
  }

  Pos plus(int x) { ret new Pos(tok, i + x); }
}

static void copy(Pos a, Pos b) {
  b.tok = a.tok;
  b.i = a.i;
}

static void debug(S bla, Pos pos) {
  if (debug)
    print(bla + " on " + quote(pos.rest()));
}

static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
  if (prod.end())
    setAdd(out, pos.i);
  else if (pos.end())
    ret;
  else {
    S p = prod.get();
    
    if (isBracketedID(p) && !specials.contains(p)) {
      MultiMap<S, Integer> rr = recog.get(pos.i);
      if (rr == null)
        fail("parser: recog null at " + pos.i);
      L<Integer> r = rr.get(unbracket(p));
      
      // keep parsing for every option
  
      for (int i : cloneList(r))
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
      
    } else {
      // it's a literal
      S t = pos.get();
      if (!matchToken(p, t))
        ret;
      
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
    }
  }
}

static L<S> specials = litlist("<quoted>", "<int>", "<identifier>");

static boolean matchToken(S p, S t) {
  if (eq(p, "<quoted>")) {
    if (!isQuoted(t)) ret false;
  } else if (eq(p, "<int>")) {
    if (!isInteger(t)) ret false;
  } else if (eq(p, "<identifier>")) {
    if (!isIdentifier(t)) ret false;
  } else if (!(eq(p, "*") || eqic(p, t)))
    ret false; // token mismatch
  ret true;
}

// assumes that there is a match (pos, class, endPos)
// and gives explanations of how it was probably matched
static L explainMatch(Pos pos, int endPos, S forClass) {
  L<L<S>> prods = productionMap.get(forClass);
  new L out;
  for (L<S> prod : prods)
    explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out);
  ret out;
}

// same, but with fixed production
static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
  if (prod.end()) {
    if (pos.i == endPos)
      out.add(cloneList(match));
  } else if (pos.end())
    ret;
  else {
    S p = prod.get();
    
    if (isBracketedID(p) && neq(p, "<quoted>")) {
      S className = unbracket(p);
      L<Integer> r = recog.get(pos.i).get(className);

      // keep parsing for every option
  
      for (int i : cloneList(r)) {
        match.add(litlist(pos.i, i));
        explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out);
        removeLast(match);
      }
    } else {
      // it's a literal
      S t = pos.get();
      if (!matchToken(p, t))
        ret;
      
      //match.add(litlist(pos.i, p, pos.i+2));
      explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out);
      removeLast(match);
    }
  }
}

static S loadSnippetThroughBot(S snippetID) {
  S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
  new Matches m;
  assertTrue(match("the text of snippet * is *", answer, m));
  ret m.unq(1);
}

static S prettierAnalysis() {
  new L<S> l;
  for (int i = 1; i < l(tok); i += 2) {
    MultiMap<S, Integer> rr = recog.get(i);
    new L<S> bla;
    for (S prod : rr.keys()) {
      L<Integer> is = rr.get(prod);
      bla.add(prod + " " + (max(is)-i)/2);
    }
    l.add(tok.get(i) + " : " + structure(bla));
  } ret fromLines(l);
}

// Non-recursing productions don't need to be repeatedly executed
static void removeGroundProductions() {
  int n = productionMap.size();
  for (S className : new HashSet<S>(productionMap.keySet())) {
    L<L<S>> prods = productionMap.getActual(className);
    for (int i = 0; i < l(prods); )
      if (isGroundProduction(prods.get(i))) {
        //print("Ground production: " + className + " = " + join(prods.get(i)));
        prods.remove(i);
      } else
        ++i;
    productionMap.clean(className);
  }
  n -= productionMap.size();
  if (n != 0)
    print(n + " ground productions removed.");
}

static boolean isGroundProduction(L<S> prod) {
  for (int i = 1; i < l(prod); i += 2) {
    S t = prod.get(i);
    if (isBracketedID(t) && !specials.contains(unbracket(t)))
      ret false;
  }
  ret true;
}

static void addDict(S className, Collection<S> entries) {
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
}

Author comment

Began life as a copy of #1002307

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002328
Snippet name: NL Parser, implementing "empty" classes & ground class optimisation
Eternal ID of this version: #1002328/1
Text MD5: aa5353cac7328b453bcc61018ba8b672
Transpilation MD5: 00ec6e09b91e996a60ca772a06365be6
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-01-07 17:35:18
Source code size: 8380 bytes / 298 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 795 / 1110
Referenced in: [show references]