Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

298
LINES

< > BotCompany Repo | #1002328 // NL Parser, implementing "empty" classes & ground class optimisation

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (3162L/21K/69K).

1  
// Idea: For every position, store the productions recognized to start there, then infer up to higher classes
2  
3  
!759
4  
5  
// a recognition is identified by (startPos, className, endPos)
6  
7  
8  
// key 1 = start position, key 2 = class name, value = end position
9  
static Map<Integer, MultiMap<S, Integer>> recog;
10  
static L<S> tok;
11  
12  
static MultiMap<S, L<S>> productionMap;
13  
14  
static boolean debug = false;
15  
16  
static long timing;
17  
static int iterations;
18  
19  
static Map<S, Set<S>> dicts = new TreeMap; // dictionaries - class name to lowercase examples
20  
21  
22  
static S defaultParserRules = "#1002281";
23  
static S defaultMainProd = "line";
24  
25  
p {
26  
  startBot("Snippet Text Bot", "#1002084");
27  
}
28  
29  
static synchronized S parse(S inputText) {
30  
  ret parse(inputText, defaultParserRules, defaultMainProd);
31  
}
32  
33  
static synchronized S parse(S inputText, S parserRules, S mainProd) {
34  
  S rulesText = loadSnippetThroughBot(parserRules);
35  
36  
  productionMap = new MultiMap;
37  
  for (S rule : toLinesFullTrim(rulesText)) pcall {
38  
    //printF("Processing rule: *", rule);
39  
    L<S> lr = splitAtJavaToken(rule, "=");
40  
    S l, r;
41  
    if (l(lr) == 1) {
42  
      l = "";
43  
      r = lr.get(0);
44  
    } else if (l(lr) == 2) {
45  
      l = lr.get(0);
46  
      r = lr.get(1);
47  
    } else {
48  
      print("Weird rule: " + rule);
49  
      continue;
50  
    }
51  
    L<S> tokr = javaTok(r);
52  
    assertEquals(structure(tokr), 3, l(tokr));
53  
    S className = assertIdentifier(get(tokr, 1));
54  
    L<S> tok = javaTok(l);
55  
    tok = mergeBracketThingies(tok);
56  
    //printStructure(tok);
57  
    productionMap.put(className, tok);
58  
  }
59  
  
60  
  print(n(productionMap.size(), "production") + ".");
61  
  print();
62  
  
63  
  timing = now();
64  
  tok = javaTok(inputText);
65  
  Pos pos = new Pos(tok);
66  
  L<Integer> x = parseTop(pos, mainProd);
67  
  S result;
68  
  timing = now()-timing;
69  
  
70  
  int numTok = l(tok)/2;
71  
  print("Iterations required for " + numTok + " tokens: " + iterations*numTok + ", time: " + timing + " ms");
72  
  
73  
  if (x.contains(l(tok)))
74  
    result = "parsed";
75  
  else if (!empty(x))
76  
    result = "beginning matches";
77  
  else
78  
    ret "not parsed";
79  
  L out = explainMatch(new Pos(tok), x.get(0), mainProd);
80  
  ret result + ", detailed analysis: " + structure(out);
81  
}
82  
83  
static L<Integer> parseTop(Pos pos, S mainProd) {
84  
  // init structures
85  
  recog = new TreeMap;
86  
  for (int i = pos.i; i < l(pos.tok); i += 2)
87  
    recog.put(i, new MultiMap);
88  
  print("parser: recog inited " + pos.i + " to " + l(pos.tok));
89  
  
90  
  // process dicts
91  
  for (S className : dicts.keySet()) {
92  
    Set<S> values = dicts.get(className);
93  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
94  
      S t = pos.tok.get(i);
95  
      if (values.contains(t.toLowerCase()))
96  
        recog.get(i).setPut(className, i+2);
97  
    }
98  
  }
99  
100  
  boolean anyChange;
101  
  iterations = 0;
102  
  MultiMap<S, L<S>> fullProductionMap = new MultiMap(productionMap);
103  
  do {
104  
    anyChange = false;
105  
    ++iterations;
106  
    if (iterations == 2) removeGroundProductions();
107  
    for (int i = pos.i; i < l(pos.tok); i += 2) {
108  
      Pos pos2 = new Pos(pos.tok, i);
109  
      for (S className : productionMap.keySet()) {
110  
        MultiMap<S, Integer> rr = recog.get(i);
111  
        L<Integer> recs = rr.getActual(className);
112  
        L<L<S>> prods = productionMap.get(className);
113  
        for (L<S> prod : prods) {
114  
          int n = l(recs);
115  
          matchProd(pos2, new Pos(prod), className, recs);
116  
          anyChange = anyChange || l(recs) > n;
117  
        }
118  
        rr.clean(className);
119  
      }
120  
    }
121  
  } while (anyChange);
122  
  productionMap = fullProductionMap; // restore ground productions
123  
  
124  
  ret recog.get(pos.i).get(mainProd);
125  
}
126  
127  
static class Pos {
128  
  L<S> tok;
129  
  int i = 1;
130  
  
131  
  *() {}
132  
  *(L<S> *tok) {}
133  
  *(L<S> *tok, int *i) {}
134  
  
135  
  boolean end() { ret i >= l(tok)-1; }
136  
  S get() { ret tok.get(i); }
137  
  public Pos clone() { ret new Pos(tok, i); }
138  
  public boolean equals(O o) {
139  
    if (!(o instanceof Pos)) ret false;
140  
    Pos pos = cast o;
141  
    ret tok == pos.tok && i == pos.i;
142  
  }
143  
  
144  
  S rest() {
145  
    ret join(subList(tok, i));
146  
  }
147  
148  
  Pos plus(int x) { ret new Pos(tok, i + x); }
149  
}
150  
151  
static void copy(Pos a, Pos b) {
152  
  b.tok = a.tok;
153  
  b.i = a.i;
154  
}
155  
156  
static void debug(S bla, Pos pos) {
157  
  if (debug)
158  
    print(bla + " on " + quote(pos.rest()));
159  
}
160  
161  
static void matchProd(Pos pos, Pos prod, S forClass, L<Integer> out) {
162  
  if (prod.end())
163  
    setAdd(out, pos.i);
164  
  else if (pos.end())
165  
    ret;
166  
  else {
167  
    S p = prod.get();
168  
    
169  
    if (isBracketedID(p) && !specials.contains(p)) {
170  
      MultiMap<S, Integer> rr = recog.get(pos.i);
171  
      if (rr == null)
172  
        fail("parser: recog null at " + pos.i);
173  
      L<Integer> r = rr.get(unbracket(p));
174  
      
175  
      // keep parsing for every option
176  
  
177  
      for (int i : cloneList(r))
178  
        matchProd(new Pos(pos.tok, i), prod.plus(2), forClass, out);
179  
      
180  
    } else {
181  
      // it's a literal
182  
      S t = pos.get();
183  
      if (!matchToken(p, t))
184  
        ret;
185  
      
186  
      matchProd(pos.plus(2), prod.plus(2), forClass, out);
187  
    }
188  
  }
189  
}
190  
191  
static L<S> specials = litlist("<quoted>", "<int>", "<identifier>");
192  
193  
static boolean matchToken(S p, S t) {
194  
  if (eq(p, "<quoted>")) {
195  
    if (!isQuoted(t)) ret false;
196  
  } else if (eq(p, "<int>")) {
197  
    if (!isInteger(t)) ret false;
198  
  } else if (eq(p, "<identifier>")) {
199  
    if (!isIdentifier(t)) ret false;
200  
  } else if (!(eq(p, "*") || eqic(p, t)))
201  
    ret false; // token mismatch
202  
  ret true;
203  
}
204  
205  
// assumes that there is a match (pos, class, endPos)
206  
// and gives explanations of how it was probably matched
207  
static L explainMatch(Pos pos, int endPos, S forClass) {
208  
  L<L<S>> prods = productionMap.get(forClass);
209  
  new L out;
210  
  for (L<S> prod : prods)
211  
    explainMatch2(pos, new Pos(prod), endPos, forClass, litlist(forClass, join(prod)), out);
212  
  ret out;
213  
}
214  
215  
// same, but with fixed production
216  
static void explainMatch2(Pos pos, Pos prod, int endPos, S forClass, L match, L out) {
217  
  if (prod.end()) {
218  
    if (pos.i == endPos)
219  
      out.add(cloneList(match));
220  
  } else if (pos.end())
221  
    ret;
222  
  else {
223  
    S p = prod.get();
224  
    
225  
    if (isBracketedID(p) && neq(p, "<quoted>")) {
226  
      S className = unbracket(p);
227  
      L<Integer> r = recog.get(pos.i).get(className);
228  
229  
      // keep parsing for every option
230  
  
231  
      for (int i : cloneList(r)) {
232  
        match.add(litlist(pos.i, i));
233  
        explainMatch2(new Pos(pos.tok, i), prod.plus(2), endPos, forClass, match, out);
234  
        removeLast(match);
235  
      }
236  
    } else {
237  
      // it's a literal
238  
      S t = pos.get();
239  
      if (!matchToken(p, t))
240  
        ret;
241  
      
242  
      //match.add(litlist(pos.i, p, pos.i+2));
243  
      explainMatch2(pos.plus(2), prod.plus(2), endPos, forClass, match, out);
244  
      removeLast(match);
245  
    }
246  
  }
247  
}
248  
249  
static S loadSnippetThroughBot(S snippetID) {
250  
  S answer = sendToLocalBot_cached("Snippet Text Bot", "what is the text of snippet *", snippetID);
251  
  new Matches m;
252  
  assertTrue(match("the text of snippet * is *", answer, m));
253  
  ret m.unq(1);
254  
}
255  
256  
static S prettierAnalysis() {
257  
  new L<S> l;
258  
  for (int i = 1; i < l(tok); i += 2) {
259  
    MultiMap<S, Integer> rr = recog.get(i);
260  
    new L<S> bla;
261  
    for (S prod : rr.keys()) {
262  
      L<Integer> is = rr.get(prod);
263  
      bla.add(prod + " " + (max(is)-i)/2);
264  
    }
265  
    l.add(tok.get(i) + " : " + structure(bla));
266  
  } ret fromLines(l);
267  
}
268  
269  
// Non-recursing productions don't need to be repeatedly executed
270  
static void removeGroundProductions() {
271  
  int n = productionMap.size();
272  
  for (S className : new HashSet<S>(productionMap.keySet())) {
273  
    L<L<S>> prods = productionMap.getActual(className);
274  
    for (int i = 0; i < l(prods); )
275  
      if (isGroundProduction(prods.get(i))) {
276  
        //print("Ground production: " + className + " = " + join(prods.get(i)));
277  
        prods.remove(i);
278  
      } else
279  
        ++i;
280  
    productionMap.clean(className);
281  
  }
282  
  n -= productionMap.size();
283  
  if (n != 0)
284  
    print(n + " ground productions removed.");
285  
}
286  
287  
static boolean isGroundProduction(L<S> prod) {
288  
  for (int i = 1; i < l(prod); i += 2) {
289  
    S t = prod.get(i);
290  
    if (isBracketedID(t) && !specials.contains(unbracket(t)))
291  
      ret false;
292  
  }
293  
  ret true;
294  
}
295  
296  
static void addDict(S className, Collection<S> entries) {
297  
  dicts.put(className, entries instanceof Set ? (Set) entries : new TreeSet(entries));
298  
}

Author comment

Began life as a copy of #1002307

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1002328
Snippet name: NL Parser, implementing "empty" classes & ground class optimisation
Eternal ID of this version: #1002328/1
Text MD5: aa5353cac7328b453bcc61018ba8b672
Transpilation MD5: 00ec6e09b91e996a60ca772a06365be6
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-01-07 17:35:18
Source code size: 8380 bytes / 298 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 733 / 1024
Referenced in: [show references]