Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

478
LINES

< > BotCompany Repo | #1001033 // Token prediction, multiple predictors (v4, including patterns, developing)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (2971L/23K/69K).

1  
!747
2  
!actionListener {
3  
4  
m {
5  
  static S corpusID = "#1001034";
6  
  static int numSnippets = 100;
7  
  static boolean showGUI = true;
8  
  static int maxCharsGUI = 500000;
9  
  static boolean allTokens = true;
10  
  
11  
  static Collector collector;
12  
  static L<F> files;
13  
  static Map<F, Set<int>> predicted;
14  
  
15  
  // a file to learn from
16  
  static class F {
17  
    String id, name;
18  
    L<S> tok;
19  
  }
20  
  
21  
  // a predictor
22  
  static abstract class P {
23  
    int seen;
24  
    S file;
25  
    
26  
    abstract S read(S file, L<S> tok);
27  
    abstract P derive(); // clone & reset counter for actual use
28  
    abstract P clear();
29  
    
30  
    void prepare(S file) {
31  
      if (!eq(file, this.file)) {
32  
        seen = 0;
33  
        this.file = file;
34  
      }
35  
    }
36  
  }
37  
38  
  static class Chain extends P {
39  
    new L<P> list;
40  
    
41  
    *() {}
42  
    *(L<P> *list) {}
43  
    *(P... a) { list = asList(a); }
44  
    
45  
    void add(P p) { list.add(p); }
46  
    
47  
    S read(S file, L<S> tok) {
48  
      for (P p : list) {
49  
        S s = p.read(file, tok);
50  
        if (s != null) return s;
51  
      }
52  
      return null;
53  
    }
54  
    
55  
    P derive() {
56  
      new Chain c;
57  
      for (P p : list)
58  
        c.add(p.derive());
59  
      return c;
60  
    }
61  
    
62  
    P clear() {
63  
      new Chain c;
64  
      for (P p : list)
65  
        c.add(p.clear());
66  
      return c;
67  
    }
68  
  }
69  
    
70  
  static class Tuples extends P {
71  
    Map<L<S>,S> map = new HashMap<L<S>,S>();
72  
    int n;
73  
74  
    *(int *n) {
75  
    }
76  
    
77  
    S read(S file, L<S> tok) {
78  
      prepare(file);
79  
      
80  
      while (tok.size() > seen) {
81  
        ++seen;
82  
        if (seen > n)
83  
          map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
84  
      }
85  
      
86  
      if (tok.size() >= n)
87  
        return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size())));
88  
        
89  
      return null;
90  
    }
91  
    
92  
    P derive() {
93  
      Tuples t = new Tuples(n);
94  
      t.map = new DerivedHashMap<L<S>,S>(map);
95  
      return t;
96  
    }
97  
    
98  
    P clear() {
99  
      return new Tuples(n);
100  
    }
101  
  }
102  
  
103  
  static Map<S, S> makeMapPrefix(L<S> tok1, L<S> tok2) {
104  
    if (tok1.size() < tok2.size()) return null;
105  
    
106  
    new Map<S, S> map;
107  
    for (int i = 1; i < tok2.size(); i += 2) {
108  
      S t1 = tok1.get(i), t2 = tok2.get(i);
109  
      if (!t1.equals(t2)) {
110  
        S v = map.get(t1);
111  
        if (v == null)
112  
          map.put(t1, t2);
113  
        else if (!v.equals(t2))
114  
          return null; // match fail
115  
      }
116  
    }
117  
    
118  
    // match succeeds
119  
    return map;
120  
  }
121  
  
122  
  // TODO: code tokens only
123  
  static class Patterns extends P {
124  
    Map<L<S>,S> map = new HashMap<L<S>,S>();
125  
    int n;
126  
127  
    *(int *n) {
128  
    }
129  
    
130  
    S read(S file, L<S> tok) {
131  
      prepare(file);
132  
      
133  
      while (tok.size() > seen) {
134  
        ++seen;
135  
        if (seen > n)
136  
          put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
137  
      }
138  
      
139  
      if (tok.size() >= n) {
140  
        L<S> l = new ArrayList<S>(tok.subList(tok.size()-n, tok.size()));
141  
        for (L<S> pl : map.keySet()) {
142  
          S pr = map.get(pl);
143  
          print("pl: " + structure(pl) + ", l: " + structure(l));
144  
          Map<S, S> m = makeMapPrefix(pl, l);
145  
          if (m != null) {
146  
            S result = m.get(pr);
147  
            print("map: " + structure(m) + ", result: " + quote(result));
148  
            ret result;
149  
          }
150  
        }
151  
      }
152  
        
153  
      return null;
154  
    }
155  
    
156  
    void put(L<S> l, S r) {
157  
      if (isPattern(l, r)) {
158  
        new L<S> l2;
159  
        l2.addAll(l);
160  
        l2.add(r);
161  
        print("pattern: " + structure(l2));
162  
        map.put(l, r);
163  
      }
164  
    }
165  
    
166  
    boolean isPattern(L<S> l, S r) {
167  
      /*new Set<S> set;
168  
      set.addAll(l);
169  
      set.add(r);
170  
      return set.size() < l.size()+1;*/
171  
      return l.contains(r) && interestingToken(r);
172  
    }
173  
     
174  
    boolean interestingToken(S r) {
175  
      //return !r.trim().equals("");
176  
      for (int i = 0; i < r.length(); i++)
177  
        if (Character.isLetter(r.charAt(i)))
178  
          ret true;
179  
      ret false;
180  
    }
181  
    
182  
    P derive() {
183  
      Patterns t = new Patterns(n);
184  
      t.map = new DerivedHashMap<L<S>,S>(map);
185  
      return t;
186  
    }
187  
    
188  
    P clear() {
189  
      return new Patterns(n);
190  
    }
191  
  }
192  
  
193  
  !include #1001027 // DerivedHashMap
194  
  
195  
  static class Node {
196  
    String token;
197  
    float count;
198  
    new L<Node> next;
199  
    
200  
    *() {} // for clone method
201  
    
202  
    *(S *token) {}
203  
    
204  
    Node find(S token) {
205  
      for (Node n : next)
206  
        if (n.token.equals(token))
207  
          ret n;
208  
      ret null;
209  
    }
210  
    
211  
    Node bestNext() {
212  
      float bestCount = 0f;
213  
      Node best = null;
214  
      for (Node n : next)
215  
        if (best == null || n.count > best.count) {
216  
          best = n;
217  
          bestCount = n.count;
218  
        }
219  
      ret best;
220  
    }
221  
  }
222  
  
223  
  static class StartTree extends P {
224  
    Node tree = new Node("");
225  
    Node node;
226  
    boolean nonmod;
227  
    
228  
    S read(S file, L<S> tok) {
229  
      if (!eq(file, this.file)) {
230  
        seen = 0;
231  
        this.file = file;
232  
        node = tree;
233  
      }
234  
      
235  
      if (!nonmod) while (tok.size() > seen) {
236  
        S t = tok.get(seen++);
237  
        Node child = node.find(t);
238  
        if (child == null)
239  
          node.next.add(child = new Node(t));
240  
        child.count++;
241  
        node = child;
242  
      }
243  
      
244  
      Node n = node.bestNext();
245  
      ret n != null ? n.token : null;
246  
    }
247  
    
248  
    // it's a hack - derived predictor doesn't learn
249  
    P derive() {
250  
      //return (P) main.clone(this);
251  
      new StartTree p;
252  
      p.nonmod = true;
253  
      p.tree = tree;
254  
      return p;
255  
    }
256  
    
257  
    P clear() {
258  
      return new StartTree;
259  
    }
260  
  }
261  
  
262  
  p {
263  
    files = makeCorpus();
264  
    print("Files in corpus: " + files.size());
265  
    
266  
    print("Learning...");
267  
    collector = new Collector;
268  
    //test(new Tuples(1));
269  
    /*test(new StartTree);
270  
    test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1), new StartTree));*/
271  
    //test(new Patterns(6));
272  
    test(new Patterns(9));
273  
274  
    print("Learning done.");
275  
    printVMSize();
276  
    if (collector.winner != null && showGUI)
277  
      window();
278  
  }
279  
  
280  
  static int points = 0, total = 0;
281  
  
282  
  // train & evaluate a predictor
283  
  static void test(P p) {
284  
    int lastPercent = 0;
285  
    predicted = new HashMap;
286  
    points = 0;
287  
    total = 0;
288  
    for (int ii = 0; ii < files.size(); ii++) {
289  
      F f = files.get(ii);
290  
      
291  
      testFile(p, f);
292  
      
293  
      int percent = roundUpTo(10, (int) (ii*100L/files.size()));
294  
      if (percent > lastPercent) {
295  
        print("Learning " + percent + "% done.");
296  
        lastPercent = percent;
297  
      }
298  
    }
299  
    double score = points*100.0/total;
300  
    collector.add(p, score);
301  
  }
302  
  
303  
  static void testFile(P p, F f) {
304  
    new TreeSet<int> pred;
305  
    new L<S> history;
306  
    for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) {
307  
      S t = f.tok.get(i);
308  
      S x = p.read(f.name, history);
309  
      boolean correct = t.equals(x);
310  
      total += t.length();
311  
      if (correct) {
312  
        pred.add(i);
313  
        points += t.length();
314  
      }
315  
      history.add(t);
316  
    }
317  
    predicted.put(f, pred);
318  
  }
319  
  
320  
  !include #1000989 // SnippetDB
321  
  
322  
  static L<F> makeCorpus() ctex {
323  
    S name = getSnippetTitle(corpusID);
324  
    S s = loadSnippet(corpusID);
325  
    if (s.length() != 0)
326  
      return makeCorpus_single(s);
327  
    else if (name.toLowerCase().indexOf(".zip") >= 0)
328  
      return makeCorpus_zip();
329  
    else
330  
      return makeCorpus_mysqldump();
331  
  }
332  
  
333  
  static L<F> makeCorpus_single(S text) ctex {
334  
    new L<F> files;
335  
    new F f;
336  
    f.id = corpusID;
337  
    f.name = getSnippetTitle(corpusID);
338  
    f.tok = internAll(javaTok(text));
339  
    files.add(f);
340  
    return files;
341  
  }
342  
  
343  
  static L<F> makeCorpus_zip() ctex {
344  
    new L<F> files;
345  
    ZipFile zipFile = new ZipFile(loadLibrary(corpusID));
346  
    Enumeration entries = zipFile.entries();
347  
348  
    while (entries.hasMoreElements() && files.size() < numSnippets) {
349  
      ZipEntry entry = (ZipEntry) entries.nextElement(); 
350  
      if (entry.isDirectory()) continue;
351  
      //System.out.println("File found: " + entry.getName());
352  
353  
      InputStream fin = zipFile.getInputStream(entry);
354  
      // TODO: try to skip binary files?
355  
      
356  
      InputStreamReader reader = new InputStreamReader(fin, "UTF-8");
357  
      new StringBuilder builder;
358  
      BufferedReader bufferedReader = new BufferedReader(reader);
359  
      String line;
360  
      while ((line = bufferedReader.readLine()) != null)
361  
        builder.append(line).append('\n');
362  
      fin.close();
363  
      S text = builder.toString();
364  
      
365  
      new F f;
366  
      f.name = entry.getName();
367  
      f.tok = internAll(javaTok(text));
368  
      files.add(f);
369  
    }
370  
    
371  
    zipFile.close();
372  
    return files;
373  
  }
374  
  
375  
  static L<F> makeCorpus_mysqldump() {
376  
    new L<F> files;
377  
    SnippetDB db = new SnippetDB(corpusID);
378  
    List<List<S>> rows = db.rowsOrderedBy("sn_created");
379  
    for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) {
380  
      new F f;
381  
      f.id = db.getField(rows.get(i), "sn_id");
382  
      f.name = db.getField(rows.get(i), "sn_title");
383  
      S text = db.getField(rows.get(i), "sn_text");
384  
      f.tok = internAll(javaTok(text));
385  
      files.add(f);
386  
      ++i;
387  
    }
388  
    return files;
389  
  }
390  
391  
  static class Collector {
392  
    P winner;
393  
    double bestScore = -1;
394  
    Map<F, Set<int>> predicted;
395  
396  
    void add(P p, double score) {
397  
      if (winner == null || score > bestScore) {
398  
        winner = p;
399  
        bestScore = score;
400  
        //S name = shorten(structure(p), 100);
401  
        S name = p.getClass().getName();
402  
        print("New best score: " + formatDouble(score, 2) + "% (" + name + ")");
403  
        predicted = main.predicted;
404  
      }
405  
    }
406  
  }
407  
  
408  
  static void window() {
409  
    //final P p = collector.winner.clear();
410  
411  
    JFrame jf = new JFrame("Predicted = green");
412  
    Container cp = jf.getContentPane();
413  
414  
    final JButton btnNext = new JButton("Next");
415  
    
416  
    final JTextPane pane = new JTextPane();
417  
    //pane.setFont(loadFont("#1000993", 24));
418  
    
419  
    JScrollPane scrollPane = new JScrollPane(pane);
420  
    cp.add(scrollPane, BorderLayout.CENTER);
421  
    
422  
    class X {
423  
      int ii;
424  
      
425  
      void y() ctex {
426  
        ii = ii == 0 ? files.size()-1 : ii-1;
427  
        F f = files.get(ii);
428  
        //testFile(p, f);
429  
        Set<int> pred = collector.predicted.get(f);
430  
        
431  
        StyledDocument doc = new DefaultStyledDocument();
432  
433  
        L<S> tok = f.tok;
434  
        int i = tok.size(), len = 0;
435  
        while (len <= maxCharsGUI && i > 0) {
436  
          --i;
437  
          len += tok.get(i).length();
438  
        }
439  
        
440  
        for (; i < tok.size(); i++) {
441  
          if (tok.get(i).length() == 0) continue;
442  
          boolean green = pred.contains(i);
443  
          SimpleAttributeSet set = new SimpleAttributeSet();
444  
          StyleConstants.setForeground(set, green ? Color.green : Color.gray);
445  
          doc.insertString(doc.getLength(), tok.get(i), set);
446  
        }
447  
        
448  
        pane.setDocument(doc);
449  
        double score = getScore(pred, tok);
450  
        btnNext.setText(f.name + " (" + (ii+1) + "/" + files.size() + ") - " + (int) score + " %");
451  
      }
452  
    }
453  
    final new X x;
454  
    
455  
    btnNext.addActionListener(actionListener {
456  
      x.y();
457  
    });
458  
    cp.add(btnNext, BorderLayout.NORTH);
459  
460  
    x.y();
461  
    
462  
    jf.setBounds(100, 100, 600, 600);
463  
    jf.setVisible(true);    
464  
  }
465  
  
466  
  !include #1001032 // clone function
467  
  
468  
  static double getScore(Set<int> pred, L<S> tok) {
469  
    int total = 0, score = 0;
470  
    for (int i = 0; i < tok.size(); i++) {
471  
      int n = tok.get(i).length();
472  
      total += n;
473  
      if (pred.contains(i))
474  
        score += n;
475  
    }
476  
    ret score*100.0/total;
477  
  }
478  
}

Author comment

Began life as a copy of #1001028

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1001033
Snippet name: Token prediction, multiple predictors (v4, including patterns, developing)
Eternal ID of this version: #1001033/1
Text MD5: 85cea85358eb275d7c83f6d0c67117fa
Transpilation MD5: a5d078ee9518058f0828e33869770292
Author: stefan
Category:
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-09-16 20:59:04
Source code size: 11899 bytes / 478 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 690 / 752
Referenced in: [show references]