Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

419
LINES

< > BotCompany Repo | #1001037 // Token prediction, multiple predictors (v5, developing)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (2911L/22K/67K).

1  
!747
2  
!actionListener {
3  
4  
m {
5  
  static S corpusID = 
6  
    //"#1001034"; // one small snippet
7  
    "#1001006"; // snippets DB
8  
  static int numSnippets = 100;
9  
  static boolean showGUI = true;
10  
  static int maxCharsGUI = 500000;
11  
  static boolean allTokens = true;
12  
  
13  
  static Collector collector;
14  
  static L<F> files;
15  
  static Map<F, Set<int>> predicted;
16  
  
17  
  // a file to learn from
18  
  static class F {
19  
    String id, name;
20  
    L<S> tok;
21  
  }
22  
  
23  
  // a predictor
24  
  static abstract class P {
25  
    int seen;
26  
    S file;
27  
28  
    // basic function - predict next token    
29  
    abstract S read(S file, L<S> tok);
30  
    
31  
    // advanced function - predict rest of token starting with t
32  
    S complete(S file, L<S> tok, S t) { return null; }
33  
    
34  
    abstract P derive(); // clone with trained data
35  
    abstract P clear(); // clone without trained data
36  
    
37  
    void prepare(S file) {
38  
      if (!eq(file, this.file)) {
39  
        seen = 0;
40  
        this.file = file;
41  
      }
42  
    }
43  
  }
44  
45  
  static class Chain extends P {
46  
    new L<P> list;
47  
    
48  
    *() {}
49  
    *(L<P> *list) {}
50  
    *(P... a) { list = asList(a); }
51  
    
52  
    void add(P p) { list.add(p); }
53  
    
54  
    S read(S file, L<S> tok) {
55  
      for (P p : list) {
56  
        S s = p.read(file, tok);
57  
        if (s != null) return s;
58  
      }
59  
      return null;
60  
    }
61  
    
62  
    P derive() {
63  
      new Chain c;
64  
      for (P p : list)
65  
        c.add(p.derive());
66  
      return c;
67  
    }
68  
    
69  
    P clear() {
70  
      new Chain c;
71  
      for (P p : list)
72  
        c.add(p.clear());
73  
      return c;
74  
    }
75  
  }
76  
    
77  
  static class Tuples extends P {
78  
    Map<L<S>,S> map = new HashMap<L<S>,S>();
79  
    int n;
80  
81  
    *(int *n) {
82  
    }
83  
    
84  
    S read(S file, L<S> tok) {
85  
      prepare(file);
86  
      
87  
      while (tok.size() > seen) {
88  
        ++seen;
89  
        if (seen > n)
90  
          map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
91  
      }
92  
      
93  
      if (tok.size() >= n)
94  
        return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size())));
95  
        
96  
      return null;
97  
    }
98  
    
99  
    P derive() {
100  
      Tuples t = new Tuples(n);
101  
      t.map = new DerivedHashMap<L<S>,S>(map);
102  
      return t;
103  
    }
104  
    
105  
    P clear() {
106  
      return new Tuples(n);
107  
    }
108  
  }
109  
  
110  
  static Map<S, S> makeMapPrefix(L<S> tok1, L<S> tok2) {
111  
    if (tok1.size() < tok2.size()) return null;
112  
    
113  
    new Map<S, S> map;
114  
    for (int i = 1; i < tok2.size(); i += 2) {
115  
      S t1 = tok1.get(i), t2 = tok2.get(i);
116  
      if (!t1.equals(t2)) {
117  
        S v = map.get(t1);
118  
        if (v == null)
119  
          map.put(t1, t2);
120  
        else if (!v.equals(t2))
121  
          return null; // match fail
122  
      }
123  
    }
124  
    
125  
    // match succeeds
126  
    return map;
127  
  }
128  
  
129  
  !include #1001041 // Pattern
130  
  
131  
  !include #1001027 // DerivedHashMap
132  
  
133  
  !include #1001036 // LastWordToLower
134  
  
135  
  static class Node {
136  
    String token;
137  
    float count;
138  
    new L<Node> next;
139  
    
140  
    *() {} // for clone method
141  
    
142  
    *(S *token) {}
143  
    
144  
    Node find(S token) {
145  
      for (Node n : next)
146  
        if (n.token.equals(token))
147  
          ret n;
148  
      ret null;
149  
    }
150  
    
151  
    Node bestNext() {
152  
      float bestCount = 0f;
153  
      Node best = null;
154  
      for (Node n : next)
155  
        if (best == null || n.count > best.count) {
156  
          best = n;
157  
          bestCount = n.count;
158  
        }
159  
      ret best;
160  
    }
161  
  }
162  
  
163  
  static class StartTree extends P {
164  
    Node tree = new Node("");
165  
    Node node;
166  
    boolean nonmod;
167  
    
168  
    S read(S file, L<S> tok) {
169  
      if (!eq(file, this.file)) {
170  
        seen = 0;
171  
        this.file = file;
172  
        node = tree;
173  
      }
174  
      
175  
      if (!nonmod) while (tok.size() > seen) {
176  
        S t = tok.get(seen++);
177  
        Node child = node.find(t);
178  
        if (child == null)
179  
          node.next.add(child = new Node(t));
180  
        child.count++;
181  
        node = child;
182  
      }
183  
      
184  
      Node n = node.bestNext();
185  
      ret n != null ? n.token : null;
186  
    }
187  
    
188  
    // it's a hack - derived predictor doesn't learn
189  
    P derive() {
190  
      //return (P) main.clone(this);
191  
      new StartTree p;
192  
      p.nonmod = true;
193  
      p.tree = tree;
194  
      return p;
195  
    }
196  
    
197  
    P clear() {
198  
      return new StartTree;
199  
    }
200  
  }
201  
  
202  
  p {
203  
    files = makeCorpus();
204  
    print("Files in corpus: " + files.size());
205  
    
206  
    print("Learning...");
207  
    collector = new Collector;
208  
    test(new Chain(new Tuples(8), new Tuples(6), new Tuples(4), new Tuples(2), new Tuples(1), new StartTree));
209  
    
210  
    //test(new Patterns(6));
211  
    //test(new Chain(new Patterns(9), new LastWordToLower));
212  
    test(new Chain(new Patterns(9), new Patterns(7), new Patterns(5), new LastWordToLower));
213  
214  
    print("Learning done.");
215  
    printVMSize();
216  
    if (collector.winner != null && showGUI)
217  
      window();
218  
  }
219  
  
220  
  static int points = 0, total = 0;
221  
  
222  
  // train & evaluate a predictor
223  
  static void test(P p) {
224  
    int lastPercent = 0;
225  
    predicted = new HashMap;
226  
    points = 0;
227  
    total = 0;
228  
    for (int ii = 0; ii < files.size(); ii++) {
229  
      F f = files.get(ii);
230  
      
231  
      testFile(p, f);
232  
      
233  
      int percent = roundUpTo(10, (int) (ii*100L/files.size()));
234  
      if (percent > lastPercent) {
235  
        print("Learning " + percent + "% done.");
236  
        lastPercent = percent;
237  
      }
238  
    }
239  
    double score = points*100.0/total;
240  
    collector.add(p, score);
241  
  }
242  
  
243  
  static void testFile(P p, F f) {
244  
    new TreeSet<int> pred;
245  
    new L<S> history;
246  
    for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) {
247  
      S t = f.tok.get(i);
248  
      S x = p.read(f.name, history);
249  
      boolean correct = t.equals(x);
250  
      total += t.length();
251  
      if (correct) {
252  
        pred.add(i);
253  
        points += t.length();
254  
      }
255  
      history.add(t);
256  
    }
257  
    p.read(f.name, history); // feed last token, ignore output
258  
    predicted.put(f, pred);
259  
  }
260  
  
261  
  !include #1000989 // SnippetDB
262  
  
263  
  static L<F> makeCorpus() ctex {
264  
    S name = getSnippetTitle(corpusID);
265  
    S s = loadSnippet(corpusID);
266  
    if (s.length() != 0)
267  
      return makeCorpus_single(s);
268  
    else if (name.toLowerCase().indexOf(".zip") >= 0)
269  
      return makeCorpus_zip();
270  
    else
271  
      return makeCorpus_mysqldump();
272  
  }
273  
  
274  
  static L<F> makeCorpus_single(S text) ctex {
275  
    new L<F> files;
276  
    new F f;
277  
    f.id = corpusID;
278  
    f.name = getSnippetTitle(corpusID);
279  
    f.tok = internAll(javaTok(text));
280  
    files.add(f);
281  
    return files;
282  
  }
283  
  
284  
  static L<F> makeCorpus_zip() ctex {
285  
    new L<F> files;
286  
    ZipFile zipFile = new ZipFile(loadLibrary(corpusID));
287  
    Enumeration entries = zipFile.entries();
288  
289  
    while (entries.hasMoreElements() && files.size() < numSnippets) {
290  
      ZipEntry entry = (ZipEntry) entries.nextElement(); 
291  
      if (entry.isDirectory()) continue;
292  
      //System.out.println("File found: " + entry.getName());
293  
294  
      InputStream fin = zipFile.getInputStream(entry);
295  
      // TODO: try to skip binary files?
296  
      
297  
      InputStreamReader reader = new InputStreamReader(fin, "UTF-8");
298  
      new StringBuilder builder;
299  
      BufferedReader bufferedReader = new BufferedReader(reader);
300  
      String line;
301  
      while ((line = bufferedReader.readLine()) != null)
302  
        builder.append(line).append('\n');
303  
      fin.close();
304  
      S text = builder.toString();
305  
      
306  
      new F f;
307  
      f.name = entry.getName();
308  
      f.tok = internAll(javaTok(text));
309  
      files.add(f);
310  
    }
311  
    
312  
    zipFile.close();
313  
    return files;
314  
  }
315  
  
316  
  static L<F> makeCorpus_mysqldump() {
317  
    new L<F> files;
318  
    SnippetDB db = new SnippetDB(corpusID);
319  
    List<List<S>> rows = db.rowsOrderedBy("sn_created");
320  
    for (int i = Math.max(0, rows.size()-numSnippets); i < rows.size(); i++) {
321  
      new F f;
322  
      f.id = db.getField(rows.get(i), "sn_id");
323  
      f.name = db.getField(rows.get(i), "sn_title");
324  
      S text = db.getField(rows.get(i), "sn_text");
325  
      f.tok = internAll(javaTok(text));
326  
      files.add(f);
327  
      ++i;
328  
    }
329  
    return files;
330  
  }
331  
332  
  static class Collector {
333  
    P winner;
334  
    double bestScore = -1;
335  
    Map<F, Set<int>> predicted;
336  
337  
    void add(P p, double score) {
338  
      if (winner == null || score > bestScore) {
339  
        winner = p;
340  
        bestScore = score;
341  
        //S name = shorten(structure(p), 100);
342  
        S name = p.getClass().getName();
343  
        print("New best score: " + formatDouble(score, 2) + "% (" + name + ")");
344  
        predicted = main.predicted;
345  
      }
346  
    }
347  
  }
348  
  
349  
  static void window() {
350  
    //final P p = collector.winner.clear();
351  
352  
    JFrame jf = new JFrame("Predicted = green");
353  
    Container cp = jf.getContentPane();
354  
355  
    final JButton btnNext = new JButton("Next");
356  
    
357  
    final JTextPane pane = new JTextPane();
358  
    //pane.setFont(loadFont("#1000993", 24));
359  
    
360  
    JScrollPane scrollPane = new JScrollPane(pane);
361  
    cp.add(scrollPane, BorderLayout.CENTER);
362  
    
363  
    class X {
364  
      int ii;
365  
      
366  
      void y() ctex {
367  
        ii = ii == 0 ? files.size()-1 : ii-1;
368  
        F f = files.get(ii);
369  
        //testFile(p, f);
370  
        Set<int> pred = collector.predicted.get(f);
371  
        
372  
        StyledDocument doc = new DefaultStyledDocument();
373  
374  
        L<S> tok = f.tok;
375  
        int i = tok.size(), len = 0;
376  
        while (len <= maxCharsGUI && i > 0) {
377  
          --i;
378  
          len += tok.get(i).length();
379  
        }
380  
        
381  
        for (; i < tok.size(); i++) {
382  
          if (tok.get(i).length() == 0) continue;
383  
          boolean green = pred.contains(i);
384  
          SimpleAttributeSet set = new SimpleAttributeSet();
385  
          StyleConstants.setForeground(set, green ? Color.green : Color.gray);
386  
          doc.insertString(doc.getLength(), tok.get(i), set);
387  
        }
388  
        
389  
        pane.setDocument(doc);
390  
        double score = getScore(pred, tok);
391  
        btnNext.setText(f.name + " (" + (ii+1) + "/" + files.size() + ") - " + (int) score + " %");
392  
      }
393  
    }
394  
    final new X x;
395  
    
396  
    btnNext.addActionListener(actionListener {
397  
      x.y();
398  
    });
399  
    cp.add(btnNext, BorderLayout.NORTH);
400  
401  
    x.y();
402  
    
403  
    jf.setBounds(100, 100, 600, 600);
404  
    jf.setVisible(true);    
405  
  }
406  
  
407  
  !include #1001032 // clone function
408  
  
409  
  static double getScore(Set<int> pred, L<S> tok) {
410  
    int total = 0, score = 0;
411  
    for (int i = 0; i < tok.size(); i++) {
412  
      int n = tok.get(i).length();
413  
      total += n;
414  
      if (pred.contains(i))
415  
        score += n;
416  
    }
417  
    ret score*100.0/total;
418  
  }
419  
}

Author comment

Began life as a copy of #1001033

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1001037
Snippet name: Token prediction, multiple predictors (v5, developing)
Eternal ID of this version: #1001037/1
Text MD5: b286849fecdd14a17f356c50a3b92756
Transpilation MD5: d3e70f9fff633fe7154db05fbb2e2d3e
Author: stefan
Category:
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-09-16 21:44:38
Source code size: 10641 bytes / 419 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 629 / 639
Referenced in: [show references]