Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

235
LINES

< > BotCompany Repo | #1001025 // Token prediction, multiple predictors (new architecture, developing)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (2373L/19K/56K).

1  
!747
2  
3  
m {
4  
  static S corpusID = "#1001010";
5  
  static int numSnippets = 3000;
6  
  static boolean showGUI = true;
7  
  static int maxCharsGUI = 500000;
8  
  static boolean allTokens = true;
9  
  
10  
  static Collector collector;
11  
  static L<F> files;
12  
  static Set<int> predicted;
13  
  
14  
  // a file to learn from
15  
  static class F {
16  
    String id, name;
17  
    L<S> tok;
18  
  }
19  
  
20  
  // a predictor
21  
  static abstract class P {
22  
    abstract S read(S file, L<S> tok);
23  
    abstract P derive(); // clone & reset counter for actual use
24  
  }
25  
26  
  static class Chain extends P {
27  
    new L<P> list;
28  
    
29  
    *() {}
30  
    *(L<P> *list) {}
31  
    *(P... a) { list = asList(a); }
32  
    
33  
    void add(P p) { list.add(p); }
34  
    
35  
    S read(S file, L<S> tok) {
36  
      for (P p : list) {
37  
        S s = p.read(file, tok);
38  
        if (s != null) return s;
39  
      }
40  
      return null;
41  
    }
42  
    
43  
    P derive() {
44  
      new Chain c;
45  
      for (P p : list)
46  
        c.add(p.derive());
47  
      return c;
48  
    }
49  
  }
50  
    
51  
  static class Tuples extends P {
52  
    Map<L<S>,S> map = new HashMap<L<S>,S>();
53  
    int n, seen;
54  
    S file;
55  
56  
    *(int *n) {
57  
    }
58  
    
59  
    S read(S file, L<S> tok) {
60  
      if (!eq(file, this.file)) {
61  
        seen = 0;
62  
        this.file = file;
63  
      }
64  
      
65  
      while (tok.size() > seen) {
66  
        ++seen;
67  
        if (seen > n)
68  
          map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
69  
      }
70  
      
71  
      if (tok.size() >= n)
72  
        return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size())));
73  
        
74  
      return null;
75  
    }
76  
    
77  
    // slow...
78  
    P oldDerive() {
79  
      Tuples t = new Tuples(n);
80  
      t.map.putAll(map);
81  
      // t.seen == 0 which is ok
82  
      return t;
83  
    }
84  
    
85  
    // fast!
86  
    P derive() {
87  
      Tuples t = new Tuples(n);
88  
      t.map = new DerivedHashMap<L<S>,S>(map);
89  
      return t;
90  
    }
91  
  }
92  
  
93  
  static class DerivedHashMap<A, B> extends AbstractMap<A, B> {
94  
    Map<A, B> base;
95  
    new HashMap<A, B> additions;
96  
    
97  
    *(Map<A, B> *base) {}
98  
    
99  
    public B get(Object key) {
100  
      B b = additions.get(key);
101  
      if (b != null) return b;
102  
      return base.get(key);
103  
    }
104  
    
105  
    public B put(A key, B value) {
106  
      return additions.put(key, value);
107  
    }
108  
    
109  
    public Set<Map.Entry<A,B>> entrySet() {
110  
      throw fail();
111  
    }
112  
  }
113  
  
114  
  // TODO: Put NewX back in
115  
  
116  
  p {
117  
    files = makeCorpus();
118  
    print("Files in corpus: " + files.size());
119  
    
120  
    print("Learning...");
121  
    collector = new Collector;
122  
    test(new Tuples(1));
123  
    //test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1)));
124  
125  
    print("Learning done.");
126  
    /*if (collector.winner != null && showGUI) {
127  
      predicted = collector.predicted;
128  
      showColoredText();
129  
    }*/
130  
  }
131  
  
132  
  // train & evaluate a predictor
133  
  static void test(P p) {
134  
    //predicted = new TreeSet<int>();
135  
    int points = 0, total = 0, lastPercent = 0;
136  
    for (int ii = 0; ii < files.size(); ii++) {
137  
      F f = files.get(ii);
138  
      
139  
      new L<S> history;
140  
      for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) {
141  
        S t = f.tok.get(i);
142  
        S x = p.read(f.name, history);
143  
        boolean correct = t.equals(x);
144  
        total += t.length();
145  
        if (correct) {
146  
          //predicted.add(i);
147  
          points += t.length();
148  
        }
149  
        history.add(t);
150  
      }
151  
      
152  
      int percent = roundUpTo(10, (int) (ii*100L/files.size()));
153  
      if (percent > lastPercent) {
154  
        print("Learning " + percent + "% done.");
155  
        lastPercent = percent;
156  
      }
157  
    }
158  
    double score = points*100.0/total;
159  
    collector.add(p, score);
160  
  }
161  
  
162  
  !include #1000989 // SnippetDB
163  
  
164  
  static L<F> makeCorpus() {
165  
    S name = getSnippetTitle(corpusID);
166  
    if (name.toLowerCase().indexOf(".zip") >= 0)
167  
      return makeCorpus_zip();
168  
    else
169  
      return makeCorpus_mysqldump();
170  
  }
171  
  
172  
  static L<F> makeCorpus_zip() ctex {
173  
    new L<F> files;
174  
    ZipFile zipFile = new ZipFile(loadLibrary(corpusID));
175  
    Enumeration entries = zipFile.entries();
176  
177  
    while (entries.hasMoreElements()) {
178  
      ZipEntry entry = (ZipEntry) entries.nextElement(); 
179  
      //System.out.println("File found: " + entry.getName());
180  
181  
      InputStream fin = zipFile.getInputStream(entry);
182  
      // TODO: try to skip binary files?
183  
      
184  
      InputStreamReader reader = new InputStreamReader(fin, "UTF-8");
185  
      new StringBuilder builder;
186  
      BufferedReader bufferedReader = new BufferedReader(reader);
187  
      String line;
188  
      while ((line = bufferedReader.readLine()) != null)
189  
        builder.append(line).append('\n');
190  
      fin.close();
191  
      S text = builder.toString();
192  
      
193  
      new F f;
194  
      f.name = entry.getName();
195  
      f.tok = internAll(javaTok(text));
196  
      files.add(f);
197  
    }
198  
    
199  
    zipFile.close();
200  
    return files;
201  
  }
202  
  
203  
  static L<F> makeCorpus_mysqldump() {
204  
    new L<F> files;
205  
    SnippetDB db = new SnippetDB(corpusID);
206  
    List<List<S>> rows = db.rowsOrderedBy("sn_created");
207  
    for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) {
208  
      new F f;
209  
      f.id = db.getField(rows.get(i), "sn_id");
210  
      f.name = db.getField(rows.get(i), "sn_title");
211  
      S text = db.getField(rows.get(i), "sn_text");
212  
      f.tok = internAll(javaTok(text));
213  
      files.add(f);
214  
      ++i;
215  
    }
216  
    return files;
217  
  }
218  
219  
  static class Collector {
220  
    P winner;
221  
    double bestScore = -1;
222  
    Set<int> predicted;
223  
224  
    void add(P p, double score) {
225  
      if (winner == null || score > bestScore) {
226  
        winner = p;
227  
        bestScore = score;
228  
        //S name = shorten(structure(p), 100);
229  
        S name = p.getClass().getName();
230  
        print("New best score: " + formatDouble(score, 2) + "% (" + name + ")");
231  
        this.predicted = main.predicted;
232  
      }
233  
    }
234  
  }
235  
}

Author comment

Began life as a copy of #1001011

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1001025
Snippet name: Token prediction, multiple predictors (new architecture, developing)
Eternal ID of this version: #1001025/1
Text MD5: 04645fde994b63b0cbf0290e16d1047b
Transpilation MD5: 5dd35d6420b39e05bcc4e9c613a729e0
Author: stefan
Category:
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-09-16 14:20:59
Source code size: 5941 bytes / 235 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 661 / 647
Referenced in: [show references]