Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

262
LINES

< > BotCompany Repo | #1001011 // Token prediction, multiple predictors (adding zip support)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (2382L/19K/57K).

1  
!747
2  
3  
abstract class P {
4  
  abstract S read(L<S> tok);
5  
  abstract P derive(); // clone & reset counter for actual use
6  
}
7  
8  
m {
9  
  static S corpusID = "#1001010";
10  
  static int numSnippets = 3000;
11  
  static boolean showGUI = true;
12  
  static int maxCharsGUI = 500000;
13  
  static boolean allTokens = true;
14  
  
15  
  static Collector collector;
16  
  static L<S> tok;
17  
  static Set<int> predicted;
18  
  
19  
  static class Chain extends P {
20  
    new L<P> list;
21  
    
22  
    *() {}
23  
    *(L<P> *list) {}
24  
    *(P... a) { list = asList(a); }
25  
    
26  
    void add(P p) { list.add(p); }
27  
    
28  
    S read(L<S> tok) {
29  
      for (P p : list) {
30  
        S s = p.read(tok);
31  
        if (s != null) return s;
32  
      }
33  
      return null;
34  
    }
35  
    
36  
    P derive() {
37  
      new Chain c;
38  
      for (P p : list)
39  
        c.add(p.derive());
40  
      return c;
41  
    }
42  
  }
43  
    
44  
  static class Tuples extends P {
45  
    Map<L<S>,S> map = new HashMap<L<S>,S>();
46  
    int n, seen;
47  
48  
    *(int *n) {
49  
    }
50  
    
51  
    S read(L<S> tok) {
52  
      while (tok.size() > seen) {
53  
        ++seen;
54  
        if (seen > n)
55  
          map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
56  
      }
57  
      
58  
      if (tok.size() >= n)
59  
        return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size())));
60  
        
61  
      return null;
62  
    }
63  
    
64  
    // slow...
65  
    P oldDerive() {
66  
      Tuples t = new Tuples(n);
67  
      t.map.putAll(map);
68  
      // t.seen == 0 which is ok
69  
      return t;
70  
    }
71  
    
72  
    // fast!
73  
    P derive() {
74  
      Tuples t = new Tuples(n);
75  
      t.map = new DerivedHashMap<L<S>,S>(map);
76  
      return t;
77  
    }
78  
  }
79  
  
80  
  static class DerivedHashMap<A, B> extends AbstractMap<A, B> {
81  
    Map<A, B> base;
82  
    new HashMap<A, B> additions;
83  
    
84  
    *(Map<A, B> *base) {}
85  
    
86  
    public B get(Object key) {
87  
      B b = additions.get(key);
88  
      if (b != null) return b;
89  
      return base.get(key);
90  
    }
91  
    
92  
    public B put(A key, B value) {
93  
      return additions.put(key, value);
94  
    }
95  
    
96  
    public Set<Map.Entry<A,B>> entrySet() {
97  
      throw fail();
98  
    }
99  
  }
100  
  
101  
  // TODO: Put NewX back in
102  
  
103  
  p {
104  
    tok = makeCorpusJavaTok();
105  
    print("Tokens in corpus: " + tok.size());
106  
    
107  
    print("Learning...");
108  
    collector = new Collector;
109  
    /*test(new Tuples(1));
110  
    test(new Tuples(2));
111  
    test(new Tuples(3));
112  
    test(new Tuples(4));
113  
    test(new Chain(new Tuples(2), new Tuples(1)));*/
114  
    test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1)));
115  
116  
    print("Learning done.");
117  
    if (collector.winner != null && showGUI) {
118  
      predicted = collector.predicted;
119  
      showColoredText();
120  
    }
121  
  }
122  
  
123  
  // test a predictor
124  
  static void test(P p) {
125  
    predicted = new TreeSet<int>();
126  
    int points = 0, total = 0, lastPercent = 0;
127  
    new L<S> history;
128  
    for (int i = allTokens ? 0 : 1; i < tok.size(); i += allTokens ? 1 : 2) {
129  
      S t = tok.get(i);
130  
      S x = p.read(history);
131  
      boolean correct = t.equals(x);
132  
      total += t.length();
133  
      if (correct) {
134  
        predicted.add(i);
135  
        points += t.length();
136  
      }
137  
      history.add(t);
138  
      int percent = roundUpTo(10, (int) (i*100L/tok.size()));
139  
      if (percent > lastPercent) {
140  
        print("Learning " + percent + "% done.");
141  
        lastPercent = percent;
142  
      }
143  
    }
144  
    double score = points*100.0/total;
145  
    collector.add(p, score);
146  
  }
147  
  
148  
  static void showColoredText() ctex {
149  
    JFrame jf = new JFrame("Predicted = green");
150  
    Container cp = jf.getContentPane();
151  
152  
    JTextPane pane = new JTextPane();
153  
    //pane.setFont(loadFont("#1000993", 24));
154  
    Document doc = pane.getStyledDocument();
155  
156  
    int i = tok.size(), len = 0;
157  
    while (len <= maxCharsGUI && i > 0) {
158  
      --i;
159  
      len += tok.get(i).length();
160  
    }
161  
    
162  
    for (; i < tok.size(); i++) {
163  
      if (tok.get(i).length() == 0) continue;
164  
      boolean green = predicted.contains(i);
165  
      SimpleAttributeSet set = new SimpleAttributeSet();
166  
      StyleConstants.setForeground(set, green ? Color.green : Color.gray);
167  
      doc.insertString(doc.getLength(), tok.get(i), set);
168  
    }
169  
    
170  
    JScrollPane scrollPane = new JScrollPane(pane);
171  
    cp.add(scrollPane, BorderLayout.CENTER);
172  
173  
    jf.setBounds(100, 100, 600, 600);
174  
    jf.setVisible(true);
175  
  }
176  
  
177  
  !include #1000989 // SnippetDB
178  
  
179  
  static L<S> makeCorpusJavaTok() {
180  
    S name = getSnippetTitle(corpusID);
181  
    if (name.toLowerCase().indexOf(".zip") >= 0)
182  
      return makeCorpus_zip();
183  
    else
184  
      return makeCorpus_mysqldump();
185  
  }
186  
  
187  
  static L<S> makeCorpus_zip() ctex {
188  
    ZipFile zipFile = new ZipFile(loadLibrary(corpusID));
189  
    Enumeration entries = zipFile.entries();
190  
    new L<S> tok;
191  
    
192  
    while (entries.hasMoreElements()) {
193  
      ZipEntry entry = (ZipEntry) entries.nextElement(); 
194  
      //System.out.println("File found: " + entry.getName());
195  
196  
      InputStream fin = zipFile.getInputStream(entry);
197  
      // TODO: try to skip binary files?
198  
      
199  
      InputStreamReader reader = new InputStreamReader(fin, "UTF-8");
200  
      new StringBuilder builder;
201  
      BufferedReader bufferedReader = new BufferedReader(reader);
202  
      String line;
203  
      while ((line = bufferedReader.readLine()) != null)
204  
        builder.append(line).append('\n');
205  
      fin.close();
206  
      
207  
      new StringBuilder buf;
208  
      buf.append("\n== File: " + entry.getName());
209  
      buf.append("\n==\n");
210  
      buf.append(builder.toString()).append("\n");
211  
      if (tok.size() != 0) tok.remove(tok.size()-1);
212  
      tok.addAll(javaTok(buf.toString()));
213  
    }
214  
    
215  
    zipFile.close();
216  
    return internAll(tok);
217  
  }
218  
  
219  
  static L<S> makeCorpus_mysqldump() {
220  
    SnippetDB db = new SnippetDB(corpusID);
221  
    List<List<S>> rows = db.rowsOrderedBy("sn_created");
222  
    new L<S> tok;
223  
    for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) {
224  
      new StringBuilder buf;
225  
      S id = db.getField(rows.get(i), "sn_id");
226  
      S title = db.getField(rows.get(i), "sn_title");
227  
      S text = db.getField(rows.get(i), "sn_text");
228  
      buf.append("\n== ID: " + id);
229  
      buf.append("\n== Title: " + title);
230  
      buf.append("\n==\n");
231  
      buf.append(text).append("\n");
232  
      if (tok.size() != 0) tok.remove(tok.size()-1);
233  
      tok.addAll(javaTok(buf.toString()));
234  
      ++i;
235  
    }
236  
    return internAll(tok);
237  
  }
238  
  
239  
  static L<S> internAll(L<S> tok) {
240  
    new L<S> l;
241  
    for (S t : tok)
242  
      l.add(t.intern());
243  
    return l;
244  
  }
245  
  
246  
  static class Collector {
247  
    P winner;
248  
    double bestScore = -1;
249  
    Set<int> predicted;
250  
251  
    void add(P p, double score) {
252  
      if (winner == null || score > bestScore) {
253  
        winner = p;
254  
        bestScore = score;
255  
        //S name = shorten(structure(p), 100);
256  
        S name = p.getClass().getName();
257  
        print("New best score: " + formatDouble(score, 2) + "% (" + name + ")");
258  
        this.predicted = main.predicted;
259  
      }
260  
    }
261  
  }
262  
}

Author comment

Began life as a copy of #1001000

download  show line numbers  debug dex  old transpilations   

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1001011
Snippet name: Token prediction, multiple predictors (adding zip support)
Eternal ID of this version: #1001011/1
Text MD5: 942f3ed24c4432b998f1e22ebdd4e9fe
Transpilation MD5: 78b9d92dea6ca60c5e4296d797492666
Author: stefan
Category:
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2015-09-16 00:58:02
Source code size: 7026 bytes / 262 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 560 / 871
Referenced in: [show references]