Libraryless. Click here for Pure Java version (2971L/23K/69K).
1 | !747 |
2 | !actionListener { |
3 | |
4 | m { |
5 | static S corpusID = "#1001034"; |
6 | static int numSnippets = 100; |
7 | static boolean showGUI = true; |
8 | static int maxCharsGUI = 500000; |
9 | static boolean allTokens = true; |
10 | |
11 | static Collector collector; |
12 | static L<F> files; |
13 | static Map<F, Set<int>> predicted; |
14 | |
15 | // a file to learn from |
16 | static class F { |
17 | String id, name; |
18 | L<S> tok; |
19 | } |
20 | |
21 | // a predictor |
22 | static abstract class P { |
23 | int seen; |
24 | S file; |
25 | |
26 | abstract S read(S file, L<S> tok); |
27 | abstract P derive(); // clone & reset counter for actual use |
28 | abstract P clear(); |
29 | |
30 | void prepare(S file) { |
31 | if (!eq(file, this.file)) { |
32 | seen = 0; |
33 | this.file = file; |
34 | } |
35 | } |
36 | } |
37 | |
38 | static class Chain extends P { |
39 | new L<P> list; |
40 | |
41 | *() {} |
42 | *(L<P> *list) {} |
43 | *(P... a) { list = asList(a); } |
44 | |
45 | void add(P p) { list.add(p); } |
46 | |
47 | S read(S file, L<S> tok) { |
48 | for (P p : list) { |
49 | S s = p.read(file, tok); |
50 | if (s != null) return s; |
51 | } |
52 | return null; |
53 | } |
54 | |
55 | P derive() { |
56 | new Chain c; |
57 | for (P p : list) |
58 | c.add(p.derive()); |
59 | return c; |
60 | } |
61 | |
62 | P clear() { |
63 | new Chain c; |
64 | for (P p : list) |
65 | c.add(p.clear()); |
66 | return c; |
67 | } |
68 | } |
69 | |
70 | static class Tuples extends P { |
71 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
72 | int n; |
73 | |
74 | *(int *n) { |
75 | } |
76 | |
77 | S read(S file, L<S> tok) { |
78 | prepare(file); |
79 | |
80 | while (tok.size() > seen) { |
81 | ++seen; |
82 | if (seen > n) |
83 | map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
84 | } |
85 | |
86 | if (tok.size() >= n) |
87 | return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); |
88 | |
89 | return null; |
90 | } |
91 | |
92 | P derive() { |
93 | Tuples t = new Tuples(n); |
94 | t.map = new DerivedHashMap<L<S>,S>(map); |
95 | return t; |
96 | } |
97 | |
98 | P clear() { |
99 | return new Tuples(n); |
100 | } |
101 | } |
102 | |
103 | static Map<S, S> makeMapPrefix(L<S> tok1, L<S> tok2) { |
104 | if (tok1.size() < tok2.size()) return null; |
105 | |
106 | new Map<S, S> map; |
107 | for (int i = 1; i < tok2.size(); i += 2) { |
108 | S t1 = tok1.get(i), t2 = tok2.get(i); |
109 | if (!t1.equals(t2)) { |
110 | S v = map.get(t1); |
111 | if (v == null) |
112 | map.put(t1, t2); |
113 | else if (!v.equals(t2)) |
114 | return null; // match fail |
115 | } |
116 | } |
117 | |
118 | // match succeeds |
119 | return map; |
120 | } |
121 | |
122 | // TODO: code tokens only |
123 | static class Patterns extends P { |
124 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
125 | int n; |
126 | |
127 | *(int *n) { |
128 | } |
129 | |
130 | S read(S file, L<S> tok) { |
131 | prepare(file); |
132 | |
133 | while (tok.size() > seen) { |
134 | ++seen; |
135 | if (seen > n) |
136 | put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
137 | } |
138 | |
139 | if (tok.size() >= n) { |
140 | L<S> l = new ArrayList<S>(tok.subList(tok.size()-n, tok.size())); |
141 | for (L<S> pl : map.keySet()) { |
142 | S pr = map.get(pl); |
143 | print("pl: " + structure(pl) + ", l: " + structure(l)); |
144 | Map<S, S> m = makeMapPrefix(pl, l); |
145 | if (m != null) { |
146 | S result = m.get(pr); |
147 | print("map: " + structure(m) + ", result: " + quote(result)); |
148 | ret result; |
149 | } |
150 | } |
151 | } |
152 | |
153 | return null; |
154 | } |
155 | |
156 | void put(L<S> l, S r) { |
157 | if (isPattern(l, r)) { |
158 | new L<S> l2; |
159 | l2.addAll(l); |
160 | l2.add(r); |
161 | print("pattern: " + structure(l2)); |
162 | map.put(l, r); |
163 | } |
164 | } |
165 | |
166 | boolean isPattern(L<S> l, S r) { |
167 | /*new Set<S> set; |
168 | set.addAll(l); |
169 | set.add(r); |
170 | return set.size() < l.size()+1;*/ |
171 | return l.contains(r) && interestingToken(r); |
172 | } |
173 | |
174 | boolean interestingToken(S r) { |
175 | //return !r.trim().equals(""); |
176 | for (int i = 0; i < r.length(); i++) |
177 | if (Character.isLetter(r.charAt(i))) |
178 | ret true; |
179 | ret false; |
180 | } |
181 | |
182 | P derive() { |
183 | Patterns t = new Patterns(n); |
184 | t.map = new DerivedHashMap<L<S>,S>(map); |
185 | return t; |
186 | } |
187 | |
188 | P clear() { |
189 | return new Patterns(n); |
190 | } |
191 | } |
192 | |
193 | !include #1001027 // DerivedHashMap |
194 | |
195 | static class Node { |
196 | String token; |
197 | float count; |
198 | new L<Node> next; |
199 | |
200 | *() {} // for clone method |
201 | |
202 | *(S *token) {} |
203 | |
204 | Node find(S token) { |
205 | for (Node n : next) |
206 | if (n.token.equals(token)) |
207 | ret n; |
208 | ret null; |
209 | } |
210 | |
211 | Node bestNext() { |
212 | float bestCount = 0f; |
213 | Node best = null; |
214 | for (Node n : next) |
215 | if (best == null || n.count > best.count) { |
216 | best = n; |
217 | bestCount = n.count; |
218 | } |
219 | ret best; |
220 | } |
221 | } |
222 | |
223 | static class StartTree extends P { |
224 | Node tree = new Node(""); |
225 | Node node; |
226 | boolean nonmod; |
227 | |
228 | S read(S file, L<S> tok) { |
229 | if (!eq(file, this.file)) { |
230 | seen = 0; |
231 | this.file = file; |
232 | node = tree; |
233 | } |
234 | |
235 | if (!nonmod) while (tok.size() > seen) { |
236 | S t = tok.get(seen++); |
237 | Node child = node.find(t); |
238 | if (child == null) |
239 | node.next.add(child = new Node(t)); |
240 | child.count++; |
241 | node = child; |
242 | } |
243 | |
244 | Node n = node.bestNext(); |
245 | ret n != null ? n.token : null; |
246 | } |
247 | |
248 | // it's a hack - derived predictor doesn't learn |
249 | P derive() { |
250 | //return (P) main.clone(this); |
251 | new StartTree p; |
252 | p.nonmod = true; |
253 | p.tree = tree; |
254 | return p; |
255 | } |
256 | |
257 | P clear() { |
258 | return new StartTree; |
259 | } |
260 | } |
261 | |
262 | p { |
263 | files = makeCorpus(); |
264 | print("Files in corpus: " + files.size()); |
265 | |
266 | print("Learning..."); |
267 | collector = new Collector; |
268 | //test(new Tuples(1)); |
269 | /*test(new StartTree); |
270 | test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1), new StartTree));*/ |
271 | //test(new Patterns(6)); |
272 | test(new Patterns(9)); |
273 | |
274 | print("Learning done."); |
275 | printVMSize(); |
276 | if (collector.winner != null && showGUI) |
277 | window(); |
278 | } |
279 | |
280 | static int points = 0, total = 0; |
281 | |
282 | // train & evaluate a predictor |
283 | static void test(P p) { |
284 | int lastPercent = 0; |
285 | predicted = new HashMap; |
286 | points = 0; |
287 | total = 0; |
288 | for (int ii = 0; ii < files.size(); ii++) { |
289 | F f = files.get(ii); |
290 | |
291 | testFile(p, f); |
292 | |
293 | int percent = roundUpTo(10, (int) (ii*100L/files.size())); |
294 | if (percent > lastPercent) { |
295 | print("Learning " + percent + "% done."); |
296 | lastPercent = percent; |
297 | } |
298 | } |
299 | double score = points*100.0/total; |
300 | collector.add(p, score); |
301 | } |
302 | |
303 | static void testFile(P p, F f) { |
304 | new TreeSet<int> pred; |
305 | new L<S> history; |
306 | for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) { |
307 | S t = f.tok.get(i); |
308 | S x = p.read(f.name, history); |
309 | boolean correct = t.equals(x); |
310 | total += t.length(); |
311 | if (correct) { |
312 | pred.add(i); |
313 | points += t.length(); |
314 | } |
315 | history.add(t); |
316 | } |
317 | predicted.put(f, pred); |
318 | } |
319 | |
320 | !include #1000989 // SnippetDB |
321 | |
322 | static L<F> makeCorpus() ctex { |
323 | S name = getSnippetTitle(corpusID); |
324 | S s = loadSnippet(corpusID); |
325 | if (s.length() != 0) |
326 | return makeCorpus_single(s); |
327 | else if (name.toLowerCase().indexOf(".zip") >= 0) |
328 | return makeCorpus_zip(); |
329 | else |
330 | return makeCorpus_mysqldump(); |
331 | } |
332 | |
333 | static L<F> makeCorpus_single(S text) ctex { |
334 | new L<F> files; |
335 | new F f; |
336 | f.id = corpusID; |
337 | f.name = getSnippetTitle(corpusID); |
338 | f.tok = internAll(javaTok(text)); |
339 | files.add(f); |
340 | return files; |
341 | } |
342 | |
343 | static L<F> makeCorpus_zip() ctex { |
344 | new L<F> files; |
345 | ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); |
346 | Enumeration entries = zipFile.entries(); |
347 | |
348 | while (entries.hasMoreElements() && files.size() < numSnippets) { |
349 | ZipEntry entry = (ZipEntry) entries.nextElement(); |
350 | if (entry.isDirectory()) continue; |
351 | //System.out.println("File found: " + entry.getName()); |
352 | |
353 | InputStream fin = zipFile.getInputStream(entry); |
354 | // TODO: try to skip binary files? |
355 | |
356 | InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); |
357 | new StringBuilder builder; |
358 | BufferedReader bufferedReader = new BufferedReader(reader); |
359 | String line; |
360 | while ((line = bufferedReader.readLine()) != null) |
361 | builder.append(line).append('\n'); |
362 | fin.close(); |
363 | S text = builder.toString(); |
364 | |
365 | new F f; |
366 | f.name = entry.getName(); |
367 | f.tok = internAll(javaTok(text)); |
368 | files.add(f); |
369 | } |
370 | |
371 | zipFile.close(); |
372 | return files; |
373 | } |
374 | |
375 | static L<F> makeCorpus_mysqldump() { |
376 | new L<F> files; |
377 | SnippetDB db = new SnippetDB(corpusID); |
378 | List<List<S>> rows = db.rowsOrderedBy("sn_created"); |
379 | for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) { |
380 | new F f; |
381 | f.id = db.getField(rows.get(i), "sn_id"); |
382 | f.name = db.getField(rows.get(i), "sn_title"); |
383 | S text = db.getField(rows.get(i), "sn_text"); |
384 | f.tok = internAll(javaTok(text)); |
385 | files.add(f); |
386 | ++i; |
387 | } |
388 | return files; |
389 | } |
390 | |
391 | static class Collector { |
392 | P winner; |
393 | double bestScore = -1; |
394 | Map<F, Set<int>> predicted; |
395 | |
396 | void add(P p, double score) { |
397 | if (winner == null || score > bestScore) { |
398 | winner = p; |
399 | bestScore = score; |
400 | //S name = shorten(structure(p), 100); |
401 | S name = p.getClass().getName(); |
402 | print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); |
403 | predicted = main.predicted; |
404 | } |
405 | } |
406 | } |
407 | |
408 | static void window() { |
409 | //final P p = collector.winner.clear(); |
410 | |
411 | JFrame jf = new JFrame("Predicted = green"); |
412 | Container cp = jf.getContentPane(); |
413 | |
414 | final JButton btnNext = new JButton("Next"); |
415 | |
416 | final JTextPane pane = new JTextPane(); |
417 | //pane.setFont(loadFont("#1000993", 24)); |
418 | |
419 | JScrollPane scrollPane = new JScrollPane(pane); |
420 | cp.add(scrollPane, BorderLayout.CENTER); |
421 | |
422 | class X { |
423 | int ii; |
424 | |
425 | void y() ctex { |
426 | ii = ii == 0 ? files.size()-1 : ii-1; |
427 | F f = files.get(ii); |
428 | //testFile(p, f); |
429 | Set<int> pred = collector.predicted.get(f); |
430 | |
431 | StyledDocument doc = new DefaultStyledDocument(); |
432 | |
433 | L<S> tok = f.tok; |
434 | int i = tok.size(), len = 0; |
435 | while (len <= maxCharsGUI && i > 0) { |
436 | --i; |
437 | len += tok.get(i).length(); |
438 | } |
439 | |
440 | for (; i < tok.size(); i++) { |
441 | if (tok.get(i).length() == 0) continue; |
442 | boolean green = pred.contains(i); |
443 | SimpleAttributeSet set = new SimpleAttributeSet(); |
444 | StyleConstants.setForeground(set, green ? Color.green : Color.gray); |
445 | doc.insertString(doc.getLength(), tok.get(i), set); |
446 | } |
447 | |
448 | pane.setDocument(doc); |
449 | double score = getScore(pred, tok); |
450 | btnNext.setText(f.name + " (" + (ii+1) + "/" + files.size() + ") - " + (int) score + " %"); |
451 | } |
452 | } |
453 | final new X x; |
454 | |
455 | btnNext.addActionListener(actionListener { |
456 | x.y(); |
457 | }); |
458 | cp.add(btnNext, BorderLayout.NORTH); |
459 | |
460 | x.y(); |
461 | |
462 | jf.setBounds(100, 100, 600, 600); |
463 | jf.setVisible(true); |
464 | } |
465 | |
466 | !include #1001032 // clone function |
467 | |
468 | static double getScore(Set<int> pred, L<S> tok) { |
469 | int total = 0, score = 0; |
470 | for (int i = 0; i < tok.size(); i++) { |
471 | int n = tok.get(i).length(); |
472 | total += n; |
473 | if (pred.contains(i)) |
474 | score += n; |
475 | } |
476 | ret score*100.0/total; |
477 | } |
478 | } |
Began life as a copy of #1001028
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1001033 |
Snippet name: | Token prediction, multiple predictors (v4, including patterns, developing) |
Eternal ID of this version: | #1001033/1 |
Text MD5: | 85cea85358eb275d7c83f6d0c67117fa |
Transpilation MD5: | a5d078ee9518058f0828e33869770292 |
Author: | stefan |
Category: | |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-09-16 20:59:04 |
Source code size: | 11899 bytes / 478 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 690 / 752 |
Referenced in: | [show references] |