Libraryless. Click here for Pure Java version (2911L/22K/67K).
1 | !747 |
2 | !actionListener { |
3 | |
4 | m { |
5 | static S corpusID = |
6 | //"#1001034"; // one small snippet |
7 | "#1001006"; // snippets DB |
8 | static int numSnippets = 100; |
9 | static boolean showGUI = true; |
10 | static int maxCharsGUI = 500000; |
11 | static boolean allTokens = true; |
12 | |
13 | static Collector collector; |
14 | static L<F> files; |
15 | static Map<F, Set<int>> predicted; |
16 | |
17 | // a file to learn from |
18 | static class F { |
19 | String id, name; |
20 | L<S> tok; |
21 | } |
22 | |
23 | // a predictor |
24 | static abstract class P { |
25 | int seen; |
26 | S file; |
27 | |
28 | // basic function - predict next token |
29 | abstract S read(S file, L<S> tok); |
30 | |
31 | // advanced function - predict rest of token starting with t |
32 | S complete(S file, L<S> tok, S t) { return null; } |
33 | |
34 | abstract P derive(); // clone with trained data |
35 | abstract P clear(); // clone without trained data |
36 | |
37 | void prepare(S file) { |
38 | if (!eq(file, this.file)) { |
39 | seen = 0; |
40 | this.file = file; |
41 | } |
42 | } |
43 | } |
44 | |
45 | static class Chain extends P { |
46 | new L<P> list; |
47 | |
48 | *() {} |
49 | *(L<P> *list) {} |
50 | *(P... a) { list = asList(a); } |
51 | |
52 | void add(P p) { list.add(p); } |
53 | |
54 | S read(S file, L<S> tok) { |
55 | for (P p : list) { |
56 | S s = p.read(file, tok); |
57 | if (s != null) return s; |
58 | } |
59 | return null; |
60 | } |
61 | |
62 | P derive() { |
63 | new Chain c; |
64 | for (P p : list) |
65 | c.add(p.derive()); |
66 | return c; |
67 | } |
68 | |
69 | P clear() { |
70 | new Chain c; |
71 | for (P p : list) |
72 | c.add(p.clear()); |
73 | return c; |
74 | } |
75 | } |
76 | |
77 | static class Tuples extends P { |
78 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
79 | int n; |
80 | |
81 | *(int *n) { |
82 | } |
83 | |
84 | S read(S file, L<S> tok) { |
85 | prepare(file); |
86 | |
87 | while (tok.size() > seen) { |
88 | ++seen; |
89 | if (seen > n) |
90 | map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
91 | } |
92 | |
93 | if (tok.size() >= n) |
94 | return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); |
95 | |
96 | return null; |
97 | } |
98 | |
99 | P derive() { |
100 | Tuples t = new Tuples(n); |
101 | t.map = new DerivedHashMap<L<S>,S>(map); |
102 | return t; |
103 | } |
104 | |
105 | P clear() { |
106 | return new Tuples(n); |
107 | } |
108 | } |
109 | |
110 | static Map<S, S> makeMapPrefix(L<S> tok1, L<S> tok2) { |
111 | if (tok1.size() < tok2.size()) return null; |
112 | |
113 | new Map<S, S> map; |
114 | for (int i = 1; i < tok2.size(); i += 2) { |
115 | S t1 = tok1.get(i), t2 = tok2.get(i); |
116 | if (!t1.equals(t2)) { |
117 | S v = map.get(t1); |
118 | if (v == null) |
119 | map.put(t1, t2); |
120 | else if (!v.equals(t2)) |
121 | return null; // match fail |
122 | } |
123 | } |
124 | |
125 | // match succeeds |
126 | return map; |
127 | } |
128 | |
129 | !include #1001041 // Pattern |
130 | |
131 | !include #1001027 // DerivedHashMap |
132 | |
133 | !include #1001036 // LastWordToLower |
134 | |
135 | static class Node { |
136 | String token; |
137 | float count; |
138 | new L<Node> next; |
139 | |
140 | *() {} // for clone method |
141 | |
142 | *(S *token) {} |
143 | |
144 | Node find(S token) { |
145 | for (Node n : next) |
146 | if (n.token.equals(token)) |
147 | ret n; |
148 | ret null; |
149 | } |
150 | |
151 | Node bestNext() { |
152 | float bestCount = 0f; |
153 | Node best = null; |
154 | for (Node n : next) |
155 | if (best == null || n.count > best.count) { |
156 | best = n; |
157 | bestCount = n.count; |
158 | } |
159 | ret best; |
160 | } |
161 | } |
162 | |
163 | static class StartTree extends P { |
164 | Node tree = new Node(""); |
165 | Node node; |
166 | boolean nonmod; |
167 | |
168 | S read(S file, L<S> tok) { |
169 | if (!eq(file, this.file)) { |
170 | seen = 0; |
171 | this.file = file; |
172 | node = tree; |
173 | } |
174 | |
175 | if (!nonmod) while (tok.size() > seen) { |
176 | S t = tok.get(seen++); |
177 | Node child = node.find(t); |
178 | if (child == null) |
179 | node.next.add(child = new Node(t)); |
180 | child.count++; |
181 | node = child; |
182 | } |
183 | |
184 | Node n = node.bestNext(); |
185 | ret n != null ? n.token : null; |
186 | } |
187 | |
188 | // it's a hack - derived predictor doesn't learn |
189 | P derive() { |
190 | //return (P) main.clone(this); |
191 | new StartTree p; |
192 | p.nonmod = true; |
193 | p.tree = tree; |
194 | return p; |
195 | } |
196 | |
197 | P clear() { |
198 | return new StartTree; |
199 | } |
200 | } |
201 | |
202 | p { |
203 | files = makeCorpus(); |
204 | print("Files in corpus: " + files.size()); |
205 | |
206 | print("Learning..."); |
207 | collector = new Collector; |
208 | test(new Chain(new Tuples(8), new Tuples(6), new Tuples(4), new Tuples(2), new Tuples(1), new StartTree)); |
209 | |
210 | //test(new Patterns(6)); |
211 | //test(new Chain(new Patterns(9), new LastWordToLower)); |
212 | test(new Chain(new Patterns(9), new Patterns(7), new Patterns(5), new LastWordToLower)); |
213 | |
214 | print("Learning done."); |
215 | printVMSize(); |
216 | if (collector.winner != null && showGUI) |
217 | window(); |
218 | } |
219 | |
220 | static int points = 0, total = 0; |
221 | |
222 | // train & evaluate a predictor |
223 | static void test(P p) { |
224 | int lastPercent = 0; |
225 | predicted = new HashMap; |
226 | points = 0; |
227 | total = 0; |
228 | for (int ii = 0; ii < files.size(); ii++) { |
229 | F f = files.get(ii); |
230 | |
231 | testFile(p, f); |
232 | |
233 | int percent = roundUpTo(10, (int) (ii*100L/files.size())); |
234 | if (percent > lastPercent) { |
235 | print("Learning " + percent + "% done."); |
236 | lastPercent = percent; |
237 | } |
238 | } |
239 | double score = points*100.0/total; |
240 | collector.add(p, score); |
241 | } |
242 | |
243 | static void testFile(P p, F f) { |
244 | new TreeSet<int> pred; |
245 | new L<S> history; |
246 | for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) { |
247 | S t = f.tok.get(i); |
248 | S x = p.read(f.name, history); |
249 | boolean correct = t.equals(x); |
250 | total += t.length(); |
251 | if (correct) { |
252 | pred.add(i); |
253 | points += t.length(); |
254 | } |
255 | history.add(t); |
256 | } |
257 | p.read(f.name, history); // feed last token, ignore output |
258 | predicted.put(f, pred); |
259 | } |
260 | |
261 | !include #1000989 // SnippetDB |
262 | |
263 | static L<F> makeCorpus() ctex { |
264 | S name = getSnippetTitle(corpusID); |
265 | S s = loadSnippet(corpusID); |
266 | if (s.length() != 0) |
267 | return makeCorpus_single(s); |
268 | else if (name.toLowerCase().indexOf(".zip") >= 0) |
269 | return makeCorpus_zip(); |
270 | else |
271 | return makeCorpus_mysqldump(); |
272 | } |
273 | |
274 | static L<F> makeCorpus_single(S text) ctex { |
275 | new L<F> files; |
276 | new F f; |
277 | f.id = corpusID; |
278 | f.name = getSnippetTitle(corpusID); |
279 | f.tok = internAll(javaTok(text)); |
280 | files.add(f); |
281 | return files; |
282 | } |
283 | |
284 | static L<F> makeCorpus_zip() ctex { |
285 | new L<F> files; |
286 | ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); |
287 | Enumeration entries = zipFile.entries(); |
288 | |
289 | while (entries.hasMoreElements() && files.size() < numSnippets) { |
290 | ZipEntry entry = (ZipEntry) entries.nextElement(); |
291 | if (entry.isDirectory()) continue; |
292 | //System.out.println("File found: " + entry.getName()); |
293 | |
294 | InputStream fin = zipFile.getInputStream(entry); |
295 | // TODO: try to skip binary files? |
296 | |
297 | InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); |
298 | new StringBuilder builder; |
299 | BufferedReader bufferedReader = new BufferedReader(reader); |
300 | String line; |
301 | while ((line = bufferedReader.readLine()) != null) |
302 | builder.append(line).append('\n'); |
303 | fin.close(); |
304 | S text = builder.toString(); |
305 | |
306 | new F f; |
307 | f.name = entry.getName(); |
308 | f.tok = internAll(javaTok(text)); |
309 | files.add(f); |
310 | } |
311 | |
312 | zipFile.close(); |
313 | return files; |
314 | } |
315 | |
316 | static L<F> makeCorpus_mysqldump() { |
317 | new L<F> files; |
318 | SnippetDB db = new SnippetDB(corpusID); |
319 | List<List<S>> rows = db.rowsOrderedBy("sn_created"); |
320 | for (int i = Math.max(0, rows.size()-numSnippets); i < rows.size(); i++) { |
321 | new F f; |
322 | f.id = db.getField(rows.get(i), "sn_id"); |
323 | f.name = db.getField(rows.get(i), "sn_title"); |
324 | S text = db.getField(rows.get(i), "sn_text"); |
325 | f.tok = internAll(javaTok(text)); |
326 | files.add(f); |
327 | ++i; |
328 | } |
329 | return files; |
330 | } |
331 | |
332 | static class Collector { |
333 | P winner; |
334 | double bestScore = -1; |
335 | Map<F, Set<int>> predicted; |
336 | |
337 | void add(P p, double score) { |
338 | if (winner == null || score > bestScore) { |
339 | winner = p; |
340 | bestScore = score; |
341 | //S name = shorten(structure(p), 100); |
342 | S name = p.getClass().getName(); |
343 | print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); |
344 | predicted = main.predicted; |
345 | } |
346 | } |
347 | } |
348 | |
349 | static void window() { |
350 | //final P p = collector.winner.clear(); |
351 | |
352 | JFrame jf = new JFrame("Predicted = green"); |
353 | Container cp = jf.getContentPane(); |
354 | |
355 | final JButton btnNext = new JButton("Next"); |
356 | |
357 | final JTextPane pane = new JTextPane(); |
358 | //pane.setFont(loadFont("#1000993", 24)); |
359 | |
360 | JScrollPane scrollPane = new JScrollPane(pane); |
361 | cp.add(scrollPane, BorderLayout.CENTER); |
362 | |
363 | class X { |
364 | int ii; |
365 | |
366 | void y() ctex { |
367 | ii = ii == 0 ? files.size()-1 : ii-1; |
368 | F f = files.get(ii); |
369 | //testFile(p, f); |
370 | Set<int> pred = collector.predicted.get(f); |
371 | |
372 | StyledDocument doc = new DefaultStyledDocument(); |
373 | |
374 | L<S> tok = f.tok; |
375 | int i = tok.size(), len = 0; |
376 | while (len <= maxCharsGUI && i > 0) { |
377 | --i; |
378 | len += tok.get(i).length(); |
379 | } |
380 | |
381 | for (; i < tok.size(); i++) { |
382 | if (tok.get(i).length() == 0) continue; |
383 | boolean green = pred.contains(i); |
384 | SimpleAttributeSet set = new SimpleAttributeSet(); |
385 | StyleConstants.setForeground(set, green ? Color.green : Color.gray); |
386 | doc.insertString(doc.getLength(), tok.get(i), set); |
387 | } |
388 | |
389 | pane.setDocument(doc); |
390 | double score = getScore(pred, tok); |
391 | btnNext.setText(f.name + " (" + (ii+1) + "/" + files.size() + ") - " + (int) score + " %"); |
392 | } |
393 | } |
394 | final new X x; |
395 | |
396 | btnNext.addActionListener(actionListener { |
397 | x.y(); |
398 | }); |
399 | cp.add(btnNext, BorderLayout.NORTH); |
400 | |
401 | x.y(); |
402 | |
403 | jf.setBounds(100, 100, 600, 600); |
404 | jf.setVisible(true); |
405 | } |
406 | |
407 | !include #1001032 // clone function |
408 | |
409 | static double getScore(Set<int> pred, L<S> tok) { |
410 | int total = 0, score = 0; |
411 | for (int i = 0; i < tok.size(); i++) { |
412 | int n = tok.get(i).length(); |
413 | total += n; |
414 | if (pred.contains(i)) |
415 | score += n; |
416 | } |
417 | ret score*100.0/total; |
418 | } |
419 | } |
Began life as a copy of #1001033
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1001037 |
Snippet name: | Token prediction, multiple predictors (v5, developing) |
Eternal ID of this version: | #1001037/1 |
Text MD5: | b286849fecdd14a17f356c50a3b92756 |
Transpilation MD5: | d3e70f9fff633fe7154db05fbb2e2d3e |
Author: | stefan |
Category: | |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-09-16 21:44:38 |
Source code size: | 10641 bytes / 419 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 695 / 707 |
Referenced in: | [show references] |