Libraryless. Click here for Pure Java version (2373L/19K/56K).
1 | !747 |
2 | |
3 | m { |
4 | static S corpusID = "#1001010"; |
5 | static int numSnippets = 3000; |
6 | static boolean showGUI = true; |
7 | static int maxCharsGUI = 500000; |
8 | static boolean allTokens = true; |
9 | |
10 | static Collector collector; |
11 | static L<F> files; |
12 | static Set<int> predicted; |
13 | |
14 | // a file to learn from |
15 | static class F { |
16 | String id, name; |
17 | L<S> tok; |
18 | } |
19 | |
20 | // a predictor |
21 | static abstract class P { |
22 | abstract S read(S file, L<S> tok); |
23 | abstract P derive(); // clone & reset counter for actual use |
24 | } |
25 | |
26 | static class Chain extends P { |
27 | new L<P> list; |
28 | |
29 | *() {} |
30 | *(L<P> *list) {} |
31 | *(P... a) { list = asList(a); } |
32 | |
33 | void add(P p) { list.add(p); } |
34 | |
35 | S read(S file, L<S> tok) { |
36 | for (P p : list) { |
37 | S s = p.read(file, tok); |
38 | if (s != null) return s; |
39 | } |
40 | return null; |
41 | } |
42 | |
43 | P derive() { |
44 | new Chain c; |
45 | for (P p : list) |
46 | c.add(p.derive()); |
47 | return c; |
48 | } |
49 | } |
50 | |
51 | static class Tuples extends P { |
52 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
53 | int n, seen; |
54 | S file; |
55 | |
56 | *(int *n) { |
57 | } |
58 | |
59 | S read(S file, L<S> tok) { |
60 | if (!eq(file, this.file)) { |
61 | seen = 0; |
62 | this.file = file; |
63 | } |
64 | |
65 | while (tok.size() > seen) { |
66 | ++seen; |
67 | if (seen > n) |
68 | map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
69 | } |
70 | |
71 | if (tok.size() >= n) |
72 | return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); |
73 | |
74 | return null; |
75 | } |
76 | |
77 | // slow... |
78 | P oldDerive() { |
79 | Tuples t = new Tuples(n); |
80 | t.map.putAll(map); |
81 | // t.seen == 0 which is ok |
82 | return t; |
83 | } |
84 | |
85 | // fast! |
86 | P derive() { |
87 | Tuples t = new Tuples(n); |
88 | t.map = new DerivedHashMap<L<S>,S>(map); |
89 | return t; |
90 | } |
91 | } |
92 | |
93 | static class DerivedHashMap<A, B> extends AbstractMap<A, B> { |
94 | Map<A, B> base; |
95 | new HashMap<A, B> additions; |
96 | |
97 | *(Map<A, B> *base) {} |
98 | |
99 | public B get(Object key) { |
100 | B b = additions.get(key); |
101 | if (b != null) return b; |
102 | return base.get(key); |
103 | } |
104 | |
105 | public B put(A key, B value) { |
106 | return additions.put(key, value); |
107 | } |
108 | |
109 | public Set<Map.Entry<A,B>> entrySet() { |
110 | throw fail(); |
111 | } |
112 | } |
113 | |
114 | // TODO: Put NewX back in |
115 | |
116 | p { |
117 | files = makeCorpus(); |
118 | print("Files in corpus: " + files.size()); |
119 | |
120 | print("Learning..."); |
121 | collector = new Collector; |
122 | test(new Tuples(1)); |
123 | //test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1))); |
124 | |
125 | print("Learning done."); |
126 | /*if (collector.winner != null && showGUI) { |
127 | predicted = collector.predicted; |
128 | showColoredText(); |
129 | }*/ |
130 | } |
131 | |
132 | // train & evaluate a predictor |
133 | static void test(P p) { |
134 | //predicted = new TreeSet<int>(); |
135 | int points = 0, total = 0, lastPercent = 0; |
136 | for (int ii = 0; ii < files.size(); ii++) { |
137 | F f = files.get(ii); |
138 | |
139 | new L<S> history; |
140 | for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) { |
141 | S t = f.tok.get(i); |
142 | S x = p.read(f.name, history); |
143 | boolean correct = t.equals(x); |
144 | total += t.length(); |
145 | if (correct) { |
146 | //predicted.add(i); |
147 | points += t.length(); |
148 | } |
149 | history.add(t); |
150 | } |
151 | |
152 | int percent = roundUpTo(10, (int) (ii*100L/files.size())); |
153 | if (percent > lastPercent) { |
154 | print("Learning " + percent + "% done."); |
155 | lastPercent = percent; |
156 | } |
157 | } |
158 | double score = points*100.0/total; |
159 | collector.add(p, score); |
160 | } |
161 | |
162 | !include #1000989 // SnippetDB |
163 | |
164 | static L<F> makeCorpus() { |
165 | S name = getSnippetTitle(corpusID); |
166 | if (name.toLowerCase().indexOf(".zip") >= 0) |
167 | return makeCorpus_zip(); |
168 | else |
169 | return makeCorpus_mysqldump(); |
170 | } |
171 | |
172 | static L<F> makeCorpus_zip() ctex { |
173 | new L<F> files; |
174 | ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); |
175 | Enumeration entries = zipFile.entries(); |
176 | |
177 | while (entries.hasMoreElements()) { |
178 | ZipEntry entry = (ZipEntry) entries.nextElement(); |
179 | //System.out.println("File found: " + entry.getName()); |
180 | |
181 | InputStream fin = zipFile.getInputStream(entry); |
182 | // TODO: try to skip binary files? |
183 | |
184 | InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); |
185 | new StringBuilder builder; |
186 | BufferedReader bufferedReader = new BufferedReader(reader); |
187 | String line; |
188 | while ((line = bufferedReader.readLine()) != null) |
189 | builder.append(line).append('\n'); |
190 | fin.close(); |
191 | S text = builder.toString(); |
192 | |
193 | new F f; |
194 | f.name = entry.getName(); |
195 | f.tok = internAll(javaTok(text)); |
196 | files.add(f); |
197 | } |
198 | |
199 | zipFile.close(); |
200 | return files; |
201 | } |
202 | |
203 | static L<F> makeCorpus_mysqldump() { |
204 | new L<F> files; |
205 | SnippetDB db = new SnippetDB(corpusID); |
206 | List<List<S>> rows = db.rowsOrderedBy("sn_created"); |
207 | for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) { |
208 | new F f; |
209 | f.id = db.getField(rows.get(i), "sn_id"); |
210 | f.name = db.getField(rows.get(i), "sn_title"); |
211 | S text = db.getField(rows.get(i), "sn_text"); |
212 | f.tok = internAll(javaTok(text)); |
213 | files.add(f); |
214 | ++i; |
215 | } |
216 | return files; |
217 | } |
218 | |
219 | static class Collector { |
220 | P winner; |
221 | double bestScore = -1; |
222 | Set<int> predicted; |
223 | |
224 | void add(P p, double score) { |
225 | if (winner == null || score > bestScore) { |
226 | winner = p; |
227 | bestScore = score; |
228 | //S name = shorten(structure(p), 100); |
229 | S name = p.getClass().getName(); |
230 | print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); |
231 | this.predicted = main.predicted; |
232 | } |
233 | } |
234 | } |
235 | } |
Began life as a copy of #1001011
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1001025 |
Snippet name: | Token prediction, multiple predictors (new architecture, developing) |
Eternal ID of this version: | #1001025/1 |
Text MD5: | 04645fde994b63b0cbf0290e16d1047b |
Transpilation MD5: | 5dd35d6420b39e05bcc4e9c613a729e0 |
Author: | stefan |
Category: | |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-09-16 14:20:59 |
Source code size: | 5941 bytes / 235 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 783 / 807 |
Referenced in: | [show references] |