Libraryless. Click here for Pure Java version (2382L/19K/57K).
1 | !747 |
2 | |
3 | abstract class P { |
4 | abstract S read(L<S> tok); |
5 | abstract P derive(); // clone & reset counter for actual use |
6 | } |
7 | |
8 | m { |
9 | static S corpusID = "#1001010"; |
10 | static int numSnippets = 3000; |
11 | static boolean showGUI = true; |
12 | static int maxCharsGUI = 500000; |
13 | static boolean allTokens = true; |
14 | |
15 | static Collector collector; |
16 | static L<S> tok; |
17 | static Set<int> predicted; |
18 | |
19 | static class Chain extends P { |
20 | new L<P> list; |
21 | |
22 | *() {} |
23 | *(L<P> *list) {} |
24 | *(P... a) { list = asList(a); } |
25 | |
26 | void add(P p) { list.add(p); } |
27 | |
28 | S read(L<S> tok) { |
29 | for (P p : list) { |
30 | S s = p.read(tok); |
31 | if (s != null) return s; |
32 | } |
33 | return null; |
34 | } |
35 | |
36 | P derive() { |
37 | new Chain c; |
38 | for (P p : list) |
39 | c.add(p.derive()); |
40 | return c; |
41 | } |
42 | } |
43 | |
44 | static class Tuples extends P { |
45 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
46 | int n, seen; |
47 | |
48 | *(int *n) { |
49 | } |
50 | |
51 | S read(L<S> tok) { |
52 | while (tok.size() > seen) { |
53 | ++seen; |
54 | if (seen > n) |
55 | map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
56 | } |
57 | |
58 | if (tok.size() >= n) |
59 | return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); |
60 | |
61 | return null; |
62 | } |
63 | |
64 | // slow... |
65 | P oldDerive() { |
66 | Tuples t = new Tuples(n); |
67 | t.map.putAll(map); |
68 | // t.seen == 0 which is ok |
69 | return t; |
70 | } |
71 | |
72 | // fast! |
73 | P derive() { |
74 | Tuples t = new Tuples(n); |
75 | t.map = new DerivedHashMap<L<S>,S>(map); |
76 | return t; |
77 | } |
78 | } |
79 | |
80 | static class DerivedHashMap<A, B> extends AbstractMap<A, B> { |
81 | Map<A, B> base; |
82 | new HashMap<A, B> additions; |
83 | |
84 | *(Map<A, B> *base) {} |
85 | |
86 | public B get(Object key) { |
87 | B b = additions.get(key); |
88 | if (b != null) return b; |
89 | return base.get(key); |
90 | } |
91 | |
92 | public B put(A key, B value) { |
93 | return additions.put(key, value); |
94 | } |
95 | |
96 | public Set<Map.Entry<A,B>> entrySet() { |
97 | throw fail(); |
98 | } |
99 | } |
100 | |
101 | // TODO: Put NewX back in |
102 | |
103 | p { |
104 | tok = makeCorpusJavaTok(); |
105 | print("Tokens in corpus: " + tok.size()); |
106 | |
107 | print("Learning..."); |
108 | collector = new Collector; |
109 | /*test(new Tuples(1)); |
110 | test(new Tuples(2)); |
111 | test(new Tuples(3)); |
112 | test(new Tuples(4)); |
113 | test(new Chain(new Tuples(2), new Tuples(1)));*/ |
114 | test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1))); |
115 | |
116 | print("Learning done."); |
117 | if (collector.winner != null && showGUI) { |
118 | predicted = collector.predicted; |
119 | showColoredText(); |
120 | } |
121 | } |
122 | |
123 | // test a predictor |
124 | static void test(P p) { |
125 | predicted = new TreeSet<int>(); |
126 | int points = 0, total = 0, lastPercent = 0; |
127 | new L<S> history; |
128 | for (int i = allTokens ? 0 : 1; i < tok.size(); i += allTokens ? 1 : 2) { |
129 | S t = tok.get(i); |
130 | S x = p.read(history); |
131 | boolean correct = t.equals(x); |
132 | total += t.length(); |
133 | if (correct) { |
134 | predicted.add(i); |
135 | points += t.length(); |
136 | } |
137 | history.add(t); |
138 | int percent = roundUpTo(10, (int) (i*100L/tok.size())); |
139 | if (percent > lastPercent) { |
140 | print("Learning " + percent + "% done."); |
141 | lastPercent = percent; |
142 | } |
143 | } |
144 | double score = points*100.0/total; |
145 | collector.add(p, score); |
146 | } |
147 | |
148 | static void showColoredText() ctex { |
149 | JFrame jf = new JFrame("Predicted = green"); |
150 | Container cp = jf.getContentPane(); |
151 | |
152 | JTextPane pane = new JTextPane(); |
153 | //pane.setFont(loadFont("#1000993", 24)); |
154 | Document doc = pane.getStyledDocument(); |
155 | |
156 | int i = tok.size(), len = 0; |
157 | while (len <= maxCharsGUI && i > 0) { |
158 | --i; |
159 | len += tok.get(i).length(); |
160 | } |
161 | |
162 | for (; i < tok.size(); i++) { |
163 | if (tok.get(i).length() == 0) continue; |
164 | boolean green = predicted.contains(i); |
165 | SimpleAttributeSet set = new SimpleAttributeSet(); |
166 | StyleConstants.setForeground(set, green ? Color.green : Color.gray); |
167 | doc.insertString(doc.getLength(), tok.get(i), set); |
168 | } |
169 | |
170 | JScrollPane scrollPane = new JScrollPane(pane); |
171 | cp.add(scrollPane, BorderLayout.CENTER); |
172 | |
173 | jf.setBounds(100, 100, 600, 600); |
174 | jf.setVisible(true); |
175 | } |
176 | |
177 | !include #1000989 // SnippetDB |
178 | |
179 | static L<S> makeCorpusJavaTok() { |
180 | S name = getSnippetTitle(corpusID); |
181 | if (name.toLowerCase().indexOf(".zip") >= 0) |
182 | return makeCorpus_zip(); |
183 | else |
184 | return makeCorpus_mysqldump(); |
185 | } |
186 | |
187 | static L<S> makeCorpus_zip() ctex { |
188 | ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); |
189 | Enumeration entries = zipFile.entries(); |
190 | new L<S> tok; |
191 | |
192 | while (entries.hasMoreElements()) { |
193 | ZipEntry entry = (ZipEntry) entries.nextElement(); |
194 | //System.out.println("File found: " + entry.getName()); |
195 | |
196 | InputStream fin = zipFile.getInputStream(entry); |
197 | // TODO: try to skip binary files? |
198 | |
199 | InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); |
200 | new StringBuilder builder; |
201 | BufferedReader bufferedReader = new BufferedReader(reader); |
202 | String line; |
203 | while ((line = bufferedReader.readLine()) != null) |
204 | builder.append(line).append('\n'); |
205 | fin.close(); |
206 | |
207 | new StringBuilder buf; |
208 | buf.append("\n== File: " + entry.getName()); |
209 | buf.append("\n==\n"); |
210 | buf.append(builder.toString()).append("\n"); |
211 | if (tok.size() != 0) tok.remove(tok.size()-1); |
212 | tok.addAll(javaTok(buf.toString())); |
213 | } |
214 | |
215 | zipFile.close(); |
216 | return internAll(tok); |
217 | } |
218 | |
219 | static L<S> makeCorpus_mysqldump() { |
220 | SnippetDB db = new SnippetDB(corpusID); |
221 | List<List<S>> rows = db.rowsOrderedBy("sn_created"); |
222 | new L<S> tok; |
223 | for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) { |
224 | new StringBuilder buf; |
225 | S id = db.getField(rows.get(i), "sn_id"); |
226 | S title = db.getField(rows.get(i), "sn_title"); |
227 | S text = db.getField(rows.get(i), "sn_text"); |
228 | buf.append("\n== ID: " + id); |
229 | buf.append("\n== Title: " + title); |
230 | buf.append("\n==\n"); |
231 | buf.append(text).append("\n"); |
232 | if (tok.size() != 0) tok.remove(tok.size()-1); |
233 | tok.addAll(javaTok(buf.toString())); |
234 | ++i; |
235 | } |
236 | return internAll(tok); |
237 | } |
238 | |
239 | static L<S> internAll(L<S> tok) { |
240 | new L<S> l; |
241 | for (S t : tok) |
242 | l.add(t.intern()); |
243 | return l; |
244 | } |
245 | |
246 | static class Collector { |
247 | P winner; |
248 | double bestScore = -1; |
249 | Set<int> predicted; |
250 | |
251 | void add(P p, double score) { |
252 | if (winner == null || score > bestScore) { |
253 | winner = p; |
254 | bestScore = score; |
255 | //S name = shorten(structure(p), 100); |
256 | S name = p.getClass().getName(); |
257 | print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); |
258 | this.predicted = main.predicted; |
259 | } |
260 | } |
261 | } |
262 | } |
Began life as a copy of #1001000
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1001011 |
Snippet name: | Token prediction, multiple predictors (adding zip support) |
Eternal ID of this version: | #1001011/1 |
Text MD5: | 942f3ed24c4432b998f1e22ebdd4e9fe |
Transpilation MD5: | 78b9d92dea6ca60c5e4296d797492666 |
Author: | stefan |
Category: | |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-09-16 00:58:02 |
Source code size: | 7026 bytes / 262 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 627 / 957 |
Referenced in: | [show references] |