Libraryless. Click here for Pure Java version (2382L/19K/57K).
1 | !747 |
2 | |
3 | abstract class P {
|
4 | abstract S read(L<S> tok); |
5 | abstract P derive(); // clone & reset counter for actual use |
6 | } |
7 | |
8 | m {
|
9 | static S corpusID = "#1001010"; |
10 | static int numSnippets = 3000; |
11 | static boolean showGUI = true; |
12 | static int maxCharsGUI = 500000; |
13 | static boolean allTokens = true; |
14 | |
15 | static Collector collector; |
16 | static L<S> tok; |
17 | static Set<int> predicted; |
18 | |
19 | static class Chain extends P {
|
20 | new L<P> list; |
21 | |
22 | *() {}
|
23 | *(L<P> *list) {}
|
24 | *(P... a) { list = asList(a); }
|
25 | |
26 | void add(P p) { list.add(p); }
|
27 | |
28 | S read(L<S> tok) {
|
29 | for (P p : list) {
|
30 | S s = p.read(tok); |
31 | if (s != null) return s; |
32 | } |
33 | return null; |
34 | } |
35 | |
36 | P derive() {
|
37 | new Chain c; |
38 | for (P p : list) |
39 | c.add(p.derive()); |
40 | return c; |
41 | } |
42 | } |
43 | |
44 | static class Tuples extends P {
|
45 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
46 | int n, seen; |
47 | |
48 | *(int *n) {
|
49 | } |
50 | |
51 | S read(L<S> tok) {
|
52 | while (tok.size() > seen) {
|
53 | ++seen; |
54 | if (seen > n) |
55 | map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
56 | } |
57 | |
58 | if (tok.size() >= n) |
59 | return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); |
60 | |
61 | return null; |
62 | } |
63 | |
64 | // slow... |
65 | P oldDerive() {
|
66 | Tuples t = new Tuples(n); |
67 | t.map.putAll(map); |
68 | // t.seen == 0 which is ok |
69 | return t; |
70 | } |
71 | |
72 | // fast! |
73 | P derive() {
|
74 | Tuples t = new Tuples(n); |
75 | t.map = new DerivedHashMap<L<S>,S>(map); |
76 | return t; |
77 | } |
78 | } |
79 | |
80 | static class DerivedHashMap<A, B> extends AbstractMap<A, B> {
|
81 | Map<A, B> base; |
82 | new HashMap<A, B> additions; |
83 | |
84 | *(Map<A, B> *base) {}
|
85 | |
86 | public B get(Object key) {
|
87 | B b = additions.get(key); |
88 | if (b != null) return b; |
89 | return base.get(key); |
90 | } |
91 | |
92 | public B put(A key, B value) {
|
93 | return additions.put(key, value); |
94 | } |
95 | |
96 | public Set<Map.Entry<A,B>> entrySet() {
|
97 | throw fail(); |
98 | } |
99 | } |
100 | |
101 | // TODO: Put NewX back in |
102 | |
103 | p {
|
104 | tok = makeCorpusJavaTok(); |
105 | print("Tokens in corpus: " + tok.size());
|
106 | |
107 | print("Learning...");
|
108 | collector = new Collector; |
109 | /*test(new Tuples(1)); |
110 | test(new Tuples(2)); |
111 | test(new Tuples(3)); |
112 | test(new Tuples(4)); |
113 | test(new Chain(new Tuples(2), new Tuples(1)));*/ |
114 | test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1))); |
115 | |
116 | print("Learning done.");
|
117 | if (collector.winner != null && showGUI) {
|
118 | predicted = collector.predicted; |
119 | showColoredText(); |
120 | } |
121 | } |
122 | |
123 | // test a predictor |
124 | static void test(P p) {
|
125 | predicted = new TreeSet<int>(); |
126 | int points = 0, total = 0, lastPercent = 0; |
127 | new L<S> history; |
128 | for (int i = allTokens ? 0 : 1; i < tok.size(); i += allTokens ? 1 : 2) {
|
129 | S t = tok.get(i); |
130 | S x = p.read(history); |
131 | boolean correct = t.equals(x); |
132 | total += t.length(); |
133 | if (correct) {
|
134 | predicted.add(i); |
135 | points += t.length(); |
136 | } |
137 | history.add(t); |
138 | int percent = roundUpTo(10, (int) (i*100L/tok.size())); |
139 | if (percent > lastPercent) {
|
140 | print("Learning " + percent + "% done.");
|
141 | lastPercent = percent; |
142 | } |
143 | } |
144 | double score = points*100.0/total; |
145 | collector.add(p, score); |
146 | } |
147 | |
148 | static void showColoredText() ctex {
|
149 | JFrame jf = new JFrame("Predicted = green");
|
150 | Container cp = jf.getContentPane(); |
151 | |
152 | JTextPane pane = new JTextPane(); |
153 | //pane.setFont(loadFont("#1000993", 24));
|
154 | Document doc = pane.getStyledDocument(); |
155 | |
156 | int i = tok.size(), len = 0; |
157 | while (len <= maxCharsGUI && i > 0) {
|
158 | --i; |
159 | len += tok.get(i).length(); |
160 | } |
161 | |
162 | for (; i < tok.size(); i++) {
|
163 | if (tok.get(i).length() == 0) continue; |
164 | boolean green = predicted.contains(i); |
165 | SimpleAttributeSet set = new SimpleAttributeSet(); |
166 | StyleConstants.setForeground(set, green ? Color.green : Color.gray); |
167 | doc.insertString(doc.getLength(), tok.get(i), set); |
168 | } |
169 | |
170 | JScrollPane scrollPane = new JScrollPane(pane); |
171 | cp.add(scrollPane, BorderLayout.CENTER); |
172 | |
173 | jf.setBounds(100, 100, 600, 600); |
174 | jf.setVisible(true); |
175 | } |
176 | |
177 | !include #1000989 // SnippetDB |
178 | |
179 | static L<S> makeCorpusJavaTok() {
|
180 | S name = getSnippetTitle(corpusID); |
181 | if (name.toLowerCase().indexOf(".zip") >= 0)
|
182 | return makeCorpus_zip(); |
183 | else |
184 | return makeCorpus_mysqldump(); |
185 | } |
186 | |
187 | static L<S> makeCorpus_zip() ctex {
|
188 | ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); |
189 | Enumeration entries = zipFile.entries(); |
190 | new L<S> tok; |
191 | |
192 | while (entries.hasMoreElements()) {
|
193 | ZipEntry entry = (ZipEntry) entries.nextElement(); |
194 | //System.out.println("File found: " + entry.getName());
|
195 | |
196 | InputStream fin = zipFile.getInputStream(entry); |
197 | // TODO: try to skip binary files? |
198 | |
199 | InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); |
200 | new StringBuilder builder; |
201 | BufferedReader bufferedReader = new BufferedReader(reader); |
202 | String line; |
203 | while ((line = bufferedReader.readLine()) != null) |
204 | builder.append(line).append('\n');
|
205 | fin.close(); |
206 | |
207 | new StringBuilder buf; |
208 | buf.append("\n== File: " + entry.getName());
|
209 | buf.append("\n==\n");
|
210 | buf.append(builder.toString()).append("\n");
|
211 | if (tok.size() != 0) tok.remove(tok.size()-1); |
212 | tok.addAll(javaTok(buf.toString())); |
213 | } |
214 | |
215 | zipFile.close(); |
216 | return internAll(tok); |
217 | } |
218 | |
219 | static L<S> makeCorpus_mysqldump() {
|
220 | SnippetDB db = new SnippetDB(corpusID); |
221 | List<List<S>> rows = db.rowsOrderedBy("sn_created");
|
222 | new L<S> tok; |
223 | for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) {
|
224 | new StringBuilder buf; |
225 | S id = db.getField(rows.get(i), "sn_id"); |
226 | S title = db.getField(rows.get(i), "sn_title"); |
227 | S text = db.getField(rows.get(i), "sn_text"); |
228 | buf.append("\n== ID: " + id);
|
229 | buf.append("\n== Title: " + title);
|
230 | buf.append("\n==\n");
|
231 | buf.append(text).append("\n");
|
232 | if (tok.size() != 0) tok.remove(tok.size()-1); |
233 | tok.addAll(javaTok(buf.toString())); |
234 | ++i; |
235 | } |
236 | return internAll(tok); |
237 | } |
238 | |
239 | static L<S> internAll(L<S> tok) {
|
240 | new L<S> l; |
241 | for (S t : tok) |
242 | l.add(t.intern()); |
243 | return l; |
244 | } |
245 | |
246 | static class Collector {
|
247 | P winner; |
248 | double bestScore = -1; |
249 | Set<int> predicted; |
250 | |
251 | void add(P p, double score) {
|
252 | if (winner == null || score > bestScore) {
|
253 | winner = p; |
254 | bestScore = score; |
255 | //S name = shorten(structure(p), 100); |
256 | S name = p.getClass().getName(); |
257 | print("New best score: " + formatDouble(score, 2) + "% (" + name + ")");
|
258 | this.predicted = main.predicted; |
259 | } |
260 | } |
261 | } |
262 | } |
Began life as a copy of #1001000
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1001011 |
| Snippet name: | Token prediction, multiple predictors (adding zip support) |
| Eternal ID of this version: | #1001011/1 |
| Text MD5: | 942f3ed24c4432b998f1e22ebdd4e9fe |
| Transpilation MD5: | 78b9d92dea6ca60c5e4296d797492666 |
| Author: | stefan |
| Category: | |
| Type: | JavaX source code |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2015-09-16 00:58:02 |
| Source code size: | 7026 bytes / 262 lines |
| Pitched / IR pitched: | No / Yes |
| Views / Downloads: | 884 / 1266 |
| Referenced in: | [show references] |