Libraryless. Click here for Pure Java version (2289L/18K/55K).
1 | !747 |
2 | |
3 | abstract class P { |
4 | abstract S read(L<S> tok); |
5 | abstract P derive(); // clone & reset counter for actual use |
6 | } |
7 | |
8 | m { |
9 | static S corpusID = "#1001006"; |
10 | static int numSnippets = 3000; |
11 | static boolean showGUI = true; |
12 | static int maxCharsGUI = 500000; |
13 | |
14 | static Collector collector; |
15 | static L<S> tok; |
16 | static Set<int> predicted; |
17 | |
18 | static class Chain extends P { |
19 | new L<P> list; |
20 | |
21 | *() {} |
22 | *(L<P> *list) {} |
23 | *(P... a) { list = asList(a); } |
24 | |
25 | void add(P p) { list.add(p); } |
26 | |
27 | S read(L<S> tok) { |
28 | for (P p : list) { |
29 | S s = p.read(tok); |
30 | if (s != null) return s; |
31 | } |
32 | return null; |
33 | } |
34 | |
35 | P derive() { |
36 | new Chain c; |
37 | for (P p : list) |
38 | c.add(p.derive()); |
39 | return c; |
40 | } |
41 | } |
42 | |
43 | static class Tuples extends P { |
44 | Map<L<S>,S> map = new HashMap<L<S>,S>(); |
45 | int n, seen; |
46 | |
47 | *(int *n) { |
48 | } |
49 | |
50 | S read(L<S> tok) { |
51 | while (tok.size() > seen) { |
52 | ++seen; |
53 | if (seen > n) |
54 | map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); |
55 | } |
56 | |
57 | if (tok.size() >= n) |
58 | return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); |
59 | |
60 | return null; |
61 | } |
62 | |
63 | // slow... |
64 | P oldDerive() { |
65 | Tuples t = new Tuples(n); |
66 | t.map.putAll(map); |
67 | // t.seen == 0 which is ok |
68 | return t; |
69 | } |
70 | |
71 | // fast! |
72 | P derive() { |
73 | Tuples t = new Tuples(n); |
74 | t.map = new DerivedHashMap<L<S>,S>(map); |
75 | return t; |
76 | } |
77 | } |
78 | |
79 | static class DerivedHashMap<A, B> extends AbstractMap<A, B> { |
80 | Map<A, B> base; |
81 | new HashMap<A, B> additions; |
82 | |
83 | *(Map<A, B> *base) {} |
84 | |
85 | public B get(Object key) { |
86 | B b = additions.get(key); |
87 | if (b != null) return b; |
88 | return base.get(key); |
89 | } |
90 | |
91 | public B put(A key, B value) { |
92 | return additions.put(key, value); |
93 | } |
94 | |
95 | public Set<Map.Entry<A,B>> entrySet() { |
96 | throw fail(); |
97 | } |
98 | } |
99 | |
100 | // TODO: Put NewX back in |
101 | |
102 | p { |
103 | tok = makeCorpusJavaTok(numSnippets); |
104 | print("Tokens in corpus: " + tok.size()); |
105 | |
106 | print("Learning..."); |
107 | collector = new Collector; |
108 | /*test(new Tuples(1)); |
109 | test(new Tuples(2)); |
110 | test(new Tuples(3)); |
111 | test(new Tuples(4)); |
112 | test(new Chain(new Tuples(2), new Tuples(1)));*/ |
113 | test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1))); |
114 | |
115 | print("Learning done."); |
116 | if (collector.winner != null && showGUI) { |
117 | predicted = collector.predicted; |
118 | showColoredText(); |
119 | } |
120 | } |
121 | |
122 | // test a predictor |
123 | static void test(P p) { |
124 | predicted = new TreeSet<int>(); |
125 | int points = 0, total = 0, lastPercent = 0; |
126 | new L<S> history; |
127 | for (int i = 1; i < tok.size(); i += 2) { |
128 | S t = tok.get(i); |
129 | S x = p.read(history); |
130 | boolean correct = t.equals(x); |
131 | total += t.length(); |
132 | if (correct) { |
133 | predicted.add(i); |
134 | points += t.length(); |
135 | } |
136 | history.add(t); |
137 | int percent = roundUpTo(10, (int) (i*100L/tok.size())); |
138 | if (percent > lastPercent) { |
139 | print("Learning " + percent + "% done."); |
140 | lastPercent = percent; |
141 | } |
142 | } |
143 | double score = points*100.0/total; |
144 | collector.add(p, score); |
145 | } |
146 | |
147 | static void showColoredText() ctex { |
148 | JFrame jf = new JFrame("Predicted = green"); |
149 | Container cp = jf.getContentPane(); |
150 | |
151 | JTextPane pane = new JTextPane(); |
152 | //pane.setFont(loadFont("#1000993", 24)); |
153 | Document doc = pane.getStyledDocument(); |
154 | |
155 | int i = tok.size(), len = 0; |
156 | while (len <= maxCharsGUI && i > 0) { |
157 | --i; |
158 | len += tok.get(i).length(); |
159 | } |
160 | |
161 | for (; i < tok.size(); i++) { |
162 | if (tok.get(i).length() == 0) continue; |
163 | boolean green = predicted.contains(i); |
164 | SimpleAttributeSet set = new SimpleAttributeSet(); |
165 | StyleConstants.setForeground(set, green ? Color.green : Color.gray); |
166 | doc.insertString(doc.getLength(), tok.get(i), set); |
167 | } |
168 | |
169 | JScrollPane scrollPane = new JScrollPane(pane); |
170 | cp.add(scrollPane, BorderLayout.CENTER); |
171 | |
172 | jf.setBounds(100, 100, 600, 600); |
173 | jf.setVisible(true); |
174 | } |
175 | |
176 | !include #1000989 // SnippetDB |
177 | |
178 | static L<S> makeCorpusJavaTok(int numSnippets) { |
179 | SnippetDB db = new SnippetDB(corpusID); |
180 | List<List<S>> rows = db.rowsOrderedBy("sn_created"); |
181 | new L<S> tok; |
182 | for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) { |
183 | new StringBuilder buf; |
184 | S id = db.getField(rows.get(i), "sn_id"); |
185 | S title = db.getField(rows.get(i), "sn_title"); |
186 | S text = db.getField(rows.get(i), "sn_text"); |
187 | buf.append("\n== ID: " + id); |
188 | buf.append("\n== Title: " + title); |
189 | buf.append("\n==\n"); |
190 | buf.append(text).append("\n"); |
191 | if (tok.size() != 0) tok.remove(tok.size()-1); |
192 | tok.addAll(javaTok(buf.toString())); |
193 | ++i; |
194 | } |
195 | return internAll(tok); |
196 | } |
197 | |
198 | static L<S> internAll(L<S> tok) { |
199 | new L<S> l; |
200 | for (S t : tok) |
201 | l.add(t.intern()); |
202 | return l; |
203 | } |
204 | |
205 | static class Collector { |
206 | P winner; |
207 | double bestScore = -1; |
208 | Set<int> predicted; |
209 | |
210 | void add(P p, double score) { |
211 | if (winner == null || score > bestScore) { |
212 | winner = p; |
213 | bestScore = score; |
214 | //S name = shorten(structure(p), 100); |
215 | S name = p.getClass().getName(); |
216 | print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); |
217 | this.predicted = main.predicted; |
218 | } |
219 | } |
220 | } |
221 | } |
Began life as a copy of #1000995
download show line numbers debug dex old transpilations
Travelled to 16 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, jtubtzbbkimh, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1001000 |
Snippet name: | Token prediction, multiple predictors (improving architecture) |
Eternal ID of this version: | #1001000/1 |
Text MD5: | 69216550a151d4f135d39f4547654aec |
Transpilation MD5: | 5fd815aa086d3473101aa689540506e8 |
Author: | stefan |
Category: | |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-09-15 20:55:52 |
Source code size: | 5669 bytes / 221 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 860 / 948 |
Referenced in: | [show references] |