Libraryless. Click here for Pure Java version (2382L/19K/57K).
!747 abstract class P { abstract S read(L<S> tok); abstract P derive(); // clone & reset counter for actual use } m { static S corpusID = "#1001010"; static int numSnippets = 3000; static boolean showGUI = true; static int maxCharsGUI = 500000; static boolean allTokens = true; static Collector collector; static L<S> tok; static Set<int> predicted; static class Chain extends P { new L<P> list; *() {} *(L<P> *list) {} *(P... a) { list = asList(a); } void add(P p) { list.add(p); } S read(L<S> tok) { for (P p : list) { S s = p.read(tok); if (s != null) return s; } return null; } P derive() { new Chain c; for (P p : list) c.add(p.derive()); return c; } } static class Tuples extends P { Map<L<S>,S> map = new HashMap<L<S>,S>(); int n, seen; *(int *n) { } S read(L<S> tok) { while (tok.size() > seen) { ++seen; if (seen > n) map.put(new ArrayList<S>(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); } if (tok.size() >= n) return map.get(new ArrayList<S>(tok.subList(tok.size()-n, tok.size()))); return null; } // slow... P oldDerive() { Tuples t = new Tuples(n); t.map.putAll(map); // t.seen == 0 which is ok return t; } // fast! P derive() { Tuples t = new Tuples(n); t.map = new DerivedHashMap<L<S>,S>(map); return t; } } static class DerivedHashMap<A, B> extends AbstractMap<A, B> { Map<A, B> base; new HashMap<A, B> additions; *(Map<A, B> *base) {} public B get(Object key) { B b = additions.get(key); if (b != null) return b; return base.get(key); } public B put(A key, B value) { return additions.put(key, value); } public Set<Map.Entry<A,B>> entrySet() { throw fail(); } } // TODO: Put NewX back in p { tok = makeCorpusJavaTok(); print("Tokens in corpus: " + tok.size()); print("Learning..."); collector = new Collector; /*test(new Tuples(1)); test(new Tuples(2)); test(new Tuples(3)); test(new Tuples(4)); test(new Chain(new Tuples(2), new Tuples(1)));*/ test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1))); print("Learning done."); if (collector.winner != null && showGUI) { predicted = collector.predicted; showColoredText(); } } // test a predictor static void test(P p) { predicted = new TreeSet<int>(); int points = 0, total = 0, lastPercent = 0; new L<S> history; for (int i = allTokens ? 0 : 1; i < tok.size(); i += allTokens ? 1 : 2) { S t = tok.get(i); S x = p.read(history); boolean correct = t.equals(x); total += t.length(); if (correct) { predicted.add(i); points += t.length(); } history.add(t); int percent = roundUpTo(10, (int) (i*100L/tok.size())); if (percent > lastPercent) { print("Learning " + percent + "% done."); lastPercent = percent; } } double score = points*100.0/total; collector.add(p, score); } static void showColoredText() ctex { JFrame jf = new JFrame("Predicted = green"); Container cp = jf.getContentPane(); JTextPane pane = new JTextPane(); //pane.setFont(loadFont("#1000993", 24)); Document doc = pane.getStyledDocument(); int i = tok.size(), len = 0; while (len <= maxCharsGUI && i > 0) { --i; len += tok.get(i).length(); } for (; i < tok.size(); i++) { if (tok.get(i).length() == 0) continue; boolean green = predicted.contains(i); SimpleAttributeSet set = new SimpleAttributeSet(); StyleConstants.setForeground(set, green ? Color.green : Color.gray); doc.insertString(doc.getLength(), tok.get(i), set); } JScrollPane scrollPane = new JScrollPane(pane); cp.add(scrollPane, BorderLayout.CENTER); jf.setBounds(100, 100, 600, 600); jf.setVisible(true); } !include #1000989 // SnippetDB static L<S> makeCorpusJavaTok() { S name = getSnippetTitle(corpusID); if (name.toLowerCase().indexOf(".zip") >= 0) return makeCorpus_zip(); else return makeCorpus_mysqldump(); } static L<S> makeCorpus_zip() ctex { ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); Enumeration entries = zipFile.entries(); new L<S> tok; while (entries.hasMoreElements()) { ZipEntry entry = (ZipEntry) entries.nextElement(); //System.out.println("File found: " + entry.getName()); InputStream fin = zipFile.getInputStream(entry); // TODO: try to skip binary files? InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); new StringBuilder builder; BufferedReader bufferedReader = new BufferedReader(reader); String line; while ((line = bufferedReader.readLine()) != null) builder.append(line).append('\n'); fin.close(); new StringBuilder buf; buf.append("\n== File: " + entry.getName()); buf.append("\n==\n"); buf.append(builder.toString()).append("\n"); if (tok.size() != 0) tok.remove(tok.size()-1); tok.addAll(javaTok(buf.toString())); } zipFile.close(); return internAll(tok); } static L<S> makeCorpus_mysqldump() { SnippetDB db = new SnippetDB(corpusID); List<List<S>> rows = db.rowsOrderedBy("sn_created"); new L<S> tok; for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) { new StringBuilder buf; S id = db.getField(rows.get(i), "sn_id"); S title = db.getField(rows.get(i), "sn_title"); S text = db.getField(rows.get(i), "sn_text"); buf.append("\n== ID: " + id); buf.append("\n== Title: " + title); buf.append("\n==\n"); buf.append(text).append("\n"); if (tok.size() != 0) tok.remove(tok.size()-1); tok.addAll(javaTok(buf.toString())); ++i; } return internAll(tok); } static L<S> internAll(L<S> tok) { new L<S> l; for (S t : tok) l.add(t.intern()); return l; } static class Collector { P winner; double bestScore = -1; Set<int> predicted; void add(P p, double score) { if (winner == null || score > bestScore) { winner = p; bestScore = score; //S name = shorten(structure(p), 100); S name = p.getClass().getName(); print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); this.predicted = main.predicted; } } } }
Began life as a copy of #1001000
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, teubizvjbppd, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1001011 |
Snippet name: | Token prediction, multiple predictors (adding zip support) |
Eternal ID of this version: | #1001011/1 |
Text MD5: | 942f3ed24c4432b998f1e22ebdd4e9fe |
Transpilation MD5: | 78b9d92dea6ca60c5e4296d797492666 |
Author: | stefan |
Category: | |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2015-09-16 00:58:02 |
Source code size: | 7026 bytes / 262 lines |
Pitched / IR pitched: | No / Yes |
Views / Downloads: | 626 / 955 |
Referenced in: | [show references] |