!747 m { static S corpusID = "#1001010"; static int numSnippets = 3000; static boolean showGUI = true; static int maxCharsGUI = 500000; static boolean allTokens = true; static Collector collector; static L files; static Set predicted; // a file to learn from static class F { String id, name; L tok; } // a predictor static abstract class P { abstract S read(S file, L tok); abstract P derive(); // clone & reset counter for actual use } static class Chain extends P { new L

list; *() {} *(L

*list) {} *(P... a) { list = asList(a); } void add(P p) { list.add(p); } S read(S file, L tok) { for (P p : list) { S s = p.read(file, tok); if (s != null) return s; } return null; } P derive() { new Chain c; for (P p : list) c.add(p.derive()); return c; } } static class Tuples extends P { Map,S> map = new HashMap,S>(); int n, seen; S file; *(int *n) { } S read(S file, L tok) { if (!eq(file, this.file)) { seen = 0; this.file = file; } while (tok.size() > seen) { ++seen; if (seen > n) map.put(new ArrayList(tok.subList(seen-n-1, seen-1)), tok.get(seen-1)); } if (tok.size() >= n) return map.get(new ArrayList(tok.subList(tok.size()-n, tok.size()))); return null; } // slow... P oldDerive() { Tuples t = new Tuples(n); t.map.putAll(map); // t.seen == 0 which is ok return t; } // fast! P derive() { Tuples t = new Tuples(n); t.map = new DerivedHashMap,S>(map); return t; } } static class DerivedHashMap extends AbstractMap { Map base; new HashMap additions; *(Map *base) {} public B get(Object key) { B b = additions.get(key); if (b != null) return b; return base.get(key); } public B put(A key, B value) { return additions.put(key, value); } public Set> entrySet() { throw fail(); } } // TODO: Put NewX back in p { files = makeCorpus(); print("Files in corpus: " + files.size()); print("Learning..."); collector = new Collector; test(new Tuples(1)); //test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1))); print("Learning done."); /*if (collector.winner != null && showGUI) { predicted = collector.predicted; showColoredText(); }*/ } // train & evaluate a predictor static void test(P p) { //predicted = new TreeSet(); int points = 0, total = 0, lastPercent = 0; for (int ii = 0; ii < files.size(); ii++) { F f = files.get(ii); new L history; for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) { S t = f.tok.get(i); S x = p.read(f.name, history); boolean correct = t.equals(x); total += t.length(); if (correct) { //predicted.add(i); points += t.length(); } history.add(t); } int percent = roundUpTo(10, (int) (ii*100L/files.size())); if (percent > lastPercent) { print("Learning " + percent + "% done."); lastPercent = percent; } } double score = points*100.0/total; collector.add(p, score); } !include #1000989 // SnippetDB static L makeCorpus() { S name = getSnippetTitle(corpusID); if (name.toLowerCase().indexOf(".zip") >= 0) return makeCorpus_zip(); else return makeCorpus_mysqldump(); } static L makeCorpus_zip() ctex { new L files; ZipFile zipFile = new ZipFile(loadLibrary(corpusID)); Enumeration entries = zipFile.entries(); while (entries.hasMoreElements()) { ZipEntry entry = (ZipEntry) entries.nextElement(); //System.out.println("File found: " + entry.getName()); InputStream fin = zipFile.getInputStream(entry); // TODO: try to skip binary files? InputStreamReader reader = new InputStreamReader(fin, "UTF-8"); new StringBuilder builder; BufferedReader bufferedReader = new BufferedReader(reader); String line; while ((line = bufferedReader.readLine()) != null) builder.append(line).append('\n'); fin.close(); S text = builder.toString(); new F f; f.name = entry.getName(); f.tok = internAll(javaTok(text)); files.add(f); } zipFile.close(); return files; } static L makeCorpus_mysqldump() { new L files; SnippetDB db = new SnippetDB(corpusID); List> rows = db.rowsOrderedBy("sn_created"); for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) { new F f; f.id = db.getField(rows.get(i), "sn_id"); f.name = db.getField(rows.get(i), "sn_title"); S text = db.getField(rows.get(i), "sn_text"); f.tok = internAll(javaTok(text)); files.add(f); ++i; } return files; } static class Collector { P winner; double bestScore = -1; Set predicted; void add(P p, double score) { if (winner == null || score > bestScore) { winner = p; bestScore = score; //S name = shorten(structure(p), 100); S name = p.getClass().getName(); print("New best score: " + formatDouble(score, 2) + "% (" + name + ")"); this.predicted = main.predicted; } } } }