list; *() {} *(L
*list) {}
*(P... a) { list = asList(a); }
void add(P p) { list.add(p); }
S read(S file, L tok) {
for (P p : list) {
S s = p.read(file, tok);
if (s != null) return s;
}
return null;
}
P derive() {
new Chain c;
for (P p : list)
c.add(p.derive());
return c;
}
P clear() {
new Chain c;
for (P p : list)
c.add(p.clear());
return c;
}
}
static class Tuples extends P {
Map tok) {
prepare(file);
while (tok.size() > seen) {
++seen;
if (seen > n)
map.put(new ArrayList(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
}
if (tok.size() >= n)
return map.get(new ArrayList(tok.subList(tok.size()-n, tok.size())));
return null;
}
P derive() {
Tuples t = new Tuples(n);
t.map = new DerivedHashMap makeMapPrefix(L tok1, L tok2) {
if (tok1.size() < tok2.size()) return null;
new Map map;
for (int i = 1; i < tok2.size(); i += 2) {
S t1 = tok1.get(i), t2 = tok2.get(i);
if (!t1.equals(t2)) {
S v = map.get(t1);
if (v == null)
map.put(t1, t2);
else if (!v.equals(t2))
return null; // match fail
}
}
// match succeeds
return map;
}
// TODO: code tokens only
static class Patterns extends P {
Map tok) {
prepare(file);
while (tok.size() > seen) {
++seen;
if (seen > n)
put(new ArrayList(tok.subList(seen-n-1, seen-1)), tok.get(seen-1));
}
if (tok.size() >= n) {
L l = new ArrayList(tok.subList(tok.size()-n, tok.size()));
for (L pl : map.keySet()) {
S pr = map.get(pl);
print("pl: " + structure(pl) + ", l: " + structure(l));
Map m = makeMapPrefix(pl, l);
if (m != null) {
S result = m.get(pr);
print("map: " + structure(m) + ", result: " + quote(result));
ret result;
}
}
}
return null;
}
void put(L l, S r) {
if (isPattern(l, r)) {
new L l2;
l2.addAll(l);
l2.add(r);
print("pattern: " + structure(l2));
map.put(l, r);
}
}
boolean isPattern(L l, S r) {
/*new Set set;
set.addAll(l);
set.add(r);
return set.size() < l.size()+1;*/
return l.contains(r) && interestingToken(r);
}
boolean interestingToken(S r) {
//return !r.trim().equals("");
for (int i = 0; i < r.length(); i++)
if (Character.isLetter(r.charAt(i)))
ret true;
ret false;
}
P derive() {
Patterns t = new Patterns(n);
t.map = new DerivedHashMap tok) {
if (!eq(file, this.file)) {
seen = 0;
this.file = file;
node = tree;
}
if (!nonmod) while (tok.size() > seen) {
S t = tok.get(seen++);
Node child = node.find(t);
if (child == null)
node.next.add(child = new Node(t));
child.count++;
node = child;
}
Node n = node.bestNext();
ret n != null ? n.token : null;
}
// it's a hack - derived predictor doesn't learn
P derive() {
//return (P) main.clone(this);
new StartTree p;
p.nonmod = true;
p.tree = tree;
return p;
}
P clear() {
return new StartTree;
}
}
p {
files = makeCorpus();
print("Files in corpus: " + files.size());
print("Learning...");
collector = new Collector;
//test(new Tuples(1));
/*test(new StartTree);
test(new Chain(new Tuples(4), new Tuples(3), new Tuples(2), new Tuples(1), new StartTree));*/
//test(new Patterns(6));
test(new Patterns(9));
print("Learning done.");
printVMSize();
if (collector.winner != null && showGUI)
window();
}
static int points = 0, total = 0;
// train & evaluate a predictor
static void test(P p) {
int lastPercent = 0;
predicted = new HashMap;
points = 0;
total = 0;
for (int ii = 0; ii < files.size(); ii++) {
F f = files.get(ii);
testFile(p, f);
int percent = roundUpTo(10, (int) (ii*100L/files.size()));
if (percent > lastPercent) {
print("Learning " + percent + "% done.");
lastPercent = percent;
}
}
double score = points*100.0/total;
collector.add(p, score);
}
static void testFile(P p, F f) {
new TreeSet history;
for (int i = allTokens ? 0 : 1; i < f.tok.size(); i += allTokens ? 1 : 2) {
S t = f.tok.get(i);
S x = p.read(f.name, history);
boolean correct = t.equals(x);
total += t.length();
if (correct) {
pred.add(i);
points += t.length();
}
history.add(t);
}
predicted.put(f, pred);
}
!include #1000989 // SnippetDB
static L> rows = db.rowsOrderedBy("sn_created");
for (int i = 0; i < Math.min(rows.size(), numSnippets); i++) {
new F f;
f.id = db.getField(rows.get(i), "sn_id");
f.name = db.getField(rows.get(i), "sn_title");
S text = db.getField(rows.get(i), "sn_text");
f.tok = internAll(javaTok(text));
files.add(f);
++i;
}
return files;
}
static class Collector {
P winner;
double bestScore = -1;
Map
tok = f.tok;
int i = tok.size(), len = 0;
while (len <= maxCharsGUI && i > 0) {
--i;
len += tok.get(i).length();
}
for (; i < tok.size(); i++) {
if (tok.get(i).length() == 0) continue;
boolean green = pred.contains(i);
SimpleAttributeSet set = new SimpleAttributeSet();
StyleConstants.setForeground(set, green ? Color.green : Color.gray);
doc.insertString(doc.getLength(), tok.get(i), set);
}
pane.setDocument(doc);
double score = getScore(pred, tok);
btnNext.setText(f.name + " (" + (ii+1) + "/" + files.size() + ") - " + (int) score + " %");
}
}
final new X x;
btnNext.addActionListener(actionListener {
x.y();
});
cp.add(btnNext, BorderLayout.NORTH);
x.y();
jf.setBounds(100, 100, 600, 600);
jf.setVisible(true);
}
!include #1001032 // clone function
static double getScore(Set tok) {
int total = 0, score = 0;
for (int i = 0; i < tok.size(); i++) {
int n = tok.get(i).length();
total += n;
if (pred.contains(i))
score += n;
}
ret score*100.0/total;
}
}