import javax.imageio.*; import java.awt.image.*; import java.awt.event.*; import java.awt.*; import java.security.NoSuchAlgorithmException; import java.security.MessageDigest; import java.lang.reflect.*; import java.net.*; import java.io.*; import javax.swing.text.*; import javax.swing.*; import java.util.concurrent.*; import java.util.regex.*; import java.util.List; import java.util.zip.*; import java.util.*; public class main { static List tok; // list of tokens in HTML document static List table; // list of tokens in table static List> rows; // for every row, list of tokens in row static List> data; // for every row, for every cell, inner data public static void main(String[] args) throws Exception { String htmlID = "#3000025"; if (args.length != 0) htmlID = args[0]; String script = "\n find table\n "; String html = loadSnippet(htmlID); tok = htmlcoarsetok(html); for (String cmd : splitScript(script)) { List c = javaTok(cmd); if (c.size() == 1) continue; print("cmd: " + structure(c)); if (cmdMatch("find table", c) != null) { findTable(); if (table == null) fail("No table"); //print(fromLines(table)); } } } // split at newline, but also take into account multi-line strings static List splitScript(String script) { List tok = javaTok(script); List result = new ArrayList(); result.add(""); for (int i = 0; i < tok.size(); i++) { boolean nl = tok.get(i).indexOf("\n") >= 0; if (nl) result.add(""); else result.set(result.size()-1, result.get(result.size()-1) + tok.get(i)); } return result; } static String[] cmdMatch(String pat, List cmd) { return match2(javaTok(pat), cmd); } static void findTable() { print("Finding table."); for (int i = 1; i < tok.size(); i += 2) if (isTag(tok.get(i), "table")) for (int j = i+2; j < tok.size(); j += 2) if (isTag(tok.get(j), "/table")) { print("Table found!"); table = tok.subList(i-1, j+2); findRows(); return; } } static void findRows() { List tok = table; rows = new ArrayList>(); data = new ArrayList>(); int rowStart = 0; for (int i = 1; i < table.size(); i += 2) { //print(tok.get(i)); if (isTag(tok.get(i), "tr")) rowStart = i; else if (isTag(tok.get(i), "/tr") && rowStart != 0) { List row = table.subList(rowStart-1, i+2); rows.add(row); data.add(getData(row)); } } print(rows.size() + " row(s)"); print("Top left cell: " + data.get(0).get(0)); } static boolean isTag(String token, String tag) { return token.regionMatches(true, 0, "<" + tag + " ", 0, tag.length()+2) || token.regionMatches(true, 0, "<" + tag + ">", 0, tag.length()+2); } static List getData(List row) { int colStart = 0; List cols = new ArrayList(); for (int i = 1; i < row.size(); i += 2) { String t = row.get(i); if (isTag(t, "td") || isTag(t, "th")) colStart = i; else if ((isTag(t, "/td") || isTag(t, "/th")) && colStart != 0) cols.add(join(row.subList(colStart+1, i))); } return cols; } // match2 matches multiple "*" (matches a single token) wildcards and zero or one "..." wildcards (matches multiple tokens) static String[] match2(List pat, List tok) { // standard case (no ...) int i = pat.indexOf("..."); if (i < 0) return match2_match(pat, tok); pat = new ArrayList(pat); // We're modifying it, so copy first pat.set(i, "*"); int expand = 0; while (pat.size() < tok.size()) { ++expand; pat.add(i, "*"); pat.add(i+1, ""); // doesn't matter } return match2_match(pat, tok); } static String[] match2_match(List pat, List tok) { List result = new ArrayList(); if (pat.size() != tok.size()) { /*if (debug) print("Size mismatch: " + structure(pat) + " vs " + structure(tok));*/ return null; } for (int i = 1; i < pat.size(); i += 2) { String p = pat.get(i), t = tok.get(i); /*if (debug) print("Checking " + p + " against " + t);*/ if ("*".equals(p)) result.add(t); else if (!p.equals(t)) return null; } return result.toArray(new String[result.size()]); } // replacement for class JavaTok // maybe incomplete, might want to add floating point numbers // todo also: extended multi-line strings static List javaTok(String s) { List tok = new ArrayList(); int l = s.length(); int i = 0; while (i < l) { int j = i; char c; String cc; // scan for whitespace while (j < l) { c = s.charAt(j); cc = s.substring(j, Math.min(j+2, l)); if (c == ' ' || c == '\t' || c == '\r' || c == '\n') ++j; else if (cc.equals("/*")) { do ++j; while (j < l && !s.substring(j, Math.min(j+2, l)).equals("*/")); j = Math.min(j+2, l); } else if (cc.equals("//")) { do ++j; while (j < l && "\r\n".indexOf(s.charAt(j)) < 0); } else break; } tok.add(s.substring(i, j)); i = j; if (i >= l) break; c = s.charAt(i); // cc is not needed in rest of loop body cc = s.substring(i, Math.min(i+2, l)); // scan for non-whitespace if (c == '\'' || c == '"') { char opener = c; ++j; while (j < l) { if (s.charAt(j) == opener) { ++j; break; } else if (s.charAt(j) == '\\' && j+1 < l) j += 2; else ++j; } } else if (Character.isJavaIdentifierStart(c)) do ++j; while (j < l && Character.isJavaIdentifierPart(s.charAt(j))); else if (Character.isDigit(c)) do ++j; while (j < l && Character.isDigit(s.charAt(j))); else if (cc.equals("[[")) { do ++j; while (j+1 < l && !s.substring(j, j+2).equals("]]")); j = Math.min(j+2, l); } else ++j; tok.add(s.substring(i, j)); i = j; } if ((tok.size() % 2) == 0) tok.add(""); return tok; } // TODO: process CDATA? static List htmlcoarsetok(String s) { List tok = new ArrayList(); int l = s.length(); int i = 0; while (i < l) { int j = i; char c; // scan for non-tags while (j < l) { if (s.charAt(j) != '<') // regular character ++j; else if (s.substring(j, Math.min(j+4, l)).equals("")); j = Math.min(j+3, l); } else // it's a tag break; } tok.add(s.substring(i, j)); i = j; if (i >= l) break; c = s.charAt(i); // scan for tags if (c == '<') { ++j; while (j < l && s.charAt(j) != '>') ++j; // TODO: strings? if (j < l) ++j; } tok.add(s.substring(i, j)); i = j; } return tok; } static void print() { System.out.println(); } static void print(Object o) { System.out.println(o); } static void print(long i) { System.out.println(i); } static boolean preferCached = false; public static String loadSnippet(String snippetID) throws IOException { return loadSnippet(parseSnippetID(snippetID), preferCached); } public static String loadSnippet(String snippetID, boolean preferCached) throws IOException { return loadSnippet(parseSnippetID(snippetID), preferCached); } public static long parseSnippetID(String snippetID) { return Long.parseLong(shortenSnippetID(snippetID)); } private static String shortenSnippetID(String snippetID) { if (snippetID.startsWith("#")) snippetID = snippetID.substring(1); String httpBlaBla = "http://tinybrain.de/"; if (snippetID.startsWith(httpBlaBla)) snippetID = snippetID.substring(httpBlaBla.length()); return snippetID; } public static boolean isSnippetID(String snippetID) { snippetID = shortenSnippetID(snippetID); return isInteger(snippetID) && Long.parseLong(snippetID) != 0; } public static boolean isInteger(String s) { return Pattern.matches("\\-?\\d+", s); } public static String loadSnippet(long snippetID, boolean preferCached) throws IOException { if (preferCached) { initSnippetCache(); String text = DiskSnippetCache_get(snippetID); if (text != null) return text; } String text; try { URL url = new URL("http://tinybrain.de:8080/getraw.php?id=" + snippetID); text = loadPage(url); } catch (FileNotFoundException e) { throw new IOException("Snippet #" + snippetID + " not found or not public"); } try { initSnippetCache(); DiskSnippetCache_put(snippetID, text); } catch (IOException e) { System.err.println("Minor warning: Couldn't save snippet to cache (" + DiskSnippetCache_getDir() + ")"); } return text; } static File DiskSnippetCache_dir; public static void initDiskSnippetCache(File dir) { DiskSnippetCache_dir = dir; dir.mkdirs(); } public static synchronized String DiskSnippetCache_get(long snippetID) throws IOException { return loadTextFile(DiskSnippetCache_getFile(snippetID).getPath(), null); } private static File DiskSnippetCache_getFile(long snippetID) { return new File(DiskSnippetCache_dir, "" + snippetID); } public static synchronized void DiskSnippetCache_put(long snippetID, String snippet) throws IOException { saveTextFile(DiskSnippetCache_getFile(snippetID).getPath(), snippet); } public static File DiskSnippetCache_getDir() { return DiskSnippetCache_dir; } public static void initSnippetCache() { if (DiskSnippetCache_dir == null) initDiskSnippetCache(new File(System.getProperty("user.home"), ".tinybrain/snippet-cache")); } public static String join(String glue, Iterable strings) { StringBuilder buf = new StringBuilder(); Iterator i = strings.iterator(); if (i.hasNext()) { buf.append(i.next()); while (i.hasNext()) buf.append(glue).append(i.next()); } return buf.toString(); } public static String join(String glue, String[] strings) { return join(glue, Arrays.asList(strings)); } public static String join(Iterable strings) { return join("", strings); } public static String join(String[] strings) { return join("", strings); } static RuntimeException fail() { throw new RuntimeException("fail"); } static RuntimeException fail(String msg) { throw new RuntimeException(msg); } static String structure(Object o) { return structure(o, 0); } static String structure(Object o, int stringSizeLimit) { if (o == null) return "null"; String name = o.getClass().getName(); StringBuilder buf = new StringBuilder(); if (o instanceof Collection) { for (Object x : (Collection) o) { if (buf.length() != 0) buf.append(", "); buf.append(structure(x, stringSizeLimit)); } return "{" + buf + "}"; } if (o instanceof Map) { for (Object e : ((Map) o).entrySet()) { if (buf.length() != 0) buf.append(", "); buf.append(structure(((Map.Entry) e).getKey(), stringSizeLimit)); buf.append("="); buf.append(structure(((Map.Entry) e).getValue(), stringSizeLimit)); } return "{" + buf + "}"; } if (o.getClass().isArray()) { int n = Array.getLength(o); for (int i = 0; i < n; i++) { if (buf.length() != 0) buf.append(", "); buf.append(structure(Array.get(o, i), stringSizeLimit)); } return "{" + buf + "}"; } if (o instanceof String) return quote(stringSizeLimit != 0 ? shorten((String) o, stringSizeLimit) : (String) o); // Need more cases? This should cover all library classes... if (name.startsWith("java.") || name.startsWith("javax.")) return String.valueOf(o); String shortName = o.getClass().getName().replaceAll("^main\\$", ""); // TODO: go to superclasses too Field[] fields = o.getClass().getDeclaredFields(); int numFields = 0; String fieldName = ""; for (Field field : fields) { if ((field.getModifiers() & Modifier.STATIC) != 0) continue; Object value; try { value = field.get(o); } catch (Exception e) { value = "?"; } fieldName = field.getName(); // put special cases here... if (value != null) { if (buf.length() != 0) buf.append(", "); buf.append(fieldName + "=" + structure(value, stringSizeLimit)); } ++numFields; } String b = buf.toString(); if (numFields == 1) b = b.replaceAll("^" + fieldName + "=", ""); // drop field name if only one String s = shortName; if (buf.length() != 0) s += "(" + b + ")"; return s; } /** writes safely (to temp file, then rename) */ public static void saveTextFile(String fileName, String contents) throws IOException { File file = new File(fileName); File parentFile = file.getParentFile(); if (parentFile != null) parentFile.mkdirs(); String tempFileName = fileName + "_temp"; FileOutputStream fileOutputStream = new FileOutputStream(tempFileName); OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream, "UTF-8"); PrintWriter printWriter = new PrintWriter(outputStreamWriter); printWriter.print(contents); printWriter.close(); if (file.exists() && !file.delete()) throw new IOException("Can't delete " + fileName); if (!new File(tempFileName).renameTo(file)) throw new IOException("Can't rename " + tempFileName + " to " + fileName); } public static String loadTextFile(String fileName, String defaultContents) throws IOException { if (!new File(fileName).exists()) return defaultContents; FileInputStream fileInputStream = new FileInputStream(fileName); InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream, "UTF-8"); return loadTextFile(inputStreamReader); } public static String loadTextFile(Reader reader) throws IOException { StringBuilder builder = new StringBuilder(); try { BufferedReader bufferedReader = new BufferedReader(reader); String line; while ((line = bufferedReader.readLine()) != null) builder.append(line).append('\n'); } finally { reader.close(); } return builder.length() == 0 ? "" : builder.substring(0, builder.length()-1); } public static String quote(String s) { if (s == null) return "null"; return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\""; } public static String loadPage(String url) throws IOException { if (url.indexOf("://") < 0) url = "http://" + url; return loadPage(new URL(url)); } public static String loadPage(URL url) throws IOException { System.out.println("Loading: " + url.toExternalForm()); URLConnection con = url.openConnection(); return loadPage(con, url); } public static String loadPage(URLConnection con, URL url) throws IOException { String contentType = con.getContentType(); if (contentType == null) throw new IOException("Page could not be read: " + url); //Log.info("Content-Type: " + contentType); String charset = loadPage_guessCharset(contentType); Reader r = new InputStreamReader(con.getInputStream(), charset); StringBuilder buf = new StringBuilder(); while (true) { int ch = r.read(); if (ch < 0) break; //Log.info("Chars read: " + buf.length()); buf.append((char) ch); } return buf.toString(); } static String loadPage_guessCharset(String contentType) { Pattern p = Pattern.compile("text/html;\\s+charset=([^\\s]+)\\s*"); Matcher m = p.matcher(contentType); /* If Content-Type doesn't match this pre-conception, choose default and hope for the best. */ return m.matches() ? m.group(1) : "ISO-8859-1"; } static String shorten(String s, int max) { return s.length() <= max ? s : s.substring(0, Math.min(s.length(), max)) + "..."; } }