import javax.imageio.*; import java.awt.image.*; import java.awt.event.*; import java.awt.*; import java.security.NoSuchAlgorithmException; import java.security.MessageDigest; import java.lang.reflect.*; import java.net.*; import java.io.*; import javax.swing.text.*; import javax.swing.event.*; import javax.swing.*; import java.util.concurrent.*; import java.util.regex.*; import java.util.List; import java.util.zip.*; import java.util.*; public class main { static class SelectFinder { List tok; // list of tokens in HTML document List select; // list of tokens in select List> rows; // for every row, list of tokens List options; // for every options, value plus content void go(String html) { tok = htmlcoarsetok(html); findSelect(); } void findSelect() { print("Finding select."); for (int i = 1; i < tok.size(); i += 2) if (isTag(tok.get(i), "select")) for (int j = i+2; j < tok.size(); j += 2) if (isTag(tok.get(j), "/select")) { print("Select found!"); select = tok.subList(i-1, j+2); findRows(); return; } } void findRows() { List tok = select; rows = new ArrayList>(); options = new ArrayList(); int rowStart = 0; for (int i = 1; i < select.size(); i += 2) { //print(tok.get(i)); if (isTag(tok.get(i), "option")) { if (rowStart != 0) rows.add(select.subList(rowStart-1, i)); rowStart = i; } else if (isTag(tok.get(i), "/option") && rowStart != 0) { rows.add(select.subList(rowStart-1, i+2)); rowStart = 0; } } /*if (rowStart != 0) rows.add(select.subList(rowStart-1, i+2));*/ // TODO (unclosed option at the end) for (List row : rows) options.add(getData(row)); print(rows.size() + " row(s)"); } boolean isTag(String token, String tag) { return token.regionMatches(true, 0, "<" + tag + " ", 0, tag.length()+2) || token.regionMatches(true, 0, "<" + tag + ">", 0, tag.length()+2); } String[] getData(List row) { int colStart = 0, colEnd = 0; for (int i = 1; i < row.size(); i += 2) { String t = row.get(i); if (isTag(t, "option")) colStart = i; else if (isTag(t, "/option")) colEnd = i; } Map map = htmlgetparams(row.get(colStart)); String value = map.get("value"); if (colEnd == 0) colEnd = row.size(); String data = join(row.subList(colStart+1, colEnd)); if (value == null) value = data; return new String[] {value, data}; } } // SelectFinder public static void main(String[] args) throws Exception { String s = loadPage("http://tinybrain.de:8080/tb/snippets.php?action=list"); SelectFinder f = new SelectFinder(); f.go(s); String tag = f.select.get(1); print(tag); String name = htmlgetparams(tag).get("name"); if (!"type".equals(name)) error("huch: " + tag); print(structure(f.options)); for (String[] o : f.options) { if (o[0].equals("")) continue; name = o[1].replaceAll(".*\\]", "").trim(); int id = parseInt(o[0]); print("Type found: " + "[" + id + "] " + name); } } // gets name/value map from HTML tag static Map htmlgetparams(String tag) { List tok = javaTok(tag); Map map = new TreeMap(); for (int i = 1; i+6 < tok.size(); i += 2) { if (tok.get(i+2).equals("=")) { map.put(tok.get(i), htmlunquote(tok.get(i+4))); i += 4; } } return map; } static RuntimeException error() { throw new RuntimeException("fail"); } static RuntimeException error(String msg) { throw new RuntimeException(msg); } static int parseInt(String s) { return Integer.parseInt(s); } // TODO: process CDATA? static List htmlcoarsetok(String s) { List tok = new ArrayList(); int l = s.length(); int i = 0; while (i < l) { int j = i; char c; // scan for non-tags while (j < l) { if (s.charAt(j) != '<') // regular character ++j; else if (s.substring(j, Math.min(j+4, l)).equals("")); j = Math.min(j+3, l); } else // it's a tag break; } tok.add(s.substring(i, j)); i = j; if (i >= l) break; c = s.charAt(i); // scan for tags if (c == '<') { ++j; while (j < l && s.charAt(j) != '>') ++j; // TODO: strings? if (j < l) ++j; } tok.add(s.substring(i, j)); i = j; } if ((tok.size() % 2) == 0) tok.add(""); return tok; } static void print() { System.out.println(); } static void print(Object o) { System.out.println(o); } static void print(long i) { System.out.println(i); } public static String loadPage(String url) throws IOException { if(url.startsWith("tb/")) url = "tinybrain.de:8080/" + url; if (url.indexOf("://") < 0) url = "http://" + url; return loadPage(new URL(url)); } public static String loadPage(URL url) throws IOException { System.out.println("Loading: " + url.toExternalForm()); URLConnection con = url.openConnection(); return loadPage(con, url); } public static String loadPage(URLConnection con, URL url) throws IOException { String contentType = con.getContentType(); if (contentType == null) throw new IOException("Page could not be read: " + url); //Log.info("Content-Type: " + contentType); String charset = loadPage_guessCharset(contentType); Reader r = new InputStreamReader(con.getInputStream(), charset); StringBuilder buf = new StringBuilder(); while (true) { int ch = r.read(); if (ch < 0) break; //Log.info("Chars read: " + buf.length()); buf.append((char) ch); } return buf.toString(); } static String loadPage_guessCharset(String contentType) { Pattern p = Pattern.compile("text/html;\\s+charset=([^\\s]+)\\s*"); Matcher m = p.matcher(contentType); /* If Content-Type doesn't match this pre-conception, choose default and hope for the best. */ return m.matches() ? m.group(1) : "ISO-8859-1"; } public static String join(String glue, Iterable strings) { StringBuilder buf = new StringBuilder(); Iterator i = strings.iterator(); if (i.hasNext()) { buf.append(i.next()); while (i.hasNext()) buf.append(glue).append(i.next()); } return buf.toString(); } public static String join(String glue, String[] strings) { return join(glue, Arrays.asList(strings)); } public static String join(Iterable strings) { return join("", strings); } public static String join(String[] strings) { return join("", strings); } static String structure(Object o) { return structure(o, 0); } static String structure(Object o, int stringSizeLimit) { if (o == null) return "null"; String name = o.getClass().getName(); StringBuilder buf = new StringBuilder(); if (o instanceof Collection) { for (Object x : (Collection) o) { if (buf.length() != 0) buf.append(", "); buf.append(structure(x, stringSizeLimit)); } return "{" + buf + "}"; } if (o instanceof Map) { for (Object e : ((Map) o).entrySet()) { if (buf.length() != 0) buf.append(", "); buf.append(structure(((Map.Entry) e).getKey(), stringSizeLimit)); buf.append("="); buf.append(structure(((Map.Entry) e).getValue(), stringSizeLimit)); } return "{" + buf + "}"; } if (o.getClass().isArray()) { int n = Array.getLength(o); for (int i = 0; i < n; i++) { if (buf.length() != 0) buf.append(", "); buf.append(structure(Array.get(o, i), stringSizeLimit)); } return "{" + buf + "}"; } if (o instanceof String) return quote(stringSizeLimit != 0 ? shorten((String) o, stringSizeLimit) : (String) o); // Need more cases? This should cover all library classes... if (name.startsWith("java.") || name.startsWith("javax.")) return String.valueOf(o); String shortName = o.getClass().getName().replaceAll("^main\\$", ""); // TODO: go to superclasses too Field[] fields = o.getClass().getDeclaredFields(); int numFields = 0; String fieldName = ""; for (Field field : fields) { if ((field.getModifiers() & Modifier.STATIC) != 0) continue; Object value; try { value = field.get(o); } catch (Exception e) { value = "?"; } fieldName = field.getName(); // put special cases here... if (value != null) { if (buf.length() != 0) buf.append(", "); buf.append(fieldName + "=" + structure(value, stringSizeLimit)); } ++numFields; } String b = buf.toString(); if (numFields == 1) b = b.replaceAll("^" + fieldName + "=", ""); // drop field name if only one String s = shortName; if (buf.length() != 0) s += "(" + b + ")"; return s; } static String htmlunquote(String s) { if (s.startsWith("'") && s.endsWith("'") && s.length() >= 2 || s.startsWith("\"") && s.endsWith("\"") && s.length() >= 2) s = s.substring(1, s.length()-1); return htmldecode(s); } // replacement for class JavaTok // maybe incomplete, might want to add floating point numbers // todo also: extended multi-line strings static List javaTok(String s) { List tok = new ArrayList(); int l = s.length(); int i = 0; while (i < l) { int j = i; char c; String cc; // scan for whitespace while (j < l) { c = s.charAt(j); cc = s.substring(j, Math.min(j+2, l)); if (c == ' ' || c == '\t' || c == '\r' || c == '\n') ++j; else if (cc.equals("/*")) { do ++j; while (j < l && !s.substring(j, Math.min(j+2, l)).equals("*/")); j = Math.min(j+2, l); } else if (cc.equals("//")) { do ++j; while (j < l && "\r\n".indexOf(s.charAt(j)) < 0); } else break; } tok.add(s.substring(i, j)); i = j; if (i >= l) break; c = s.charAt(i); // cc is not needed in rest of loop body cc = s.substring(i, Math.min(i+2, l)); // scan for non-whitespace if (c == '\'' || c == '"') { char opener = c; ++j; while (j < l) { if (s.charAt(j) == opener) { ++j; break; } else if (s.charAt(j) == '\\' && j+1 < l) j += 2; else ++j; } } else if (Character.isJavaIdentifierStart(c)) do ++j; while (j < l && Character.isJavaIdentifierPart(s.charAt(j))); else if (Character.isDigit(c)) do ++j; while (j < l && Character.isDigit(s.charAt(j))); else if (cc.equals("[[")) { do ++j; while (j+1 < l && !s.substring(j, j+2).equals("]]")); j = Math.min(j+2, l); } else ++j; tok.add(s.substring(i, j)); i = j; } if ((tok.size() % 2) == 0) tok.add(""); return tok; } public static String quote(String s) { if (s == null) return "null"; return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\r", "\\r").replace("\n", "\\n") + "\""; } static String shorten(String s, int max) { return s.length() <= max ? s : s.substring(0, Math.min(s.length(), max)) + "..."; } static String htmldecode(final String input) { final int MIN_ESCAPE = 2; final int MAX_ESCAPE = 6; StringWriter writer = null; int len = input.length(); int i = 1; int st = 0; while (true) { // look for '&' while (i < len && input.charAt(i-1) != '&') i++; if (i >= len) break; // found '&', look for ';' int j = i; while (j < len && j < i + MAX_ESCAPE + 1 && input.charAt(j) != ';') j++; if (j == len || j < i + MIN_ESCAPE || j == i + MAX_ESCAPE + 1) { i++; continue; } // found escape if (input.charAt(i) == '#') { // numeric escape int k = i + 1; int radix = 10; final char firstChar = input.charAt(k); if (firstChar == 'x' || firstChar == 'X') { k++; radix = 16; } try { int entityValue = Integer.parseInt(input.substring(k, j), radix); if (writer == null) writer = new StringWriter(input.length()); writer.append(input.substring(st, i - 1)); if (entityValue > 0xFFFF) { final char[] chrs = Character.toChars(entityValue); writer.write(chrs[0]); writer.write(chrs[1]); } else { writer.write(entityValue); } } catch (NumberFormatException ex) { i++; continue; } } else { // named escape CharSequence value = htmldecode_lookupMap.get(input.substring(i, j)); if (value == null) { i++; continue; } if (writer == null) writer = new StringWriter(input.length()); writer.append(input.substring(st, i - 1)); writer.append(value); } // skip escape st = j + 1; i = st; } if (writer != null) { writer.append(input.substring(st, len)); return writer.toString(); } return input; } private static final String[][] htmldecode_ESCAPES = { {"\"", "quot"}, // " - double-quote {"&", "amp"}, // & - ampersand {"<", "lt"}, // < - less-than {">", "gt"}, // > - greater-than // Mapping to escape ISO-8859-1 characters to their named HTML 3.x equivalents. {"\u00A0", "nbsp"}, // non-breaking space {"\u00A1", "iexcl"}, // inverted exclamation mark {"\u00A2", "cent"}, // cent sign {"\u00A3", "pound"}, // pound sign {"\u00A4", "curren"}, // currency sign {"\u00A5", "yen"}, // yen sign = yuan sign {"\u00A6", "brvbar"}, // broken bar = broken vertical bar {"\u00A7", "sect"}, // section sign {"\u00A8", "uml"}, // diaeresis = spacing diaeresis {"\u00A9", "copy"}, // copyright sign {"\u00AA", "ordf"}, // feminine ordinal indicator {"\u00AB", "laquo"}, // left-pointing double angle quotation mark = left pointing guillemet {"\u00AC", "not"}, // not sign {"\u00AD", "shy"}, // soft hyphen = discretionary hyphen {"\u00AE", "reg"}, // registered trademark sign {"\u00AF", "macr"}, // macron = spacing macron = overline = APL overbar {"\u00B0", "deg"}, // degree sign {"\u00B1", "plusmn"}, // plus-minus sign = plus-or-minus sign {"\u00B2", "sup2"}, // superscript two = superscript digit two = squared {"\u00B3", "sup3"}, // superscript three = superscript digit three = cubed {"\u00B4", "acute"}, // acute accent = spacing acute {"\u00B5", "micro"}, // micro sign {"\u00B6", "para"}, // pilcrow sign = paragraph sign {"\u00B7", "middot"}, // middle dot = Georgian comma = Greek middle dot {"\u00B8", "cedil"}, // cedilla = spacing cedilla {"\u00B9", "sup1"}, // superscript one = superscript digit one {"\u00BA", "ordm"}, // masculine ordinal indicator {"\u00BB", "raquo"}, // right-pointing double angle quotation mark = right pointing guillemet {"\u00BC", "frac14"}, // vulgar fraction one quarter = fraction one quarter {"\u00BD", "frac12"}, // vulgar fraction one half = fraction one half {"\u00BE", "frac34"}, // vulgar fraction three quarters = fraction three quarters {"\u00BF", "iquest"}, // inverted question mark = turned question mark {"\u00C0", "Agrave"}, // ? - uppercase A, grave accent {"\u00C1", "Aacute"}, // ? - uppercase A, acute accent {"\u00C2", "Acirc"}, // ? - uppercase A, circumflex accent {"\u00C3", "Atilde"}, // ? - uppercase A, tilde {"\u00C4", "Auml"}, // ? - uppercase A, umlaut {"\u00C5", "Aring"}, // ? - uppercase A, ring {"\u00C6", "AElig"}, // ? - uppercase AE {"\u00C7", "Ccedil"}, // ? - uppercase C, cedilla {"\u00C8", "Egrave"}, // ? - uppercase E, grave accent {"\u00C9", "Eacute"}, // ? - uppercase E, acute accent {"\u00CA", "Ecirc"}, // ? - uppercase E, circumflex accent {"\u00CB", "Euml"}, // ? - uppercase E, umlaut {"\u00CC", "Igrave"}, // ? - uppercase I, grave accent {"\u00CD", "Iacute"}, // ? - uppercase I, acute accent {"\u00CE", "Icirc"}, // ? - uppercase I, circumflex accent {"\u00CF", "Iuml"}, // ? - uppercase I, umlaut {"\u00D0", "ETH"}, // ? - uppercase Eth, Icelandic {"\u00D1", "Ntilde"}, // ? - uppercase N, tilde {"\u00D2", "Ograve"}, // ? - uppercase O, grave accent {"\u00D3", "Oacute"}, // ? - uppercase O, acute accent {"\u00D4", "Ocirc"}, // ? - uppercase O, circumflex accent {"\u00D5", "Otilde"}, // ? - uppercase O, tilde {"\u00D6", "Ouml"}, // ? - uppercase O, umlaut {"\u00D7", "times"}, // multiplication sign {"\u00D8", "Oslash"}, // ? - uppercase O, slash {"\u00D9", "Ugrave"}, // ? - uppercase U, grave accent {"\u00DA", "Uacute"}, // ? - uppercase U, acute accent {"\u00DB", "Ucirc"}, // ? - uppercase U, circumflex accent {"\u00DC", "Uuml"}, // ? - uppercase U, umlaut {"\u00DD", "Yacute"}, // ? - uppercase Y, acute accent {"\u00DE", "THORN"}, // ? - uppercase THORN, Icelandic {"\u00DF", "szlig"}, // ? - lowercase sharps, German {"\u00E0", "agrave"}, // ? - lowercase a, grave accent {"\u00E1", "aacute"}, // ? - lowercase a, acute accent {"\u00E2", "acirc"}, // ? - lowercase a, circumflex accent {"\u00E3", "atilde"}, // ? - lowercase a, tilde {"\u00E4", "auml"}, // ? - lowercase a, umlaut {"\u00E5", "aring"}, // ? - lowercase a, ring {"\u00E6", "aelig"}, // ? - lowercase ae {"\u00E7", "ccedil"}, // ? - lowercase c, cedilla {"\u00E8", "egrave"}, // ? - lowercase e, grave accent {"\u00E9", "eacute"}, // ? - lowercase e, acute accent {"\u00EA", "ecirc"}, // ? - lowercase e, circumflex accent {"\u00EB", "euml"}, // ? - lowercase e, umlaut {"\u00EC", "igrave"}, // ? - lowercase i, grave accent {"\u00ED", "iacute"}, // ? - lowercase i, acute accent {"\u00EE", "icirc"}, // ? - lowercase i, circumflex accent {"\u00EF", "iuml"}, // ? - lowercase i, umlaut {"\u00F0", "eth"}, // ? - lowercase eth, Icelandic {"\u00F1", "ntilde"}, // ? - lowercase n, tilde {"\u00F2", "ograve"}, // ? - lowercase o, grave accent {"\u00F3", "oacute"}, // ? - lowercase o, acute accent {"\u00F4", "ocirc"}, // ? - lowercase o, circumflex accent {"\u00F5", "otilde"}, // ? - lowercase o, tilde {"\u00F6", "ouml"}, // ? - lowercase o, umlaut {"\u00F7", "divide"}, // division sign {"\u00F8", "oslash"}, // ? - lowercase o, slash {"\u00F9", "ugrave"}, // ? - lowercase u, grave accent {"\u00FA", "uacute"}, // ? - lowercase u, acute accent {"\u00FB", "ucirc"}, // ? - lowercase u, circumflex accent {"\u00FC", "uuml"}, // ? - lowercase u, umlaut {"\u00FD", "yacute"}, // ? - lowercase y, acute accent {"\u00FE", "thorn"}, // ? - lowercase thorn, Icelandic {"\u00FF", "yuml"}, // ? - lowercase y, umlaut {"'", "apos"}, // the controversial (but who cares!) ' // stackoverflow.com/questions/2083754/why-shouldnt-apos-be-used-to-escape-single-quotes }; private static final HashMap htmldecode_lookupMap; static { htmldecode_lookupMap = new HashMap(); for (final CharSequence[] seq : htmldecode_ESCAPES) htmldecode_lookupMap.put(seq[1].toString(), seq[0]); } }