import javax.imageio.*; import java.awt.image.*; import java.awt.*; import java.security.NoSuchAlgorithmException; import java.security.MessageDigest; import java.lang.reflect.*; import java.net.*; import java.io.*; import javax.swing.*; import java.util.regex.*; import java.util.List; import java.util.zip.*; import java.util.*; public class main { public static void main(String[] args) throws Exception { String s = load("tinybrain.blog.de"); List tok = htmlcoarsetok(s); for (String _s : tok) print(_s); } static String load(String bla) { try { return loadPage(bla); } catch (Throwable __e) { throw __e instanceof RuntimeException ? (RuntimeException) __e : new RuntimeException(__e); }} static List htmlcoarsetok(String s) { List tok = new ArrayList(); int l = s.length(); int i = 0; while (i < l) { int j = i; char c; // scan for non-tags while (j < l) { if (s.charAt(j) != '<') // regular character ++j; else if (s.substring(j, Math.min(j+4, l)).equals("")); j = Math.min(j+3, l); } else // it's a tag break; } tok.add(s.substring(i, j)); i = j; if (i >= l) break; c = s.charAt(i); // scan for tags if (c == '<') { ++j; while (j < l && s.charAt(j) != '>') ++j; // TODO: strings? if (j < l) ++j; } tok.add(s.substring(i, j)); i = j; } return tok; } static void print() { System.out.println(); } static void print(Object o) { System.out.println(o); } static void print(long i) { System.out.println(i); } public static String loadPage(String url) throws IOException { if (url.indexOf("://") < 0) url = "http://" + url; return loadPage(new URL(url)); } public static String loadPage(URL url) throws IOException { System.out.println("Loading: " + url.toExternalForm()); URLConnection con = url.openConnection(); return loadPage(con, url); } public static String loadPage(URLConnection con, URL url) throws IOException { String contentType = con.getContentType(); if (contentType == null) throw new IOException("Page could not be read: " + url); //Log.info("Content-Type: " + contentType); String charset = loadPage_guessCharset(contentType); Reader r = new InputStreamReader(con.getInputStream(), charset); StringBuilder buf = new StringBuilder(); while (true) { int ch = r.read(); if (ch < 0) break; //Log.info("Chars read: " + buf.length()); buf.append((char) ch); } return buf.toString(); } static String loadPage_guessCharset(String contentType) { Pattern p = Pattern.compile("text/html;\\s+charset=([^\\s]+)\\s*"); Matcher m = p.matcher(contentType); /* If Content-Type doesn't match this pre-conception, choose default and hope for the best. */ return m.matches() ? m.group(1) : "ISO-8859-1"; } }