public static String loadPage(URL url) throws IOException { System.out.println("Loading: " + url.toExternalForm()); URLConnection con = url.openConnection(); return loadPage(con, url); } public static String loadPage(URLConnection con, URL url) throws IOException { String contentType = con.getContentType(); if (contentType == null) throw new IOException("Page could not be read: " + url); //Log.info("Content-Type: " + contentType); String charset = loadPage_guessCharset(contentType); Reader r = new InputStreamReader(con.getInputStream(), charset); StringBuilder buf = new StringBuilder(); while (true) { int ch = r.read(); if (ch < 0) break; //Log.info("Chars read: " + buf.length()); buf.append((char) ch); } return buf.toString(); } static String loadPage_guessCharset(String contentType) { Pattern p = Pattern.compile("text/html;\\s+charset=([^\\s]+)\\s*"); Matcher m = p.matcher(contentType); /* If Content-Type doesn't match this pre-conception, choose default and hope for the best. */ return m.matches() ? m.group(1) : "ISO-8859-1"; } }