Uses 0K of libraries. Click here for Pure Java version (1869L/12K).
lib 1400208 // tika - 50 MB! import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.http.util.*; import org.apache.http.protocol.HTTP; static Map<S, O> tika(String url) { new DefaultHttpClient httpclient; //HttpClient httpclient = apacheHttp_trustingClient(); Map<String, Object> map = new HashMap<String, Object>(); pcall { HttpGet httpGet = new HttpGet(url); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); InputStream input = null; if (entity != null) { try { input = entity.getContent(); //S inputString = EntityUtils.toString(entity, HTTP.UTF_8); //input = new ByteArrayInputStream(toUTF8(inputString)); //new StringWriter stringWriter; //BodyContentHandler handler = new(stringWriter); new BodyContentHandler handler; //new ByteArrayOutputStream outputStream; //BodyContentHandler handler = new(outputStream); new Metadata metadata; new AutoDetectParser parser; new ParseContext parseContext; parser.parse(input, handler, metadata, parseContext); map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */); //map.put("text", stringWriter.toString()); //map.put("text", fromUTF8(outputStream.toByteArray())); //map.put("text", fromUTF8(charsToSingleBytes(handler.toString()))); map.put("title", metadata.get(TikaCoreProperties.TITLE)); map.put("pageCount", metadata.get("xmpTPg:NPages")); map.put("status_code", response.getStatusLine().getStatusCode() + ""); } catch (Exception e) { e.printStackTrace(); } finally { if (input != null) input.close(); } } } ret map; }
Began life as a copy of #1004691
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1004722 | 
| Snippet name: | tika function - run Apache Tika to exract text from web page, PDF, ... | 
| Eternal ID of this version: | #1004722/14 | 
| Text MD5: | df054b30791ea35332ac8a07b34462d7 | 
| Transpilation MD5: | 5033868318b5fef2b64296f61563484d | 
| Author: | stefan | 
| Category: | javax | 
| Type: | JavaX fragment (include) | 
| Public (visible to everyone): | Yes | 
| Archived (hidden from active list): | No | 
| Created/modified: | 2019-05-16 21:50:56 | 
| Source code size: | 2221 bytes / 55 lines | 
| Pitched / IR pitched: | No / No | 
| Views / Downloads: | 1005 / 1102 | 
| Version history: | 13 change(s) | 
| Referenced in: | #1006654 - Standard functions list 2 (LIVE, continuation of #761) #1013650 - tikaText - Tika-extract, get text only - now uses simpleEmptyLines |