Uses 53509K of libraries. Click here for Pure Java version (355L/3K/11K).
!752 lib 1004690 // tika import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; static Map<S, O> tika(String url) { DefaultHttpClient httpclient = new DefaultHttpClient(); Map<String, Object> map = new HashMap<String, Object>(); pcall { HttpGet httpGet = new HttpGet(url); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); InputStream input = null; if (entity != null) { try { input = entity.getContent(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parser.parse(input, handler, metadata, parseContext); map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */); map.put("title", metadata.get(TikaCoreProperties.TITLE)); map.put("pageCount", metadata.get("xmpTPg:NPages")); map.put("status_code", response.getStatusLine().getStatusCode() + ""); } catch (Exception e) { e.printStackTrace(); } finally { if (input != null) input.close(); } } } return map; } p { S url = or(get(args, 0), "http://math.about.com/library/q20.pdf"); print("Loading " + url); Map<S, O> extractedMap = tika(url); print(extractedMap.get("text")); }
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, uwnvikuolobj, vouqrxazstgt
No comments. add comment
Snippet ID: | #1004691 |
Snippet name: | Apache Tika Test |
Eternal ID of this version: | #1004691/1 |
Text MD5: | a8c1f6b11f9c2763ed8853f211166129 |
Transpilation MD5: | 02175c5979180294ebe86dbcb434c006 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-08-28 14:03:05 |
Source code size: | 1826 bytes / 51 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 1152 / 1097 |
Referenced in: | #1004692 - Apache Tika Test On Local PDF #1004722 - tika function - run Apache Tika to exract text from web page, PDF, ... |