Uses 0K of libraries. Click here for Pure Java version (1869L/12K).
| 1 | lib 1400208 // tika - 50 MB! | 
| 2 | |
| 3 | import org.apache.http.HttpEntity; | 
| 4 | import org.apache.http.HttpResponse; | 
| 5 | import org.apache.http.client.methods.HttpGet; | 
| 6 | import org.apache.http.impl.client.DefaultHttpClient; | 
| 7 | import org.apache.tika.metadata.Metadata; | 
| 8 | import org.apache.tika.metadata.TikaCoreProperties; | 
| 9 | import org.apache.tika.parser.AutoDetectParser; | 
| 10 | import org.apache.tika.parser.ParseContext; | 
| 11 | import org.apache.tika.sax.BodyContentHandler; | 
| 12 | import org.apache.http.util.*; | 
| 13 | import org.apache.http.protocol.HTTP; | 
| 14 | |
| 15 | static Map<S, O> tika(String url) {
 | 
| 16 | new DefaultHttpClient httpclient; | 
| 17 | //HttpClient httpclient = apacheHttp_trustingClient(); | 
| 18 | |
| 19 | Map<String, Object> map = new HashMap<String, Object>(); | 
| 20 |   pcall {
 | 
| 21 | HttpGet httpGet = new HttpGet(url); | 
| 22 | HttpResponse response = httpclient.execute(httpGet); | 
| 23 | HttpEntity entity = response.getEntity(); | 
| 24 | InputStream input = null; | 
| 25 |     if (entity != null) {
 | 
| 26 |       try {
 | 
| 27 | input = entity.getContent(); | 
| 28 | //S inputString = EntityUtils.toString(entity, HTTP.UTF_8); | 
| 29 | //input = new ByteArrayInputStream(toUTF8(inputString)); | 
| 30 | //new StringWriter stringWriter; | 
| 31 | //BodyContentHandler handler = new(stringWriter); | 
| 32 | new BodyContentHandler handler; | 
| 33 | //new ByteArrayOutputStream outputStream; | 
| 34 | //BodyContentHandler handler = new(outputStream); | 
| 35 | new Metadata metadata; | 
| 36 | new AutoDetectParser parser; | 
| 37 | new ParseContext parseContext; | 
| 38 | parser.parse(input, handler, metadata, parseContext); | 
| 39 |         map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */);
 | 
| 40 |         //map.put("text", stringWriter.toString());
 | 
| 41 |         //map.put("text", fromUTF8(outputStream.toByteArray()));
 | 
| 42 |         //map.put("text", fromUTF8(charsToSingleBytes(handler.toString())));
 | 
| 43 |         map.put("title", metadata.get(TikaCoreProperties.TITLE));
 | 
| 44 |         map.put("pageCount", metadata.get("xmpTPg:NPages"));
 | 
| 45 |         map.put("status_code", response.getStatusLine().getStatusCode() + "");
 | 
| 46 |       } catch (Exception e) {                     
 | 
| 47 | e.printStackTrace(); | 
| 48 |       } finally {
 | 
| 49 | if (input != null) | 
| 50 | input.close(); | 
| 51 | } | 
| 52 | } | 
| 53 | } | 
| 54 | ret map; | 
| 55 | } | 
Began life as a copy of #1004691
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1004722 | 
| Snippet name: | tika function - run Apache Tika to exract text from web page, PDF, ... | 
| Eternal ID of this version: | #1004722/14 | 
| Text MD5: | df054b30791ea35332ac8a07b34462d7 | 
| Transpilation MD5: | 5033868318b5fef2b64296f61563484d | 
| Author: | stefan | 
| Category: | javax | 
| Type: | JavaX fragment (include) | 
| Public (visible to everyone): | Yes | 
| Archived (hidden from active list): | No | 
| Created/modified: | 2019-05-16 21:50:56 | 
| Source code size: | 2221 bytes / 55 lines | 
| Pitched / IR pitched: | No / No | 
| Views / Downloads: | 1007 / 1104 | 
| Version history: | 13 change(s) | 
| Referenced in: | [show references] |