Uses 0K of libraries. Click here for Pure Java version (1869L/12K).
1 | lib 1400208 // tika - 50 MB! |
2 | |
3 | import org.apache.http.HttpEntity; |
4 | import org.apache.http.HttpResponse; |
5 | import org.apache.http.client.methods.HttpGet; |
6 | import org.apache.http.impl.client.DefaultHttpClient; |
7 | import org.apache.tika.metadata.Metadata; |
8 | import org.apache.tika.metadata.TikaCoreProperties; |
9 | import org.apache.tika.parser.AutoDetectParser; |
10 | import org.apache.tika.parser.ParseContext; |
11 | import org.apache.tika.sax.BodyContentHandler; |
12 | import org.apache.http.util.*; |
13 | import org.apache.http.protocol.HTTP; |
14 | |
15 | static Map<S, O> tika(String url) { |
16 | new DefaultHttpClient httpclient; |
17 | //HttpClient httpclient = apacheHttp_trustingClient(); |
18 | |
19 | Map<String, Object> map = new HashMap<String, Object>(); |
20 | pcall { |
21 | HttpGet httpGet = new HttpGet(url); |
22 | HttpResponse response = httpclient.execute(httpGet); |
23 | HttpEntity entity = response.getEntity(); |
24 | InputStream input = null; |
25 | if (entity != null) { |
26 | try { |
27 | input = entity.getContent(); |
28 | //S inputString = EntityUtils.toString(entity, HTTP.UTF_8); |
29 | //input = new ByteArrayInputStream(toUTF8(inputString)); |
30 | //new StringWriter stringWriter; |
31 | //BodyContentHandler handler = new(stringWriter); |
32 | new BodyContentHandler handler; |
33 | //new ByteArrayOutputStream outputStream; |
34 | //BodyContentHandler handler = new(outputStream); |
35 | new Metadata metadata; |
36 | new AutoDetectParser parser; |
37 | new ParseContext parseContext; |
38 | parser.parse(input, handler, metadata, parseContext); |
39 | map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */); |
40 | //map.put("text", stringWriter.toString()); |
41 | //map.put("text", fromUTF8(outputStream.toByteArray())); |
42 | //map.put("text", fromUTF8(charsToSingleBytes(handler.toString()))); |
43 | map.put("title", metadata.get(TikaCoreProperties.TITLE)); |
44 | map.put("pageCount", metadata.get("xmpTPg:NPages")); |
45 | map.put("status_code", response.getStatusLine().getStatusCode() + ""); |
46 | } catch (Exception e) { |
47 | e.printStackTrace(); |
48 | } finally { |
49 | if (input != null) |
50 | input.close(); |
51 | } |
52 | } |
53 | } |
54 | ret map; |
55 | } |
Began life as a copy of #1004691
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1004722 |
Snippet name: | tika function - run Apache Tika to exract text from web page, PDF, ... |
Eternal ID of this version: | #1004722/14 |
Text MD5: | df054b30791ea35332ac8a07b34462d7 |
Transpilation MD5: | 5033868318b5fef2b64296f61563484d |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2019-05-16 21:50:56 |
Source code size: | 2221 bytes / 55 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 786 / 844 |
Version history: | 13 change(s) |
Referenced in: | [show references] |