Uses 0K of libraries. Click here for Pure Java version (1869L/12K).
1 | lib 1400208 // tika - 50 MB! |
2 | |
3 | import org.apache.http.HttpEntity; |
4 | import org.apache.http.HttpResponse; |
5 | import org.apache.http.client.methods.HttpGet; |
6 | import org.apache.http.impl.client.DefaultHttpClient; |
7 | import org.apache.tika.metadata.Metadata; |
8 | import org.apache.tika.metadata.TikaCoreProperties; |
9 | import org.apache.tika.parser.AutoDetectParser; |
10 | import org.apache.tika.parser.ParseContext; |
11 | import org.apache.tika.sax.BodyContentHandler; |
12 | import org.apache.http.util.*; |
13 | import org.apache.http.protocol.HTTP; |
14 | |
15 | static Map<S, O> tika(String url) {
|
16 | new DefaultHttpClient httpclient; |
17 | //HttpClient httpclient = apacheHttp_trustingClient(); |
18 | |
19 | Map<String, Object> map = new HashMap<String, Object>(); |
20 | pcall {
|
21 | HttpGet httpGet = new HttpGet(url); |
22 | HttpResponse response = httpclient.execute(httpGet); |
23 | HttpEntity entity = response.getEntity(); |
24 | InputStream input = null; |
25 | if (entity != null) {
|
26 | try {
|
27 | input = entity.getContent(); |
28 | //S inputString = EntityUtils.toString(entity, HTTP.UTF_8); |
29 | //input = new ByteArrayInputStream(toUTF8(inputString)); |
30 | //new StringWriter stringWriter; |
31 | //BodyContentHandler handler = new(stringWriter); |
32 | new BodyContentHandler handler; |
33 | //new ByteArrayOutputStream outputStream; |
34 | //BodyContentHandler handler = new(outputStream); |
35 | new Metadata metadata; |
36 | new AutoDetectParser parser; |
37 | new ParseContext parseContext; |
38 | parser.parse(input, handler, metadata, parseContext); |
39 | map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */);
|
40 | //map.put("text", stringWriter.toString());
|
41 | //map.put("text", fromUTF8(outputStream.toByteArray()));
|
42 | //map.put("text", fromUTF8(charsToSingleBytes(handler.toString())));
|
43 | map.put("title", metadata.get(TikaCoreProperties.TITLE));
|
44 | map.put("pageCount", metadata.get("xmpTPg:NPages"));
|
45 | map.put("status_code", response.getStatusLine().getStatusCode() + "");
|
46 | } catch (Exception e) {
|
47 | e.printStackTrace(); |
48 | } finally {
|
49 | if (input != null) |
50 | input.close(); |
51 | } |
52 | } |
53 | } |
54 | ret map; |
55 | } |
Began life as a copy of #1004691
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1004722 |
| Snippet name: | tika function - run Apache Tika to exract text from web page, PDF, ... |
| Eternal ID of this version: | #1004722/14 |
| Text MD5: | df054b30791ea35332ac8a07b34462d7 |
| Transpilation MD5: | 5033868318b5fef2b64296f61563484d |
| Author: | stefan |
| Category: | javax |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2019-05-16 21:50:56 |
| Source code size: | 2221 bytes / 55 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 1051 / 1140 |
| Version history: | 13 change(s) |
| Referenced in: | [show references] |