Uses 53509K of libraries. Click here for Pure Java version (355L/3K/11K).
1 | !752 |
2 | |
3 | lib 1004690 // tika |
4 | |
5 | import org.apache.http.HttpEntity; |
6 | import org.apache.http.HttpResponse; |
7 | import org.apache.http.client.methods.HttpGet; |
8 | import org.apache.http.impl.client.DefaultHttpClient; |
9 | import org.apache.tika.metadata.Metadata; |
10 | import org.apache.tika.metadata.TikaCoreProperties; |
11 | import org.apache.tika.parser.AutoDetectParser; |
12 | import org.apache.tika.parser.ParseContext; |
13 | import org.apache.tika.sax.BodyContentHandler; |
14 | |
15 | static Map<S, O> tika(String url) { |
16 | DefaultHttpClient httpclient = new DefaultHttpClient(); |
17 | Map<String, Object> map = new HashMap<String, Object>(); |
18 | pcall { |
19 | HttpGet httpGet = new HttpGet(url); |
20 | HttpResponse response = httpclient.execute(httpGet); |
21 | HttpEntity entity = response.getEntity(); |
22 | InputStream input = null; |
23 | if (entity != null) { |
24 | try { |
25 | input = entity.getContent(); |
26 | BodyContentHandler handler = new BodyContentHandler(); |
27 | Metadata metadata = new Metadata(); |
28 | AutoDetectParser parser = new AutoDetectParser(); |
29 | ParseContext parseContext = new ParseContext(); |
30 | parser.parse(input, handler, metadata, parseContext); |
31 | map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */); |
32 | map.put("title", metadata.get(TikaCoreProperties.TITLE)); |
33 | map.put("pageCount", metadata.get("xmpTPg:NPages")); |
34 | map.put("status_code", response.getStatusLine().getStatusCode() + ""); |
35 | } catch (Exception e) { |
36 | e.printStackTrace(); |
37 | } finally { |
38 | if (input != null) |
39 | input.close(); |
40 | } |
41 | } |
42 | } |
43 | return map; |
44 | } |
45 | |
46 | p { |
47 | S url = or(get(args, 0), "http://math.about.com/library/q20.pdf"); |
48 | print("Loading " + url); |
49 | Map<S, O> extractedMap = tika(url); |
50 | print(extractedMap.get("text")); |
51 | } |
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, uwnvikuolobj, vouqrxazstgt
No comments. add comment
Snippet ID: | #1004691 |
Snippet name: | Apache Tika Test |
Eternal ID of this version: | #1004691/1 |
Text MD5: | a8c1f6b11f9c2763ed8853f211166129 |
Transpilation MD5: | 02175c5979180294ebe86dbcb434c006 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-08-28 14:03:05 |
Source code size: | 1826 bytes / 51 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 1165 / 1112 |
Referenced in: | [show references] |