Uses 53509K of libraries. Click here for Pure Java version (327L/3K/10K).
1 | !752 |
2 | |
3 | lib 1004690 // tika |
4 | |
5 | import org.apache.http.HttpEntity; |
6 | import org.apache.http.HttpResponse; |
7 | import org.apache.http.client.methods.HttpGet; |
8 | import org.apache.http.impl.client.DefaultHttpClient; |
9 | import org.apache.tika.metadata.Metadata; |
10 | import org.apache.tika.metadata.TikaCoreProperties; |
11 | import org.apache.tika.parser.AutoDetectParser; |
12 | import org.apache.tika.parser.ParseContext; |
13 | import org.apache.tika.sax.BodyContentHandler; |
14 | |
15 | static Map<S, O> processFile(File file) ctex { |
16 | new HashMap<S, O> map; |
17 | InputStream input = new FileInputStream(file); |
18 | try { |
19 | BodyContentHandler handler = new BodyContentHandler(-1); |
20 | Metadata metadata = new Metadata(); |
21 | AutoDetectParser parser = new AutoDetectParser(); |
22 | ParseContext parseContext = new ParseContext(); |
23 | parser.parse(input, handler, metadata, parseContext); |
24 | map.put("text", handler.toString()); |
25 | map.put("title", metadata.get(TikaCoreProperties.TITLE)); |
26 | map.put("pageCount", metadata.get("xmpTPg:NPages")); |
27 | } finally { |
28 | input.close(); |
29 | } |
30 | return map; |
31 | } |
32 | |
33 | p { |
34 | Map<S, O> extractedMap = processFile(new File(or(get(args, 0), "/home/stefan/Desktop/maude-primer.pdf"))); |
35 | S text = (S) extractedMap.get("text"); |
36 | print(text); |
37 | print(l(text)); |
38 | } |
Began life as a copy of #1004691
download show line numbers debug dex old transpilations
Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, uwnvikuolobj, vouqrxazstgt
No comments. add comment
Snippet ID: | #1004692 |
Snippet name: | Apache Tika Test On Local PDF |
Eternal ID of this version: | #1004692/1 |
Text MD5: | ad25d76a1835518f6836f7e72326db82 |
Transpilation MD5: | 85844c0ce85c497d2b97f17138a92612 |
Author: | stefan |
Category: | javax |
Type: | JavaX source code |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-08-27 12:37:38 |
Source code size: | 1269 bytes / 38 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 575 / 656 |
Referenced in: | [show references] |