tika function - run Apache Tika to exract text from web page, PDF, ... [1004722]

lib 1400208 // tika - 50 MB!

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.http.util.*;
import org.apache.http.protocol.HTTP;

static Map<S, O> tika(String url) {
  new DefaultHttpClient httpclient;
  //HttpClient httpclient = apacheHttp_trustingClient();
  
  Map<String, Object> map = new HashMap<String, Object>();
  pcall {
    HttpGet httpGet = new HttpGet(url);
    HttpResponse response = httpclient.execute(httpGet);
    HttpEntity entity = response.getEntity();
    InputStream input = null;
    if (entity != null) {
      try {
        input = entity.getContent();
        //S inputString = EntityUtils.toString(entity, HTTP.UTF_8);
        //input = new ByteArrayInputStream(toUTF8(inputString));
        //new StringWriter stringWriter;
        //BodyContentHandler handler = new(stringWriter);
        new BodyContentHandler handler;
        //new ByteArrayOutputStream outputStream;
        //BodyContentHandler handler = new(outputStream);
        new Metadata metadata;
        new AutoDetectParser parser;
        new ParseContext parseContext;
        parser.parse(input, handler, metadata, parseContext);
        map.put("text", handler.toString() /*.replaceAll("\n|\r|\t", " ") */);
        //map.put("text", stringWriter.toString());
        //map.put("text", fromUTF8(outputStream.toByteArray()));
        //map.put("text", fromUTF8(charsToSingleBytes(handler.toString())));
        map.put("title", metadata.get(TikaCoreProperties.TITLE));
        map.put("pageCount", metadata.get("xmpTPg:NPages"));
        map.put("status_code", response.getStatusLine().getStatusCode() + "");
      } catch (Exception e) {                     
          e.printStackTrace();
      } finally {
        if (input != null)
          input.close();
      }
    }
  }
  ret map;
}

Travelled to 15 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

Snippet ID:	#1004722
Snippet name:	tika function - run Apache Tika to exract text from web page, PDF, ...
Eternal ID of this version:	#1004722/14
Text MD5:	df054b30791ea35332ac8a07b34462d7
Transpilation MD5:	5033868318b5fef2b64296f61563484d
Author:	stefan
Category:	javax
Type:	JavaX fragment (include)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2019-05-16 21:50:56
Source code size:	2221 bytes / 55 lines
Pitched / IR pitched:	No / No
Views / Downloads:	1179 / 1283
Version history:	13 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1004722 // tika function - run Apache Tika to exract text from web page, PDF, ...

JavaX fragment (include) [tags: use-pretranspiled]

Author comment

1	lib 1400208 // tika - 50 MB!
2
3	import org.apache.http.HttpEntity;
4	import org.apache.http.HttpResponse;
5	import org.apache.http.client.methods.HttpGet;
6	import org.apache.http.impl.client.DefaultHttpClient;
7	import org.apache.tika.metadata.Metadata;
8	import org.apache.tika.metadata.TikaCoreProperties;
9	import org.apache.tika.parser.AutoDetectParser;
10	import org.apache.tika.parser.ParseContext;
11	import org.apache.tika.sax.BodyContentHandler;
12	import org.apache.http.util.*;
13	import org.apache.http.protocol.HTTP;
14
15	static Map<S, O> tika(String url) {
16	new DefaultHttpClient httpclient;
17	//HttpClient httpclient = apacheHttp_trustingClient();
18
19	Map<String, Object> map = new HashMap<String, Object>();
20	pcall {
21	HttpGet httpGet = new HttpGet(url);
22	HttpResponse response = httpclient.execute(httpGet);
23	HttpEntity entity = response.getEntity();
24	InputStream input = null;
25	if (entity != null) {
26	try {
27	input = entity.getContent();
28	//S inputString = EntityUtils.toString(entity, HTTP.UTF_8);
29	//input = new ByteArrayInputStream(toUTF8(inputString));
30	//new StringWriter stringWriter;
31	//BodyContentHandler handler = new(stringWriter);
32	new BodyContentHandler handler;
33	//new ByteArrayOutputStream outputStream;
34	//BodyContentHandler handler = new(outputStream);
35	new Metadata metadata;
36	new AutoDetectParser parser;
37	new ParseContext parseContext;
38	parser.parse(input, handler, metadata, parseContext);
39	map.put("text", handler.toString() /.replaceAll("\n\|\r\|\t", " ") /);
40	//map.put("text", stringWriter.toString());
41	//map.put("text", fromUTF8(outputStream.toByteArray()));
42	//map.put("text", fromUTF8(charsToSingleBytes(handler.toString())));
43	map.put("title", metadata.get(TikaCoreProperties.TITLE));
44	map.put("pageCount", metadata.get("xmpTPg:NPages"));
45	map.put("status_code", response.getStatusLine().getStatusCode() + "");
46	} catch (Exception e) {
47	e.printStackTrace();
48	} finally {
49	if (input != null)
50	input.close();
51	}
52	}
53	}
54	ret map;
55	}