Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

58
LINES

< > BotCompany Repo | #1011993 // quickGoogle2 - returns triples of (link, text, desc)

JavaX fragment (include) [tags: use-pretranspiled]

Libraryless. Click here for Pure Java version (10927L/67K).

static L<T3<S>> quickGoogle2(S query) {
  ret quickGoogle2(query, null);
}

static L<T3<S>> quickGoogle2(S query, S language, O... _) {
  optPar bool safeSearch;
  optPar bool noCache;
  optPar bool debug;
  if (safeSearch) set noCache;
  
  language = or2(language, "lang_en");
  query = trim(query);
  L<T3<S>> out = noCache ? null : lookupPossiblyIgnoringCase(parseGoogleLog(), query);
  if (out != null) ret out with print("cache hit");
  out = new L;
  
  S userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0";
  set loadPageWithUserAgent_verbose;
  S html = loadPageWithUserAgent("https://www.google.com/search?q=" + urlencode(query) + "&lr=" + language + "&hl=en"
    + (safeSearch ? "&safe=active" : ""), userAgent);
  S url = first(loadPage_responseHeaders->get("Location"));
  if (url != null)
    html = loadPageWithUserAgent(url, userAgent);

  saveTextFile(javaxCachesDir("last-google-result-page.html"), html);
  L<S> htmlTok = htmlTok(html);
  LL<S> h3s = findContainerTagDeep(htmlTok, "h3");
  for (L<S> tok : h3s) {
    int idx = magicIndexOfSubList(htmlTok, tok);
    S linkTag = get(htmlTok, idx-1);
    if (!tagIs(linkTag, "a")) continue with if (debug) print(+linkTag);

    S link = tagGet(linkTag, "href");
    continue unless isAbsoluteURL(link);
    
    S text = htmldecode(join(dropTags(contentsOfContainerTag(tok))));
    
    // TODO
    L<S> sub = subList(htmlTok, idx+l(tok)-1);
    LLS spans = findContainerTagWithParams(sub, "span", "class" := "st");
    if (debug) pnl("l(sub)=" + l(sub) + ", spans: " + l(spans));
    if (debug) pnl(+spans);
    S desc = trim(htmldecode(dropTags(join(first(spans)))));
    if (debug) printStruct(+desc);

    //S desc = "";
    out.add(triple(link, text, desc));
  }
  
  if (empty(out)) {
    saveTextFile(javaxCachesDir("buggy-google.html"), html);
    ret out;
  }
  
  if (!noCache)
    logStructure(googleLog(), litorderedmap(+query, +language, date := localDateWithSeconds(), results := out));
  ret out;
}

Author comment

Began life as a copy of #1011241

download  show line numbers  debug dex  old transpilations   

Travelled to 16 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, irmadwmeruwu, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt, whxojlpjdney, xrpafgyirdlv

No comments. add comment

Snippet ID: #1011993
Snippet name: quickGoogle2 - returns triples of (link, text, desc)
Eternal ID of this version: #1011993/41
Text MD5: bc7cae08943ffc4299c78f821c04767d
Transpilation MD5: 909c625632eb81facb04a964396cf85b
Author: stefan
Category: javax / networking
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2019-07-22 21:29:12
Source code size: 2078 bytes / 58 lines
Pitched / IR pitched: No / No
Views / Downloads: 948 / 1132
Version history: 40 change(s)
Referenced in: [show references]