Libraryless. Click here for Pure Java version (5444L/37K).
1 | sclass WebScraper { |
2 | S baseURL; |
3 | new Set<S> urlsSeen; |
4 | new LinkedHashSet<S> linksToFollow; |
5 | Int maxPages; // includes cached pages |
6 | |
7 | *(S baseAndStartURL) { |
8 | this(baseAndStartURL, baseAndStartURL); |
9 | } |
10 | |
11 | *(S *baseURL, S startURL) { |
12 | addLink(startURL); |
13 | } |
14 | |
15 | void addLinks(Iterable<S> urls) { for (S url : urls) addLink(url); } |
16 | void addLink(S url) { |
17 | if (!urlsSeen.contains(url) && startsWith(url, baseURL)) |
18 | linksToFollow.add(url); |
19 | } |
20 | |
21 | bool step() { |
22 | ping(); |
23 | if (maxPages != null && l(urlsSeen) >= maxPages) |
24 | ret false with print("Maximum number of pages reached: " + maxPages + ". Queue size: " + l(linksToFollow)); |
25 | if (empty(linksToFollow)) false; |
26 | _loadURL(popFirst(linksToFollow)); |
27 | true; |
28 | } |
29 | |
30 | void _loadURL(S url) { |
31 | urlsSeen.add(url); |
32 | addLinks(pairsA(webScraper_getLinks(url))); |
33 | } |
34 | |
35 | run { |
36 | while (step()) |
37 | print("URLs checked: " + l(urlsSeen) + ", queue size: " + l(linksToFollow)); |
38 | print("Scraping done. " + n2(l(urlsSeen), "URL") + " checked."); |
39 | } |
40 | } |
download show line numbers debug dex old transpilations
Travelled to 6 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1023835 |
Snippet name: | WebScraper |
Eternal ID of this version: | #1023835/10 |
Text MD5: | 24cb860524028c47729c2a1985dc7bfa |
Transpilation MD5: | 3962b7fd5f86fbfe14276c5b2dfc9e20 |
Author: | stefan |
Category: | javax / html parsing |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2019-07-10 13:10:03 |
Source code size: | 1081 bytes / 40 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 247 / 699 |
Version history: | 9 change(s) |
Referenced in: | [show references] |