Libraryless. Click here for Pure Java version (5444L/37K).
1 | sclass WebScraper {
|
2 | S baseURL; |
3 | new Set<S> urlsSeen; |
4 | new LinkedHashSet<S> linksToFollow; |
5 | Int maxPages; // includes cached pages |
6 | |
7 | *(S baseAndStartURL) {
|
8 | this(baseAndStartURL, baseAndStartURL); |
9 | } |
10 | |
11 | *(S *baseURL, S startURL) {
|
12 | addLink(startURL); |
13 | } |
14 | |
15 | void addLinks(Iterable<S> urls) { for (S url : urls) addLink(url); }
|
16 | void addLink(S url) {
|
17 | if (!urlsSeen.contains(url) && startsWith(url, baseURL)) |
18 | linksToFollow.add(url); |
19 | } |
20 | |
21 | bool step() {
|
22 | ping(); |
23 | if (maxPages != null && l(urlsSeen) >= maxPages) |
24 | ret false with print("Maximum number of pages reached: " + maxPages + ". Queue size: " + l(linksToFollow));
|
25 | if (empty(linksToFollow)) false; |
26 | _loadURL(popFirst(linksToFollow)); |
27 | true; |
28 | } |
29 | |
30 | void _loadURL(S url) {
|
31 | urlsSeen.add(url); |
32 | addLinks(pairsA(webScraper_getLinks(url))); |
33 | } |
34 | |
35 | run {
|
36 | while (step()) |
37 | print("URLs checked: " + l(urlsSeen) + ", queue size: " + l(linksToFollow));
|
38 | print("Scraping done. " + n2(l(urlsSeen), "URL") + " checked.");
|
39 | } |
40 | } |
download show line numbers debug dex old transpilations
Travelled to 6 computer(s): bhatertpkbcr, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1023835 |
| Snippet name: | WebScraper |
| Eternal ID of this version: | #1023835/10 |
| Text MD5: | 24cb860524028c47729c2a1985dc7bfa |
| Transpilation MD5: | 3962b7fd5f86fbfe14276c5b2dfc9e20 |
| Author: | stefan |
| Category: | javax / html parsing |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2019-07-10 13:10:03 |
| Source code size: | 1081 bytes / 40 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 449 / 957 |
| Version history: | 9 change(s) |
| Referenced in: | [show references] |