1 | #!/usr/bin/env python |
2 | |
3 | from __future__ import print_function |
4 | |
5 | import os |
6 | import sys |
7 | import time |
8 | import json |
9 | import requests |
10 | import argparse |
11 | import lxml.html |
12 | |
13 | from lxml.cssselect import CSSSelector |
14 | |
15 | YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}' |
16 | YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax' |
17 | |
18 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' |
19 | |
20 | |
21 | def find_value(html, key, num_chars=2): |
22 | pos_begin = html.find(key) + len(key) + num_chars |
23 | pos_end = html.find('"', pos_begin) |
24 | return html[pos_begin: pos_end] |
25 | |
26 | |
27 | def extract_comments(html): |
28 | tree = lxml.html.fromstring(html) |
29 | item_sel = CSSSelector('.comment-item') |
30 | text_sel = CSSSelector('.comment-text-content') |
31 | time_sel = CSSSelector('.time') |
32 | author_sel = CSSSelector('.user-name') |
33 | |
34 | for item in item_sel(tree): |
35 | yield {'cid': item.get('data-cid'), |
36 | 'text': text_sel(item)[0].text_content(), |
37 | 'time': time_sel(item)[0].text_content().strip(), |
38 | 'author': author_sel(item)[0].text_content()} |
39 | |
40 | |
41 | def extract_reply_cids(html): |
42 | tree = lxml.html.fromstring(html) |
43 | sel = CSSSelector('.comment-replies-header > .load-comments') |
44 | return [i.get('data-cid') for i in sel(tree)] |
45 | |
46 | |
47 | def ajax_request(session, url, params, data, retries=10, sleep=20): |
48 | for _ in range(retries): |
49 | response = session.post(url, params=params, data=data) |
50 | if response.status_code == 200: |
51 | response_dict = json.loads(response.text) |
52 | return response_dict.get('page_token', None), response_dict['html_content'] |
53 | else: |
54 | time.sleep(sleep) |
55 | |
56 | |
57 | def download_comments(youtube_id, sleep=1): |
58 | session = requests.Session() |
59 | session.headers['User-Agent'] = USER_AGENT |
60 | |
61 | # Get Youtube page with initial comments |
62 | response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id)) |
63 | html = response.text |
64 | reply_cids = extract_reply_cids(html) |
65 | |
66 | ret_cids = [] |
67 | for comment in extract_comments(html): |
68 | ret_cids.append(comment['cid']) |
69 | yield comment |
70 | |
71 | page_token = find_value(html, 'data-token') |
72 | session_token = find_value(html, 'XSRF_TOKEN', 4) |
73 | |
74 | first_iteration = True |
75 | |
76 | # Get remaining comments (the same as pressing the 'Show more' button) |
77 | while page_token: |
78 | data = {'video_id': youtube_id, |
79 | 'session_token': session_token} |
80 | |
81 | params = {'action_load_comments': 1, |
82 | 'order_by_time': True, |
83 | 'filter': youtube_id} |
84 | |
85 | if first_iteration: |
86 | params['order_menu'] = True |
87 | else: |
88 | data['page_token'] = page_token |
89 | |
90 | response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) |
91 | if not response: |
92 | break |
93 | |
94 | page_token, html = response |
95 | |
96 | reply_cids += extract_reply_cids(html) |
97 | for comment in extract_comments(html): |
98 | if comment['cid'] not in ret_cids: |
99 | ret_cids.append(comment['cid']) |
100 | yield comment |
101 | |
102 | first_iteration = False |
103 | time.sleep(sleep) |
104 | |
105 | # Get replies (the same as pressing the 'View all X replies' link) |
106 | for cid in reply_cids: |
107 | data = {'comment_id': cid, |
108 | 'video_id': youtube_id, |
109 | 'can_reply': 1, |
110 | 'session_token': session_token} |
111 | |
112 | params = {'action_load_replies': 1, |
113 | 'order_by_time': True, |
114 | 'filter': youtube_id, |
115 | 'tab': 'inbox'} |
116 | |
117 | response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data) |
118 | if not response: |
119 | break |
120 | |
121 | _, html = response |
122 | |
123 | for comment in extract_comments(html): |
124 | if comment['cid'] not in ret_cids: |
125 | ret_cids.append(comment['cid']) |
126 | yield comment |
127 | time.sleep(sleep) |
128 | |
129 | |
130 | def main(argv): |
131 | parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API')) |
132 | parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit') |
133 | parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments') |
134 | parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)') |
135 | |
136 | try: |
137 | args = parser.parse_args(argv) |
138 | |
139 | youtube_id = args.youtubeid |
140 | output = args.output |
141 | |
142 | if not youtube_id or not output: |
143 | parser.print_usage() |
144 | raise ValueError('you need to specify a Youtube ID and an output filename') |
145 | |
146 | print('Downloading Youtube comments for video:', youtube_id) |
147 | count = 0 |
148 | with open(output, 'w') as fp: |
149 | for comment in download_comments(youtube_id): |
150 | print(json.dumps(comment), file=fp) |
151 | count += 1 |
152 | sys.stdout.write('Downloaded %d comment(s)\r' % count) |
153 | sys.stdout.flush() |
154 | print('\nDone!') |
155 | |
156 | |
157 | except Exception as e: |
158 | print('Error:', str(e)) |
159 | sys.exit(1) |
160 | |
161 | |
162 | if __name__ == "__main__": |
163 | main(sys.argv[1:]) |
https://github.com/egbertbouman/youtube-comment-downloader
Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1013723 |
Snippet name: | downloader.py (youtube comment downloader) |
Eternal ID of this version: | #1013723/2 |
Text MD5: | 8ba37a40bc248859abba431e6aee3867 |
Author: | stefan |
Category: | javax / networking |
Type: | Document |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2018-01-28 03:27:11 |
Source code size: | 5385 bytes / 163 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 440 / 455 |
Version history: | 1 change(s) |
Referenced in: | [show references] |