downloader.py (youtube comment downloader) [1013723]

#!/usr/bin/env python

from __future__ import print_function

import os
import sys
import time
import json
import requests
import argparse
import lxml.html

from lxml.cssselect import CSSSelector

YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'


def find_value(html, key, num_chars=2):
    pos_begin = html.find(key) + len(key) + num_chars
    pos_end = html.find('"', pos_begin)
    return html[pos_begin: pos_end]


def extract_comments(html):
    tree = lxml.html.fromstring(html)
    item_sel = CSSSelector('.comment-item')
    text_sel = CSSSelector('.comment-text-content')
    time_sel = CSSSelector('.time')
    author_sel = CSSSelector('.user-name')

    for item in item_sel(tree):
        yield {'cid': item.get('data-cid'),
               'text': text_sel(item)[0].text_content(),
               'time': time_sel(item)[0].text_content().strip(),
               'author': author_sel(item)[0].text_content()}


def extract_reply_cids(html):
    tree = lxml.html.fromstring(html)
    sel = CSSSelector('.comment-replies-header > .load-comments')
    return [i.get('data-cid') for i in sel(tree)]


def ajax_request(session, url, params, data, retries=10, sleep=20):
    for _ in range(retries):
        response = session.post(url, params=params, data=data)
        if response.status_code == 200:
            response_dict = json.loads(response.text)
            return response_dict.get('page_token', None), response_dict['html_content']
        else:
            time.sleep(sleep)


def download_comments(youtube_id, sleep=1):
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT

    # Get Youtube page with initial comments
    response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
    html = response.text
    reply_cids = extract_reply_cids(html)

    ret_cids = []
    for comment in extract_comments(html):
        ret_cids.append(comment['cid'])
        yield comment

    page_token = find_value(html, 'data-token')
    session_token = find_value(html, 'XSRF_TOKEN', 4)

    first_iteration = True

    # Get remaining comments (the same as pressing the 'Show more' button)
    while page_token:
        data = {'video_id': youtube_id,
                'session_token': session_token}

        params = {'action_load_comments': 1,
                  'order_by_time': True,
                  'filter': youtube_id}

        if first_iteration:
            params['order_menu'] = True
        else:
            data['page_token'] = page_token

        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
        if not response:
            break

        page_token, html = response

        reply_cids += extract_reply_cids(html)
        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment

        first_iteration = False
        time.sleep(sleep)

    # Get replies (the same as pressing the 'View all X replies' link)
    for cid in reply_cids:
        data = {'comment_id': cid,
                'video_id': youtube_id,
                'can_reply': 1,
                'session_token': session_token}

        params = {'action_load_replies': 1,
                  'order_by_time': True,
                  'filter': youtube_id,
                  'tab': 'inbox'}

        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
        if not response:
            break

        _, html = response

        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment
        time.sleep(sleep)


def main(argv):
    parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
    parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
    parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
    parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')

    try:
        args = parser.parse_args(argv)

        youtube_id = args.youtubeid
        output = args.output

        if not youtube_id or not output:
            parser.print_usage()
            raise ValueError('you need to specify a Youtube ID and an output filename')

        print('Downloading Youtube comments for video:', youtube_id)
        count = 0
        with open(output, 'w') as fp:
            for comment in download_comments(youtube_id):
                print(json.dumps(comment), file=fp)
                count += 1
                sys.stdout.write('Downloaded %d comment(s)\r' % count)
                sys.stdout.flush()
        print('\nDone!')


    except Exception as e:
        print('Error:', str(e))
        sys.exit(1)


if __name__ == "__main__":
    main(sys.argv[1:])

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

Snippet ID:	#1013723
Snippet name:	downloader.py (youtube comment downloader)
Eternal ID of this version:	#1013723/2
Text MD5:	8ba37a40bc248859abba431e6aee3867
Author:	stefan
Category:	javax / networking
Type:	Document
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2018-01-28 03:27:11
Source code size:	5385 bytes / 163 lines
Pitched / IR pitched:	No / No
Views / Downloads:	440 / 455
Version history:	1 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1013723 // downloader.py (youtube comment downloader)

Document

Author comment

1	#!/usr/bin/env python
2
3	from __future__ import print_function
4
5	import os
6	import sys
7	import time
8	import json
9	import requests
10	import argparse
11	import lxml.html
12
13	from lxml.cssselect import CSSSelector
14
15	YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
16	YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
17
18	USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
19
20
21	def find_value(html, key, num_chars=2):
22	pos_begin = html.find(key) + len(key) + num_chars
23	pos_end = html.find('"', pos_begin)
24	return html[pos_begin: pos_end]
25
26
27	def extract_comments(html):
28	tree = lxml.html.fromstring(html)
29	item_sel = CSSSelector('.comment-item')
30	text_sel = CSSSelector('.comment-text-content')
31	time_sel = CSSSelector('.time')
32	author_sel = CSSSelector('.user-name')
33
34	for item in item_sel(tree):
35	yield {'cid': item.get('data-cid'),
36	'text': text_sel(item)[0].text_content(),
37	'time': time_sel(item)[0].text_content().strip(),
38	'author': author_sel(item)[0].text_content()}
39
40
41	def extract_reply_cids(html):
42	tree = lxml.html.fromstring(html)
43	sel = CSSSelector('.comment-replies-header > .load-comments')
44	return [i.get('data-cid') for i in sel(tree)]
45
46
47	def ajax_request(session, url, params, data, retries=10, sleep=20):
48	for _ in range(retries):
49	response = session.post(url, params=params, data=data)
50	if response.status_code == 200:
51	response_dict = json.loads(response.text)
52	return response_dict.get('page_token', None), response_dict['html_content']
53	else:
54	time.sleep(sleep)
55
56
57	def download_comments(youtube_id, sleep=1):
58	session = requests.Session()
59	session.headers['User-Agent'] = USER_AGENT
60
61	# Get Youtube page with initial comments
62	response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
63	html = response.text
64	reply_cids = extract_reply_cids(html)
65
66	ret_cids = []
67	for comment in extract_comments(html):
68	ret_cids.append(comment['cid'])
69	yield comment
70
71	page_token = find_value(html, 'data-token')
72	session_token = find_value(html, 'XSRF_TOKEN', 4)
73
74	first_iteration = True
75
76	# Get remaining comments (the same as pressing the 'Show more' button)
77	while page_token:
78	data = {'video_id': youtube_id,
79	'session_token': session_token}
80
81	params = {'action_load_comments': 1,
82	'order_by_time': True,
83	'filter': youtube_id}
84
85	if first_iteration:
86	params['order_menu'] = True
87	else:
88	data['page_token'] = page_token
89
90	response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
91	if not response:
92	break
93
94	page_token, html = response
95
96	reply_cids += extract_reply_cids(html)
97	for comment in extract_comments(html):
98	if comment['cid'] not in ret_cids:
99	ret_cids.append(comment['cid'])
100	yield comment
101
102	first_iteration = False
103	time.sleep(sleep)
104
105	# Get replies (the same as pressing the 'View all X replies' link)
106	for cid in reply_cids:
107	data = {'comment_id': cid,
108	'video_id': youtube_id,
109	'can_reply': 1,
110	'session_token': session_token}
111
112	params = {'action_load_replies': 1,
113	'order_by_time': True,
114	'filter': youtube_id,
115	'tab': 'inbox'}
116
117	response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
118	if not response:
119	break
120
121	_, html = response
122
123	for comment in extract_comments(html):
124	if comment['cid'] not in ret_cids:
125	ret_cids.append(comment['cid'])
126	yield comment
127	time.sleep(sleep)
128
129
130	def main(argv):
131	parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
132	parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
133	parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
134	parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
135
136	try:
137	args = parser.parse_args(argv)
138
139	youtube_id = args.youtubeid
140	output = args.output
141
142	if not youtube_id or not output:
143	parser.print_usage()
144	raise ValueError('you need to specify a Youtube ID and an output filename')
145
146	print('Downloading Youtube comments for video:', youtube_id)
147	count = 0
148	with open(output, 'w') as fp:
149	for comment in download_comments(youtube_id):
150	print(json.dumps(comment), file=fp)
151	count += 1
152	sys.stdout.write('Downloaded %d comment(s)\r' % count)
153	sys.stdout.flush()
154	print('\nDone!')
155
156
157	except Exception as e:
158	print('Error:', str(e))
159	sys.exit(1)
160
161
162	if __name__ == "__main__":
163	main(sys.argv[1:])