Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

163
LINES

< > BotCompany Repo | #1013723 // downloader.py (youtube comment downloader)

Document

1  
#!/usr/bin/env python
2  
3  
from __future__ import print_function
4  
5  
import os
6  
import sys
7  
import time
8  
import json
9  
import requests
10  
import argparse
11  
import lxml.html
12  
13  
from lxml.cssselect import CSSSelector
14  
15  
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
16  
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
17  
18  
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
19  
20  
21  
def find_value(html, key, num_chars=2):
22  
    pos_begin = html.find(key) + len(key) + num_chars
23  
    pos_end = html.find('"', pos_begin)
24  
    return html[pos_begin: pos_end]
25  
26  
27  
def extract_comments(html):
28  
    tree = lxml.html.fromstring(html)
29  
    item_sel = CSSSelector('.comment-item')
30  
    text_sel = CSSSelector('.comment-text-content')
31  
    time_sel = CSSSelector('.time')
32  
    author_sel = CSSSelector('.user-name')
33  
34  
    for item in item_sel(tree):
35  
        yield {'cid': item.get('data-cid'),
36  
               'text': text_sel(item)[0].text_content(),
37  
               'time': time_sel(item)[0].text_content().strip(),
38  
               'author': author_sel(item)[0].text_content()}
39  
40  
41  
def extract_reply_cids(html):
42  
    tree = lxml.html.fromstring(html)
43  
    sel = CSSSelector('.comment-replies-header > .load-comments')
44  
    return [i.get('data-cid') for i in sel(tree)]
45  
46  
47  
def ajax_request(session, url, params, data, retries=10, sleep=20):
48  
    for _ in range(retries):
49  
        response = session.post(url, params=params, data=data)
50  
        if response.status_code == 200:
51  
            response_dict = json.loads(response.text)
52  
            return response_dict.get('page_token', None), response_dict['html_content']
53  
        else:
54  
            time.sleep(sleep)
55  
56  
57  
def download_comments(youtube_id, sleep=1):
58  
    session = requests.Session()
59  
    session.headers['User-Agent'] = USER_AGENT
60  
61  
    # Get Youtube page with initial comments
62  
    response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
63  
    html = response.text
64  
    reply_cids = extract_reply_cids(html)
65  
66  
    ret_cids = []
67  
    for comment in extract_comments(html):
68  
        ret_cids.append(comment['cid'])
69  
        yield comment
70  
71  
    page_token = find_value(html, 'data-token')
72  
    session_token = find_value(html, 'XSRF_TOKEN', 4)
73  
74  
    first_iteration = True
75  
76  
    # Get remaining comments (the same as pressing the 'Show more' button)
77  
    while page_token:
78  
        data = {'video_id': youtube_id,
79  
                'session_token': session_token}
80  
81  
        params = {'action_load_comments': 1,
82  
                  'order_by_time': True,
83  
                  'filter': youtube_id}
84  
85  
        if first_iteration:
86  
            params['order_menu'] = True
87  
        else:
88  
            data['page_token'] = page_token
89  
90  
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
91  
        if not response:
92  
            break
93  
94  
        page_token, html = response
95  
96  
        reply_cids += extract_reply_cids(html)
97  
        for comment in extract_comments(html):
98  
            if comment['cid'] not in ret_cids:
99  
                ret_cids.append(comment['cid'])
100  
                yield comment
101  
102  
        first_iteration = False
103  
        time.sleep(sleep)
104  
105  
    # Get replies (the same as pressing the 'View all X replies' link)
106  
    for cid in reply_cids:
107  
        data = {'comment_id': cid,
108  
                'video_id': youtube_id,
109  
                'can_reply': 1,
110  
                'session_token': session_token}
111  
112  
        params = {'action_load_replies': 1,
113  
                  'order_by_time': True,
114  
                  'filter': youtube_id,
115  
                  'tab': 'inbox'}
116  
117  
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
118  
        if not response:
119  
            break
120  
121  
        _, html = response
122  
123  
        for comment in extract_comments(html):
124  
            if comment['cid'] not in ret_cids:
125  
                ret_cids.append(comment['cid'])
126  
                yield comment
127  
        time.sleep(sleep)
128  
129  
130  
def main(argv):
131  
    parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
132  
    parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
133  
    parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
134  
    parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
135  
136  
    try:
137  
        args = parser.parse_args(argv)
138  
139  
        youtube_id = args.youtubeid
140  
        output = args.output
141  
142  
        if not youtube_id or not output:
143  
            parser.print_usage()
144  
            raise ValueError('you need to specify a Youtube ID and an output filename')
145  
146  
        print('Downloading Youtube comments for video:', youtube_id)
147  
        count = 0
148  
        with open(output, 'w') as fp:
149  
            for comment in download_comments(youtube_id):
150  
                print(json.dumps(comment), file=fp)
151  
                count += 1
152  
                sys.stdout.write('Downloaded %d comment(s)\r' % count)
153  
                sys.stdout.flush()
154  
        print('\nDone!')
155  
156  
157  
    except Exception as e:
158  
        print('Error:', str(e))
159  
        sys.exit(1)
160  
161  
162  
if __name__ == "__main__":
163  
    main(sys.argv[1:])

Author comment

https://github.com/egbertbouman/youtube-comment-downloader

download  show line numbers   

Travelled to 12 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1013723
Snippet name: downloader.py (youtube comment downloader)
Eternal ID of this version: #1013723/2
Text MD5: 8ba37a40bc248859abba431e6aee3867
Author: stefan
Category: javax / networking
Type: Document
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2018-01-28 03:27:11
Source code size: 5385 bytes / 163 lines
Pitched / IR pitched: No / No
Views / Downloads: 440 / 455
Version history: 1 change(s)
Referenced in: [show references]