Python学习之路-新浪微博评论爬虫

爬取指定页面所有评论
import csv
import json
import random
import time
import urllib.request


class Data:
    code = 'utf-8'
    id = 0
    uid = 0
    max_id = 0
    url = ''
    total = 0
    retry = 0
    count = 0
    filename = ''


over = Data()


def http_r(url, c=over.code):
    # 设置UA
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    }
    request = urllib.request.Request(url=url, headers=headers)
    return urllib.request.urlopen(request).read().decode(c)


def setData(url):
    # url = 'https://m.weibo.cn/status/4639954594955978'
    # url = 'https://weibo.com/2803301701/KgLoTkNh8'

    over.count = 0
    over.total = 0

    u = url.replace('https://', 'w.') \
        .replace('http://', 'w.') \
        .replace(url[url.find('#'): 0 if url.find('#') == -1 else None], '') \
        .replace(url[url.find('?'): 0 if url.find('?') == -1 else None], '') \
        .split('/')

    if u[0].find('m.weibo.') != -1:
        s = 'var $render_data = ['
        e = '][0] || {};'
        back = http_r(url)
        try:
            back = json.loads(back[back.find(s) + len(s):back.find(e)])
        except Exception:
            back = http_r(url)
            try:
                back = json.loads(back[back.find(s) + len(s):back.find(e)])
            except Exception:
                return False

        over.uid = back['status']['user']['id']
        over.id = u[2]
        newCsv(back['status']['status_title'], back['status']['user']['screen_name'])
        return True
    elif u[0].find('w.weibo.') != -1:
        back = json.loads(http_r('https://weibo.com/ajax/statuses/show?id=' + u[2]))
        over.uid = u[1]
        over.id = back['id']
        newCsv(back['text_raw'], back['user']['screen_name'])
        return True
    return False


def setUrl():
    if over.max_id > 0:
        over.url = 'https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=' \
                   + str(over.id) + '&is_show_bulletin=2&is_mix=0&max_id=' + str(over.max_id) \
                   + '&count=20&uid=' + str(over.uid)
    elif over.max_id == 0:
        over.url = 'https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=' \
                   + str(over.id) + '&is_show_bulletin=2&is_mix=0&count=20&uid=' + str(over.uid)


def getComment():
    # https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=4639954594955978&is_show_bulletin=2&is_mix=0&count=20&uid=2803301701
    # https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4639954594955978&is_show_bulletin=2&is_mix=0&max_id=273267091222258&count=20&uid=2803301701

    setUrl()
    print('    开始获取 ', end='')
    try:
        back = json.loads(http_r(over.url))
    except Exception:
        back = {}

    over.retry = 0
    while ('data' not in back or len(back['data']) == 0) and over.retry < 10:
        print('-', end='')
        over.retry += 1
        time.sleep(random.uniform(1, 2.5))
        try:
            back = json.loads(http_r(over.url))
        except Exception:
            back = {}
        if 'max_id' in back:
            over.max_id = back['max_id']
        setUrl()

    if over.retry == 10:
        print('\n    本次获取失败')
        return False
    else:
        s = len(back['data'])
        over.count += s
        print('\n    本次获取到: ' + str(s) + ' 新数据')

    if over.total == 0:
        over.total = back['total_number']

    comments = {'name': [], 'comment': []}
    for c in back['data']:
        comments['name'].append(c['user']['name'])
        comments['comment'].append(c['text_raw'].replace('\r', '').replace('\n', '\\n').replace('\t', '\\t'))
    writeOut(comments)
    return True


def writeOut(info):  # 将解析好的数据按格式写入文本
    with open(over.filename + '.csv', 'a', encoding='utf-8') as file_obj:  # 将数据追加写出到同级目录下的infos.txt中
        f_csv = csv.writer(file_obj)
        for i in range(len(info['name'])):
            f_csv.writerow([info['name'][i], info['comment'][i]])


def newCsv(title, author):
    over.filename = title[title.find('【') + 1:title.find('】')].replace('#', '') + '-' + str(time.time())
    with open(over.filename + '.csv', 'w', encoding='utf-8')as f:
        f_csv = csv.writer(f)
        f_csv.writerow([title.replace('\r', '').replace('\n', '\\n').replace('\t', '\\t'), author])
        f_csv.writerow(['评论者', '评论'])


def start(urls, num=-1):  # 爬取链接, 爬取数量
    for url in urls:
        if not setData(url):
            continue
        print('0/' + str(num))
        # for i in range(int(num / 20)):
        while over.count < num or num == -1:
            if not getComment():
                continue
            print(str(over.count) + '/' + str(num if num > 0 else over.total))
            if over.total <= over.count:
                print('此文已抓完')
                break
        time.sleep(random.uniform(1, 2.5))


url_list = [
    'https://weibo.com/1893892941/KlDpMDEnh',
    'https://m.weibo.cn/status/4639954594955978'
]
start(url_list)
按讚

發佈留言

電子郵件地址不會被公開。必填項已用 * 標註