爬取豆瓣和微博《镇魂》评论进行数据分析

爬取豆瓣《镇魂》评论并进行数据分析

#引入包
import requests
from bs4 import BeautifulSoup
import random
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud
import PIL
import numpy as np
from snownlp import SnowNLP
import csv
import codecs
import pandas as pd

#设置浏览器
agents = [
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
    "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
    "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
    "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
    "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
    "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
    "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
    "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
    "Mozilla/2.02E (Win95; U)",
    "Mozilla/3.01Gold (Win95; I)",
    "Mozilla/4.8 [en] (Windows NT 5.1; U)",
    "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
    "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
    "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
    "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
] 

heads = {                                                # ip代理池获取时用的url的headers
        'User-Agent': random.choice(agents),
        'ue': 'utf-8'  # 设置翻译支持中文
    }

# 获取代理ip池,获得匿名的ip
def get_ip_list():
    urlip = 'http://www.xicidaili.com/nn/'
    html = requests.get(urlip, headers=heads).text
    soup = BeautifulSoup(html, 'html.parser')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list
# 从ip代理池随机选取一个ip返回
def get_random_ip():
    ip_list = get_ip_list()
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies
#返回的是一个字典

# 用于评论页面html获取的url的headers
heade = {
        'User-Agent': random.choice(agents),
        'proxies': get_random_ip(),                     # 反爬虫策略——每次从ip池中随机调取一个ip
        'ue': 'utf-8'  # 设置翻译支持中文
    }

# 获取一个评论页面的所有html文本信息
def get_html(url):
    response = requests.get(url, params=heade)
    response.encoding = 'utf-8'
    html = response.text
    #print(html)
    return html

## 获取下一页评论区信息的url
def get_url(html):
    bs = BeautifulSoup(html, 'html.parser')
    url_list = bs.find_all('div', attrs={'id': "paginator"})
    #print(url_list)
    if len(url_list) > 0:
        url_next_part = url_list[0].find('a', attrs={'class': "next"})['href']
        url_next = 'https://movie.douban.com/subject/26979097/comments' + url_next_part
        return url_next


#先把表头写好,以便后面构建表格

comment_columns=['标号','评论','星标','地区','时间']
with codecs.open("D:\\comment4.csv",'ab',"utf-8") as f:
    writer = csv.writer(f,delimiter ='\t',quotechar='"',quoting=csv.QUOTE_ALL)
    writer.writerow(comment_columns)

#b_comment_list里面是逐条的记录
b_comment_list=[]

# 获取某一页评论每种等级和影评信息
def get_star_and_comments(url_comment):
    global b_comment_list
    #print(url_comment)
    comment_html = get_html(url_comment)
    bs = BeautifulSoup(comment_html, 'html.parser')
    comment_list = bs.find_all('div', attrs={'class': "comment-item"})
    #print(comment_list)
    star=0
    name=0#表示每一个人的id号
    for comment in comment_list:
        name+=1#这个是用来作为id号码识别对象
        comments = (comment.find('p')).text.strip('\n')
        span = (comment.find_all('span')[4])['class']
        if span[0] == 'allstar50':
            star = 5
        elif span[0] == 'allstar40':
            star=4
        elif span[0] == 'allstar30':
            star=3
        elif span[0] == 'allstar20':
            star=2
        elif span[0] == 'allstar10':
            star=1
        #评论人的个人信息链接
        info_url=(comment.find_all('a')[2])['href']#字符串
        #print(type(info_url))
        info_html=get_html(info_url)
        info_bs=BeautifulSoup(info_html,"html.parser")
        #print(info_bs)
        #地区
        try:
            place=info_bs.find_all('div',attrs={"class":'user-info'})[0].find('a').text.strip()
        except:
            place=''
        #place=info_bs.find_all('div',attrs={"class":'user-info'})[0]
        #print(place)
        #print(type(place))
        #评论日期
        date=comment.find('span',attrs={"class":"comment-time"}).text.strip()
        #print(date)
        #一条数据的表示方法
        b_comment_list.append([name,comments,star,place,date])
        #print("haha:")
        #print(b_comment_list)
        #make_dir(file_path,b_comment_list)

# 创建循环调用实现完成一个页面信息获取后,自动获取下个页面信息的函数,这是一个调用自己的函数
i=0
def get_all(url):
    global i
    i+=1
    print(url)  
    get_star_and_comments(url)#把当前页面的内容形成列表放在b_comment_list中    
    comment_url = get_url(get_html(url))
    #print(comment_url)
    if comment_url != None and i%5!=0:        
        get_all(comment_url)
    else:
        pass
# 主函数运行程序
if __name__ == '__main__':
    b_comment_list=[]
    file_path='D:\\comment4.csv'#放在c盘中会有权限的限制
    #逐行写入数据
    def make_dir(file_path,data_list):
        with codecs.open(file_path,'ab',"utf-8") as f:
            writer = csv.writer(f,delimiter ='\t',quotechar='"',quoting=csv.QUOTE_ALL)   
            writer.writerows(data_list) # 写入多行
    url = 'https://movie.douban.com/subject/26979097/comments?start=240&limit=20&sort=new_score&status=P'
    get_all(url)#表格构建完成
    make_dir(file_path,b_comment_list)
    #print(b_comment_list)  

#读取形成的.csv文件
cp=pd.read_csv('D:\comment4.csv',engine='python',delimiter ='\t',sep='"',encoding="utf-8")

comment_list=list(cp["评论"])  
#file_list是评论列表
with open('haha.txt',"r",encoding='utf-8') as file:
    file_list=[i.strip() for i in file.readlines() if i!='\n']
#接下来是情感分析
def emotion():
    #f = open('comment.txt', 'r', encoding='UTF-8')
    #list = f.readlines()
    sentimentslist = []
    for i in comment_list:
        try:
            s = SnowNLP(i)
            #print(s.sentiments)
            sentimentslist.append(s.sentiments)
        except:
            break
    plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01), facecolor='g')
    plt.xlabel('Value of sentiments')
    plt.ylabel('Quantity')
    plt.title('Sentiments probability of douban')
    plt.show()


#####分别构建积极和消极评论的列表,用来分析是什么原因导致产生积极或者消极的评价
def get_pos_neg_all(comment_list):
    pos_text=''
    neg_text=''
    comment_text=''
    with open('pos.txt', 'w', encoding='utf-8') as pos, open('neg.txt', 'w', encoding='utf-8') as neg,open('comment.txt', 'w', encoding='utf-8') as comment:        
        for i in comment_list:

            comment.write(i)
            s=i.strip()
            comment_text+=s
            try:
                if SnowNLP(s).sentiments>0.5:
                    pos_text+=s
                    pos.write(s)
                else:
                    neg_text+=s
                    neg.write(s)
            except:
                continue
    return pos_text,neg_text,comment_text



pos_text,neg_text,comment_text=get_pos_neg_all(comment_list)            
  # 调用wordcloud生成词云图并保存为ipg
#comment里面是字符串
def wc(text,name):
    path = r'C:\Windows\Fonts\STXINGKA.TTF'
    alien_mask = np.array(PIL.Image.open(r'C:\Users\ChengYiMing\Desktop\kuang.png')) 

    wc = WordCloud(font_path=path, background_color='white', margin=5, mask=alien_mask,width=1800, height=800, max_words=2000, max_font_size=60, random_state=42)

    a = []
    words = list(jieba.cut(text))
    for word in words:
        if len(word) > 1:
            a.append(word)
    txt = r' '.join(a)
    wc = wc.generate(txt)
    wc.to_file(name+'.jpg')       


wc(pos_text,"pos") 
wc(pos_text,"pos") 



参考文献:
python爬取豆瓣《狂暴巨兽》评分影评,matplotlib和wordcloud制作评分图和词云图

以下是爬取《镇魂》官微评论数据的过程,最后的分析过程没有写出来,和上面的豆瓣评论分析过程如出一辙,可以参考上面的。

爬取《镇魂》微博数据

本文皆在于通过爬虫方式爬取微博镇魂。

  • 编程语言:python2.7
  • 依赖库:requests
  • 系统环境:windows

*本篇文章主要参考自Denise_hzf的博客https://www.cnblogs.com/Denise-hzf/p/7927852.html,感谢。


要爬取的数据来源

最近名为《镇魂》的剧十分火热,于是为了分析其热门的原因,有了这篇文章,主要爬取微博评论的数据。
比较推荐又简单的方法是通过手机网页的get请求来访问,原因就是简单粗暴(不过存在的问题是貌似最多只能访问101页)。
微博提供的接口为

https://m.weibo.cn/api/comments/show?id=微博文章的id&page=要获取评论的页数

从get请求获得的json数据中提取评论信息

由于在原文章https://www.cnblogs.com/Denise-hzf/p/7927852.html中执行

jsondata.get('data')

不能获取到数据(可能是因为微博json的格式不一样了)。通过分析以后发现改为

jsondata.get('data').get('data')

即可获取评论数据


使用正则表达式去除多余的回复信息和@信息

由于微博的评论,不仅仅有评论,还有回复,所以爬取以后难免有很多“回复@XXXX:DDDD”类似的信息和@信息,可以用正则表达式去掉

import re
dr = re.compile(r'<[^>]+>',re.S)
drr = re.compile(r'@.*:(.*?)',re.S)
drrr = re.compile(r'@.+?\s',re.S)

comment = dr.sub('',comment)#删除多余的信息
comment = drr.sub('',comment)#删除多余的信息
comment = drrr.sub('',comment)#删除多余的信息

完整代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#参考自Denise_hzf的博客https://www.cnblogs.com/Denise-hzf/p/7927852.html
import sys 
reload(sys) 
sys.setdefaultencoding('utf-8') 
import requests
import re
import time
import random


def readfromtxt(filename):
    file = open(filename, "r")
    text = file.read()
    file.close()
    return text

def writeintxt(dict,filename):
    output = open(filename, 'a+')
    for d, list in dict.items():
        comment_str = ""
        for l in list:
            comment_str = comment_str + l.__str__()+'\n'
        output.write(comment_str+'\n')
    output.close()


ua_agents =[
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
    "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
  ]
ck_cookies=["SINAGLOBAL=6061592354656.324.1489207743838; un=18240343109; TC-V5-G0=52dad2141fc02c292fc30606953e43ef; wb_cusLike_2140170130=N; _s_tentry=login.sina.com.cn; Apache=5393750164131.485.1511882292296; ULV=1511882292314:55:14:7:5393750164131.485.1511882292296:1511789163477; TC-Page-G0=1e758cd0025b6b0d876f76c087f85f2c; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; login_sid_t=7cbd20d7f5c121ef83f50e3b28a77ed7; cross_origin_proto=SSL; WBStorage=82ca67f06fa80da0|undefined; UOR=,,login.sina.com.cn; WBtopGlobal_register_version=573631b425a602e8; crossidccode=CODE-tc-1EjHEO-2SNIe8-y00Hd0Yq79mGw3l1975ae; SSOLoginState=1511882345; SCF=AvFiX3-W7ubLmZwXrMhoZgCv_3ZXikK7fhjlPKRLjog0OIIQzSqq7xsdv-_GhEe8XWdkHikzsFJyqtvqej6OkaM.; SUB=_2A253GQ45DeThGeRP71IQ9y7NyDyIHXVUb3jxrDV8PUNbmtAKLWrSkW9NTjfYoWTfrO0PkXSICRzowbfjExbQidve; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFaVAdSwLmvOo1VRiSlRa3q5JpX5KzhUgL.FozpSh5pS05pe052dJLoIfMLxKBLBonL122LxKnLB.qL1-z_i--fiKyFi-2Xi--fi-2fiKyFTCH8SFHF1C-4eFH81FHWSE-RebH8SE-4BC-RSFH8SFHFBbHWeEH8SEHWeF-RegUDMJ7t; SUHB=04W-u1HCo6armH; ALF=1543418344; wvr=6",
"SINAGLOBAL=6061592354656.324.1489207743838; TC-V5-G0=52dad2141fc02c292fc30606953e43ef; wb_cusLike_2140170130=N; _s_tentry=login.sina.com.cn; Apache=5393750164131.485.1511882292296; ULV=1511882292314:55:14:7:5393750164131.485.1511882292296:1511789163477; TC-Page-G0=1e758cd0025b6b0d876f76c087f85f2c; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; login_sid_t=7cbd20d7f5c121ef83f50e3b28a77ed7; WBStorage=82ca67f06fa80da0|undefined; WBtopGlobal_register_version=573631b425a602e8; crossidccode=CODE-tc-1EjHEO-2SNIe8-y00Hd0Yq79mGw3l1975ae; cross_origin_proto=SSL; UOR=,,login.sina.com.cn; SSOLoginState=1511882443; SCF=AvFiX3-W7ubLmZwXrMhoZgCv_3ZXikK7fhjlPKRLjog0-14gBQox9IhSK8vZVaZYWsLxUaOWNkudAR9iT6NFJkg.; SUB=_2A253GQ6bDeRhGeNH6FsZ8CjLzj2IHXVUb2dTrDV8PUNbmtAKLWTjkW9NSqHIBUvGapKd6-MQhJTejk3w_ivUUNXZ; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5gYdHWIHRmedh9Nyrij6XN5JpX5K2hUgL.Fo-4e0.RehqNSK22dJLoI0.LxK-L122LB.qLxK-LB.BLBKqLxKMLB.2LBKzLxKnL12-L122LxK.LBK2L12qLxKqLBKqL1KHiqc-t; SUHB=0auwlDzUYulNGs; ALF=1543418442; un=13728408992; wvr=6",
"SINAGLOBAL=6061592354656.324.1489207743838; TC-V5-G0=52dad2141fc02c292fc30606953e43ef; wb_cusLike_2140170130=N; _s_tentry=login.sina.com.cn; Apache=5393750164131.485.1511882292296; ULV=1511882292314:55:14:7:5393750164131.485.1511882292296:1511789163477; TC-Page-G0=1e758cd0025b6b0d876f76c087f85f2c; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; login_sid_t=7cbd20d7f5c121ef83f50e3b28a77ed7; WBStorage=82ca67f06fa80da0|undefined; WBtopGlobal_register_version=573631b425a602e8; crossidccode=CODE-tc-1EjHEO-2SNIe8-y00Hd0Yq79mGw3l1975ae; wb_cusLike_5939806751=N; cross_origin_proto=SSL; UOR=,,login.sina.com.cn; SSOLoginState=1511882512; SCF=AvFiX3-W7ubLmZwXrMhoZgCv_3ZXikK7fhjlPKRLjog089iFKjxeT1Oc6cbJkkqgWrnQAuMVukRrJy3898cKIb8.; SUB=_2A253GQ9ADeRhGeNH6FsZ8ynJzz6IHXVUb2eIrDV8PUNbmtAKLVWhkW9NSqG4DzNeLkyPCmJIKq6bXfKXpSRCPLqO; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W50J-rDh2D6-QEqNOZ2NddF5JpX5K2hUgL.Fo-4e0.Re0MfShz2dJLoIEeLxK-LB--L1KeLxK-L1hqLBoMLxKnL1K5LBo8IC281xEfIg5tt; SUHB=0gHiPrbPWNJvao; ALF=1543418511; un=15614187608; wvr=6",
"SINAGLOBAL=6061592354656.324.1489207743838; TC-V5-G0=52dad2141fc02c292fc30606953e43ef; wb_cusLike_2140170130=N; _s_tentry=login.sina.com.cn; Apache=5393750164131.485.1511882292296; ULV=1511882292314:55:14:7:5393750164131.485.1511882292296:1511789163477; TC-Page-G0=1e758cd0025b6b0d876f76c087f85f2c; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; login_sid_t=7cbd20d7f5c121ef83f50e3b28a77ed7; WBStorage=82ca67f06fa80da0|undefined; WBtopGlobal_register_version=573631b425a602e8; crossidccode=CODE-tc-1EjHEO-2SNIe8-y00Hd0Yq79mGw3l1975ae; wb_cusLike_5939806751=N; wb_cusLike_5939837542=N; cross_origin_proto=SSL; UOR=,,login.sina.com.cn; SSOLoginState=1511882567; SCF=AvFiX3-W7ubLmZwXrMhoZgCv_3ZXikK7fhjlPKRLjog02c5hBW41ia6vpj1cAqbFzE2KCcsXvDxToS_KOeUnwRc.; SUB=_2A253GQ8XDeRhGeNH6FsZ9CjKyjuIHXVUb2ffrDV8PUNbmtAKLU7wkW9NSqGOexL53l1CujvuLpAFNeOEsl05T_5E; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWuISqBnuGqpyxGiWdJ4bOv5JpX5K2hUgL.Fo-4e0.RShqceKM2dJLoI0YLxK-L1K5L1K2LxK.L1KnLBoeLxK-L1K5L1K2LxKqL1-2L1KqLxK.L1KMLBo-LxKMLB.zLB.qLxK-L1hML1-Bt; SUHB=0LcSwyK5XYMzbr; ALF=1543418566; un=13242833134; wvr=6"]

user_agent = random.choice(ua_agents)
cookies = random.choice(ck_cookies)
headers = {
    'User-agent' : user_agent,
    'Host' : 'm.weibo.cn',
    'Accept' : 'application/json, text/plain, */*',
    'Accept-Language' : 'zh-CN,zh;q=0.8',
    'Accept-Encoding' : 'gzip, deflate, sdch, br',
    'Referer' : 'https://m.weibo.cn/u/6126633570',
    'Cookie' : cookies,
    'Connection' : 'keep-alive',
}


dr = re.compile(r'<[^>]+>',re.S)
drr = re.compile(r'@.*:(.*?)',re.S)
drrr = re.compile(r'@.+?\s',re.S)

base_url = 'https://m.weibo.cn/api/comments/show?id='
weibo_id_list = readfromtxt('weibo_id1.txt').split('\n')
result_dict = {}
for weibo_id in weibo_id_list:
    try:
        record_list = []
        i=1
        SIGN = 1
        while(SIGN):
            url = base_url + str(weibo_id) + '&page=' + str(i)
            print(url)
            resp = requests.get(url,headers=headers,timeout=100)
            jsondata = resp.json()
            if jsondata.get('ok') == 1:
                SIGN = 1
                i = i + 1
                data = jsondata.get('data').get('data')
                for d in data:
                    comment = d.get('text').replace('$$','')
                    comment=comment.replace('回复','')
                    comment = dr.sub('',comment)
                    comment = drr.sub('',comment)
                    comment = drrr.sub('',comment)
                    #comment=comment[(comment.index(':')+1):]
                    #print(comment)
                    #like_count = d.get('like_counts')
                    #user_id = d.get("user").get('id')
                    #user_name = d.get("user").get('screen_name').replace('$$','')
                    one_record = comment.__str__()
                    record_list.append(one_record)
            else:
                SIGN = 0

        result_dict[weibo_id]=record_list
        time.sleep(random.randint(2,3))
    except:
        print("error!")

writeintxt(result_dict,'comments_4.txt')

关注微信公众号 吉客街404
春城最炫酷的黑科技,都在这里。

猜你喜欢

转载自blog.csdn.net/the_little_fairy___/article/details/81357513