爬取豆瓣影评,告诉你《流浪地球》在讲什么!

今天忽然想爬取《流浪地球》的豆瓣影评并分析这部电影在讲一些什么把内容,在还没开始写的时候,我认为这个爬虫应该很简单,但是,经过我写完之后,发现并不是那么容易,豆瓣给爬虫设置了很多反爬虫,运行程序的时候总是在不经意间给了我错误,最后豆瓣这个网站把我的豆瓣账号给封了!最终的结果是程序运行一个半小时左右,爬取了1160条评论。开始我们的目标吧!

分析网页,寻找规律

 首先找到评论区的页面,然后我们在分析页面的规律。我们的目标是获得评论和用户名,用户评分,用户所在地区。

 根据上图请求头中的信息,我们就可以构造出相应请求函数来获取相应网页的源代码。

根据上面的那一页,我们可以获得用户名,用户评论的URL,用户详细信息的URL,用户的评分,然后根据我们获得的URL继续获取相应的数据,最终组成一个字典存入数据库。具体的获取数据的代码和成果展示如下图:

 数据的获取及成果

import requests
from lxml import etree
import time
import pymysql
import jieba

#获取网页源代码的框架
def getHtml(URL):
    try:
        headers = {

            "Referer":"https://movie.douban.com/subject/26266893/?from=showing",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400",
            'Cookie':'bid=dBLxsRMMRbs; __utmc=30149280; __utmc=223695111; ll="118184"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.19232; _vwo_uuid_v2=DD9A4D81803D3AC581031A21E0B6F1628|eb32f81d72deea36fe07c889241a8846; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.219998368.1550975340.1550979553.1550999110.3; __utmz=30149280.1550999110.3.2.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/subject/26266893/reviews; dbcl2="192325248:8Qm6nZGk5Co"; ck=n3HL; __utma=223695111.1281168398.1550975340.1550979553.1550999422.3; __utmb=223695111.0.10.1550999422; __utmz=223695111.1550999422.3.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utmt=1; gr_user_id=95b206b3-7bf4-40ca-94d8-14f39be020e1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=1b9e9583-a672-486f-8b8d-7c6a6ba6294b; gr_cs1_1b9e9583-a672-486f-8b8d-7c6a6ba6294b=user_id%3A1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_1b9e9583-a672-486f-8b8d-7c6a6ba6294b=true; __utmt_douban=1; __utmb=30149280.7.10.1550999110; _pk_id.100001.4cf6=138e377200b95457.1550975332.3.1551000296.1550979600.'
        }
        response = requests.get(URL,headers = headers)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.content.decode("utf-8")
    except:
        return "爬取失败"

#获取评论的URL并且获得用户名,用户评分,用户所在地区以及用户加入豆瓣时间
def get_all_info(page):

    #把这一页的所有信息存入一个列表,便于后续的存入数据库
    all_info = []

    html = etree.HTML(page)

    #获取评论的URL
    com_urls = html.xpath("//a[@class = 'reply ']/@href")
    #获取作者的URL
    user_url = html.xpath("//header[@class = 'main-hd']//a[1]/@href")


    #在获取评论的页面中获取用户评分
    user_score = html.xpath("//header[@class = 'main-hd']//span[1]/@title")

    # 在获取评论的页面中获取用户发布时间
    release_time = html.xpath("//header[@class = 'main-hd']//span[2]/text()")
    if len(user_url) != len(user_score):
        print("这个界面出现了问题,直接爬取下一个界面")
        return False





    for i in range(len(user_url)):
        print("正在爬取这一页中的第{}个用户信息".format(i+1))

        #将所有的信息存入字典
        all_dict = {}

        #获取用户的评论
        res = getHtml(com_urls[i])
        time.sleep(1)
        html = etree.HTML(res)
        comments = html.xpath("//div[@id = 'link-report']//p/text()")
        new_comment = "".join(comments)

        #获取用户的基本信息
        res = getHtml(user_url[i])
        time.sleep(1)
        new_html = etree.HTML(res)
        #获取用户名
        user_name = new_html.xpath("//div[@class = 'info']//h1/text()")[0].strip()

        #获取用户的创建时间
        user_creat_time = new_html.xpath("//div[@class = 'user-info']//div[@class = 'pl']/text()")[1].strip()

        #获取用户的地址
        try:
            user_addr = new_html.xpath("//div[@class = 'user-info']//a/text()")[0].strip()
        except:
            user_addr = "用户没有填写"

        #把所有的数据先存入字典
        all_dict["user_name"] = user_name
        all_dict["user_creat_time"] = user_creat_time
        all_dict["user_addr"] = user_addr
        all_dict["user_score"] = user_score[i]
        all_dict["new_comment"] = new_comment
        all_dict["release_time"] = release_time[i]
        all_info.append(all_dict)
    return all_info



def push_data(data_dict):
    conn = pymysql.connect(host="localhost", user="root", password="yanzhiguo140710", port=3306, db="doubanyingping")
    cur = conn.cursor()
    keys = ",".join(data_dict.keys())
    values = ",".join(['%s'] * len(data_dict))
    sql = "insert into liulangdiqiu ({keys}) values ({value})".format(keys=keys, value=values)
    cur.execute(sql, tuple(data_dict.values()))
    conn.commit()
    conn.close()




#对电影的评论进行词云展示或者进行
def parse_data(p):
    pass


if __name__ == '__main__':
    try:
        for i in range(10000):
            print("---------------正在爬取第{}页的评论的URL---------------".format(i+1))
            URL = "https://movie.douban.com/subject/26266893/reviews?start="+str(i*20)
            page = getHtml(URL)
            time.sleep(1)
            single_data = get_all_info(page)
            if single_data == False:
                continue
            for item in single_data:
                push_data(item)
            print("---------------第{}页的信息爬取结束-----------------".format(i+1))


     except:
         print("爬取所有评论的URL完毕")



对爬取的数据进行分析

pass(对这些数据有时间在处理)

猜你喜欢

转载自blog.csdn.net/yanzhiguo98/article/details/87898181