Crawl book information and review information of Douban reading

Recently doing graduation design, you need to collect user's rating data for collaborative filtering algorithm, and at the same time collect comment data for sentiment analysis

Pit

  1. Douban books may not be rated, or users have commented but not rated. And the coding method of Douban books is very helpless. Popular books are always unpopular near the books, no ratings, no comments, so often output failed
  2. Can't crawl too fast, only 40-50 pages per minute, a request can only be accessed a thousand times, otherwise it will report status code 403

fake_useragent

In this crawler, fake_useragent was used to forge the request header, because I heard that Douban's anti-crawl mechanism is better
. The usage of fake_useragent is simple as follows, random is to randomly generate a request header

from fake_useragent import UserAgent
import requests
ua=UserAgent()
url="https://www.baidu.com"    #请求的网址
headers={"User-Agent":ua.random}   #请求头
response=requests.get(url=url,headers=headers)   #请求网址
print(headers)
print(response.status_code)   #响应状态信息
text = response.headers
for line in text.items():
    print(line)

Crawl book information and review information of Douban reading

The first thing to observe is these links
https://book.douban.com/subject/26953606/ book information page
https://book.douban.com/subject/26953606/comments/ first page comment page
https: // book.douban.com/subject/26953606/comments/hot?p=2 On the second page of the review page,
you can see that the front is the same https://book.douban.com/subject/ plus a book id, review page Followed by a / comments /, the second page of comments is followed by a hot? P = 2, so the recursive 3 pages are hot? P = 3
some of the operations to write text, because I am the
second to collect data I revised it again. The distribution of popular books is too sparse, so first determine whether the total number of comments exceeds 1,000 in the program. If it exceeds 1,000, continue to crawl, otherwise continue
to change the bug, which is digital. Write to the file must be converted to str

#coding=utf-8
#下载豆瓣图书的评分、评论,需要建立四张表。auther:wuyou
#表一:图书ID,图书名,平均分
#表二:用户ID,用户名
#表三:图书ID,热门评论
#表四:用户ID,图书ID,评分,评分时间
import requests
import time
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
header = {
    'User-Agent': ua.random
}
def get_score(book_id,text):                   #获取(图书ID,图书名,图书评分)
    soup = BeautifulSoup(text,'lxml')
    try:
        book_name = soup.select("#wrapper > h1 > span")   #返回书名的列表
        name = book_name[0].string
        book_score = soup.select("#interest_sectl > div > div.rating_self.clearfix > strong")  #返回分数的列表
        score = book_score[0].string
        #print("book name is " + str(name)+" and score is "+str(score))  打印书名和分数
        line = str(book_id) + "," + name + "," + str(score) + "\n"   #拼接图书信息
        with open("BookInfo.txt","a",encoding="utf-8") as file:    #表一:图书ID,图书名,平均分
            file.write(line)
        file.close()
    except:
        print("book " + str(book_id) + "get score is  failed!")



def write_txt(soup,book_id):    #参与为url,图书id,和网页页码
    try:     #为了防止报错,因为有些人可以不打分,那么在user_info下只有一个span
        comment_list = soup.find_all("span","short")     #找到评论所在的区域
        comments = ""
        flag = 0
        for line in comment_list:       #把逗号全部替换成分号
            bc = line.string
            bc = bc.replace(",","。")     #将英文逗号替换成句号
            bc = bc.replace(",","。")    #将中文逗号替换成句号
            bc = bc .replace(";","。")    #将分号替换成句号
            if flag == 0:     #如果是第一条评论
                flag += 1
            else:
                comments += ";"      #评论之间用分号间隔
            comments += bc
        with open("BookComments.txt","a",encoding="utf-8") as file:    #表三:图书ID,热门评论
            BookComments = str(book_id) + "," +comments + "\n"
            file.write(BookComments)
        file.close()
        user_list = soup.find_all("span", "comment-info")   #找到用户和评分的所在区域
        user_info_txt = open("UserInfo.txt","a",encoding="utf-8")
        user_score_txt = open("UserScore.txt","a",encoding="utf-8")
        for user_info in user_list:
            user_name = user_info.find("a").string         #用户姓名所在的<a></a>
            user_url = user_info.find("a").attrs["href"]   #提取出超链接
            user_id = user_url.split("/")[-2]              #提取出用户id
            score = user_info.find_all("span")[0].attrs["title"]   #找到用户评分的区域,得到分数
            time_info = user_info.find_all("span")[1].string   #提取出评分的时间
            time_info = time_info.split("-")
            score_year = time_info[0]       #截取出评论时间的年份
            user_info_txt.write(user_id + "," +user_name + "\n")            #表二:用户ID,用户名
            user_score_txt.write(user_id + "," + str(book_id) + "," + score + "," + str(score_year) + "\n")  #表四:用户ID,图书ID,评分,评分时间
            #print("book_id is " + book_id +" user name is " + user_name + ",id is " + user_id + ",score is " + score_info + " " + time_info)   打印出一系列信息
        user_info_txt.close()
        user_score_txt.close()
    except:
        print("cannot find!")


def get_comments(soup, comment_url, book_id, page):         #获取(图书ID,图书评论),(图书ID,用户ID,用户评分),(用户ID,用户名)
    while page <= 2:           #爬取的页数
        if int(page) == 1:     #如果是第一页
            write_txt(soup, book_id)          #传入超链接
            page += 1           #页数加一
        else:
            comment_url += "hot?p=" + str(page)   #拼合链接
            time.sleep(random.uniform(3,6))
            html = requests.get(url=comment_url,headers=header)
            if html.status_code == 200:
                comment_text = html.text
                soup = BeautifulSoup(comment_text,"lxml")
                write_txt(soup, book_id)          #传入网页内容
                page += 1           #页数加一



#https://book.douban.com/subject/1007305/
if __name__ == '__main__':
    url="https://book.douban.com/subject/"
    startID=1007304  #起始的图书ID
    st = 0   #循环的起点
    lens=20000   #len=20000时,需要爬取的总书籍数
    while st < lens:           #设置st和lens是为了爬取热门书籍
        if startID-1007304 >=1000:
            print("stop! " + startID)
            break
        try:
            startID += 1           #图书id增长
            score_url = url + str(startID) + "/"       #图书信息的链接地址
            html = requests.get(url=score_url,headers=header)
            html.encoding = "utf-8"
            time.sleep(random.uniform(3, 6))  # 暂停几秒,随机数在2-4s之间
            if html.status_code == 200:
                comment_url = score_url + "comments/"  # 评论的链接地址
                comment_html = requests.get(url=comment_url, headers=header).text
                time.sleep(random.uniform(3, 6))  # 暂停几秒,随机数在2-4s之间
                soup = BeautifulSoup(comment_html, "lxml")
                total_comments = soup.select("#total-comments")[0].string
                comment_num = total_comments.replace("全部共 ","")
                comment_num = comment_num.replace(" 条","")
                if int(comment_num) >= 1000:
                    st +=1
                    print(str(startID)+" is success!" + score_url + " comment_num is " + comment_num)
                    text = html.text
                    get_score(startID,text)
                    get_comments(soup,comment_url,startID,1)       #获取评论信息
                else:
                    print(score_url + " is failed!" + " comment_num is " + comment_num)
            else:
                print(str(startID)+" is failed!")
        except:
            print(str(startID) + " is failed!",end='')
            print(html.status_code)

The output is as follows (this is the output of the code when there are output statements before). A
Insert picture description here
bunch of data in the middle is omitted.
This is crawled to some unpopular books, and the number of comments is very poor, so I directly ignored
Insert picture description here

Published 304 original articles · 51 praises · 140,000 views

Guess you like

Origin blog.csdn.net/qq_39905917/article/details/104784103