xlwt在爬虫中的实战(爬取豆瓣图书)

爬虫相关知识请阅读我的其他文章

import re
import xlwt
import requests
from bs4 import BeautifulSoup

def getHtml(url):
    # 构造请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'
    }
    page = requests.get(url, headers=headers)
    html =page.text
    return html

if __name__=='__main__':
    # 创建workbook
    Workbook = xlwt.Workbook()
    # 创建worksheet
    sheet = Workbook.add_sheet('豆瓣图书Top250', cell_overwrite_ok=True)
    sheet.write(0,0,'书名')
    sheet.write(0,1,'作者')
    sheet.write(0,2,'译者')
    sheet.write(0,3,'出版单位')
    sheet.write(0,4,'出版时间')
    sheet.write(0,5,'定价')
    sheet.write(0,6,'豆瓣评分')
    sheet.write(0,7,'评价人数')
    sheet.write(0,8,'一句话')
    i = 1
    j = 1
    k = 1
    m = 1
    # 循环翻页
    for page in range(0,250,25):
        url = 'https://book.douban.com/top250?start={0}'.format(page)
        html = getHtml(url)
        # beautiful对象
        Soup = BeautifulSoup(html,'html.parser')
        names = Soup.find_all('div',class_ = 'pl2')
        for name in names:
            book = name.find('a')
            book = book.text.strip()
            book = book.replace(' ','')
            sheet.write(i,0,book)
            i += 1
        Infos = Soup.find_all('p',class_ = 'pl')
        for Info in Infos:
            r = 1
            authorinfo = Info.text
            authors = authorinfo.split('/')
            if len(authors) < 4:
                sheet.write(j,1,authors[0])
                sheet.write(j,2,authors[1])
                sheet.write(j,3,authors[2])
                j += 1
                continue
            sheet.write(j,1,authors[0])
            if authorinfo.count('/') == 4:
                sheet.write(j,2,authors[r])
                r += 1
            sheet.write(j,3,authors[r])
            sheet.write(j,4,authors[r+1])
            sheet.write(j,5,authors[r+2])
            j += 1
        rating_nums = Soup.find_all('div',class_ = 'star clearfix')
        for rating in rating_nums:
            star = rating.find_all('span')
            sheet.write(k,6,star[1].text)
            reg = r'\d+'
            vote = re.findall(reg,star[2].text)
            sheet.write(k,7,vote)
            k += 1
        quotes = Soup.find_all('p',class_ = 'quote')
        for quote in quotes:
            sheet.write(m,8,quote.text)
            m += 1
    Workbook.save('豆瓣图书Top250.xls')

在这里插入图片描述

发布了60 篇原创文章 · 获赞 6 · 访问量 7775

猜你喜欢

转载自blog.csdn.net/qq_44205272/article/details/103213737
今日推荐