Python爬虫实战项目之小说信息爬取

我们以奇书网为例进行爬取

网址:https://www.qisuu.la

一,先新建一个新的文件夹,名字自取,用于存放py文件和爬取的数据

二,找到要爬取的网站的ur和你自己浏览器的请求头,(因为我是以奇书网为例,浏览器为火狐浏览器)

        url= ‘https://www.qisuu.la/soft/sort01/’

      请求头:‘User_Anger’: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'

(  找不到请求头的可以直接拿去用)

三,开始爬取

1创建一个类,并.定义初始化函数,在初始化函数中定义好url和请求头,由于下面要用到的数据较多,所以我定义的比较多,代码如下:

class NovelSpider(object):
    def __init__(self):
      self.url = 'https://www.qisuu.la/soft/sort01/'
      self.html = ''
      self.herders = {
            'User_Anger': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
        }
      self.total = 0
      self.count = 0
      self.retry_count = 0
        #创建Excel表格,用于存储爬取的数据
      self.workbook = xlwt.Workbook(encoding='utf-8')
      self.sheet = self.workbook.add_sheet('novel_data')
      self.create_excel()

 2,创建Excel表,用于存储爬取的数据

    def create_excel(self):
        self.sheet.write(0, 0, '小说名称')
        self.sheet.write(0, 1, '点击次数')
        self.sheet.write(0, 2, '文本大小')
        self.sheet.write(0, 3, '书籍类型')
        self.sheet.write(0, 4, '更新日期')
        self.sheet.write(0, 5, '连载状态')
        self.sheet.write(0, 6, '书籍作者')
        self.sheet.write(0, 7, '运行环境')
        self.sheet.write(0, 8, '小说简介')
        self.sheet.write(0, 9, '下载地址')

3.模拟浏览器发送请求,并接受返回的网页源码,代码如下:

        def get_html(self, url):

        # 1.创建request对象,设置随机请求头
            req = request.Request(url=url, headers={
            'User-Agent': choice(self.ua_list)
        })
        try:
            self.retry_count += 1
            # 2.发起请求
            response = request.urlopen(req)
            # 3.接收数据
            self.html = response.read().decode('utf-8')
        except Exception as e:
            # 请求重试次数大于3,放弃该请求
            if self.retry_count > 3:
                print('请求失败,地址:{}'.format(url))
                return
            # 重新发送请求
            print('请求数据失败,正在尝试重新连接...')
            self.get_html(url)
        else:
            # 归0
            self.retry_count = 0

4.用正则表达式来解析网页源码,并获取小说详情页的链接,代码如下:

    def get_story_link(self):
        """获取小说页面链接"""
         #用正则从网页源码中匹配小说页面的链接
        pattern = re.compile(r'<li>.*?<div class="s.*?<a href="(.*?)">.*?"',re.S)
        res = re.findall(pattern,self.html)
        if res:
            """遍历小说链接"""
            for x in res:
                #拼接新的链接,并传入请求函数中
                url =self.url2 + x
                self.get_html(url)
                # #解析小说网页数据
                self.parse_story()

5.解析小说网页的数据,拿到自己想要的数据,代码如下:

(我们在这里找了小说信息的部分数据)

    def parse_story(self):
        """解析小说页面的数据"""
        #运用正则来匹配自己想要的数据
        pattern = re.compile(r'.*?detail_right".*?h1>(.*?)</h1.*?ul>.*?<li.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?showInfo".*?p>(.*?)</p.*?', re.S)
        res = re.findall(pattern, self.html)
        # for x in res:
        # print(res)
        #提取数据
        title = res[0][0]
        click_num = res[0][1]
        size = res[0][2]
        novel_type =res[0][3]
        datetime =res[0][4]
        status =res[0][5]
        author =res[0][6]
        run_sys =res[0][7]
        content =res[0][8]
        #保存数据 

self.save_date(title,click_num,size,novel_type,datetime,status,author,run_sys,content)



6,保存数据,保存到Excel表中,代码如下:

    def write_to_excel(self, idx, data):
        #封装写入表格的函数
        print(idx, data)
        self.sheet.write(self.count, idx, data)

    def save_data(self, *args):
        self.count += 1
        print('正在保存第{}本小说:{}'.format(self.count, args[0]))
        # 1.基础写法
        self.sheet.write(self.count, 0, args[0])
        self.sheet.write(self.count, 1, args[1])
        self.sheet.write(self.count, 2, args[2])
        self.sheet.write(self.count, 3, args[3])
        self.sheet.write(self.count, 4, args[4])
        self.sheet.write(self.count, 5, args[5])
        self.sheet.write(self.count, 6, args[6])
        self.sheet.write(self.count, 7, args[7])
        self.sheet.write(self.count, 8, args[8])
        self.sheet.write(self.count, 9, args[9])

        # 2.进阶写法
        # *args 将元组看做一个容器,进行枚举
        # for idx, data in enumerate(args):
        #     if idx == 8:
        #         data = data.replace('&#12288;', ' ')
        #
        #     self.write_to_excel(idx, data)

        # 3.终极写法
        # rs = map(lambda idx, data: self.sheet.write(self.count, idx, data), range(10), args)
        # for x in rs:
        #     pass
        self.workbook.save('小说数据.xls')

7.运行函数:

def run(self):            
            #想获取多少页的数据 就把range函数里面的后面的数字改一下
          for x in range(1, 11):
                print(''.center(50,'*'))
                print('正在获取第%s页数据,请稍后....' % (x))

                # 拼接完整的url地址
                url = t_info[0] + 'index_{}.html'.format(x)
                # 获取该页源代码
                self.get_html(url)
                # 解析源代码,提取数据
                self.parse_index()
                break

        self.workbook.save('小说数据.xls')

以上就是爬取小说的全部步骤,全部代码如下:

# -*- coding: utf-8 -*-
__author__ = 'wj'
__date__ = '2018/8/10 9:08'
import re
from random import choice
from urllib import request

import xlwt


class NovelSpider(object):

    def __init__(self):

        self.url = 'https://www.qisuu.la/soft/sort01/'
        self.html = ''
        self.ua_list = [
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
        ]
        self.total = 0
        self.count = 0
        self.retry_count = 0
        self.workbook = xlwt.Workbook(encoding='utf-8')
        self.sheet = self.workbook.add_sheet('novel_data')
        self.create_excel()

    def create_excel(self):
        self.sheet.write(0, 0, '小说名称')
        self.sheet.write(0, 1, '点击次数')
        self.sheet.write(0, 2, '文本大小')
        self.sheet.write(0, 3, '书籍类型')
        self.sheet.write(0, 4, '更新日期')
        self.sheet.write(0, 5, '连载状态')
        self.sheet.write(0, 6, '书籍作者')
        self.sheet.write(0, 7, '运行环境')
        self.sheet.write(0, 8, '小说简介')
        self.sheet.write(0, 9, '下载地址')

    def get_html(self, url):

        # 1.创建request对象,设置随机请求头
        req = request.Request(url=url, headers={
            'User-Agent': choice(self.ua_list)
        })
        try:
            self.retry_count += 1
            # 2.发起请求
            response = request.urlopen(req)
            # 3.接收数据
            self.html = response.read().decode('utf-8')
        except Exception as e:
            # 请求重试次数大于3,放弃该请求
            if self.retry_count > 3:
                print('请求失败,地址:{}'.format(url))
                return
            # 重新发送请求
            print('请求数据失败,正在尝试重新连接...')
            self.get_html(url)
        else:
            # 归0
            self.retry_count = 0

    def get_total(self):
        # 1.获取源代码
        self.get_html(self.url)
        # 2.准备正则
        pattern = re.compile(r'<div class="tspage.*?/(.*?)&nbsp;', re.S)
        # 3.搜索
        rs = re.search(pattern, self.html)

        if rs:
            self.total = int(rs.group(1))
            print(self.total)

    def parse_index(self):
        # 1.准备正则
        pattern = re.compile(r'<li.*?<div.*?class="s".*?<a href="(.*?)"', re.S)
        # 2.搜索数据
        results = re.findall(pattern, self.html)
        # 3.循环遍历每一个小链接
        for link in results:
            url = 'https://www.qisuu.la' + link
            # 4.获取详情页面的源代码
            self.get_html(url)
            # 5.解析详情页面数据
            self.parse_detail()

    def parse_detail(self):

        #1 准备正则
        pattern = re.compile(r"""<div class="detail_right.*?<h1>(.*?)</h1.*?<li.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?<div class="showInfo".*?<p.*?>(.*?)</p.*?get_down_url.*?,'(.*?)'""", re.S)

        results = re.findall(pattern, self.html)

        # 1.提取数据
        title = results[0][0]
        click_num = results[0][1]
        file_size = results[0][2]
        novel_type = results[0][3]
        datetime = results[0][4]
        status = results[0][5]
        author = results[0][6]
        run_sys = results[0][7]
        description = results[0][8].replace('&#12288;',' ')
        download = results[0][9]
        # 保存数据
        self.save_data(title, click_num, file_size, novel_type, datetime, status, author, run_sys, description, download)

    # 封装写入excel表格的函数
    def write_to_excel(self, idx, data):
        print(idx, data)
        self.sheet.write(self.count, idx, data)

    def save_data(self, *args):
        self.count += 1
        print('正在保存第{}本小说:{}'.format(self.count, args[0]))
        # 1.基础写法
        self.sheet.write(self.count, 0, args[0])
        self.sheet.write(self.count, 1, args[1])
        self.sheet.write(self.count, 2, args[2])
        self.sheet.write(self.count, 3, args[3])
        self.sheet.write(self.count, 4, args[4])
        self.sheet.write(self.count, 5, args[5])
        self.sheet.write(self.count, 6, args[6])
        self.sheet.write(self.count, 7, args[7])
        self.sheet.write(self.count, 8, args[8])
        self.sheet.write(self.count, 9, args[9])

        # 2.进阶写法
        # *args 将元组看做一个容器,进行枚举
        # for idx, data in enumerate(args):
        #     if idx == 8:
        #         data = data.replace('&#12288;', ' ')
        #
        #     self.write_to_excel(idx, data)

        # 3.终极写法
        # rs = map(lambda idx, data: self.sheet.write(self.count, idx, data), range(10), args)
        # for x in rs:
        #     pass
        self.workbook.save('小说数据.xls')

    def parse_type(self):

        pattern = re.compile(r'<div class="nav">(.*?)</div>', re.S)
        res = re.search(pattern, self.html)

        if res:
            html = res.group(1)
            results = re.findall(re.compile(r'<a.*? href="(.*?)".*?>(.*?)</a>',re.S), html)

            # 返回所有分类地址
            # x是一个小元组
            return map(lambda x: ('https://www.qisuu.la'+x[0],x[1]), results[1:])

    def run(self):
        # 获取总页码
        self.get_total()
        # 获取所有分类地址
        types = self.parse_type()

        for t_info in types:
            # print(t_info)
            print('正在爬取{}下的小说.....'.format(t_info[1]))
            for x in range(1, self.total + 1):
                print(''.center(50,'*'))
                print('正在获取%s下的第%s页数据,请稍后....' % (t_info[1], x))

                # 拼接完整的url地址
                url = t_info[0] + 'index_{}.html'.format(x)
                # 获取该页源代码
                self.get_html(url)
                # 解析源代码,提取数据
                self.parse_index()
                break

        self.workbook.save('小说数据.xls')


if __name__ == '__main__':

    novel = NovelSpider()
    novel.run()



















猜你喜欢

转载自blog.csdn.net/p_xiaobai/article/details/81584756