爬取奇书网书籍信息并存储在自动生成的slsx表中(正则初级)

# -*- coding: utf-8 -*-
__author__ = '木之易'
__date__ = '2018/8/9 9:34'

import re
from urllib import request

import xlwt

"""爬取奇书网书籍信息初极版"""

class QishuSpider(object):
    def __init__(self, t_id):
        self.url = 'https://www.qisuu.la' + t_id
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.html = ''
        self.total = 0
        self.title = ''
        self.workbook = None
        self.i = 0

        self.sheet1 = None
        self.work_book()


    def work_book(self):

        # 1.创建工作簿对象,指定编码格式
        self.workbook = xlwt.Workbook(encoding='utf-8')
        # 2.添加一张数据表
        self.sheet1 = self.workbook.add_sheet('奇书网小说大全')
        # 3.向表中添加数据
        # 3.1 行号 3.2 列号 3.3 写入的数据
        self.sheet1.write(0, 0, '书名')
        self.sheet1.write(0, 1, '点击次数')
        self.sheet1.write(0, 2, '文件大小')
        self.sheet1.write(0, 3, '书籍类型')
        self.sheet1.write(0, 4, '更新日期')
        self.sheet1.write(0, 5, '连载状态')
        self.sheet1.write(0, 6, '书籍作者')
        self.sheet1.write(0, 7, '运行环境')
        self.sheet1.write(0, 8, '运行环境')
        self.sheet1.write(0, 9, '小说介绍')

    def get_html(self, url):
        """
        构建请求,发送请求,接收响应数据
        获取网页源代码,赋值给self.html属性
        :param url: 网址
        :return:
        """
        # 构建请求对象
        req = request.Request(url=url, headers=self.headers)
        #发送请求,获取源代码
        response = request.urlopen(req)
        # 将返回的数据读取并转换为字符串,赋值给对象的html属性
        self.html = response.read().decode('utf-8', 'ignore')

    def get_title_total(self):
        """
        获取标题和总页数
        :return:
        """
        # 准备正则,获取标题
        tit_pattern = re.compile(r'<title>(.*?)</title>', re.S)
        # 根据正则匹配标题
        tit_res = re.search(tit_pattern, self.html)
        # 取出标题
        if tit_res:
            self.title = tit_res.group(1)


        # 匹配包含总页数上级标签
        page_pattern = re.compile(r'<div.*?tspage">(.*?)</a>', re.S)
        page_res = re.search(page_pattern, self.html)

        # 匹配总页数
        if page_res:
            ul_html = page_res.group()
            last_page_pattern = re.compile(r'页次.*?1/(.*?)&', re.S)
            last_page_res = re.search(last_page_pattern, ul_html)
            # 取出总页数
            if last_page_res:
                self.total = int(last_page_res.group(1))
                print(self.total)

    def earch_html(self):
        """提取每本书籍的地址"""
        # 准备正则
        pattern = re.compile(r'<div.*?"s".*?<a href="(.*?)">', re.S)
        results = re.findall(pattern, self.html)

        for r in results:
            # 把提取到的小地址传给parse_html中
            self.parse_html(r)

    def parse_html(self, r):
        """提取书籍内部数据"""

        # 拼接完整的地址
        url = 'https://www.qisuu.la' + r
        # 获取网页源代码
        self.get_html(url)
        # 准备正则
        pattern = re.compile(r"""<div.*?detail_right".*?<h1>(.*?)</h1>.*?次数:(.*?)<.*?大小:(.*?)<.*?类型:(.*?)<.*?日期:(.*?)<.*?状态:(.*?)<.*?作者:(.*?)<.*?环境:(.*?)<.*?<p>(.*?)</p>.*?get_down_url.*?'(.*?)'""", re.S)
        results = re.findall(pattern, self.html)


        for r in results:
            self.i += 1

            self.save_text(r)
            print(r)
            print(self.i)
    def save_text(self, r):

        self.sheet1.write(self.i, 0, '{}'.format(r[0]))
        self.sheet1.write(self.i, 1, '{}'.format(r[1]))
        self.sheet1.write(self.i, 2, '{}'.format(r[2]))
        self.sheet1.write(self.i, 3, '{}'.format(r[3]))
        self.sheet1.write(self.i, 4, '{}'.format(r[4]))
        self.sheet1.write(self.i, 5, '{}'.format(r[5]))
        self.sheet1.write(self.i, 6, '{}'.format(r[6]))
        self.sheet1.write(self.i, 7, '{}'.format(r[7]))
        self.sheet1.write(self.i, 8, '{}'.format(r[8]))
        self.sheet1.write(self.i, 9, '{}'.format(r[9]))
        self.workbook.save('奇书网小说大全.xlsx')


    def run(self):
        # 获取网页源代码
        self.get_html(self.url)
        # 获取页数,标题
        self.get_title_total()
        print('正在爬取:{},共{}页,请稍后.....'.format(self.title, self.total))
        # 提取每本书籍的地址并下载
        # self.earch_html()

        # 循环爬取每一页数据
        for x in range(1, self.total + 1):
            print('****************************************')
            print('...正在下载第{}页...请稍后...'.format(x))
            print('****************************************')
            # 拼接每一页完整地址
            url = self.url + 'index_{}.html'.format(x)
            # 获取源代码
            self.get_html(url=url)
            # 解析网页数据
            self.earch_html()


if __name__ == '__main__':
    qishu = QishuSpider(t_id='/soft/sort01/')
    qishu.run()
# 参考网址
# 第一页
# https://www.qisuu.la/soft/sort01/index_1.html
# https://www.qisuu.la/Shtml36877.html
# https://www.qisuu.la/Shtml37331.html
# 第二页
# https://www.qisuu.la/Shtml37312.html
# https://www.qisuu.la/Shtml37771.html
# 第三页
# https://www.qisuu.la/soft/sort01/index_3.html
# https://www.qisuu.la/Shtml37487.html
# https://www.qisuu.la/Shtml37285.html

猜你喜欢

转载自blog.csdn.net/A_fool_Program_ape/article/details/81570121