Python crawler-download the personal insurance clause file of Sino-British Life Insurance (.pdf)

Python crawler-download the personal insurance clause file of Sino-British Life Insurance (.pdf)

Description

Support original, support open source!

statement

This article is original, please link to it for reprinting. Hyperlink or address of this article : Link to this article

surroundings

This code uses the environment is: Anaconda2 (python2.7), Windows ( win10), use of the editor is PyCharm
# Code

# encoding:utf-8

from bs4 import BeautifulSoup
import urllib
import re
import urlparse
import requests
import time


class HtmlParder_pdf(object):
    # 容器
    def __init__(self):
        self.url_list = set()
        self.filenames = set()

    # 解析器
    def parser(self, url):
        response = urllib.urlopen(url)
        html_cont = response.read()
        soup = BeautifulSoup(html_cont, "html.parser", from_encoding='utf-8')
        return soup

    # 找出每一页的有效链接
    def find_file_in_one_page(self, soup):
        pdf_url_in_One = soup.find_all('a', href=re.compile(r"/data/."))
        return pdf_url_in_One

    # 得到下一页的链接地址
    def get_next_page(self, soup):
        next_page_url = soup.find('a', id="next", href=re.compile(r'/website/xxzx/gkxxpl/gsjbxx/grbxtk/rsbx/.'))
        print next_page_url
        next_page_link = urlparse.urljoin(root_url, next_page_url['href'])
        return next_page_link
    #为set添加,写出到文件中使用,本pro旨在下载以及中文转义问题,没有使用这个方法提取文件名
    def add_filenames(self, pdf_url_in_One):
        filename_list = self.filenames
        for link in pdf_url_in_One:
            pdf_url_in_One_link = urlparse.urljoin(root_url, link['href'])
            filename = pdf_url_in_One_link.split("/")[-1]
            filename_list.add(filename)

    # 添加url队列
    def add_url_list(self, url):
        for link in url:
            pdf_url_in_One_link = urlparse.urljoin(root_url, link['href'])
            self.url_list.add(pdf_url_in_One_link)

    # 6##########xxxxxxxxxxxxxxxxxx
    # 下载器
    """
    由于直接获取的连接中有中文存在,导致 运行错误;
    通过抓包工具找到解决办法:
        使用request的GET method获取直接的连接之后进行获取header,输出request地址中的文件连接之后,使用这个连接进行下载;
    """
    def download(self):
        # donwload 1 pdf and pop that url
        url = self.url_list.pop().encode("utf-8")
        request = requests.request("GET", url)
        new_url = request.url
        if url.split("/")[-1].split(".")[-1].decode("utf-8") == "pdf":
            urllib.urlretrieve(new_url, url.split("/")[-1].decode("utf-8"))
        # print os._exists(url.split("/")[-1].decode("utf-8"))
        request.close()

    # 3
    def hasnext_page(self, url):
        if url != None:
            return True
        return False


if __name__ == '__main__':
    #第一页,下载页起始页面
    root_url = "http://www.aviva-cofco.com.cn/website/xxzx/gkxxpl/gsjbxx/grbxtk/rsbx/list-1.shtml"
    count = 1
    obj = HtmlParder_pdf()

    while obj.hasnext_page(root_url):
        soup = obj.parser(root_url)
        old_root_url = None
        # print soup
        file_list = obj.find_file_in_one_page(soup)
        # print file_list
        obj.add_url_list(file_list)
        obj.add_filenames(file_list)
        for i in range(0, list(obj.url_list).__len__()):
            obj.download()
            time.sleep(2)
            print 'Crawing page', count, 'url', i + 1

        next_url = obj.get_next_page(soup)
        if next_url == root_url:
            break
        root_url = next_url
        count = count + 1

    print count

This code was partly encapsulated, tested and modified the bugs. It was originally to help a person in the Python communication group to view the code. It turned out that the code is messy and only understands the general purpose; the problem solved in the code theme is simple Remarks; the final downloaded file is *.pdf

If there are any errors, please feel free to enlighten me~

Running effect display

Insert picture description here

Guess you like

Origin blog.csdn.net/Uridis/article/details/86540728