使用xpath解析站长素材_免费简历模板

关于使用什么模块写爬虫做数据解析,并没有刻板的规定,鉴于正在学习仍使用xpath
分析站长素材网的免费简历模板,需求分析不再赘述,且看编码过程

#!/usr/bin/env python
# encoding: utf-8

"""
@file: 解析站长素材_免费简历模板.py
@time: 2020/2/29 14:30
"""

import requests
from lxml import etree
import random
import os


def resume():
    index = int(input('您要下载几页:'))

    file = './免费简历模板'
    if not os.path.exists(file):
        os.mkdir(file)

    headers = {
        'Connection': 'close',  # 请求成功释放当前资源
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.116 Safari/537.36'
    }
    url = 'http://sc.chinaz.com/jianli/free_%d.html'

    for page in range(1, index):
        if page == 1:
            # 第一页的url不一样 需要单独处理
            new_url = 'http://sc.chinaz.com/jianli/free.html'
        else:
            # 拼接从第二页开始往后的url
            new_url = format(url % page)
            response = requests.get(url=new_url, headers=headers)

            # 手动设置响应数据的编码格式(处理中文乱码)
            response.encoding = 'utf-8'
            page_text = response.text
            tree = etree.HTML(page_text)

            # 找到简历对应的div
            div_list = tree.xpath('//div[@id="container"]/div')
            for div in div_list:
                # 获取详情页的url
                detail_url = div.xpath('./a/@href')[0]
                # 使用alt作为简历的文件名称
                resume_name = div.xpath('./a/img/@alt')[0]

                # 获取简历详情页面
                detail_page = requests.get(url=detail_url, headers=headers).text
                tree = etree.HTML(detail_page)

                # 获取下载地址
                download_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
                # 随机选下载地址,防ip检测
                download_url = random.choice(download_list)

                # 持久化存储
                data = requests.get(url=download_url, headers=headers).content
                filename = resume_name + '.rar'
                filepath = file + '/' + filename
                with open(filepath, 'wb') as fp:
                    fp.write(data)
                    print(filename, '下载成功')


if __name__ == '__main__':
    resume()
    print('Over!!!')

发布了48 篇原创文章 · 获赞 55 · 访问量 4481

猜你喜欢

转载自blog.csdn.net/qq_43562262/article/details/104588622