Batch download resume template

Batch download resume template

Thinking

1. First tab study, found that in addition a special case of the first page request, the back pages of a regular

2. The study html, find resume download interface

3. Go to the download page, find the download link

4. download link to download the data, stored locally

import requests
from lxml import etree
import os

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}

def work(page):
    if not os.path.exists("dic"):
        os.mkdir("dic")

    count=1

    while count <= page:
        #第一页请求特殊,独立处理
        if count == 1 :
            url="http://sc.chinaz.com/jianli/free.html"
        else:
            url=f"http://sc.chinaz.com/jianli/free_{count}.html"
        res=requests.get(url=url,headers=headers)
        res.encoding=res.apparent_encoding
        tree=etree.HTML(res.text)
        #获取下载界面的a标签
        a_s=tree.xpath("//div[@class='box col3 ws_block']/a/@href")
        for i in a_s:
            res2=requests.get(url=i,headers=headers)
            tree2=etree.HTML(res2.text)
            #找到下载链接
            do_url=tree2.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')[-1]

            file_name=do_url[-11:]
            #写入本地
            res3=requests.get(url=do_url,headers=headers).content
            with open(f"dic/{file_name}","wb")as fw:
                fw.write(res3)
                print(file_name+"写入成功!")
        count+=1

if __name__ == '__main__':
    work(5)



'''
由于第一页的分页请求比较特殊,需要进行单独处理
http://sc.chinaz.com/jianli/free.html
http://sc.chinaz.com/jianli/free_2.html
http://sc.chinaz.com/jianli/free_3.html
'''

Guess you like

Origin www.cnblogs.com/zx125/p/11410687.html