python爬虫学习(十五)xpath模板下载

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == '__main__':
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
     }
     url='http://sc.chinaz.com/jianli/free.html'
     page_text=requests.get(url=url,headers=headers).text
     tree = etree.HTML(page_text)
     div_list=tree.xpath('//div[@id="main"]/div/div')

     if not os.path.exists('./muban'):
         os.mkdir('./muban')

     url_list=[]
     name_list=[]
     for div in div_list:

         muban_href= div.xpath('./a/@href')[0]
         #print(muban_href)
         download_url = requests.get(url=muban_href,headers=headers).text
         #print(download_url)
         download_tree = etree.HTML(download_url)
         name=download_tree.xpath('//div[@class="bread clearfix"]/a[3]/text()')[0]+'.rar'
         download_name = name.encode('iso-8859-1').decode('utf-8')
         #print(download_name)
         download_url = download_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')
         url_list.append(download_url)
         name_list.append(download_name)

     for name,url in zip(name_list,url_list):
         print(name,url)



发布了23 篇原创文章 · 获赞 0 · 访问量 666

猜你喜欢

转载自blog.csdn.net/haimian_baba/article/details/103820485