1.使用传统方式爬取“斗图啦”网站的图片
#-*-coding = utf-8 -*- import requests from lxml import etree import re import os.path from urllib import request def parse_page(url): headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} response = requests.get(url=url,headers=headers) text = response.text parse = etree.HTMLParser(encoding='utf-8') html = etree.fromstring(text,parser=parse) images = html.xpath('//div[@class="page-content text-center"]//a//img[@class!="gif"]') for img in images: img_url = img.get("data-original") img_name = img.get("alt") img_name = re.sub(r'[??!!./,,。]','',img_name) img_postfix = os.path.splitext(img_url)[1] img_save_path = os.path.join('E:\study',img_name+img_postfix) request.urlretrieve(img_url,img_save_path) def main(): for page in range(1,101): url = 'https://www.doutula.com/photo/list/?page=%d' %page parse_page(url) if __name__=='__main__': main()
2.使用生产者与消费者模式多线程下载表情包
>>>>>>>>>>待续