爬虫之多线程、异步

1.使用传统方式爬取“斗图啦”网站的图片

#-*-coding = utf-8 -*-
import requests
from lxml import etree
import re
import os.path
from urllib import request

def parse_page(url):
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    response = requests.get(url=url,headers=headers)
    text = response.text
    parse = etree.HTMLParser(encoding='utf-8')
    html = etree.fromstring(text,parser=parse)
    images = html.xpath('//div[@class="page-content text-center"]//a//img[@class!="gif"]')
    for img in images:
        img_url = img.get("data-original")
        img_name = img.get("alt")
        img_name = re.sub(r'[??!!./,,。]','',img_name)
        img_postfix = os.path.splitext(img_url)[1]
        img_save_path = os.path.join('E:\study',img_name+img_postfix)
        request.urlretrieve(img_url,img_save_path)

def main():
    for page in range(1,101):
        url = 'https://www.doutula.com/photo/list/?page=%d' %page
        parse_page(url)
       

if __name__=='__main__':
    main()
View Code

 2.使用生产者与消费者模式多线程下载表情包

>>>>>>>>>>待续

猜你喜欢

转载自www.cnblogs.com/wuxunyan/p/10648135.html