Python crawler-data analysis (regular)

Python crawler-data analysis (regular)

Regular Analysis Case-Crawling Pictures of Embarrassment Encyclopedia

Embarrassment Encyclopedia URL
https://www.qiushibaike.com/imgrank/page/2/

Insert picture description here
Check the source code of the webpage and find the address where the picture is stored

import requests
import re
import os


if __name__ == '__main__':
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }
    #判断是否存在qiushi文件夹,如果不存在就创建一个
    if not os.path.exists('./qiushi'):
        os.mkdir('./qiushi')
        
    url = 'https://www.qiushibaike.com/imgrank/page/%d/'
    for pageNum in range(1,3):
        new_url = format(url%pageNum)
		#通用爬虫请求页面数据
        gate_text = requests.get(url=new_url,headers=headers).text
        
        ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
        #正则匹配,匹配出图片地址
        ex_data = re.findall(ex,gate_text,re.S)
        
        for src in ex_data:
        	#拼接出完整的图片URL
            src = 'https:'+src
            #请求图片二进制数据
            img_data = requests.get(url=src,headers=headers).content
            img_name = src.split('/')[-1]
            img_path = './qiushi/' + img_name
            #创建并写入图片二进制数据
            with open(img_path,'wb') as fp:
                fp.write(img_data)
                print(img_name,'success!!')

Crawl the result
Insert picture description here
practice

https://pic.netbian.com/4kmeinv/

Here is the quote

import re
import requests
import os

if __name__ == '__main__':
    if not os.path.exists('./meinv'):
        os.mkdir('./meinv')

    url = 'https://pic.netbian.com/4kmeinv/'

    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
    }

    page_text = requests.get(url=url,headers=headers).text
    ex = '<li><a href=".*?target=".*?<img src="(.*?)" alt=.*?</a></li>'
    ex_data = re.findall(ex,page_text,re.S)
    for src in ex_data:
        src = 'https://pic.netbian.com/'+src
        img_data = requests.get(url=src,headers=headers).content
        img_name = src.split('/')[-1]
        img_path = './meinv/' + img_name
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_name, 'success!!')


result
Insert picture description here

Guess you like

Origin blog.csdn.net/qq_43710889/article/details/115003880