Python crawler-data analysis (regular)
Regular Analysis Case-Crawling Pictures of Embarrassment Encyclopedia
Embarrassment Encyclopedia URL
https://www.qiushibaike.com/imgrank/page/2/
Check the source code of the webpage and find the address where the picture is stored
import requests
import re
import os
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
#判断是否存在qiushi文件夹,如果不存在就创建一个
if not os.path.exists('./qiushi'):
os.mkdir('./qiushi')
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for pageNum in range(1,3):
new_url = format(url%pageNum)
#通用爬虫请求页面数据
gate_text = requests.get(url=new_url,headers=headers).text
ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
#正则匹配,匹配出图片地址
ex_data = re.findall(ex,gate_text,re.S)
for src in ex_data:
#拼接出完整的图片URL
src = 'https:'+src
#请求图片二进制数据
img_data = requests.get(url=src,headers=headers).content
img_name = src.split('/')[-1]
img_path = './qiushi/' + img_name
#创建并写入图片二进制数据
with open(img_path,'wb') as fp:
fp.write(img_data)
print(img_name,'success!!')
Crawl the result
practice
https://pic.netbian.com/4kmeinv/
import re
import requests
import os
if __name__ == '__main__':
if not os.path.exists('./meinv'):
os.mkdir('./meinv')
url = 'https://pic.netbian.com/4kmeinv/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
ex = '<li><a href=".*?target=".*?<img src="(.*?)" alt=.*?</a></li>'
ex_data = re.findall(ex,page_text,re.S)
for src in ex_data:
src = 'https://pic.netbian.com/'+src
img_data = requests.get(url=src,headers=headers).content
img_name = src.split('/')[-1]
img_path = './meinv/' + img_name
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(img_name, 'success!!')
result