Requests 校花网图片爬取

纪念我们闹过的矛盾
import requests
import re
url = 'http://www.xiaohuar.com/list-1-%s.html'
for i in range(4):
temp = url % i
response =requests.get(temp)
html = response.text

#img_urls=re.findall(r"/d/file/\d+/\w+\.jpg",html)#取出图片URL
#img_urls1 = re.findall(r"https://\w+.*?/\w+/\w+/\w+/\w+/\d+/\w+.*\.jpg", html)#取出图片URL
#img_names = re.findall(r'<img \w+.*="\d+".*? alt="(.*?)"', html) #取出图片名称
img=re.findall(r'<img \w+.*="\d+".*? alt="(.*?)".*"(/d/file/\d+/\w+\.jpg)"', html)
for img_tupian in img:
img_tupian_urls=img_tupian[-1]#取出图片Url
img_name=img_tupian[0]#取出名称

img_response=requests.get("http://www.xiaohuar.com%s" %img_tupian_urls)
        xiaohua=img_response.content
name=("http://www.xiaohuar.com%s" %img_tupian_urls).split('/')[-1]
print(houzui)
with open(img_name +name,'wb') as f:
f.write(xiaohua)



爬去结果还是有点不好,有空想想把那些乱码去掉


猜你喜欢

转载自www.cnblogs.com/python2687806834/p/9687108.html
今日推荐