import urllib.request
import re
import urllib.error
def craw(url,page):
html=urllib.request.urlopen(url).read()
html=str(html)
pat1='<div id="plist".+?<div class="clr">'
result1=re.findall(pat1,html)
if result1:
result1=result1[0]
pat2='<img width="220" height="220" data-img="1" src="//(.+?\.jpg)"'
pat3='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?.jpg)"'
imagelist1=re.findall(pat2,result1)
imagelist2=re.findall(pat3,result1)
imagelist=imagelist1+imagelist2
x=1
for imageurl in imagelist:
imagename='jd/'+str(page)+str(x)+".jpg"
imageurl="http://"+imageurl
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.HTTPError as e:
if hasattr(e,"code"):
x += 1
if hasattr(e,'reason'):
x += 1
x+=1
print('抓取成功')
else:
print('抓取失败,未获得内容')
for i in range(1,2):
url='https://list.jd.com/list.html?cat=9987,653,655&page='+str(i)
craw(url,i)
抓取成功