对京东上面的手机图片进行爬取,f12找到相应的代码之后进行模式匹配,其中过滤到没用信息,实现代码如下:
import re
import urllib.request as request
import urllib
def craw(url, page):
html1 = request.urlopen(url).read()
html1 = str(html1)
pat1 = '<div id="J_goodsList".+?<div class="page clearfix">'
result1 = re.compile(pat1).findall(html1)
result1 = result1[0]
pat2 = '<img width="220" height="220" class="err-product" data-img="1" source-data-lazy-img=".+?\.jpg" />'
imagelist = re.compile(pat2).findall(result1)
x = 1
for imageurl in imagelist:
imagename = "F:\\C\\mobilepic\\" + str(page) + '-' + str(x) + '.jpg'
#print(imageurl[87:-4]) 获得图片的地址
imageurl = "http://" + imageurl[87:-4]
try:
request.urlretrieve(imageurl, filename=imagename)
print(imagename)
except urllib.error.URLError as e:
#异常处理,若不能爬取,调至下一张图片
if hasattr(e, 'code'):
x = + 1
if hasattr(e, 'reason'):
x += 1
x += 1
for i in range(1, 3):
url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=" + str(
i) + '3&s=58&click=0'
craw(url, i)
print("Finish:", i)
结果: