爬取京东任意商品图片

最下面为最新代码
代码如下:

import re
import urllib.request
def craw(url,page):
    html1=urllib.request.urlopen(url).read()
    html1=str(html1)
    pat1='<div id="J_goodsList".*<div class="p-commit">'
    result1=re.compile(pat1).findall(html1)
    result1=result1[0]
    pat2 ='img width.*?//img.*?jpg'
    imag = re.compile(pat2).findall(result1)
    x = 1
    for imagurl in imag:
        imagname = room + '第'+str(page)+'页' + '第'+str(x)+'个' + '.jpg'
        reurl=imagurl.replace('img width="220" height="220" class="err-product" data-img="1" source-data-lazy-img="',':')
        imagurl = 'https' + reurl
        try:
            urllib.request.urlretrieve(imagurl,filename=imagname)
            print('已输出第',page,'页,第',x,'个')
        except urllib.error.URLError as e:
            if hasattr(e,'code'):
                x+=1
            if hasattr(e,'reason'):
                x+=1
        x+=1
if __name__ =="__main__":
    name=input('请输入要爬的类别:')
    page_=int(input('请输入要爬的页数:'))
    room =input('请输入地址:(确保文件夹存在比如F:/pachong/)')
    for i in range(0,page_):
        h=2*i+1
        key2=i+1
        key1=name
        key=urllib.request.quote(key1)
        url2='https://search.jd.com/Search?keyword='+key +'&enc=utf-8&page='+ str(h)
        url=urllib.request.Request(url2)
        craw(url,key2)

关键点:
1,地址中汉字一定要转码。
2,由于技术有限,只爬取每页30张。
3,课本上的东西是一成不变的,需要自己用心突破。
4,数据筛选,正则表达式一定要准确。
ps 漏洞百出,还好成功了。
注:
2018.9.11原版
2018.9.12 改良一次(优化正则表达式,优化代码结构)
2018.9.16修改(正则表达式优化)

import re
import urllib.request
def craw(url,page):
    html1=urllib.request.urlopen(url).read()
    html1=str(html1)
    pat1='<div id="J_goodsList".*<div class="p-commit">'
    result1=re.compile(pat1).findall(html1)
    result1=result1[0]
    pat2 ='source-data-lazy-img="(//.*?jpg)'
    imag = re.compile(pat2).findall(result1)
    x = 1
    for imagurl in imag:
        imagname = room + '第'+str(page)+'页' + '第'+str(x)+'个' + '.jpg'
        imagurl = 'https:' + imagurl
        try:
            urllib.request.urlretrieve(imagurl,filename=imagname)
            print('已输出第',page,'页,第',x,'个')
        except urllib.error.URLError as e:
            if hasattr(e,'code'):
                x+=1
            if hasattr(e,'reason'):
                x+=1
        x+=1
if __name__ =="__main__":
    name=input('请输入要爬的类别:')
    page_=int(input('请输入要爬的页数:'))
    room =input('请输入地址:(确保文件夹存在比如F:/pachong/)')
    for i in range(1,2*page_+1):
        if i%2==0:
            key2=i/2+0.5
        else:
            key2=(i+1)/2
        key1=name
        key=urllib.request.quote(key1)
        url2='https://search.jd.com/Search?keyword='+key +'&enc=utf-8&page='+ str(i)
        # 模拟浏览器
        req = urllib.request.Request(url2)
        req.add_header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
        req.add_header("Accept-Encoding","gzip, deflate, br")
        req.add_header("Accept-Language","zh-CN,zh;q=0.9")
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
        url=urllib.request.Request(url2)
        craw(url,key2)

猜你喜欢

转载自blog.csdn.net/weixin_43131464/article/details/82632491