爬虫练习一,爬取京东图片

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u011529752/article/details/78317072

爬虫入门小项目,爬取京东的图片。还不会处理动态加载,只是简单的爬取图片和名称。

#-*- coding: utf-8 -*-
from urllib import request
from urllib import error
import  chardet
import re

def crawler(urladdr,page,img_id):
    urladdr = urladdr + str(page)
    print(urladdr)
    html1 = request.urlopen(urladdr).read()

    htmlfile = open("jd.html",'wb')
    htmlfile.write(html1)

    html1 = str(html1,'utf-8')
    #html1 = str(html1)
    # chardit1 = chardet.detect(html1)
    # html1 = html1.decode(chardit1['encoding']).encode('utf-8')
    # html1 = str(html1)


    pattern1 = '<div id="plist" class="goods-list-v2 J-goods-list gl-type-1 ">(.+?)<div class="clr"></div>\n</div>'
    res1 = re.compile(pattern1,re.S).findall(html1)
    res1 = res1[0]

    pattern2 = u'<li class="gl-item">.+?</li>'
    res2 = re.compile(pattern2,re.S).findall(res1)

    file = open("resList.txt","w")

    x=1
    for m in res2:
        pattern3 = u'<img width=".+?" height=".+?" data-+?img=".+?" data-lazy-img="//(.+?)"'
        imgurl = re.compile(pattern3,re.S).findall(m)
        if imgurl == []:
            pattern3 = u'<img width=".+?" height=".+?" data-+?img=".+?" src="//(.+?)"'
            imgurl = re.compile(pattern3, re.S).findall(m)
        print(x)
        x = x + 1
        if imgurl != []:
            imgurl = "http://" + imgurl[0]
            # pattern4 = '<strong class="J_price"><em>¥</em>(.+?)</i>'
            # price = re.compile(pattern4,re.S).findall(m)[0]
            pattern5 = u'<a target="_blank" title=.+?>\n.+?<em>\n.+?(\S.+?)</em>'
            imgname = re.compile(pattern5,re.S,).findall(m)
            imgname = imgname[0]
            # imgname = imgname.encode('raw_unicode_escape')
            # imgname = imgname.decode()
            try:
                form = imgurl[-4:]
                request.urlretrieve(imgurl,"img/"+str(img_id)+form)
            except error.URLError as e:
                if(hasattr(e,"code")):
                    img_id = img_id + 1
                if(hasattr(e,"reason")):
                    img_id = img_id + 1
            write_str = str(img_id)+" "+imgname +'\n'
            img_id = img_id + 1
            print(write_str)
            file.write(write_str)
    file.close()
    return img_id
#urladdr = "https://search.jd.com/search?keyword=%E5%8D%95%E5%8F%8D%E5%85%A5%E9%97%A8%E7%9B%B8%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&spm=2.1.1&vt=2&page="
urladdr = "http://list.jd.com/list.html?cat=652,654,832&page="
img_id = 1
s=1
for i in range(1,205):
    img_id = crawler(urladdr,i,img_id)

这里写图片描述
这里写图片描述

猜你喜欢

转载自blog.csdn.net/u011529752/article/details/78317072