【爬虫】Pycharm+requests+pyquery 爬取煎蛋网+豆瓣+好乐买商品图片

说明:

爬虫的基本思路都是获取某一url的网页的字符串,然后可以通过正则,beautifulsoup或者这里使用的PyQuery来进行解析。图片一般都是以链接的形式出现在html文本中,因此只需要找到图片连接即可(一般是在img src中),这时再把图片url打开,江里面的content保存成具体的文件。这里使用的hashlib实际上是一个编码的库,为了使得每一个图片的名字不一样,就用md5这个方法把图片的内容进行了一个编码。爬取的这三个网站整体来说比较简单,除了煎蛋加入了一定的反爬措施,即并不是直接将图片的url列出来,而是整了一段js,在每一次加载图片的时候都要加载这个js,进而把图片的url解析出来。

煎蛋网(由于加入了一个js的反爬,所以加入了一步解码):

import requests
from pyquery import PyQuery as pq
import hashlib
import base64
from hashlib import md5
def ty(body):
    print(type(body))

# 处理md5编码问题
def handle_md5(hd_object):
    return hashlib.md5(hd_object.encode('utf-8')).hexdigest()

# 处理base64编码问题
def handle_base64(hd_object):
	return str(base64.b64decode(hd_object))[2:-1]

# 解密图片链接
def parse(ig_hs, ct):
    count = 4
    contains = handle_md5(ct)
    ig_hs_copy = ig_hs
    p = handle_md5(contains[0:16])
    m = ig_hs[0:count]
    c = p + handle_md5(p + m)
    n = ig_hs[count:]
    l = handle_base64(n)
    k = []
    for h in range(256):
        k.append(h)
    b = []
    for h in range(256):
        b.append(ord(c[h % len(c)]))
    g = 0
    for h in range(256):
        g = (g + k[h] + b[h]) % 256
        tmp = k[h]
        k[h] = k[g]
        k[g] = tmp
    u = ''
    q = 0
    z = 0
    for h in range(len(l)):
        q = (q + 1) % 256
        z = (z + k[q]) % 256
        tmp = k[q]
        k[q] = k[z]
        k[z] = tmp
        u += chr(ord(l[h]) ^ (k[(k[q] + k[g]) % 256]))
    u = u[26:]
    u = handle_base64(ig_hs_copy)
    return u
for i in range(1,10):
    url = 'http://jandan.net/ooxx/page-'+str(i)+'#comments'
    response = requests.get(url)
    doc = pq(response.text)
    links = doc('#wrapper #body #content #comments .commentlist .row .text .img-hash')
    print(links)
    arg = '5HTs9vFpTZjaGnG2M473PomLAGtI37M8'
    for link in links:
        l=link.text
        print(type(l))
        u=parse(l,arg)
        #print(u)
        u1='http:'+u
        print(u1)
        r=requests.get(u1)
        with open('D:/image/'+md5(r.content).hexdigest()+'.jpg','wb') as f:
            f.write(r.content)
            f.close()

运行结果:

豆瓣:

import requests
from pyquery import PyQuery as pq
from hashlib import md5
url='https://read.douban.com/ebooks/?dcs=book-nav&dcm=douban'
response=requests.get(url)
doc=pq(response.text)
url1=doc('.wrapper .main .col.col10 .showcase.featured-books.slide-5-items .bd .carousel .slide-list'
         ' .slide-item.list-col.list-col5.cover-list .border-wrap .cover.shadow-cover a img')
print(url1)
for u in url1:
    l=pq(u).attr('data-src')
    print(l)
    r=requests.get(l)
    with open('D:/image/'+md5(r.content).hexdigest()+'.jpg','wb') as f:
        f.write(r.content)
        f.close()

运行结果:

好乐买:

import requests
from pyquery import PyQuery as pq
from hashlib import md5
for i in range(200,235):
    url='http://www.okbuy.com/shoe/'+str(i)
    response=requests.get(url)
    doc=pq(response.text)
    url1=doc('div .w1200 .prolist-container .prolist-main .clearfix .gl-item .gl-wrap .gl-img a img')
    print(url1)
    for u in url1:
        uu=pq(u).attr('src')
        print(uu)
        r=requests.get(uu)
        with open('D:/image1/'+md5(r.content).hexdigest()+'.jpg','wb') as f:
            f.write(r.content)
            f.close()

运行结果:

猜你喜欢

转载自blog.csdn.net/bubbler_726/article/details/81092317
今日推荐