说明:
爬虫的基本思路都是获取某一url的网页的字符串,然后可以通过正则,beautifulsoup或者这里使用的PyQuery来进行解析。图片一般都是以链接的形式出现在html文本中,因此只需要找到图片连接即可(一般是在img src中),这时再把图片url打开,江里面的content保存成具体的文件。这里使用的hashlib实际上是一个编码的库,为了使得每一个图片的名字不一样,就用md5这个方法把图片的内容进行了一个编码。爬取的这三个网站整体来说比较简单,除了煎蛋加入了一定的反爬措施,即并不是直接将图片的url列出来,而是整了一段js,在每一次加载图片的时候都要加载这个js,进而把图片的url解析出来。
煎蛋网(由于加入了一个js的反爬,所以加入了一步解码):
import requests
from pyquery import PyQuery as pq
import hashlib
import base64
from hashlib import md5
def ty(body):
print(type(body))
# 处理md5编码问题
def handle_md5(hd_object):
return hashlib.md5(hd_object.encode('utf-8')).hexdigest()
# 处理base64编码问题
def handle_base64(hd_object):
return str(base64.b64decode(hd_object))[2:-1]
# 解密图片链接
def parse(ig_hs, ct):
count = 4
contains = handle_md5(ct)
ig_hs_copy = ig_hs
p = handle_md5(contains[0:16])
m = ig_hs[0:count]
c = p + handle_md5(p + m)
n = ig_hs[count:]
l = handle_base64(n)
k = []
for h in range(256):
k.append(h)
b = []
for h in range(256):
b.append(ord(c[h % len(c)]))
g = 0
for h in range(256):
g = (g + k[h] + b[h]) % 256
tmp = k[h]
k[h] = k[g]
k[g] = tmp
u = ''
q = 0
z = 0
for h in range(len(l)):
q = (q + 1) % 256
z = (z + k[q]) % 256
tmp = k[q]
k[q] = k[z]
k[z] = tmp
u += chr(ord(l[h]) ^ (k[(k[q] + k[g]) % 256]))
u = u[26:]
u = handle_base64(ig_hs_copy)
return u
for i in range(1,10):
url = 'http://jandan.net/ooxx/page-'+str(i)+'#comments'
response = requests.get(url)
doc = pq(response.text)
links = doc('#wrapper #body #content #comments .commentlist .row .text .img-hash')
print(links)
arg = '5HTs9vFpTZjaGnG2M473PomLAGtI37M8'
for link in links:
l=link.text
print(type(l))
u=parse(l,arg)
#print(u)
u1='http:'+u
print(u1)
r=requests.get(u1)
with open('D:/image/'+md5(r.content).hexdigest()+'.jpg','wb') as f:
f.write(r.content)
f.close()
运行结果:
豆瓣:
import requests
from pyquery import PyQuery as pq
from hashlib import md5
url='https://read.douban.com/ebooks/?dcs=book-nav&dcm=douban'
response=requests.get(url)
doc=pq(response.text)
url1=doc('.wrapper .main .col.col10 .showcase.featured-books.slide-5-items .bd .carousel .slide-list'
' .slide-item.list-col.list-col5.cover-list .border-wrap .cover.shadow-cover a img')
print(url1)
for u in url1:
l=pq(u).attr('data-src')
print(l)
r=requests.get(l)
with open('D:/image/'+md5(r.content).hexdigest()+'.jpg','wb') as f:
f.write(r.content)
f.close()
运行结果:
好乐买:
import requests
from pyquery import PyQuery as pq
from hashlib import md5
for i in range(200,235):
url='http://www.okbuy.com/shoe/'+str(i)
response=requests.get(url)
doc=pq(response.text)
url1=doc('div .w1200 .prolist-container .prolist-main .clearfix .gl-item .gl-wrap .gl-img a img')
print(url1)
for u in url1:
uu=pq(u).attr('src')
print(uu)
r=requests.get(uu)
with open('D:/image1/'+md5(r.content).hexdigest()+'.jpg','wb') as f:
f.write(r.content)
f.close()
运行结果: