Version: Python3.10
Modules: requests, urllib, etc.
Because the crawler involves copyright reasons, the url request address in the source code is not released for the time being, please ask me for the complete source code in the comments
Let’s talk about dynamically loading pictures first. That is to say, when the html is just loaded, there is no picture, and then the data related to the picture is generated through json, and then inserted into the html, so as to quickly open the web page, so the question arises? How do we find the json file of the loaded file? And this problem is the first step for us to crawl a certain du image.
see json url
参数解析:
word
:%E6%80%A7%E6%84%9F%E5%A7%90%E5%A7%90 :%
queryWord
E6%80%A7%E6%84%9F%E5%A7%90%E5% A7%90
These two parameters are the same, there should be no difference, they all mean search keywords. The following value is the UrlEncode encoding.
pn is the number of pictures to get a certain du picture. When the picture slides down, 30 pictures are displayed at a time by default.
The red mark is the encoding format of the keyword word, which can be decoded with urllib.parse.unquote(), and the second is the new step size for each rinse, which is calculated as rm+the previous pnui
Here you just need to remember that the image we are looking for is linked to objURL. You can find all the objurls of this json file through the compile and find_all methods of the re module. With these objurls, we are not far from successfully crawling a du image here. up. ps Because the objurl of a du picture is encrypted, the decryption process will not be explained.
Not much to say, directly on the source code of a certain du image crawler:
import json
import itertools
import urllib
import requests
import os
import re
import sys
word = input("请输入图片关键字:")
path = "./image" #存储路径
if not os.path.exists(path):
os.mkdir(path)
word = urllib.parse.quote(word)
print('正在抓取图片...')
url = "" # 版权原因 url暂时不放 评论要完整代码
urls = (url.format(word=word, pn=x) for x in itertools.count(start=0, step=60))
index = 0
str_table = {
'_z2C$q': ':',
'_z&e3B': '.',
'AzdH3F': '/'
}
print('请求中...')
char_table = {
'w': 'a',
'k': 'b',
'v': 'c',
'1': 'd',
'j': 'e',
'u': 'f',
'2': 'g',
'i': 'h',
't': 'i',
'3': 'j',
'h': 'k',
's': 'l',
'4': 'm',
'g': 'n',
'5': 'o',
'r': 'p',
'q': 'q',
'6': 'r',
'f': 's',
'p': 't',
'7': 'u',
'e': 'v',
'o': 'w',
'8': '1',
'd': '2',
'n': '3',
'9': '4',
'c': '5',
'm': '6',
'0': '7',
'b': '8',
'l': '9',
'a': '0'
}
i = 0
char_table = {
ord(key): ord(value) for key, value in char_table.items()}
print('准备开始,请稍后...')
for url in urls:
html = requests.get(url, timeout=10).text
a = re.compile(r'"objURL":"(.*?)"')
downURL = re.findall(a, html)
for t in downURL:
for key, value in str_table.items():
t = t.replace(key, value)
t = t.translate(char_table)
try:
html_1 = requests.get(t)
if str(html_1.status_code)[0] == "4":
print('失败1')
continue
except Exception as e:
print('失败2')
continue
with open(path + "/" +'img_'+ str(i) + ".png", 'wb') as f:
f.write(html_1.content)
i = i + 1
print('正在爬取第 '+str(i)+' 张图片...')
run directly
Two less friendly source codes are attached later. It is error-prone or slow to crawl, you can optimize it.
1. Climb slowly
import time
import requests
import urllib
page = input("请输入要爬取多少页:")
page = int(page) + 1 # 确保其至少是一页,因为 输入值可以是 0
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
n = 0 # 图片的前缀 如 0.png
pn = 1 # pn是从第几张图片获取 某du图片下滑时默认一次性显示30张
storage="D:\Python_demo\crawler_image\image" # 本地存储地址
img_name="\清纯小姐姐_" #图片命名
for m in range(1, page):
url = '' # 版权原因 url暂时不放 评论要完整代码
param = {
'tn': 'resultjson_com',
'logid': '8846269338939606587',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': '清纯',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '-1',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': '清纯',
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '0',
'istype': '2',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': 'girl',
'pn': pn,
'rn': '30',
'gsm': '1e',
}
page_info = requests.get(url=url, headers=header, params=param)
page_info.encoding = 'utf-8' # 确保解析的格式是utf-8的
page_info = page_info.json() # 转化为json格式在后面可以遍历字典获取其值
info_list = page_info['data'] # 观察发现data中存在 需要用到的url地址
del info_list[-1] # 每一页的图片30张,下标是从 0 开始 29结束 ,那么请求的数据要删除第30个即 29为下标结束点
img_path_list = []
for i in info_list:
img_path_list.append(i['thumbURL'])
n = n + 1
print('第 ' + str(n) + ' 张图片')
for index in range(len(img_path_list)):
# print('图片url地址:'+img_path_list[index]) # 所有的图片的访问地址
time.sleep(0.0001)
# urllib.request.urlretrieve(img_path_list[index], storage+img_name + str(n) + '.jpg')
image = requests.get(img_path_list[index]).content
imagenname='图片' +str(n)+ '.png'
with open('./image/%s' % imagenname.split("&")[0], 'wb') as file:
file.write(image)
pn += 29
2. Error prone
import requests
import re
# 确定网址
url = '' # 版权原因 url暂时不放 评论要完整代码
form_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"Host": "", # 版权原因 url暂时不放 评论要完整代码
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
res = requests.get(url, headers=form_header).text
print(res)
image_urls = re.findall('"objURL":"(.*?)",', res)
# print(image_urls)
n = 0 # 图片的前缀 如 0.png
for image_url in image_urls:
print('图片url地址:'+image_url)
# 图片名称
n = n + 1
image_name=str(n)
print('第 '+image_name+' 张图片')
image_end = re.search('(.jpg/.png/.jpeg/.gif/.webp/.bmp)$', image_name)
if image_end == None:
image_name = '清纯壁纸_'+image_name+ '.png'
# 保存
image = requests.get(image_url).content
with open('./image/%s' % image_name.split("&")[0], 'wb') as file:
file.write(image)
The following two are the crawler codes originally written by Xiaocutie himself to crawl fengniao.com and hui.com
huitu.com crawler:
import re
import requests
import os
import urllib
header= {
'content-type': 'application/json',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
url="" # 版权原因 url暂时不放 评论要完整代码
word=input("请输入关键字:")
word=urllib.parse.quote(word)
urls=[str(url).format(word=word,num=x)for x in range(1,2)]
i=1
for url in urls:
print(url)
html=requests.get(url).text
print(html)
r=re.compile(r'"imgUrl":"(.*?)"')
u=re.findall(r,html)
for s in u:
htmls=requests.get(s)
with open("F:\\im\\"+str(i)+".jpg",'wb')as f:
f.write(htmls.content)
i=i+1
Feng bird web crawler:
import re
import requests
import os
import itertools
url="" # 版权原因 url暂时不放 评论要完整代码
i=1
path="F:\\fengniao"
if not os.path.exists(path):
os.mkdir(path)
urls = [url.format(num=x) for x in range(1,100)]
for url in urls:
html = requests.get(url).text
r=re.compile(r'"image":"(.*?)"')
u=re.findall(r, html)
for downurl in u:
downurl=str(downurl).replace("\\","").split("?")[0]
htmls=requests.get(downurl)
with open(path+"\\"+str(i)+".jpg",'wb') as f:
f.write(htmls.content)
i=i+1
print(downurl)