Use Python3 to write a crawler script to crawl the pictures and wallpapers of Miss Chunyu. The front-end brother next door is crying and begging me for the script

Version: Python3.10
Modules: requests, urllib, etc.

Because the crawler involves copyright reasons, the url request address in the source code is not released for the time being, please ask me for the complete source code in the comments

Let’s talk about dynamically loading pictures first. That is to say, when the html is just loaded, there is no picture, and then the data related to the picture is generated through json, and then inserted into the html, so as to quickly open the web page, so the question arises? How do we find the json file of the loaded file? And this problem is the first step for us to crawl a certain du image.

insert image description here

see json url

参数解析:
word:%E6%80%A7%E6%84%9F%E5%A7%90%E5%A7%90 :%
queryWordE6%80%A7%E6%84%9F%E5%A7%90%E5% A7%90

These two parameters are the same, there should be no difference, they all mean search keywords. The following value is the UrlEncode encoding.

pn is the number of pictures to get a certain du picture. When the picture slides down, 30 pictures are displayed at a time by default.

The red mark is the encoding format of the keyword word, which can be decoded with urllib.parse.unquote(), and the second is the new step size for each rinse, which is calculated as rm+the previous pnui

Here you just need to remember that the image we are looking for is linked to objURL. You can find all the objurls of this json file through the compile and find_all methods of the re module. With these objurls, we are not far from successfully crawling a du image here. up. ps Because the objurl of a du picture is encrypted, the decryption process will not be explained.

Not much to say, directly on the source code of a certain du image crawler:


import json
import itertools
import urllib
import requests
import os
import re
import sys

word = input("请输入图片关键字:")
path = "./image" #存储路径
if not os.path.exists(path):
    os.mkdir(path)
word = urllib.parse.quote(word)
print('正在抓取图片...')
url = "" # 版权原因 url暂时不放 评论要完整代码
urls = (url.format(word=word, pn=x) for x in itertools.count(start=0, step=60))
index = 0
str_table = {
    
    
    '_z2C$q': ':',
    '_z&e3B': '.',
    'AzdH3F': '/'
}
print('请求中...')
char_table = {
    
    
    'w': 'a',
    'k': 'b',
    'v': 'c',
    '1': 'd',
    'j': 'e',
    'u': 'f',
    '2': 'g',
    'i': 'h',
    't': 'i',
    '3': 'j',
    'h': 'k',
    's': 'l',
    '4': 'm',
    'g': 'n',
    '5': 'o',
    'r': 'p',
    'q': 'q',
    '6': 'r',
    'f': 's',
    'p': 't',
    '7': 'u',
    'e': 'v',
    'o': 'w',
    '8': '1',
    'd': '2',
    'n': '3',
    '9': '4',
    'c': '5',
    'm': '6',
    '0': '7',
    'b': '8',
    'l': '9',
    'a': '0'
}
i = 0
char_table = {
    
    ord(key): ord(value) for key, value in char_table.items()}

print('准备开始,请稍后...')
for url in urls:
    html = requests.get(url, timeout=10).text
    a = re.compile(r'"objURL":"(.*?)"')
    downURL = re.findall(a, html)

    for t in downURL:
        for key, value in str_table.items():
            t = t.replace(key, value)
        t = t.translate(char_table)
        try:
            html_1 = requests.get(t)
            if str(html_1.status_code)[0] == "4":
                print('失败1')
                continue
        except Exception as e:
            print('失败2')
            continue
        with open(path + "/" +'img_'+ str(i) + ".png", 'wb') as f:
            f.write(html_1.content)
        i = i + 1
        print('正在爬取第 '+str(i)+' 张图片...')

run directly

insert image description here

insert image description here
Two less friendly source codes are attached later. It is error-prone or slow to crawl, you can optimize it.

1. Climb slowly

import time
import requests
import urllib

page = input("请输入要爬取多少页:")
page = int(page) + 1  # 确保其至少是一页,因为 输入值可以是 0
header = {
    
    
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
n = 0  # 图片的前缀 如 0.png
    pn = 1  # pn是从第几张图片获取 某du图片下滑时默认一次性显示30张


storage="D:\Python_demo\crawler_image\image" # 本地存储地址
img_name="\清纯小姐姐_"  #图片命名
for m in range(1, page):
    url = ''  # 版权原因 url暂时不放 评论要完整代码
    param = {
    
    
        'tn': 'resultjson_com',
        'logid': '8846269338939606587',
        'ipn': 'rj',
        'ct': '201326592',
        'is': '',
        'fp': 'result',
        'queryWord': '清纯',
        'cl': '2',
        'lm': '-1',
        'ie': 'utf-8',
        'oe': 'utf-8',
        'adpicid': '',
        'st': '-1',
        'z': '',
        'ic': '',
        'hd': '',
        'latest': '',
        'copyright': '',
        'word': '清纯',
        's': '',
        'se': '',
        'tab': '',
        'width': '',
        'height': '',
        'face': '0',
        'istype': '2',
        'qc': '',
        'nc': '1',
        'fr': '',
        'expermode': '',
        'force': '',
        'cg': 'girl',
        'pn': pn,
        'rn': '30',
        'gsm': '1e',
    }
    page_info = requests.get(url=url, headers=header, params=param)
    page_info.encoding = 'utf-8'  # 确保解析的格式是utf-8的
    page_info = page_info.json()  # 转化为json格式在后面可以遍历字典获取其值
    info_list = page_info['data']  # 观察发现data中存在 需要用到的url地址
    del info_list[-1]  # 每一页的图片30张,下标是从 0 开始 29结束 ,那么请求的数据要删除第30个即 29为下标结束点
    img_path_list = []
    for i in info_list:
        img_path_list.append(i['thumbURL'])
        n = n + 1
        print('第 ' + str(n) + ' 张图片')
        for index in range(len(img_path_list)):
            # print('图片url地址:'+img_path_list[index])  # 所有的图片的访问地址
            time.sleep(0.0001)
            # urllib.request.urlretrieve(img_path_list[index], storage+img_name + str(n) + '.jpg')
            image = requests.get(img_path_list[index]).content
            imagenname='图片' +str(n)+ '.png'
            with open('./image/%s' % imagenname.split("&")[0], 'wb') as file:
                file.write(image)
    pn += 29

2. Error prone


import requests
import re

# 确定网址

url = ''   # 版权原因 url暂时不放 评论要完整代码
form_header = {
    
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
    "Host": "",  # 版权原因 url暂时不放 评论要完整代码
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
    }
res = requests.get(url, headers=form_header).text
print(res)
image_urls = re.findall('"objURL":"(.*?)",', res)
# print(image_urls)
n = 0  # 图片的前缀 如 0.png
for image_url in image_urls:
    print('图片url地址:'+image_url)

    # 图片名称
    n = n + 1
    image_name=str(n)
    print('第 '+image_name+' 张图片')
    image_end = re.search('(.jpg/.png/.jpeg/.gif/.webp/.bmp)$', image_name)
    if image_end == None:
        image_name = '清纯壁纸_'+image_name+ '.png'

    # 保存
    image = requests.get(image_url).content
    with open('./image/%s' % image_name.split("&")[0], 'wb') as file:
        file.write(image)

The following two are the crawler codes originally written by Xiaocutie himself to crawl fengniao.com and hui.com

huitu.com crawler:

import re
import requests
import os
import urllib
header= {
    
    'content-type': 'application/json',
           'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
url=""    # 版权原因 url暂时不放 评论要完整代码
word=input("请输入关键字:")
word=urllib.parse.quote(word)
urls=[str(url).format(word=word,num=x)for  x in  range(1,2)]
i=1

for url in urls:
    print(url)
    html=requests.get(url).text

    print(html)
    r=re.compile(r'"imgUrl":"(.*?)"')
    u=re.findall(r,html)

    for s in u:

        htmls=requests.get(s)
        with open("F:\\im\\"+str(i)+".jpg",'wb')as f:

            f.write(htmls.content)
            i=i+1

Feng bird web crawler:

import re
import requests
import os
import itertools
url=""    # 版权原因 url暂时不放 评论要完整代码
i=1
path="F:\\fengniao"
if not os.path.exists(path):
    os.mkdir(path)
urls = [url.format(num=x) for x in range(1,100)]
for url in urls:

    html = requests.get(url).text
    r=re.compile(r'"image":"(.*?)"')
    u=re.findall(r, html)
    for downurl in u:
        downurl=str(downurl).replace("\\","").split("?")[0]
        htmls=requests.get(downurl)


        with open(path+"\\"+str(i)+".jpg",'wb') as f:
            f.write(htmls.content)
            i=i+1

        print(downurl)

Guess you like

Origin blog.csdn.net/ZiChen_Jiang/article/details/128215022
Recommended