Reptile combat: If it weren't for learning, who would crawl young lady? Analyze Ajax to crawl today's headline street photos (python)

The html code we requested for some web pages does not have the content we see in the browser.
Because some information is loaded through Ajax and generated through JavaScript rendering.

1. Target site analysis

Headline Street Shooting

Viewed Ajax request
Select network, check preserve log and then XHR, the data link is like the aid format on the left

The title can be found under the data

Our webpage keeps declining, and we find that the request has an offset of 20, 40, and 60 changes. As shown.
We can think that we can get different data by changing the value of offset.

By observing the data, it is found that the data is json data.

Actual combat

1. Grab the content of the index page.
1. View the URL. The
blue is the basic url, and the others are the parameters.

These parameters are shown in the figure below.

Need to pull down the webpage urgently, only the offset is changing, every time it changes 20

Get the html code as follows

from urllib.parse import urlencode
import requests
from requests.exceptions import ConnectionError

def get_page(offest,keyword):#获取请求并返回解析页面,offest,keyword为可变参数
    params = {
    
    
        'aid': '24',
        'app_name': 'web_search',
        'offset': offest,#页数
        'format': 'json',
        'keyword': keyword,#关键词,本例子为街拍
        'autoload': 'true',
        'count': '20',
        'en_qc': '1',
        'cur_tab': '1',
        'from': 'search_tab',
        'pd': 'synthesis',
        'timestamp': '1612660795006',
        '_signature': '_02B4Z6wo00f01bVt4zgAAIDCfdEqJspzHQm1SeeAAA1FfgsJs85FLGn5fddPtscCGmt-RCmotIguRxATrRA1jejsf0LuGWhNYZDSWZIqUdhBN1ivlGKkDtAdcHKqYiKRWjlQZt4s9AU2aI2d0c'
        }

    headers = {
    
    
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
    }
    base_url = 'https://www.toutiao.com/api/search/content/?'
    url = base_url + urlencode(params)
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200 :
            return response.text
    except ConnectionError:
        print('程序错误')
        return None

def main():
    base_url = 'https://www.toutiao.com/api/search/content/?'
    html=get_page(0,'街拍')
    print(html)
if __name__ =='__main__':
    main()
You can find that there are many hyperlinks in the results

2. Data analysis
Go back to the browser and check the response of the returned result. The data format is json format.

Find data in Preview

Expand data, where 0,1,2... All street shots

Expand 0
picture url in image_list

picture name titlie

#数据解析
import json
def parse_page_index(html):
    data=json.loads(html)#转换为json对象
    if data  and 'data'in data.keys():#判断响应里的data是否存在
        for item in data.get('data'):  # 用item循环每一条,即0,1,2...
            # 这里需要判断image_list是否为空
            title = item.get('title')
            if 'image_list' in item and item['image_list'] != []:
                images = item['image_list']
                for image in images:
                    yield {
    
    
                        'image': image.get('url'),
                        'title': title
                    }  # 返回的一个字典


See the example for yield usage. The
return is an iterable object

def getnum(n):
    i = 0
    while i <= n:
        yield i
        i += 1
a = getnum(5)
print(a)
for i in a:
    print(i)

3. Picture save

import os
from hashlib import md5
def save_image(item):
    #os.path模块主要用于文件的属性获取,exists是“存在”的意思,
    #所以顾名思义,os.path.exists()就是判断括号里的文件夹'picture'+str(offset)是否存在的意思,括号内的可以是文件路径。
    if not os.path.exists(item.get('title')):#判断当前文件夹下是否有该文件
        os.mkdir(item.get('title'))#如果不存在就创建该文件夹
    try:
        response=requests.get(item['image']) #get函数获取图片链接地址,requests发送访问请求,上面那个字典
        if response.status_code==200:
            file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
            # md5摘要算法(哈希算法),通过摘要算法得到一个长度固定的数据块。将文件保存时,通过哈希函数对每个文件进行文件名的自动生成。
            # md5() 获取一个md5加密算法对象
            # hexdigest() 获取加密后的16进制字符串
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f:
                    f.write(response.content)
                print('图片保存路径是: ', file_path)
            else:
                print('图片已经下载',file_path)
    except requests.ConnectionError:
        print('图片保存失败')

md5(response.content).hexdigest() digest algorithm (hash algorithm), a data block with a fixed length is obtained through the digest algorithm. When saving the file, the file name is automatically generated for each file through the hash function.
Example

from hashlib import md5

hash_functions = [md5]
def get_hash_code(s):
    result = []
    hash_obj = md5(s)
    hash_hex = hash_obj.hexdigest()
    result.append((hash_obj.name, hash_hex, len(hash_hex)))
    return result


if __name__ == '__main__':
    s = "123"
    result = get_hash_code(s.encode("utf-8"))
    print(result)


Total code

from urllib.parse import urlencode
import requests
from requests.exceptions import ConnectionError
import json
def get_page(offest,keyword):#获取请求并返回解析页面,offest,keyword为可变参数
    params = {
    
    
        'aid': '24',
        'app_name': 'web_search',
        'offset': offest,#页数
        'format': 'json',
        'keyword': keyword,#关键词,本例子为街拍
        'autoload': 'true',
        'count': '20',
        'en_qc': '1',
        'cur_tab': '1',
        'from': 'search_tab',
        'pd': 'synthesis',
        'timestamp': '1612660795006',
        '_signature': '_02B4Z6wo00f01bVt4zgAAIDCfdEqJspzHQm1SeeAAA1FfgsJs85FLGn5fddPtscCGmt-RCmotIguRxATrRA1jejsf0LuGWhNYZDSWZIqUdhBN1ivlGKkDtAdcHKqYiKRWjlQZt4s9AU2aI2d0c'
        }

    headers = {
    
    
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
    }
    base_url = 'https://www.toutiao.com/api/search/content/?'
    url = base_url + urlencode(params)
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200 :
            return response.text
    except ConnectionError:
        print('程序错误')
        return None
#数据解析
import json
def parse_page_index(html):
    data=json.loads(html)#转换为json对象
    if data  and 'data'in data.keys():#判断响应里的data是否存在
        for item in data.get('data'):  # 用item循环每一条,即0,1,2...
            # 这里需要判断image_list是否为空
            title = item.get('title')
            if 'image_list' in item and item['image_list'] != []:
                images = item['image_list']
                for image in images:
                    yield {
    
    
                        'image': image.get('url'),
                        'title': title
                    }  # 返回一个字典


import os
from hashlib import md5
def save_image(item):
    #os.path模块主要用于文件的属性获取,exists是“存在”的意思,
    #所以顾名思义,os.path.exists()就是判断括号里的文件夹'picture'+str(offset)是否存在的意思,括号内的可以是文件路径。
    if not os.path.exists(item.get('title')):#判断当前文件夹下是否有该文件
        os.mkdir(item.get('title'))#如果不存在就创建该文件夹
    try:
        response=requests.get(item['image']) #get函数获取图片链接地址,requests发送访问请求,上面那个字典
        if response.status_code==200:
            file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
            # md5摘要算法(哈希算法),通过摘要算法得到一个长度固定的数据块。将文件保存时,通过哈希函数对每个文件进行文件名的自动生成。
            # md5() 获取一个md5加密算法对象
            # hexdigest() 获取加密后的16进制字符串
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f:
                    f.write(response.content)
                print('图片保存路径是: ', file_path)
            else:
                print('图片已经下载',file_path)
    except requests.ConnectionError:
        print('图片保存失败')



def main():
    for offest in range(0, 60, 20):
        html = get_page(offest, '街拍')
        a = parse_page_index(html)
        for item in a:
            save_image(item)

if __name__ =='__main__':
    main()
Results folder to open a folder

There are too many folders, we modify it to one folder

Result

folder

Insert picture description here
Author: Electrical - Yudeng Wu.
The codeword is not easy, please click like before leaving.

Guess you like

Origin blog.csdn.net/kobeyu652453/article/details/113711839