爬取今日头条图片,可以输入页数

import requests,re
import json
from urllib import request
import os

url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36',

}
for i in range(3):

    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(20*i)
    response= requests.get(url,headers=headers)

    res = response.json()
    data_list = res['data']
    # 新建文件夹
    if not os.path.exists('download') :
        os.mkdir('download')
    for data_item in data_list:
        if 'article_url' in data_item:
            article_url = data_item['article_url']
            # print(article_url)
            response = requests.get(article_url,headers=headers)

            res_html = response.text
            # print(res_html)
            res_zhengze = r'gallery: JSON\.parse\((.*)\),'
            pattern = re.search(res_zhengze, res_html)
            if pattern:
                res_2 = json.loads( pattern.group(1))
                res_3 = json.loads(res_2)
            else:
                continue
            for res_4 in res_3['sub_images']:
                res_5 = res_4['url']
                print(res_5)
                filename = 'download/' + res_5.split('/')[-1] + '.jpg'
                # 下载图片
                request.urlretrieve(res_5, filename)

猜你喜欢

转载自blog.csdn.net/weixin_42958164/article/details/81750196