今日头条图片

import requests,json,re,os
from urllib import request

headers={
    "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400",
    "cookie":"tt_webid=6590204151702865412; tt_webid=6590204151702865412; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16541746a22ab-0980602f6a1886-34497b51-100200-16541746a23108; __tasessionId=qtlh48pt41534422027645; CNZZDATA1259612802=1044114658-1534399527-https%253A%252F%252Fwww.baidu.com%252F%7C1534421128; csrftoken=d2b46370bc8e2e2b6ba59384b3292811; tt_webid=6590204151702865412"
}
# 构建代理
proxy = {
    'http' : 'http://alice:[email protected]:6666',
    'https' : 'http://alice:[email protected]:6666'
}

offset=0
# i =1
for offset in range(0,60,20):
    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(offset)
    # print(offset)
    response =requests.get(url,headers=headers,proxies=proxy)
    # 可以通过response.json 直接获取转化后的对象(dict)
    index_dict=response.json()

    index_list=index_dict['data']
    # print(index_list)
    for i in index_list:
        if 'article_url' in i:
            url = i['article_url']
            # 标题
            title = i['title']
            # print(url)
            # 访问详情页
            response =requests.get(url,headers=headers,proxies=proxy)
            # 返回bytes类型
            html_bytes=response.text

            pattern = r"gallery: JSON\.parse\((.*)\),"

            match_res=re.search(pattern,html_bytes)
            if match_res:
                # print(match_res.group(1))
                json_origin = match_res.group(1)
                if json_origin:
                    # 这是第一遍loads, 返回值是str
                    res_buzhidao = json.loads(json_origin)
                    # 这是第二遍loads, 返回值是dict
                    res_dict = json.loads(res_buzhidao)
                    # 通过键选取值（是一个图片地址列表）
                    sub_images_list = res_dict['sub_images']

                    # 如果没有则新建文件夹
                    if not os.path.exists('download/' + title):
                        os.mkdir('download/' + title)
                        print(title)

                    for image in sub_images_list:
                        image_url = image['url']
                        # 下载的路径与图片名字
                        filename = 'download/' + title + '/' + image_url.split('/')[-1] + '.jpg'
                        # 下载图片
                        request.urlretrieve(image_url, filename)
                else:
                    continue
        else:
            print('报错了, 不应该来我这')
猜你喜欢