python:今日头条微信头像图集的爬取

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Yk_0311/article/details/82556312
# 爬取今日头条微信头像图集
# 网页地址:https://www.toutiao.com/search/?keyword=%E5%BE%AE%E4%BF%A1%E5%A4%B4%E5%83%8F
import requests
import re
import os

def get_json(offset):
    url = 'https://www.toutiao.com/search_content/?'
    hd = {'User-Agent': 'Mozilla/5.0'}
    params = {'offset': offset,
              'format': 'json',
              'keyword': '微信头像',
              'autoload': 'true',
              'count': '20',
              'cur_tab': '3',
              'from': 'gallery'
              }
    try:
        r = requests.get(url, headers=hd, params=params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        # print(r.json())
        return r.json()
    except:
        print('11111111')


def get_article_url(json):
    if json.get('data'):  # 如果数据存在
        for item in json.get('data'):
            if item == None:
                continue
            yield item.get('article_url')


def get_HTML(article_url):
    hd = {'User-Agent': 'Mozilla/5.0'}
    try:
        r = requests.get(article_url, headers=hd)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('222222')


def get_image_url(html, ulist):
    try:
        result = re.search(r'gallery: JSON.parse\(\"(.*?)\"\),\n', html, re.S)
        if result:
            result = result.group(1)
            result = re.sub(r'\\', '', result)
            result = re.findall(r'"http.*?"', result)

            '''
            会匹配到如下数据
            ['"http://p3.pstatp.com/origin/pgc-image/1528637853879c236db7ad2"']
            需将最外层的单引号去掉
            '''
            # 链接会有重复,所以进行筛选
            ulist.append(eval(result[0]))
            for i in range(len(result)):
                if ulist[-1][-5:] != eval(result[i])[-5:]:
                    ulist.append(eval(result[i]))
            # 这样子我们就得到了图片的url,并且存储在了ulist中
    except:
        pass


def saveimages(ulist):
    root = 'D://IDE\Pycharm//《网络爬虫实战开发》//Ajax结果提取//pictures'
    hd = {'User-Agent': 'Mozilla/5.0'}
    if not os.path.exists(root):  # 如果根目录不存在就创建一个
        os.mkdir(root)
    try:
        for imageurl in ulist:
            # print(imageurl)测试
            path = root + '//' + imageurl.split('/')[-1] + '.jpg'
            if not os.path.exists(path):  # 其实这一句话其实不用写,因为在parse_HTML中已经筛选过链接了,就不会存在重复了
                r = requests.get(imageurl, headers=hd)
                r.raise_for_status()
                with open(path, 'wb') as f:
                    f.write(r.content)
    except:
        print('请求错误3')

def main():
    ulist = []
    for offset in range(0, 20, 20):
        json = get_json(offset)
        article_urls = get_article_url(json)
        for article_url in article_urls:
            html = get_HTML(article_url)
            get_image_url(html, ulist)
    saveimages(ulist)


main()

猜你喜欢

转载自blog.csdn.net/Yk_0311/article/details/82556312