抓取今日头条的街拍美女图片

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014535666/article/details/83348813
由于今日头条的反扒机制的更新,利用多线程,将图片保存在文件夹中,将路径保存在mongo db中
import codecs
import pymongo
import requests
import json
import re
import os
from hashlib import md5
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from requests import RequestException
from confug import *
from multiprocessing import Pool

client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

def get_page_index(offset,keyword):
    data={
        'offset': 0,
        'format': 'json',
        'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3,
        'from':'gallery'
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    response=requests.get(url)
    if response.status_code==200:
        return response.text
    else:
        return None

def parse_page_index(html):
    data=json.loads(html)
    if data and "data" in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

def get_data_detail(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    try:

        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
    except RequestException:
        print('请求详情页出错')
        return None

def parsee_page_detail(html,url):
    soup=BeautifulSoup(html,'lxml')
    title=soup.select('title')[0].get_text()#查找标题
    image_pattern = re.compile('gallery: JSON.parse\("(.*?)"\)', re.S)
    # print(response.text)
    result = re.search(image_pattern, html)

    # print(result.group(1))输出json解析的内容
    # 解码
    if result != None:
        data_str = codecs.getdecoder('unicode_escape')(result.group(1))[0]
        data_json = json.loads(data_str)
        # print(data_json)

        sub_images = data_json.get('sub_images')
        images = [item.get('url') for item in sub_images]
        for image in images:
            donwload_image(image)
        return {
            'title':title,
            'url':url,
            'images':images
        }
            # print(urls)
def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print('成功存储')
        return True
    return False

def donwload_image(url):
    print('正在下载',url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    try:

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            save_image(response.content)
            # response.content表示返回二进制结果
            return response.text
    except RequestException:
        print('请求详情页出错')
        return None
def save_image(content):


    file_name='{0}/{1}.{2}'.format(os.path.dirname('G:\pic\\'),md5(content).hexdigest(),'jpg')#使用md5,防止文件重复
    # 保存到当前路径,文件名自动哈希生成
# 路径 文件名 后缀
    if not os.path.exists(file_name):
        f=open(file_name,'wb')
        f.write(content)
        f.close()

def main(offset):
    html=get_page_index(offset,KEYWORD)
    for url in parse_page_index(html):
        # print(url)
        htmll=get_data_detail(url)
        # print(htmll)
        result=parsee_page_detail(htmll,url)
        if result:save_to_mongo(result)
        print(result)

if __name__ == '__main__':

    group=[x*20 for x in range(GROUP_START,GROUP_END+1)]
    pool = Pool()  # 创建进程池
    pool.map(main,group)


猜你喜欢

转载自blog.csdn.net/u014535666/article/details/83348813