python3网络爬虫学习第六章Ajax数据爬取(爬取今日头条街拍美图)

#有时候使用requerts抓取页面时,得到的结果可能和在浏览器中看到的不一样,在浏览器中可以看到的额数据,使用requests获取不到。
#这是因为requests获取到的都是原始的HTML文档,而浏览器中的页面则是经过JavaScript处理数据后生成的结果,这些数据的来源有多重,可能是
#通过Ajax加载的,可能包含在HTML文档中,可能是经过javaScript和特定算法生成的

#1、什么是Ajax?
#Ajax  就是异步的JavaScript和XML。它不是一门编程语言,而是利用JavaScript在保证页面不被刷新,页面链接不改变的情况下与服务器交换数据并更新部分网页技术的技术

#基本原理
#(1)发送请求  (2)解析内容   (3)渲染网页

#6.2 Ajax
# https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
# https://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
# https://www.toutiao.com/search_content/?offset=40&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
import requests
from urllib.parse import urlencode
import json
def get_page(offset):
    params = {
        'offset':offset,
        "format":"json",
        "keyword":"街拍",
        "autoload":"true",
        "count":"20",
        "cur_tab":"1",
        "from":"search_tab"
    }
    headers = {
        "User-Agent":"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50"
    }
    url = "https://www.toutiao.com/search_content/?"+urlencode(params)
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError:
        return None
# with open("./1.json","w")as f:
#     json.dump(get_page(0),f)
def get_images(json):
    if json.get("data"):
        for item in json.get("data"):
            if item.get("image_list"):
                title = item.get("title")
                for picture in item.get("image_list"):
                    yield {
                        "image":"http:"+picture.get("url"),
                        "title":title
                    }
# a = get_images(get_page(0))
# for x in a:
#     print(x)
import os
from hashlib import md5

def save_image(item):
    path = os.path.join("D:\Ajxa",item.get("title"))
    if not os.path.exists(path):
        os.mkdir(path)
    try:
        response = requests.get(item.get("image"))
        if response.status_code == 200:
            file_path = os.path.join(path,md5(response.content).hexdigest())+".jpg"
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f :
                    f.write(response.content)
            else:
                print("已经下载过了")
    except requests.ConnectionError:
        print("连接失败")
from multiprocessing.pool import Pool

def main(offset):
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)
group_start = 1
group_end  =20

if __name__ == "__main__":
    pool = Pool()
    groups = ([x*20 for x in range(group_start,group_end+1)])
    pool.map(main,groups)
    pool.close()
    pool.join()

猜你喜欢

转载自blog.csdn.net/luslin/article/details/81747501