python学习（五）爬取今日头条图库

今天抽出时间写了一个小爬虫来爬取今日头条的图片
简要的说下
1图片首页是通过ajax 发生请求得到json数据然后渲染到网页，
2然后每个详情页中在获取的网页的源代码中是包含图片地址的但是直接获取img元素来获取这就需要正则来提取然后可以生成json 数据来获取图片地址
主要就是这两点明白这两点基本就完事了

# coding=utf-8

import time
import requests
import urllib.parse
import os
from lxml import etree
import hashlib
import string
import re
import json

class toutiao(object):

    def __init__(self):
        self.header = {
            "content-type": "application/x-www-form-urlencoded",
            "referer": "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",
        }
        str = "tt_webid=6706425614288946700; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16b8e5a1a2424d-0e0eb957d32c8e-19174638-1fa400-16b8e5a1a254a4; s_v_web_id=588f2ef74e1ceeaf59f3208561768055; __tasessionId=fjqbezvwc1561461398100; csrftoken=bbe2498f2893b007a4b41af04b99c838; tt_webid=6706425614288946700; CNZZDATA1259612802=1882149716-1561460036-https%253A%252F%252Fwww.baidu.com%252F%7C1561460036"
        self.cookies = {i.split('=')[0]: i.split('=')[1] for i in str.split(';')}
        img_str = "tt_webid=6706441381927290371; UM_distinctid=16b8e92149daa7-064dc97ae3da06-39395704-1fa400-16b8e92149eb16; tt_webid=6706441381927290371; csrftoken=247aa3e21fc9f5f62f6dae721f4fb01d; tt_webid=6706441381927290371; WEATHER_CITY=%E5%8C%97%E4%BA%AC; CNZZDATA1259612802=429449312-1561463830-%7C1561507030; s_v_web_id=89b01385acfcbede0b21c08deae4878d; __tasessionId=54mvwi6qw1561512119813"
        self.img_cookies = {i.split('=')[0]: i.split('=')[1] for i in str.split(';')}
        self.url_list=[ self.get_url(i*20) for i in range(0,20)]
        self.page=0
    def get_url(self, offset):
        url = "https://www.toutiao.com/api/search/content/?"
        timescape = "%d" % time.time()
        params = {
            "aid": "24",
            "app_name": "web_search",
            "offset": offset,
            "format": "json",
            "keyword": "美女",
            "autoload": "true",
            "count": "20",
            "en_qc": "1",
            "cur_tab": "1",
            "from": "search_tab",
            "pd": "synthesis",
            "timestamp": timescape
        }
        return (url + urllib.parse.urlencode(params))

    def parse_url(self, url,header,cookies):

        response = requests.get(url, headers=header, cookies=cookies)
        if response.status_code == 200:
            return response
        else:
            print("请求失败")

    def run(self):
        self.start()
    def start(self):
        if self.page < len(self.url_list):
            time.sleep(2)
            print(self.url_list[self.page])
            response = self.parse_url(self.url_list[self.page], self.header, self.cookies)
            json_data = response.json()
            self.page += 1
            self.get_url_item(json_data)


    def get_url_item(self, json_data):
        try:
            for data in json_data["data"]:
                try :
                    item={}
                    item["title"] = data['title']
                    item["share_url"] = data['share_url']
                    item["id"] = data['item_id']
                    item["group_id"]=data["group_id"]
                    self.get_detail(item)
                except:
                    continue
            self.start()
        except:
            self.start()
            print("没有data"+json_data)


    def get_detail(self, item: dict):
        url = item['share_url']
        title=item['title']
        if url is not "":
            response = self.parse_url(url, self.header, self.img_cookies)
            if url.find("http://toutiao.com")>=0:
                str_temp=re.findall(r"content: \'&lt;div&gt;&lt;p&gt;(.*?)&lt;\/p&gt;&lt;\/div&gt;\',",response.text)
                if len(str_temp)>0:
                    for str_url in str_temp:
                        str_img=re.findall(r"&quot;(.*?)&quot;",str_url)
                        str_img=[i for i in str_img if i.find("http://")>=0]
                        print('*'*50)
                        print(str_img)
                        for img in str_img:
                            self.save_pic(img, title)

                else:
                    str_temp=re.findall(r'gallery: JSON\.parse\("(.*?)"\),',response.text)
                    if len(str_temp)>0:
                        json_str=json.loads(str_temp[0].replace('\\',''))
                        print('-' * 50)
                        print(json_str)
                        url_list =json_str["sub_images"]
                        for url in url_list:
                            self.save_pic(url["url"],title)
            else:
                print("dddddddddddd")
                html=etree.HTML(response.content)
                img_url_list=html.xpath("//img/@src")
                if len(img_url_list)>0:
                    for img in img_url_list:
                        self.save_pic(img,title)
        else:
            print(item["title"]+"连接为空")




    def save_pic(self,img,title):
        if img is  "":
            return
        title_new="".join([i for i in title if i not in string.punctuation and i.isalnum()])
        if not os.path.exists("./pic"):
            os.mkdir("./pic")
        path_dir="./pic/"+title_new
        if not os.path.exists(path_dir):
            os.mkdir(path_dir)
        md5 = hashlib.md5()
        md5.update(img.encode())
        str_md5=md5.hexdigest()
        response=requests.get(img,headers=self.header)
        file_path=path_dir+"/"+str_md5+".png"
        with open(file_path,"wb") as f:
            f.write(response.content)
            print("下载完成"+img+file_path)


if __name__ == '__main__':
    spider = toutiao()
    spider.run()

知识点：
1 去掉标点符号
isalnum():string中至少有一个字符，而且全是字母或者数字或者是字母和数字混合返回True，其他情况返回False：

  title_new="".join([i for i in title if i not in string.punctuation and i.isalnum()])

2 创建文件夹是不能多级创建只能创建一级
3 urllib 将网址和参数组合成网址参数是字典形式

    def get_url(self, offset):
        url = "https://www.toutiao.com/api/search/content/?"
        timescape = "%d" % time.time()
        params = {
            "aid": "24",
            "app_name": "web_search",
            "offset": offset,
            "format": "json",
            "keyword": "美女",
            "autoload": "true",
            "count": "20",
            "en_qc": "1",
            "cur_tab": "1",
            "from": "search_tab",
            "pd": "synthesis",
            "timestamp": timescape
        }
        return (url + urllib.parse.urlencode(params))

python学习（五）爬取今日头条图库

猜你喜欢