python学习(五)爬取今日头条图库

今天抽出时间写了一个小爬虫 来爬取今日头条的图片
简要的说下
1图片首页是通过ajax 发生请求 得到json数据 然后渲染到网页,
2然后每个详情页中 在获取的网页的源代码中 是包含图片地址的 但是直接获取img元素来获取 这就需要正则来提取 然后可以生成json 数据 来获取图片地址
主要就是这两点 明白这两点 基本就完事了

# coding=utf-8

import time
import requests
import urllib.parse
import os
from lxml import etree
import hashlib
import string
import re
import json

class toutiao(object):

    def __init__(self):
        self.header = {
            "content-type": "application/x-www-form-urlencoded",
            "referer": "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D",
            "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",
        }
        str = "tt_webid=6706425614288946700; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16b8e5a1a2424d-0e0eb957d32c8e-19174638-1fa400-16b8e5a1a254a4; s_v_web_id=588f2ef74e1ceeaf59f3208561768055; __tasessionId=fjqbezvwc1561461398100; csrftoken=bbe2498f2893b007a4b41af04b99c838; tt_webid=6706425614288946700; CNZZDATA1259612802=1882149716-1561460036-https%253A%252F%252Fwww.baidu.com%252F%7C1561460036"
        self.cookies = {i.split('=')[0]: i.split('=')[1] for i in str.split(';')}
        img_str = "tt_webid=6706441381927290371; UM_distinctid=16b8e92149daa7-064dc97ae3da06-39395704-1fa400-16b8e92149eb16; tt_webid=6706441381927290371; csrftoken=247aa3e21fc9f5f62f6dae721f4fb01d; tt_webid=6706441381927290371; WEATHER_CITY=%E5%8C%97%E4%BA%AC; CNZZDATA1259612802=429449312-1561463830-%7C1561507030; s_v_web_id=89b01385acfcbede0b21c08deae4878d; __tasessionId=54mvwi6qw1561512119813"
        self.img_cookies = {i.split('=')[0]: i.split('=')[1] for i in str.split(';')}
        self.url_list=[ self.get_url(i*20) for i in range(0,20)]
        self.page=0
    def get_url(self, offset):
        url = "https://www.toutiao.com/api/search/content/?"
        timescape = "%d" % time.time()
        params = {
            "aid": "24",
            "app_name": "web_search",
            "offset": offset,
            "format": "json",
            "keyword": "美女",
            "autoload": "true",
            "count": "20",
            "en_qc": "1",
            "cur_tab": "1",
            "from": "search_tab",
            "pd": "synthesis",
            "timestamp": timescape
        }
        return (url + urllib.parse.urlencode(params))

    def parse_url(self, url,header,cookies):

        response = requests.get(url, headers=header, cookies=cookies)
        if response.status_code == 200:
            return response
        else:
            print("请求失败")

    def run(self):
        self.start()
    def start(self):
        if self.page < len(self.url_list):
            time.sleep(2)
            print(self.url_list[self.page])
            response = self.parse_url(self.url_list[self.page], self.header, self.cookies)
            json_data = response.json()
            self.page += 1
            self.get_url_item(json_data)


    def get_url_item(self, json_data):
        try:
            for data in json_data["data"]:
                try :
                    item={}
                    item["title"] = data['title']
                    item["share_url"] = data['share_url']
                    item["id"] = data['item_id']
                    item["group_id"]=data["group_id"]
                    self.get_detail(item)
                except:
                    continue
            self.start()
        except:
            self.start()
            print("没有data"+json_data)


    def get_detail(self, item: dict):
        url = item['share_url']
        title=item['title']
        if url is not "":
            response = self.parse_url(url, self.header, self.img_cookies)
            if url.find("http://toutiao.com")>=0:
                str_temp=re.findall(r"content: \'&lt;div&gt;&lt;p&gt;(.*?)&lt;\/p&gt;&lt;\/div&gt;\',",response.text)
                if len(str_temp)>0:
                    for str_url in str_temp:
                        str_img=re.findall(r"&quot;(.*?)&quot;",str_url)
                        str_img=[i for i in str_img if i.find("http://")>=0]
                        print('*'*50)
                        print(str_img)
                        for img in str_img:
                            self.save_pic(img, title)

                else:
                    str_temp=re.findall(r'gallery: JSON\.parse\("(.*?)"\),',response.text)
                    if len(str_temp)>0:
                        json_str=json.loads(str_temp[0].replace('\\',''))
                        print('-' * 50)
                        print(json_str)
                        url_list =json_str["sub_images"]
                        for url in url_list:
                            self.save_pic(url["url"],title)
            else:
                print("dddddddddddd")
                html=etree.HTML(response.content)
                img_url_list=html.xpath("//img/@src")
                if len(img_url_list)>0:
                    for img in img_url_list:
                        self.save_pic(img,title)
        else:
            print(item["title"]+"连接为空")




    def save_pic(self,img,title):
        if img is  "":
            return
        title_new="".join([i for i in title if i not in string.punctuation and i.isalnum()])
        if not os.path.exists("./pic"):
            os.mkdir("./pic")
        path_dir="./pic/"+title_new
        if not os.path.exists(path_dir):
            os.mkdir(path_dir)
        md5 = hashlib.md5()
        md5.update(img.encode())
        str_md5=md5.hexdigest()
        response=requests.get(img,headers=self.header)
        file_path=path_dir+"/"+str_md5+".png"
        with open(file_path,"wb") as f:
            f.write(response.content)
            print("下载完成"+img+file_path)


if __name__ == '__main__':
    spider = toutiao()
    spider.run()



知识点:
1 去掉标点符号
isalnum():string中至少有一个字符,而且全是字母或者数字或者是字母和数字混合返回True,其他情况返回False:

  title_new="".join([i for i in title if i not in string.punctuation and i.isalnum()])

2 创建文件夹是 不能多级创建 只能创建一级
3 urllib 将网址和参数组合成网址 参数是字典形式

    def get_url(self, offset):
        url = "https://www.toutiao.com/api/search/content/?"
        timescape = "%d" % time.time()
        params = {
            "aid": "24",
            "app_name": "web_search",
            "offset": offset,
            "format": "json",
            "keyword": "美女",
            "autoload": "true",
            "count": "20",
            "en_qc": "1",
            "cur_tab": "1",
            "from": "search_tab",
            "pd": "synthesis",
            "timestamp": timescape
        }
        return (url + urllib.parse.urlencode(params))

猜你喜欢

转载自blog.csdn.net/m_cainiaokuaifei/article/details/93739225