今天抽出时间写了一个小爬虫 来爬取今日头条的图片
简要的说下
1图片首页是通过ajax 发生请求 得到json数据 然后渲染到网页,
2然后每个详情页中 在获取的网页的源代码中 是包含图片地址的 但是直接获取img元素来获取 这就需要正则来提取 然后可以生成json 数据 来获取图片地址
主要就是这两点 明白这两点 基本就完事了
# coding=utf-8
import time
import requests
import urllib.parse
import os
from lxml import etree
import hashlib
import string
import re
import json
class toutiao(object):
def __init__(self):
self.header = {
"content-type": "application/x-www-form-urlencoded",
"referer": "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",
}
str = "tt_webid=6706425614288946700; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16b8e5a1a2424d-0e0eb957d32c8e-19174638-1fa400-16b8e5a1a254a4; s_v_web_id=588f2ef74e1ceeaf59f3208561768055; __tasessionId=fjqbezvwc1561461398100; csrftoken=bbe2498f2893b007a4b41af04b99c838; tt_webid=6706425614288946700; CNZZDATA1259612802=1882149716-1561460036-https%253A%252F%252Fwww.baidu.com%252F%7C1561460036"
self.cookies = {i.split('=')[0]: i.split('=')[1] for i in str.split(';')}
img_str = "tt_webid=6706441381927290371; UM_distinctid=16b8e92149daa7-064dc97ae3da06-39395704-1fa400-16b8e92149eb16; tt_webid=6706441381927290371; csrftoken=247aa3e21fc9f5f62f6dae721f4fb01d; tt_webid=6706441381927290371; WEATHER_CITY=%E5%8C%97%E4%BA%AC; CNZZDATA1259612802=429449312-1561463830-%7C1561507030; s_v_web_id=89b01385acfcbede0b21c08deae4878d; __tasessionId=54mvwi6qw1561512119813"
self.img_cookies = {i.split('=')[0]: i.split('=')[1] for i in str.split(';')}
self.url_list=[ self.get_url(i*20) for i in range(0,20)]
self.page=0
def get_url(self, offset):
url = "https://www.toutiao.com/api/search/content/?"
timescape = "%d" % time.time()
params = {
"aid": "24",
"app_name": "web_search",
"offset": offset,
"format": "json",
"keyword": "美女",
"autoload": "true",
"count": "20",
"en_qc": "1",
"cur_tab": "1",
"from": "search_tab",
"pd": "synthesis",
"timestamp": timescape
}
return (url + urllib.parse.urlencode(params))
def parse_url(self, url,header,cookies):
response = requests.get(url, headers=header, cookies=cookies)
if response.status_code == 200:
return response
else:
print("请求失败")
def run(self):
self.start()
def start(self):
if self.page < len(self.url_list):
time.sleep(2)
print(self.url_list[self.page])
response = self.parse_url(self.url_list[self.page], self.header, self.cookies)
json_data = response.json()
self.page += 1
self.get_url_item(json_data)
def get_url_item(self, json_data):
try:
for data in json_data["data"]:
try :
item={}
item["title"] = data['title']
item["share_url"] = data['share_url']
item["id"] = data['item_id']
item["group_id"]=data["group_id"]
self.get_detail(item)
except:
continue
self.start()
except:
self.start()
print("没有data"+json_data)
def get_detail(self, item: dict):
url = item['share_url']
title=item['title']
if url is not "":
response = self.parse_url(url, self.header, self.img_cookies)
if url.find("http://toutiao.com")>=0:
str_temp=re.findall(r"content: \'<div><p>(.*?)<\/p><\/div>\',",response.text)
if len(str_temp)>0:
for str_url in str_temp:
str_img=re.findall(r""(.*?)"",str_url)
str_img=[i for i in str_img if i.find("http://")>=0]
print('*'*50)
print(str_img)
for img in str_img:
self.save_pic(img, title)
else:
str_temp=re.findall(r'gallery: JSON\.parse\("(.*?)"\),',response.text)
if len(str_temp)>0:
json_str=json.loads(str_temp[0].replace('\\',''))
print('-' * 50)
print(json_str)
url_list =json_str["sub_images"]
for url in url_list:
self.save_pic(url["url"],title)
else:
print("dddddddddddd")
html=etree.HTML(response.content)
img_url_list=html.xpath("//img/@src")
if len(img_url_list)>0:
for img in img_url_list:
self.save_pic(img,title)
else:
print(item["title"]+"连接为空")
def save_pic(self,img,title):
if img is "":
return
title_new="".join([i for i in title if i not in string.punctuation and i.isalnum()])
if not os.path.exists("./pic"):
os.mkdir("./pic")
path_dir="./pic/"+title_new
if not os.path.exists(path_dir):
os.mkdir(path_dir)
md5 = hashlib.md5()
md5.update(img.encode())
str_md5=md5.hexdigest()
response=requests.get(img,headers=self.header)
file_path=path_dir+"/"+str_md5+".png"
with open(file_path,"wb") as f:
f.write(response.content)
print("下载完成"+img+file_path)
if __name__ == '__main__':
spider = toutiao()
spider.run()
知识点:
1 去掉标点符号
isalnum():string中至少有一个字符,而且全是字母或者数字或者是字母和数字混合返回True,其他情况返回False:
title_new="".join([i for i in title if i not in string.punctuation and i.isalnum()])
2 创建文件夹是 不能多级创建 只能创建一级
3 urllib 将网址和参数组合成网址 参数是字典形式
def get_url(self, offset):
url = "https://www.toutiao.com/api/search/content/?"
timescape = "%d" % time.time()
params = {
"aid": "24",
"app_name": "web_search",
"offset": offset,
"format": "json",
"keyword": "美女",
"autoload": "true",
"count": "20",
"en_qc": "1",
"cur_tab": "1",
"from": "search_tab",
"pd": "synthesis",
"timestamp": timescape
}
return (url + urllib.parse.urlencode(params))