Python实现的分析Ajax抓取今日头条上特定内容图片

#_____author: LiZebin
#_____date: 2018/4/26

import requests
from urllib.parse import urlencode

def get_page_index(offset,keyword):
data={
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': '1'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None

import json
def parse_page_index(html):
data=html
if data and 'data'in data.keys():
for item in data.get('data'):
yield item.get('article_url')


headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

def get_tupian_index(index):
try:
response = requests.get(index,headers=headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
return None


import re
def parse_tupian_address(text):
images_pattern = re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S)
if isinstance(text,str) is not True:
return None
result = re.search(images_pattern,text)
count=0
if result!=None:
data=str(result.groups())
add_pattern=re.compile(r'url_list.*?\\\\"url\\\\":\\\\"(.*?)"',re.S)
result2=re.findall(add_pattern,data)
if result2==None:
return None
for i in result2:
i=i.replace(r"\\\\/","/")
i=i[:-2]
print(" ",i)
yield i

import os
def save_image(item,i):
try:
response=requests.get(item)
if response.status_code==200:
file_path='{0}.{1}'.format(i,'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print('Already Download',file_path)
except requests.ConnectionError:
print('Failed to save Image')


KEYWORD='湖南大学'
SIZE=10
def main():
if not os.path.exists(KEYWORD):
os.mkdir(KEYWORD)
os.chdir(KEYWORD)
j=0
for i in range(1,SIZE+1):
html = get_page_index(i * 20, KEYWORD)
for url in parse_page_index(html):
if url != None:
print("图片来源",url)
text = get_tupian_index(url)
for k in parse_tupian_address(text):
save_image(k, j)
j += 1

if __name__=='__main__':
main()

这是我自己写的分析Ajax抓取图片的程序,可以直接运行,SIZE设定大概图片数量,1大概就是几十张图片左右;KEYWORD设定关键字。
欢迎下载运行,有任何问题请与我交流,
我的博客园账号是我的邮箱,所以留言第一时间就可以收到。程序中有我在
调试实际过程中可能会遇到可能不会遇到的一些bug,所以有些地方比较复杂。

猜你喜欢

转载自www.cnblogs.com/nanjingli/p/8953080.html