版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Yk_0311/article/details/82556312
# 爬取今日头条微信头像图集
# 网页地址:https://www.toutiao.com/search/?keyword=%E5%BE%AE%E4%BF%A1%E5%A4%B4%E5%83%8F
import requests
import re
import os
def get_json(offset):
url = 'https://www.toutiao.com/search_content/?'
hd = {'User-Agent': 'Mozilla/5.0'}
params = {'offset': offset,
'format': 'json',
'keyword': '微信头像',
'autoload': 'true',
'count': '20',
'cur_tab': '3',
'from': 'gallery'
}
try:
r = requests.get(url, headers=hd, params=params)
r.raise_for_status()
r.encoding = r.apparent_encoding
# print(r.json())
return r.json()
except:
print('11111111')
def get_article_url(json):
if json.get('data'): # 如果数据存在
for item in json.get('data'):
if item == None:
continue
yield item.get('article_url')
def get_HTML(article_url):
hd = {'User-Agent': 'Mozilla/5.0'}
try:
r = requests.get(article_url, headers=hd)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('222222')
def get_image_url(html, ulist):
try:
result = re.search(r'gallery: JSON.parse\(\"(.*?)\"\),\n', html, re.S)
if result:
result = result.group(1)
result = re.sub(r'\\', '', result)
result = re.findall(r'"http.*?"', result)
'''
会匹配到如下数据
['"http://p3.pstatp.com/origin/pgc-image/1528637853879c236db7ad2"']
需将最外层的单引号去掉
'''
# 链接会有重复,所以进行筛选
ulist.append(eval(result[0]))
for i in range(len(result)):
if ulist[-1][-5:] != eval(result[i])[-5:]:
ulist.append(eval(result[i]))
# 这样子我们就得到了图片的url,并且存储在了ulist中
except:
pass
def saveimages(ulist):
root = 'D://IDE\Pycharm//《网络爬虫实战开发》//Ajax结果提取//pictures'
hd = {'User-Agent': 'Mozilla/5.0'}
if not os.path.exists(root): # 如果根目录不存在就创建一个
os.mkdir(root)
try:
for imageurl in ulist:
# print(imageurl)测试
path = root + '//' + imageurl.split('/')[-1] + '.jpg'
if not os.path.exists(path): # 其实这一句话其实不用写,因为在parse_HTML中已经筛选过链接了,就不会存在重复了
r = requests.get(imageurl, headers=hd)
r.raise_for_status()
with open(path, 'wb') as f:
f.write(r.content)
except:
print('请求错误3')
def main():
ulist = []
for offset in range(0, 20, 20):
json = get_json(offset)
article_urls = get_article_url(json)
for article_url in article_urls:
html = get_HTML(article_url)
get_image_url(html, ulist)
saveimages(ulist)
main()