import requests,time from urllib.parse import urlencode import os from hashlib import md5 class ToutiaoSpider: def __init__(self): self.params = { 'offset': None, 'format': 'json', 'keyword': '街拍', 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', 'pd': 'synthesis' } self.ua = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" self.headers = { 'user - agent': self.ua, 'referer': 'https://www.toutiao.com/search/?keyword=街拍', 'content-type': 'application/x-www-form-urlencoded' } def get_page(self,offset): '''获取Ajax加载的数据''' self.params['offset'] = offset p = urlencode(self.params) url = 'https://www.toutiao.com/search_content/?' + p print(url) try: r = requests.get(url) if r.status_code == 200: r.encoding='utf-8' return r.json() except Exception as e: return None def get_images(self,json_data): '''下载图片ַ''' if json_data.get('data'): for item in json_data.get('data'): title = item.get('title') images = item.get('image_list') for image in images: items = {} items['image'] = image.get('url') items['title'] = title yield items def save_image(sele,item): """保存图片""" if not os.path.exists(item.get('title')): os.mkdir(item.get('title')) try: url = item.get('image') print('download running:','http:'+url) r = requests.get('http:'+item.get('image')) if r.status_code == 200: file_path = '{}/{}.{}'.format(item.get('title'),md5(r.content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as fp: fp.write(r.content) print('download finished') else: print('Already Download,',file_path) else: print('页面下载失败!') except Exception as e: print(e) print('Failed to save Image') def run(self,offsets): for offset in offsets: json_data = self.get_page(offset) #images = self.get_images(json_data) for item in self.get_images(json_data): print(item) self.save_image(item) time.sleep(1) if __name__ == '__main__': t = ToutiaoSpider() offsets = ([i*20 for i in range(0,15)]) t.run(offsets)
获取今日头条街拍图片
猜你喜欢
转载自www.cnblogs.com/wl443587/p/10295157.html
今日推荐
周排行