分析Ajax抓取街拍图片,将url等数据存储为json格式,并把图片保存下来
#coding:utf-8 import os from _md5 import md5 from urllib.parse import urlencode import re from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import requests import json from multiprocessing import Pool def get_page_index(offset,keyword):#Ajax传入参数 data={ 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': 3, 'from': 'gallery' } url='https://www.toutiao.com/search_content/?'+urlencode(data) headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' } try: response=requests.get(url,headers=headers) if response.status_code==200: return response.text return None except ConnectionError: print('请求索引值出错') return None def parse_page_index(html):#解析组图 data=json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') def get_page_detail(url):#请求详情页 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' } try: response=requests.get(url,headers=headers) if response.status_code==200: return response.text return None except ConnectionError: print('请求详情页出错') return None def parse_page_detail(html,url):#解析详情页 soup=BeautifulSoup(html,'lxml') # print(soup.get_text()) title=soup.select('title')[0].get_text() # print(title) images_pattern=re.compile('gallery:.*?\("(.*?)"\)',re.S) html = re.sub(r'\\', '', html) result=re.search(images_pattern,html) if result: data=json.loads(result.group(1),"UTF-8") # # print(result) if data and 'sub_images'in data.keys(): sub_images=data.get('sub_images') images=[item.get('url') for item in sub_images] return { 'title':title, 'url':url, 'images':images, } def write_to_file(content):#保存为Json格式 with open('result_jiepai.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close() def get_parse_image(url): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' } try: response=requests.get(url,headers=headers) if response.status_code==200: image_save(response.content) return None except ConnectionError: print('图片解析出错') return None def image_save(content):#保存图片 # content=get_parse_image(content) file_path='{0}/{1}.{2}'.format(os.getcwd()+'/images',md5(content).hexdigest(),'jpg') with open(file_path,'wb') as f: f.write(content) f.close() def main(offset): html=get_page_index(offset,'街拍') for url in parse_page_index(html): html=get_page_detail(url) if html: result=parse_page_detail(html,url) write_to_file(result) try: for item in result.get('images') if result.get('images') else None: get_parse_image(item) print('正在下载:',item) except Exception as e: print('下载出错:',e) # continue if __name__=="__main__": pool=Pool() pool.map(main,[x*20 for x in range(1,21)])
查看下下载的图片