""" Grab today's headlines and street photography """ import os import time import requests from hashlib import md5 class SpiderToutiao(object): def __init__ (self): #Specify the download directory self.download_dir = os.path.join(os.path.dirname(os.path.abspath( __file__ )), " download " ) #By analyzing the request, we found what we needed The address is as follows, and the pagination is controlled by offset + 20 self.url = " https://www.toutiao.com/search_content/ " \ " ?offset={0}&format=json&keyword=%E8%A1%97%E6 %8B%8D&autoload=true&count=20&cur_tab=3&from=gallery " #Construct request header, disguised as Ajax request self.headers = { " User-Agent " :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/66.0.3359.139 Safari/537.36", "Referer": "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D", "X-Requested-With": "XMLHttpRequest" } def handler(self, offset=0): while True: response = requests.get(self.url.format(offset), headers=self.headers) if response.status_code == 200: print("INFO -> Current URL: <%s>" % response.url) json_data = response.json().get( " data " ) #Start parsing data if json_data: for item in json_data: _title = item.get("title") _imgDict = item.get( " image_list " ) # Fix the URL, the default image address is a small image, we want a high-definition large image _imgList = [str( " http: " + _.get( " url " )).replace ( " list " , " large " ) for _ in _imgDict] #Create storage directory _downloadDir = os.path.join(self.download_dir, _title) if not os.path.exists(_downloadDir): os.makedirs(_downloadDir) #Download and save the file for img in _imgList: r = requests.get(img) _file = os.path.join(_downloadDir, md5(r.content).hexdigest() + ".jpg") if not os.path.exists(_file): with open(_file, "wb") as f: f.write(r.content) else : print ( " INFO -> ig <%s> " % _file) # indicates that there is no data, the program exits else : break #Pagination auto increment offset += 20 #Interval time time.sleep(.9 ) else : print (response.reason) exit(999) if __name__ == "__main__": spider = SpiderToutiao() spider.handler()