#Interface News Crawl # Technology News Section: http://www.jiemian.com/lists/65.html #Analysis : # F12, check whether the desired data can be obtained on the page, for example, here we want to obtain news links, titles, and image links; #You can see from the response, indicating that the data can be directly grabbed! #Question : How to get pagination? #We can see that pagination is a click load event: """ <div class="load-more" onclick="AsyncLoadList(this)" url="https://a.jiemian.com/index.php?m= lists&a=cLists&id=242&type=card¬id=2080130,2074075,2070788" page="2">Load more</div> """ #Measure : In fact, it is marked by accessing the url attribute after clicking Link; #Here we clear the data in NetWork, click the loading event of the page, an index.php? ... m: lists a: cLists id: 242 type: card note: 2080130,2074075,2070788 callback: jQuery1102017521587218181867_1524626577671 page: 2 _: 1524626577673 #After trying , it is found that the two parameters of callback and _ are returned to us by the server structure, so they can be omitted; #You can construct the url we need to access, and get the paging url according to the page: https: https: //a.jiemian.com/index.php?m=lists&a=cLists&id=242&type=card¬id=2080130,2074075,2070788&page={0} " .format(page) #Code :
import json import requests from lxml import etree class JieMianSpider(object): def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } self.proxies = { # "https:": "https://113.87.160.73:8181", "https:": "https://218.72.110.144:18118", } def get_page(self, url): response = requests.get(url, headers=self.headers, proxies=self.proxies) response.encoding = response.apparent_encoding return response.text def run(self): with open('jiedian.txt', 'w') as f: page = 0 while True: page += 1 url = " https://a.jiemian.com/index.php?m=lists&a=cLists&id=242&type=card¬id=2080130,2074075,2070788&page={0} " .format(page) # 1.Initiate Request response = self.get_page(url) # 2. Convert json data to dict type res_dict = json. loads(response[1:-1 ]) # print(res_dict) res_data = res_dict[ ' rst ' ] # 3. Get node, parsing data html = etree.HTML(res_data) el_objs = html.xpath('//div[@class="news-img"]') result = [] for el in el_objs: url = el.xpath('./a/@href') img = el.xpath('./a/img/@src') title = el.xpath('./a/@title') result.append({ "url": url, "img": img, "title": title }) result = json.dumps(result, ensure_ascii=False) print(result) f.write(result) if page==20:break if __name__ == '__main__': JM_spider = JieMianSpider() JM_spider.run()