Interface News Crawl | Crawler


#Interface News Crawl # Technology News Section: http://www.jiemian.com/lists/65.html


#Analysis : #      F12, check whether the desired data can be obtained on the page, for example, here we want to obtain news links, titles, and image links; 
#You      can see from the response, indicating that the data can be directly grabbed! 
#Question      : How to get pagination? 
#We         can see that pagination is a click load event: """ <div class="load-more" onclick="AsyncLoadList(this)" url="https://a.jiemian.com/index.php?m= lists&a=cLists&id=242&type=card&notid=2080130,2074075,2070788" page="2">Load more</div> """ #Measure      : In fact, it is marked by accessing the url attribute after clicking Link; 
#Here           we            clear the data in NetWork, click the loading event of the page, an index.php? 
...             m: lists
            


            a: cLists
            id: 242
            type: card
            note: 2080130,2074075,2070788
            callback: jQuery1102017521587218181867_1524626577671
            page: 2 
            _: 1524626577673 #After
 trying           , it is found that the two parameters of callback and _ are returned to us by the server structure, so they can be omitted; 
#You            can construct the url we need to access, and get the paging url according to the page: 
           https: https: //a.jiemian.com/index.php?m=lists&a=cLists&id=242&type=card¬id=2080130,2074075,2070788&page={0} " .format(page)
           
#Code :
import json
import requests
from lxml import etree


class JieMianSpider(object):
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        }
        self.proxies = {
            # "https:": "https://113.87.160.73:8181",
            "https:": "https://218.72.110.144:18118",
        }

    def get_page(self, url):
        response = requests.get(url, headers=self.headers, proxies=self.proxies)
        response.encoding = response.apparent_encoding
        return response.text

    def run(self):
        with open('jiedian.txt', 'w') as f:
            page = 0
            while True:
                page += 1 
                url = " https://a.jiemian.com/index.php?m=lists&a=cLists&id=242&type=card¬id=2080130,2074075,2070788&page={0} " .format(page)
                 # 1.Initiate Request 
                response = self.get_page(url)
                 # 2. Convert json data to dict type 
                res_dict = json. loads(response[1:-1 ])
                 # print(res_dict) 
                res_data = res_dict[ ' rst ' ]
                 # 3. Get node, parsing data 
                html = etree.HTML(res_data)
                el_objs = html.xpath('//div[@class="news-img"]')
                result = []
                for el in el_objs:
                    url = el.xpath('./a/@href')
                    img = el.xpath('./a/img/@src')
                    title = el.xpath('./a/@title')
                    result.append({
                        "url": url,
                        "img": img,
                        "title": title
                    })
                result = json.dumps(result, ensure_ascii=False)
                print(result)
                f.write(result)
                if page==20:break

                




if __name__ == '__main__':
    JM_spider = JieMianSpider()
    JM_spider.run()
Interface News

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324842737&siteId=291194637