Exercise 1-- crawling btc forum title and the corresponding url

Not climb this forum html source code, should be related to anti-climbing technology, and then later to address, as follows

import requests
from lxml import etree
import json


class BtcSpider(object):
    def __init__(self):
        self.headers = {
            "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        self.base_url = "https://www.chainnode.com/forum/2"
        self.data_list = []

    # 获取数据
    def get_data(self, url):
        response = requests.GET (URL, headers = self.headers) 
        Data = response.content.decode ()
         return Data 

    # parsed data 
    def parse_data (self, data): 
        Type # conversion data 
        x_data = etree.HTML (Data) 
        # accordance path resolution 
        title_list x_data.xpath = ( ' // A [@ class = "Link-Dark-Major Bold font-BBT-Block"] / text () ' ) 
        URL_LIST = x_data.xpath ( ' // A [@ class = "Link- Bold font-Major-Dark BBT-Block "] / @ the href ' ) 
        URL_LIST = [ " https://www.chainnode.com " + i for i in url_list]
        for index, title in enumerate(title_list):
            news = {}
            news['name'] = title
            news['url'] = url_list[index]
            self.data_list.append(news)

    # 保存数据
    def save_data(self):
        data_str = json.dumps(self.data_list)
        with open('03-btc.html', 'w') asF: 
            f.write (data_str) 

    # start 
    DEF RUN (Self): 
        for I in Range ( . 1 , . 5 ): 
            # stitching complete URL 
            URL = self.base_url IF I == . 1  the else self.base_url STR + (- I) 
            # transmission request 
            data = self.get_data (URL) 
            # parse data 
            self.parse_data (data) 
        self.save_data () 


BtcSpider (). RUN ()

 

Guess you like

Origin www.cnblogs.com/jj1106/p/11228748.html