import requests
from lxml import etree
import json
class BtcSpider():
def __init__(self):
self.base_url = "https://www.chainnode.com/forum/61-"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
self.data_list = []
# 1.发送请求
def get_response(self, url):
response = requests.get(url, headers=self.headers)
data = response.content.decode("utf-8")
return data
# 2.解析请求
def parse_data(self, data):
x_data = etree.HTML(data)
a_title_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/text()')
title_list = []
for i in a_title_list:
# 清楚前后空格
title_list.append(i.strip())
a_url_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/@href')
url_list = []
url = "https://www.chainnode.com"
for i in a_url_list:
url_list.append(url+i)
for index, title in enumerate(title_list):
news = {
}
news['name'] = title
news['url'] = url_list[index]
self.data_list.append(news)
# 3.保存数据
def save_data(self):
data_str = json.dumps(self.data_list, ensure_ascii=False)
with open('001.json', 'w', encoding="utf-8")as f:
f.write(data_str)
# 4.启动
def run(self):
for i in range(1000):
url = self.base_url+str(i)
print(url)
data = self.get_response(url)
self.parse_data(data)
self.save_data()
BtcSpider().run()
python爬虫——爬取链节点区块链社区所有帖子标题和链接,整理成json文件并保存
Guess you like
Origin blog.csdn.net/weixin_43912367/article/details/105001687
Ranking