#第一页链接
# https://xueqiu.com/v4/statuses/public_timeline_by_category.json?
# since_id=-1&max_id=-1&count=10&category=111
#第二页链接
# https://xueqiu.com/v4/statuses/public_timeline_by_category.json?
# since_id=-1&max_id=184255&count=15&category=111
#第三页链接
# https://xueqiu.com/v4/statuses/public_timeline_by_category.json?
# since_id=-1&max_id=184006&count=15&category=111
import requests
import json
import pymysql
#封装个连接mysql的类
class mysql_conn():
def __init__(self):
self.db = pymysql.connect('127.0.0.1','root','******','wang')
self.cursor = self.db.cursor()
def execute_modify_mysql(self,sql):
self.cursor.execute(sql)
self.db.commit()
def __del__(self):
self.db.close()
# 封装爬取一页的信息
def xueqiu(url):
res_list = []
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
headers = {
'Cookie': 'aliyungf_tc=AQAAAHEe4kB1aggAUhVFeSTHOQA3F9Tr; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; u=591534314233508; device_id=6cc7e6153f6fc5c2ee23a704fa3cfc88; _ga=GA1.2.1900937437.1534314236; _gid=GA1.2.501674476.1534314236; Hm_lvt_1db88642e346389874251b5a1eded6e3=1534314242,1534314439,1534314451; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534314451',
'User-Agent': user_agent
}
response = requests.get(url,headers=headers)
html_byte = json.loads(response.text)
for html_list in html_byte['list']:
res_list.append(html_list)
return res_list
if __name__ == '__main__':
sq = mysql_conn()
#页码列表 这里只有前三页 如需爬取多页 直接取上一页的max_id拼接起来就行
url_list = ['https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=111',
'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=184255&count=15&category=111',
'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=184006&count=15&category=111']
for url in url_list:
res_list = xueqiu(url)
for line_data in res_list:
id = json.loads(line_data['data'])['id']
title = json.loads(line_data['data'])['title']
description = json.loads(line_data['data'])['description']
target = json.loads(line_data['data'])['target']
sql = "insert into xueqiu(xid,title,description,target) values('{}','{}','{}','{}')".format(id,title,description,target)
# print(sql)
sq.execute_modify_mysql(sql)
把雪球网房产信息前三页存储到MySQL数据库
猜你喜欢
转载自blog.csdn.net/weixin_38920937/article/details/81711500
今日推荐
周排行