[python爬虫]爬寻雪球网

导包

from urllib import request,parse
from piaot import *
import json
import pymysql

yeshu是输入的页数

自己创建一个数据库函数

def sql(sql_z):
    # 打开数据库连接
    db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    # 使用 execute()  方法执行 SQL 查询
    cursor.execute("")

    # 使用 fetchone() 方法获取单条数据.
    data = cursor.fetchone(sql_z)

    print("Database version : %s " % data)

    # 关闭数据库连接
    db.close()

将爬取得网页做成函数

xq函数,参数:(xq_url=网站url,shu=为递归所需参数无须添加,yeshu=共页数)

def xq(xq_url=None,shu=0,yeshu=1):

    # 判断是否是带参数(url循环条件的id值)
    if xq_url:
        url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=" + str(xq_url) + "&count=15&category=-1"
        # print(xq_url[0])
    else:
        url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1"
    headers={
        "User-Agent":pa(),
        "Cookie":"device_id=40472336a6cdeeadffefa199fa0bf24a; _ga=GA1.2.280017243.1531224612; s=do126sw0xe; __utmz=1.1531224738.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=1.280017243.1531224612.1531224738.1531234688.3; aliyungf_tc=AQAAAAGJygtrxwQAJ0V5akZpYkAxmbvB; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; u=811534314860368; Hm_lvt_1db88642e346389874251b5a1eded6e3=1533223538,1534314860; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534314860; _gid=GA1.2.1854937153.1534314882"}
    print(url)
   # 重定向
    req=request.Request(url,headers=headers)
    # 开启端口
    ht=request.urlopen(req)
    # 返回二进制
    html=ht.read()
    # json装置
    html=json.loads(html.decode('utf-8'))

    # 循环list的值
    tz=''
    if yeshu == shu:
        return '循环结束'

    for i in html['list']:
        tz=i['id']
        b=i['data']
        # 将json转成字典
        b = json.loads(b)
        c=[[b['id']],[b['title']],[b['description']],[b['target']]]
        print(c)
        # msql存储
        sql_z = "insert into xq_1(uid,title,description,target) values(b['id'],b['title'],b['description'],b['target']);"
        sql(sql_z)
    # 自循环加一,循环判定值
    shu+=1

    #返回循环参数:url的id值,循环判定值,页数
    return xq(tz,shu,yeshu)

调用函数

if __name__=="__main__":
print(xq(yeshu=1))

猜你喜欢

转载自blog.csdn.net/Black_God1/article/details/81750714
今日推荐