python爬取软考每日一练试题存入数据库

好久没有更新博客园了,一直在我的csdn(https://blog.csdn.net/u013252962)更新,今天挪动一篇。最近打算刷关于数据库的题,对应非会员,只能做题每日一练了,可是刷题时间基本都是地铁上,所以讲题爬下来,用vue做个简单的页面,希望有同样需求的小伙伴来看下,别忘记点赞奥!

python源码(无框架)

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import json
import re
import pandas as pd
import numpy as np
import pymysql

cookie = ""
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
    "Cookie": cookie,
}
# pymysql 数据库操作
class MysqlAct(object):
    def __init__(self):
        self.connect = pymysql.connect('localhost', 'root', 'root', 'tpcommon', use_unicode=True, charset='utf8')
        self.cursor = self.connect.cursor()

    def select(self, sql):
        self.cursor.execute(sql)
        return self.cursor.fetchall()

    def find(self, sql):
        self.cursor.execute(sql)
        return self.cursor.rowcount

    def insert(self, sql):
        self.cursor.execute(sql)
        self.connect.commit()

    def update(self, sql):
        self.cursor.execute(sql)
        self.connect.commit()

    def colose(self):
        self.connect.close()

# 爬虫类
class Spider(object):

    def __init__(self):
        self.page = 1
    # 获取原始列表
    def GetClist(self,totalpage=2):
        mysql = MysqlAct()
        while self.page <= totalpage:
            print("begin----",self.page)
            url=''
            classurl = "url-{}.html".format(self.page)
            r = requests.get(classurl, headers=header)
            html = etree.HTML(r.content)
            list = html.xpath("//div[@class='ecv2_tikucom_doItem clearfix']")
            for v in list:
                title = v.xpath(".//div[@class='ecv2_tikucom_doTitle ecv2_marginbottom16']/text()")[0]
                a = v.xpath(".//a//@href")[0]
                if (a == 'javascript:;'):
                    a = v.xpath(".//a//@data-accessid")[0]
                id = re.findall(r'\d+', a)

                row = (title, id[0])
                fields = '''(title, tcid)'''
                sql = "insert into fcxlt_a_ruankao_list %s VALUES %s" % (fields, row)
                mysql.insert(sql)

            self.page = self.page+1
            print("over---",self.page)
        mysql.colose()
    # 获取测试报告列表入库
    def GetChecks(self):
        # 有404的 未做过容错跑第二遍sql = "select * from fcxlt_a_ruankao_list where checkid is null order by id asc"
        sql = "select * from fcxlt_a_ruankao_list order by id asc"
        mysql = MysqlAct()
        res = mysql.select(sql)
        df = pd.DataFrame(res)
        df.columns = ['id', 'title','tcid','checkid','checkurl']
        url = "https://uc.educity.cn/ucapi/uc/paper/startExam.do"
        for i in range(df.shape[0]):
            id = df['id'][i]
            oldtcid = df['tcid'][i]
            r = requests.post(url, data={'tcId': oldtcid, 'model': 'Exam'}, headers=header)
            json_a = json.loads(r.content)
            newid = json_a['model']['data']
            teata = "https://uc.educity.cn/tiku/testReport.html?id=" + str(json_a['model']['data'])
            upsql = "UPDATE fcxlt_a_ruankao_list SET checkid = '%d',checkurl='%s' WHERE id = '%d'" % (newid,teata,id)
            mysql.update(upsql)
        mysql.colose()

    # 获取试题答案和列表
    def GetQes(self):
        sql = "select * from fcxlt_a_ruankao_list order by id asc"
        mysql = MysqlAct()
        res = mysql.select(sql)
        df = pd.DataFrame(res)
        df.columns = ['id', 'title', 'tcid', 'checkid', 'checkurl']
        url = "https://uc.educity.cn/ucapi/uc/testPaperLog/loadShitiLogByTestId.do"
        mysql = MysqlAct()
        for i in range(df.shape[0]):
            id = df['checkid'][i]
            r = requests.post(url, data={'paperLogId': id}, headers=header)
            json_a = json.loads(r.content)
            checkid = id
            for i in range(10):
                s = json_a['model'][i]['shiti']
                title = "\'" + s['tigan'] + "\'"
                ansy = "\'" + s['analysis'] + "\'"
                xuanxiang = "\'" + s['questionDelHTMLTag'] + "\'"
                answer = "\'" + s['answerStr'] + "\'"
                num = s['questionNum']
                shitiid = s['id']
                sqlf = "select * from fcxlt_a_ruankao_shiti where shitiid = %d" % (shitiid)
                # count = mysql.find(sqlf)
                # print(count,shitiid)
                # print("insert---------", count, shitiid)
                rows = (title, ansy, xuanxiang, answer, num, checkid, shitiid)
                fields = '''(title,ansy,xuanxiang,answer,num,checkid,shitiid)'''
                sql = "insert into fcxlt_a_ruankao_shiti %s VALUES %s" % (fields, rows)
                mysql.insert(sql)

        mysql.colose()


if __name__ == '__main__':
    #获取列表存入数据库
    spider = Spider()
    # spider.GetClist(30)# 第一步
    # spider.GetChecks()# 第2步
    # spider.GetQes()# 第三步

mysql建表语句

CREATE TABLE `fcxlt_a_ruankao_list` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) DEFAULT NULL,
  `tcid` int(11) DEFAULT NULL,
  `checkid` int(11) DEFAULT NULL,
  `checkurl` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4

CREATE TABLE `fcxlt_a_ruankao_shiti` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `listid` int(11) DEFAULT NULL,
  `title` text,
  `answer` varchar(255) DEFAULT NULL,
  `ansy` text,
  `xuanxiang` text,
  `num` int(11) DEFAULT NULL,
  `checkid` int(11) DEFAULT NULL,
  `shitiid` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=1DEFAULT CHARSET=utf8mb4

猜你喜欢

转载自www.cnblogs.com/huahua2018/p/12073605.html
今日推荐