模拟登陆爬取大学智慧校园的成绩单

我爬取的地址是:http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html

应该只有北部湾大学使用了吧,好吧,应该不何止北部湾大学用吧,不管了,先贴着当笔记吧!

基本步骤:

1.获取验证码

 def get_captcha(self):
        response=self.session.get(self.captcha_url)
        #print(response.cookies)
        with open('captcha.png','wb') as f:
            f.write(response.content)
  
    def captcha_ocr(self):
        image = Image.open('captcha.png')
 
        image = image.convert('L')
        threshold = 110
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
 
        image = image.point(table, '1')
        #image.show()
        captcha = tesserocr.image_to_text(image)
        captcha=captcha.split('\n')[0]
        captcha=captcha.replace(" ","")
        return captcha

2.提交登录表单

3.获取成绩token

4.获取下载成绩单

我保存的文件为csv

import requests
import tesserocr
from PIL import Image
from pyquery import PyQuery as pq
from urllib.parse import unquote
import re
import json
import csv

class Login(object):
    def __init__(self):
        self.captcha_url='http://authserver.bbgu.edu.cn/authserver/captcha.html'
        self.login_url='http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html'
        self.session=requests.Session()
        self.admin_flag=1 #默认为1,验证码不正确
        self.save_path='data.csv'  #保存的地址
        self.username=''  #账号
        self.password=''  #密码
        self.startSchoolYear=2017
        self.endSchoolYear=2020
        self.start_semester=1
        self.end_semester=2  #开始学期为1,结束学期为1,则爬取第一个学期
    
    def csv_init(self):
        list=["学年","学期","课程代码","课程性质","课程名称","学分","成绩","开课学院","重修标记"]
        with open(self.save_path,'w',newline='') as file:
            csv_file=csv.writer(file)
            csv_file.writerow(list)
    
    def get_login(self):
        headers={
            'Host': 'authserver.bbgu.edu.cn',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'            
            }
        response=self.session.get(self.login_url,headers=headers)
        html=pq(response.text)
        lt=html("form#casLoginForm>input[name='lt']").attr('value')
        execution=html("form#casLoginForm>input[name='execution']").attr('value')
        return lt,execution
    
    def get_captcha(self):
        response=self.session.get(self.captcha_url)
        #print(response.cookies)
        with open('captcha.png','wb') as f:
            f.write(response.content)
  
    def captcha_ocr(self):
        image = Image.open('captcha.png')
 
        image = image.convert('L')
        threshold = 110
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
 
        image = image.point(table, '1')
        #image.show()
        captcha = tesserocr.image_to_text(image)
        captcha=captcha.split('\n')[0]
        captcha=captcha.replace(" ","")
        return captcha
    
    #post提交登录信息
    def post_login(self,username,password,captcha,lt,execution):
        # proxy = '127.0.0.1:8080'
        # proxies = {
        #     'http': 'http://' + proxy,
        #     'https': 'https://' + proxy,
        #     }
        
        post_headers={
            'Host': 'authserver.bbgu.edu.cn',
            'Origin': 'http://authserver.bbgu.edu.cn',
            'Referer': 'http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
            }
        #print(captcha)
        post_data={
            'username': username,
            'password': password,
            'captchaResponse': captcha,
            'lt': lt,
            'dllt': 'userNamePasswordLogin',
            'execution':execution,
            '_eventId': 'submit',
            'rmShown': '1'
            }
        
        #print(post_data)
        response=self.session.post(self.login_url,data=post_data,headers=post_headers)
        html=pq(response.text)
        admin_status=html('form#casLoginForm>span').text()
        if(admin_status=='您提供的用户名或者密码有误'):
            print('您提供的用户名或者密码有误')
            print('登录失败,请重新输入账号密码')
            self.admin_flag=0   #表示用户名或者密码有误       
        elif(admin_status=="无效的验证码"):
            print('无效的验证码')
            print('正在重新获取验证码,重新登录')
            print('..........................')
            self.admin_flag=1 #表示无效验证码
        else:
            self.admin_flag=2 
       
        #print(response.text)
        
    
    #整合login函数    
    def login(self):
        lt,execution=self.get_login()        
        i=0 #验证码尝试次数
        while(self.admin_flag==1 and i<10):
            i=i+1
            self.get_captcha()
            captcha=self.captcha_ocr()
            self.post_login(self.username,self.password,captcha,lt,execution)        
        if(self.admin_flag==2):
            admin_do=1;
            print('login successed')            
        else:
            admin_do=0
        return admin_do
        
    def get_score_url(self):
        url1='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/cas/studentRecord/list.html?amp_sec_version_=1&gid_=RHdsVlJDOC84UUMyQkJKTmVIWWIyNjZOeE9Nd0RlUlNSeGFva3RqL0ZrTS9iZzc4anRSaFpzSGozMDEvTVA2SUhzaTBNUVpaZjN6SGlLK29nY1N0TWc9PQ&EMAP_LANG=zh&THEME=millennium'        
        headers={
            'Host': 'xqcxht.bbgu.edu.cn:8082',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
            }
        try:            
            response1=self.session.get(url1,headers=headers,allow_redirects=False)
        #print(response1.status_code)
        #print(response1.headers['Location'])
        
            url2=unquote(response1.headers['Location'])
            headers2={
                'Host': 'authserver.bbgu.edu.cn',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
                }
            response2=self.session.get(url2,headers=headers2,allow_redirects=False)
        #print(response2.status_code)
        
            url3=unquote(response2.headers['Location'])       
            response3=self.session.get(url3,headers=headers,allow_redirects=False)
        #print(response3.status_code)
        #print(response3.headers)
        
            url4=response3.headers['Location']
            response4=self.session.get(url4,headers=headers,allow_redirects=False)
        #print(response4.status_code)
        #print(response4.headers['Location'])
        
            score_url=response4.headers['Location']
        #response5=self.session.get(url5,headers=headers)
            return score_url
        
        except KeyError as e:
            print('keyError',e.args)
            self.get_score_url()
        
    def get_score(self,score_url,startSchoolYear,endSchoolYear,semester):
        # proxy = '127.0.0.1:8080'
        # proxies = {
        #     'http': 'http://' + proxy,
        #     'https': 'https://' + proxy,
        #     }
        result=re.search('uid=(.*)&token=(.*)',score_url)
        uid=result.group(1)
        token=result.group(2)
        #print(score_url)
        headers={
            'Host': 'xqcxht.bbgu.edu.cn:8082',
            'Origin': 'http://xqcxht.bbgu.edu.cn:8082',
            'Proxy-Connection': 'keep-alive',
            'Referer': score_url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
            }
        
        post_data={
            "pageNum":1,
            "pageSize":20,
            "stuNumber":uid,
            "startSchoolYear":startSchoolYear,
            "endSchoolYear":endSchoolYear,
            "semester":semester,
            "uid":uid,
            "token":token}
        #print(post_data)
        url='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/studentRecord/getStuRecordList'
        response=self.session.post(url,json=post_data,headers=headers)
        datas=response.json()
        self.datas_handle(datas)
        
        
    
    
    def datas_handle(self,datas):
        items=datas.get('RetData').get('studentRecordPage').get('records')
        for item in items:
            list=[];
            school_year=item.get('startSchoolYear')+'-'+item.get('endSchoolYear')
            semester=item.get('semester')                     #学期
            lessonCode=item.get('lessonCode')                 #课程代码
            courseNature=item.get('courseNature')             #课程性质
            lessonName=item.get('lessonName')                 #课程名称
            credits=item.get('credits')
            results=item.get('results')                       #成绩
            beginCollege=item.get('beginCollege')             #开课学院
            reconstructionSign=item.get('reconstructionSign') #重修标记
            list=[school_year,semester,lessonCode,courseNature,lessonName,credits,results,beginCollege,reconstructionSign]
            self.save_to_csv(list)
        
    
    def save_to_csv(self,list):
        with open(self.save_path, 'a+',newline='') as csvfile:
            writer= csv.writer(csvfile)
            writer.writerow(list)
    
    def get_messeges(self):
        score_url=self.get_score_url()
        for year in range(self.startSchoolYear,self.endSchoolYear):
            startYear=year
            endYear=year+1
            for semester in range(self.start_semester,self.end_semester+1):
                self.get_score(score_url,startYear,endYear,semester)
        print('datas saved in',self.save_path)

def main():

    login=Login()
    login.csv_init()
    admin_do=login.login()
    if(admin_do):
        login.get_messeges()
                
if __name__ == '__main__':
    main()     

猜你喜欢

转载自blog.csdn.net/weixin_40943540/article/details/105890621