大学のスマートキャンパスのトランスクリプトをクロールするためのシミュレートされた着陸

クロールしたアドレスは次のとおりです:http://authserver.bbgu.edu.cn/authserver/login?service = http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall。 bbgu.edu.cn%2Fnew%2Findex.html

北武湾大学だけで使うべきです、まあ、北武湾大学だけで使うべきではありません。とにかく、まずはメモとして投稿しましょう!

基本的な手順:

1.確認コードを取得します

 def get_captcha(self):
        response=self.session.get(self.captcha_url)
        #print(response.cookies)
        with open('captcha.png','wb') as f:
            f.write(response.content)
  
    def captcha_ocr(self):
        image = Image.open('captcha.png')
 
        image = image.convert('L')
        threshold = 110
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
 
        image = image.point(table, '1')
        #image.show()
        captcha = tesserocr.image_to_text(image)
        captcha=captcha.split('\n')[0]
        captcha=captcha.replace(" ","")
        return captcha

2.ログインフォームを送信します

3.スコアトークンを取得します

4.ダウンロードトランスクリプトを取得します

ファイルをcsvとして保存しました

import requests
import tesserocr
from PIL import Image
from pyquery import PyQuery as pq
from urllib.parse import unquote
import re
import json
import csv

class Login(object):
    def __init__(self):
        self.captcha_url='http://authserver.bbgu.edu.cn/authserver/captcha.html'
        self.login_url='http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html'
        self.session=requests.Session()
        self.admin_flag=1 #默认为1,验证码不正确
        self.save_path='data.csv'  #保存的地址
        self.username=''  #账号
        self.password=''  #密码
        self.startSchoolYear=2017
        self.endSchoolYear=2020
        self.start_semester=1
        self.end_semester=2  #开始学期为1,结束学期为1,则爬取第一个学期
    
    def csv_init(self):
        list=["学年","学期","课程代码","课程性质","课程名称","学分","成绩","开课学院","重修标记"]
        with open(self.save_path,'w',newline='') as file:
            csv_file=csv.writer(file)
            csv_file.writerow(list)
    
    def get_login(self):
        headers={
            'Host': 'authserver.bbgu.edu.cn',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'            
            }
        response=self.session.get(self.login_url,headers=headers)
        html=pq(response.text)
        lt=html("form#casLoginForm>input[name='lt']").attr('value')
        execution=html("form#casLoginForm>input[name='execution']").attr('value')
        return lt,execution
    
    def get_captcha(self):
        response=self.session.get(self.captcha_url)
        #print(response.cookies)
        with open('captcha.png','wb') as f:
            f.write(response.content)
  
    def captcha_ocr(self):
        image = Image.open('captcha.png')
 
        image = image.convert('L')
        threshold = 110
        table = []
        for i in range(256):
            if i < threshold:
                table.append(0)
            else:
                table.append(1)
 
        image = image.point(table, '1')
        #image.show()
        captcha = tesserocr.image_to_text(image)
        captcha=captcha.split('\n')[0]
        captcha=captcha.replace(" ","")
        return captcha
    
    #post提交登录信息
    def post_login(self,username,password,captcha,lt,execution):
        # proxy = '127.0.0.1:8080'
        # proxies = {
        #     'http': 'http://' + proxy,
        #     'https': 'https://' + proxy,
        #     }
        
        post_headers={
            'Host': 'authserver.bbgu.edu.cn',
            'Origin': 'http://authserver.bbgu.edu.cn',
            'Referer': 'http://authserver.bbgu.edu.cn/authserver/login?service=http%3A%2F%2Fehall.bbgu.edu.cn%2Flogin%3Fservice%3Dhttp%3A%2F%2Fehall.bbgu.edu.cn%2Fnew%2Findex.html',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
            }
        #print(captcha)
        post_data={
            'username': username,
            'password': password,
            'captchaResponse': captcha,
            'lt': lt,
            'dllt': 'userNamePasswordLogin',
            'execution':execution,
            '_eventId': 'submit',
            'rmShown': '1'
            }
        
        #print(post_data)
        response=self.session.post(self.login_url,data=post_data,headers=post_headers)
        html=pq(response.text)
        admin_status=html('form#casLoginForm>span').text()
        if(admin_status=='您提供的用户名或者密码有误'):
            print('您提供的用户名或者密码有误')
            print('登录失败,请重新输入账号密码')
            self.admin_flag=0   #表示用户名或者密码有误       
        elif(admin_status=="无效的验证码"):
            print('无效的验证码')
            print('正在重新获取验证码,重新登录')
            print('..........................')
            self.admin_flag=1 #表示无效验证码
        else:
            self.admin_flag=2 
       
        #print(response.text)
        
    
    #整合login函数    
    def login(self):
        lt,execution=self.get_login()        
        i=0 #验证码尝试次数
        while(self.admin_flag==1 and i<10):
            i=i+1
            self.get_captcha()
            captcha=self.captcha_ocr()
            self.post_login(self.username,self.password,captcha,lt,execution)        
        if(self.admin_flag==2):
            admin_do=1;
            print('login successed')            
        else:
            admin_do=0
        return admin_do
        
    def get_score_url(self):
        url1='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/cas/studentRecord/list.html?amp_sec_version_=1&gid_=RHdsVlJDOC84UUMyQkJKTmVIWWIyNjZOeE9Nd0RlUlNSeGFva3RqL0ZrTS9iZzc4anRSaFpzSGozMDEvTVA2SUhzaTBNUVpaZjN6SGlLK29nY1N0TWc9PQ&EMAP_LANG=zh&THEME=millennium'        
        headers={
            'Host': 'xqcxht.bbgu.edu.cn:8082',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
            }
        try:            
            response1=self.session.get(url1,headers=headers,allow_redirects=False)
        #print(response1.status_code)
        #print(response1.headers['Location'])
        
            url2=unquote(response1.headers['Location'])
            headers2={
                'Host': 'authserver.bbgu.edu.cn',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
                }
            response2=self.session.get(url2,headers=headers2,allow_redirects=False)
        #print(response2.status_code)
        
            url3=unquote(response2.headers['Location'])       
            response3=self.session.get(url3,headers=headers,allow_redirects=False)
        #print(response3.status_code)
        #print(response3.headers)
        
            url4=response3.headers['Location']
            response4=self.session.get(url4,headers=headers,allow_redirects=False)
        #print(response4.status_code)
        #print(response4.headers['Location'])
        
            score_url=response4.headers['Location']
        #response5=self.session.get(url5,headers=headers)
            return score_url
        
        except KeyError as e:
            print('keyError',e.args)
            self.get_score_url()
        
    def get_score(self,score_url,startSchoolYear,endSchoolYear,semester):
        # proxy = '127.0.0.1:8080'
        # proxies = {
        #     'http': 'http://' + proxy,
        #     'https': 'https://' + proxy,
        #     }
        result=re.search('uid=(.*)&token=(.*)',score_url)
        uid=result.group(1)
        token=result.group(2)
        #print(score_url)
        headers={
            'Host': 'xqcxht.bbgu.edu.cn:8082',
            'Origin': 'http://xqcxht.bbgu.edu.cn:8082',
            'Proxy-Connection': 'keep-alive',
            'Referer': score_url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
            }
        
        post_data={
            "pageNum":1,
            "pageSize":20,
            "stuNumber":uid,
            "startSchoolYear":startSchoolYear,
            "endSchoolYear":endSchoolYear,
            "semester":semester,
            "uid":uid,
            "token":token}
        #print(post_data)
        url='http://xqcxht.bbgu.edu.cn:8082/qinzhouh5/studentRecord/getStuRecordList'
        response=self.session.post(url,json=post_data,headers=headers)
        datas=response.json()
        self.datas_handle(datas)
        
        
    
    
    def datas_handle(self,datas):
        items=datas.get('RetData').get('studentRecordPage').get('records')
        for item in items:
            list=[];
            school_year=item.get('startSchoolYear')+'-'+item.get('endSchoolYear')
            semester=item.get('semester')                     #学期
            lessonCode=item.get('lessonCode')                 #课程代码
            courseNature=item.get('courseNature')             #课程性质
            lessonName=item.get('lessonName')                 #课程名称
            credits=item.get('credits')
            results=item.get('results')                       #成绩
            beginCollege=item.get('beginCollege')             #开课学院
            reconstructionSign=item.get('reconstructionSign') #重修标记
            list=[school_year,semester,lessonCode,courseNature,lessonName,credits,results,beginCollege,reconstructionSign]
            self.save_to_csv(list)
        
    
    def save_to_csv(self,list):
        with open(self.save_path, 'a+',newline='') as csvfile:
            writer= csv.writer(csvfile)
            writer.writerow(list)
    
    def get_messeges(self):
        score_url=self.get_score_url()
        for year in range(self.startSchoolYear,self.endSchoolYear):
            startYear=year
            endYear=year+1
            for semester in range(self.start_semester,self.end_semester+1):
                self.get_score(score_url,startYear,endYear,semester)
        print('datas saved in',self.save_path)

def main():

    login=Login()
    login.csv_init()
    admin_do=login.login()
    if(admin_do):
        login.get_messeges()
                
if __name__ == '__main__':
    main()     

 

おすすめ

転載: blog.csdn.net/weixin_40943540/article/details/105890621