Requests + re crawler frame crawling educational course information system

Crawling educational curriculum system data

requests + re reptiles and analytical framework, note that this type of educational systemHere Insert Picture Description

Renderings below Here Insert Picture Description#### directly to the source, pay attention to modify their own to understand more significance, but the user name and password I hide out

The following shows the source code (full original) where the regular expression part of a small error, if more demanding students can revise the law to find them again.

// A code block
var foo = 'bar';
// An highlighted block
# coding:utf-8
import requests
import json
import re
import pandas as pd
# 登录请求地址

url = 'http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xtgl/login_slogin.html'
# 请求
headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
        ,"Cookie":"wengine_vpn_ticket=665a8f93264e9d45; refresh=1"
}
# body数据
data = {
        'yhm':"",   # 账号
        "mm":"",  # 密码
}
# 发送请求
session = requests.session()
session.post(url,headers=headers,data=data)
#url_index='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xtgl/index_initMenu.html'
#url_index='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xsxk/zzxkyzb_cxJxbWithKchZzxkYzb.html?vpn-12-o1-jwxt.zufe.edu.cn&gnmkdm=N253512&su=160103900104&rwlx=2&xkly=0&bklx_id=0&xqh_id=2&jg_id=03&zyh_id=0390&zyfx_id=wfx&njdm_id=2016&bh_id=16039001&xbm=1&xslbdm=wlb&ccdm=w&xsbj=4294967296&sfkknj=0&sfkkzy=0&sfznkx=0&zdkxms=0&sfkxq=0&sfkcfx=0&kkbk=0&kkbkdj=0&xkxnm=2019&xkxqm=12&rlkz=0&kklxdm=10&kch_id=000120030&xkkz_id=A1E3328A1E0B79B1E053A40810AC64CF&cxbj=0&fxbj=0'
course_list_detail = [["课程板块","教学地点","教学时间","备注"]]

def get_course_detail(url_index_2):
    r = session.post(url_index_2,headers=headers)
    #content = r.content.decode()
    result = r.content.decode().replace(' ', '').replace('\n', '').replace('\r', '')
    kcgsmc = str(re.findall('"kcgsmc":".*?"',result))[12:-3]
    jxdd = str(re.findall('"jxdd":".*?"',result))[10:-3]
    sksj = str(re.findall('"sksj":".*?"',result))[10:-3]
    xkbz = str(re.findall('"xkbz":".*?"',result))[10:-3]
    global tmp_list_2
    tmp_list_2 = [kcgsmc,jxdd,sksj,xkbz]
    #course_list_detail.append(tmp_list_2)
    
def set_kch_id(kch_id):    
    url_index_2='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xsxk/zzxkyzb_cxJxbWithKchZzxkYzb.html?vpn-12-o1-jwxt.zufe.edu.cn&gnmkdm=N253512&su=160103900104&rwlx=2&xkly=0&bklx_id=0&xqh_id=2&jg_id=03&zyh_id=0390&zyfx_id=wfx&njdm_id=2016&bh_id=16039001&xbm=1&xslbdm=wlb&ccdm=w&xsbj=4294967296&sfkknj=0&sfkkzy=0&sfznkx=0&zdkxms=0&sfkxq=0&sfkcfx=0&kkbk=0&kkbkdj=0&xkxnm=2019&xkxqm=12&rlkz=0&kklxdm=10&kch_id='+str(kch_id)+'&xkkz_id=A1E3328A1E0B79B1E053A40810AC64CF&cxbj=0&fxbj=0'
    get_course_detail(url_index_2)
    
def get_course_brief(url_index):
    r = session.post(url_index,headers=headers)
    #content = r.content.decode()
    content = r.content.decode().replace(' ', '').replace('\n', '').replace('\r', '')
    
    pattern=re.compile('"cxbj.*?year',re.S)
     
    results = re.findall(pattern, content)
    #course_list = [["课程编号","课程名称","课程ID","课程学分"]]
    for result in results:
        jxbmc = str(re.findall('"jxbmc":".*?"',result))[11:-3]
        kcmc = str(re.findall('"kcmc":".*?"',result))[10:-3]
        xf = str(re.findall('"xf":".*?"',result))[8:-3]
        kch_id = str(re.findall('"kch":".*?"',result))[9:-3]
        set_kch_id(kch_id)
        tmp_list = [jxbmc,kcmc,xf]+tmp_list_2
        course_list.append(tmp_list)
    
num1 = 1 ; num2 = 10
course_list = []
for i in range(6):
    url_index='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xsxk/zzxkyzb_cxZzxkYzbPartDisplay.html?vpn-12-o1-jwxt.zufe.edu.cn&gnmkdm=N253512&su=160103900104&rwlx=2&xkly=0&bklx_id=0&xqh_id=2&jg_id=03&zyh_id=0390&zyfx_id=wfx&njdm_id=2016&bh_id=16039001&xbm=1&xslbdm=wlb&ccdm=w&xsbj=4294967296&sfkknj=0&sfkkzy=0&sfznkx=0&zdkxms=0&sfkxq=0&sfkcfx=0&kkbk=0&kkbkdj=0&sfkgbcx=0&sfrxtgkcxd=0&tykczgxdcs=0&xkxnm=2019&xkxqm=12&kklxdm=10&rlkz=0&kspage='+str(num1)+'&jspage='+str(num2)+'&jxbzb='
    try:
        get_course_brief(url_index)
    except:
        print(i)
        break
    num1+=10; num2+=10
df = pd.DataFrame(course_list,columns=["课程编号","课程名称","课程学分","课程板块","教学地点","教学时间","备注"])
print(df)
Released two original articles · won praise 2 · Views 265

Guess you like

Origin blog.csdn.net/Maxiitake/article/details/105260363