Crawling educational curriculum system data
requests + re reptiles and analytical framework, note that this type of educational system
Renderings below #### directly to the source, pay attention to modify their own to understand more significance, but the user name and password I hide out
The following shows the source code (full original) where the regular expression part of a small error, if more demanding students can revise the law to find them again.
// A code block
var foo = 'bar';
// An highlighted block
# coding:utf-8
import requests
import json
import re
import pandas as pd
# 登录请求地址
url = 'http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xtgl/login_slogin.html'
# 请求
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
,"Cookie":"wengine_vpn_ticket=665a8f93264e9d45; refresh=1"
}
# body数据
data = {
'yhm':"", # 账号
"mm":"", # 密码
}
# 发送请求
session = requests.session()
session.post(url,headers=headers,data=data)
#url_index='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xtgl/index_initMenu.html'
#url_index='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xsxk/zzxkyzb_cxJxbWithKchZzxkYzb.html?vpn-12-o1-jwxt.zufe.edu.cn&gnmkdm=N253512&su=160103900104&rwlx=2&xkly=0&bklx_id=0&xqh_id=2&jg_id=03&zyh_id=0390&zyfx_id=wfx&njdm_id=2016&bh_id=16039001&xbm=1&xslbdm=wlb&ccdm=w&xsbj=4294967296&sfkknj=0&sfkkzy=0&sfznkx=0&zdkxms=0&sfkxq=0&sfkcfx=0&kkbk=0&kkbkdj=0&xkxnm=2019&xkxqm=12&rlkz=0&kklxdm=10&kch_id=000120030&xkkz_id=A1E3328A1E0B79B1E053A40810AC64CF&cxbj=0&fxbj=0'
course_list_detail = [["课程板块","教学地点","教学时间","备注"]]
def get_course_detail(url_index_2):
r = session.post(url_index_2,headers=headers)
#content = r.content.decode()
result = r.content.decode().replace(' ', '').replace('\n', '').replace('\r', '')
kcgsmc = str(re.findall('"kcgsmc":".*?"',result))[12:-3]
jxdd = str(re.findall('"jxdd":".*?"',result))[10:-3]
sksj = str(re.findall('"sksj":".*?"',result))[10:-3]
xkbz = str(re.findall('"xkbz":".*?"',result))[10:-3]
global tmp_list_2
tmp_list_2 = [kcgsmc,jxdd,sksj,xkbz]
#course_list_detail.append(tmp_list_2)
def set_kch_id(kch_id):
url_index_2='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xsxk/zzxkyzb_cxJxbWithKchZzxkYzb.html?vpn-12-o1-jwxt.zufe.edu.cn&gnmkdm=N253512&su=160103900104&rwlx=2&xkly=0&bklx_id=0&xqh_id=2&jg_id=03&zyh_id=0390&zyfx_id=wfx&njdm_id=2016&bh_id=16039001&xbm=1&xslbdm=wlb&ccdm=w&xsbj=4294967296&sfkknj=0&sfkkzy=0&sfznkx=0&zdkxms=0&sfkxq=0&sfkcfx=0&kkbk=0&kkbkdj=0&xkxnm=2019&xkxqm=12&rlkz=0&kklxdm=10&kch_id='+str(kch_id)+'&xkkz_id=A1E3328A1E0B79B1E053A40810AC64CF&cxbj=0&fxbj=0'
get_course_detail(url_index_2)
def get_course_brief(url_index):
r = session.post(url_index,headers=headers)
#content = r.content.decode()
content = r.content.decode().replace(' ', '').replace('\n', '').replace('\r', '')
pattern=re.compile('"cxbj.*?year',re.S)
results = re.findall(pattern, content)
#course_list = [["课程编号","课程名称","课程ID","课程学分"]]
for result in results:
jxbmc = str(re.findall('"jxbmc":".*?"',result))[11:-3]
kcmc = str(re.findall('"kcmc":".*?"',result))[10:-3]
xf = str(re.findall('"xf":".*?"',result))[8:-3]
kch_id = str(re.findall('"kch":".*?"',result))[9:-3]
set_kch_id(kch_id)
tmp_list = [jxbmc,kcmc,xf]+tmp_list_2
course_list.append(tmp_list)
num1 = 1 ; num2 = 10
course_list = []
for i in range(6):
url_index='http://wrdvpn.zufe.edu.cn/http/77726476706e69737468656265737421fae05988692a7d567b468ca88d1b203b/jwglxt/xsxk/zzxkyzb_cxZzxkYzbPartDisplay.html?vpn-12-o1-jwxt.zufe.edu.cn&gnmkdm=N253512&su=160103900104&rwlx=2&xkly=0&bklx_id=0&xqh_id=2&jg_id=03&zyh_id=0390&zyfx_id=wfx&njdm_id=2016&bh_id=16039001&xbm=1&xslbdm=wlb&ccdm=w&xsbj=4294967296&sfkknj=0&sfkkzy=0&sfznkx=0&zdkxms=0&sfkxq=0&sfkcfx=0&kkbk=0&kkbkdj=0&sfkgbcx=0&sfrxtgkcxd=0&tykczgxdcs=0&xkxnm=2019&xkxqm=12&kklxdm=10&rlkz=0&kspage='+str(num1)+'&jspage='+str(num2)+'&jxbzb='
try:
get_course_brief(url_index)
except:
print(i)
break
num1+=10; num2+=10
df = pd.DataFrame(course_list,columns=["课程编号","课程名称","课程学分","课程板块","教学地点","教学时间","备注"])
print(df)