今天,我们的普通话考试成绩出来了(以山东为例),下午闲来无事用python写了写代码来爬了爬大家的成绩(已知姓名和身份证),方法有点暴力,敬请指点!
import urllib.request
import urllib.parse
import re
import time
#获取成绩单页的html
def get_html(txtName,txtIDCard):
url = 'http://sd.cltt.org/Web/Login/PSCP01001.aspx'
data = {}
#来自Form Data,下面赋值
data['txtName']= txtName
data['txtIDCard']=txtIDCard
data['btnLogin']='查 询'
data['__VIEWSTATE']=''
data['txtStuID']=''
data['txtCertificateNO:']=''
data['txtCardNO']=''
data = urllib.parse.urlencode(data).encode('utf-8')
response = urllib.request.urlopen(url,data)
html = response.read().decode('utf-8')
return html
#解析html取出个人信息
def get_result(html):
#姓名
name_start = html.find(r'姓名:') #起点记录查询位置
name_end = html.find(r'证件号:')
name_html = html[name_start+190:name_end-186]
#身份证
id_start = html.find(r'证件号:') #起点记录查询位置
id_end = html.find(r'准考证号:')
id_html = html[id_start+192:id_end-786]
#等级
level_start = html.find(r'等级:') #起点记录查询位置
level_end = html.find(r'证书编号:')
level_html = html[level_start+171:level_end-169]
#分数
score_start = html.find(r'最终分:') #起点记录查询位置
score_end = html.find(r'等级:')
score_html = html[score_start+173:score_end-280]
#证书编号
bookid_start = html.find(r'证书编号:') #起点记录查询位置
bookid_end = html.find(r'省份:')
bookid_html = html[bookid_start+174:bookid_end-280]
#准考证号
k_start = html.find(r'准考证号:') #起点记录查询位置
k_end = html.find(r'出生日期:')
k_html = html[k_start+173:k_end-171]
#print(new_html)
if len(name_html) < 10:
print("--------------------------------------------------------------------------------------------------------------")
print("姓名(id):%s(%s) | 等级:%s(%s分) | 证书编号:%s | 准考证号:%s"%(name_html,id_html,level_html,score_html,bookid_html,k_html))
#主函数
def main():
#txtName = input("请输入姓名:")
#txtIDCard = input("请输入身份证号:")
#可以把姓名和身份证存到文件然后依次读取,由于样本太少,直接用元组了。。。
names = ["张三","李四","王五","赵四","狗子","二虎"]
ids = ["3xxxxxxxxxxxxxx2","3xxxxxxxxxxxxxx0","6xxxxxxxxxxxxxx0","3xxxxxxxxxxxxxx2","3xxxxxxxxxxxxxx1X","3xxxxxxxxxxxxxx7"]
for i in range(len(names)):
txtName = names[i]
txtIDCard = ids[i]
html = get_html(txtName,txtIDCard)
get_result(html)
#print(html)
if __name__ == "__main__":
main()