Python正方教务系统模拟登陆爬取教务信息

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/blog635/article/details/79341962
爬虫调度器
# coding:utf-8
from pip._vendor.distlib.compat import raw_input
from spider_mb import url_manager, html_downloader, html_parser, html_outputer, xh_generate
import time


class SpiderMain(object):
    def __init__(self):
        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.output = html_outputer.HtmlOutputer()
        self.xh = xh_generate.Xh_generate()

    def craw(self, root_url):
        raw_input("欢迎使用高校现代化教学管理系统")
        tbYHM = raw_input("请输入学号:")
        tbPSW = raw_input("请输入密码:")
        ddlSF = raw_input("请输入身份:")
        xsmainfs, session = self.downloader.jw_login(root_url, tbYHM, tbPSW, ddlSF)
        alert = self.parser.login(xsmainfs)
        if len(alert) == 0:
            xsxx, xscj, djkscjcx1, ryxk, xstop = self.downloader.jw_info(session, tbYHM)
            xsxx_list = self.parser.parser_xsxx(xsxx)
            self.output.output_data(xsxx_list)
            top = self.parser.xstop(xstop)
            shang, zhong, xia = self.parser.parser_xscj(xscj)
            zhengshu = self.parser.djkscjcx1(djkscjcx1)
            xuanxiu = self.parser.ryxk(ryxk)
            self.output.output_cj(tbYHM, top, shang, zhong, xia, zhengshu, xuanxiu, xsxx_list)
            print("-->学生信息:", xsxx_list)
        else:
            print("-->登录失败:", tbYHM, alert[0])

if __name__ == "__main__":
    root_url = "http://域名"
    obj_spider = SpiderMain()
    obj_spider.craw(root_url)

爬虫html下载器

# coding:utf-8
import requests
from spider_mb import html_parser
from requests.adapters import HTTPAdapter


class HtmlDownloader(object):
    def __init__(self):
        self.parser = html_parser.HtmlParser()

    def download(self, url, headers):
        if url is None:
            return None
        session = requests.session()
        response = session.get(url, headers=headers)

        if response.status_code != 200:
            return None
        return response.text

    def jw_login(self, root_url, tbYHM, tbPSW, ddlSF):
        try:
            session = requests.session()
            session.mount('http://', HTTPAdapter(max_retries=120))
            html_cont = session.get(root_url, timeout=5)
            __VIEWSTATE = self.parser.parse(root_url, html_cont.content)
            login = {
                "__VIEWSTATE": __VIEWSTATE,
                "tbYHM": tbYHM,
                "tbPSW": tbPSW,
                "ddlSF": ddlSF.encode("gbk"),
                "imgDL.x": "22",
                "imgDL.y": "16",
            }
            xsmainfs = session.post(root_url, data=login, timeout=5).text
            return xsmainfs, session
        except Exception as e:
            print("异常信息:", e)

    def jw_info(self, session, tbYHM):
        xstop = session.get("域名", timeout=30).text
        xsxx = session.get("域名" + tbYHM + "&xh1=" + tbYHM,
                           timeout=5).text
        xscj = session.get("域名" + tbYHM, timeout=30).text
        djkscjcx1 = session.get("域名" + tbYHM, timeout=30).text
        ryxk = session.get("域名" + tbYHM, timeout=30).text
        session.close()
        return xsxx, xscj, djkscjcx1, ryxk, xstop

爬虫解析器

涉及系统内部数据,暂不展示自行百度即可,本人使用BeautifulSoup库,html.parser解析器

猜你喜欢

转载自blog.csdn.net/blog635/article/details/79341962