引言
前面已经分享过用requests库来模拟登陆强智,这次打算用scrapy框架试一下,结果遇到的坑还真不少。
思路分析
前面已经分享过了,这里就不重复造轮子了,请出门,然后右转。传送门
这篇文章的主要目的是记录用scrapy框架模拟登陆强智教务系统遇到的问题和解决方法。
问题分析
首先,遇到的最大问题就是302重定向,scrapy框架默认是不能处理302重定向的,需要在发送请求时,加入dont_redirect
meta={'dont_redirect': True, 'handle_httpstatus_list': [302]}
然后,需要在响应头中拿到Location的URL,这里要注意,拿到的URL是bytes类型,需要转换成str类型才能使用(decode一下就好了)。
location_url = bytes.decode(response.headers.getlist('Location')[0])
最后,验证码识别的问题,这里采用的是手工识别,如果想要实现自动化,可以去网上找验证码的识别平台,直接调用API接口即可。
项目目录
代码
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for sdust project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'sdust'
SPIDER_MODULES = ['sdust.spiders']
NEWSPIDER_MODULE = 'sdust.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'sdust (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'sdust.middlewares.SdustSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'sdust.middlewares.SdustDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sdust.pipelines.SdustPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
jwgl.py
# -*- coding: utf-8 -*-
import scrapy
import os
from PIL import Image
from scrapy.http import Request, FormRequest
class JwglSpider(scrapy.Spider):
name = 'jwgl'
allowed_domains = ['jwgl.sdust.edu.cn']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'http://jwgl.sdust.edu.cn/',
'Origin': 'http://jwgl.sdust.edu.cn'
}
def start_requests(self):
start_url = 'http://jwgl.sdust.edu.cn/'
yield Request(url=start_url, callback=self.parse_login_page, headers=self.headers)
def parse_login_page(self, response):
captcha_url = "http://jwgl.sdust.edu.cn" + response.xpath('//img[@id="SafeCodeImg"]/@src').get()
if not os.path.exists('../captcha/'):
os.makedirs('../captcha/')
yield Request(url=captcha_url, callback=self.get_captcha, headers=self.headers)
def get_captcha(self, response):
with open('../captcha/captcha.png', 'wb') as f:
f.write(response.body)
yield FormRequest(url='http://jwgl.sdust.edu.cn/Logon.do?method=logon&flag=sess', callback=self.login,
headers=self.headers)
def login(self, response):
# 用户名
username = ''
# 密码
password = ''
# 加密算法
dataStr = response.text
scode = dataStr.split("#")[0]
sxh = dataStr.split("#")[1]
code = username + "%%%" + password
encoded = ""
i = 0
while i < len(code):
if i < 20:
encoded += code[i:i+1] + scode[0:int(sxh[i:i+1])]
scode = scode[int(sxh[i:i+1]):len(scode)]
else:
encoded += code[i: len(code)]
i = len(code)
i += 1
# 手工识别验证码
image = Image.open('../captcha/captcha.png')
image.show()
captcha = input("请输入验证码:")
data = {
'view': '0',
'encoded': encoded,
'RANDOMCODE': captcha
}
yield FormRequest(url='http://jwgl.sdust.edu.cn/Logon.do?method=logon', formdata=data, callback=self.parse_location, headers=self.headers, meta={'dont_redirect': True, 'handle_httpstatus_list': [302]})
def parse_location(self, response):
error = response.xpath('//font[@color="red"]/text()').get()
if error:
print(error)
else:
location_url = bytes.decode(response.headers.getlist('Location')[0])
yield Request(url=location_url, callback=self.get_main_page, headers=self.headers, meta={'dont_redirect': True, 'handle_httpstatus_list': [302]})
def get_main_page(self, response):
location_url = bytes.decode(response.headers.getlist('Location')[0])
yield Request(url=location_url, headers=self.headers, callback=self.parse)
def parse(self, response):
print("登陆成功!")
info = response.xpath('//div[@class="Nsb_top_menu_nc"]/text()').get()
print("个人信息:"+info)