1.建立scrapy爬虫程序,在terminal命令行输入’scrapy startproject douban_login’
2.建立爬虫主程序,主要步骤都在这里实现,以douban_login.py命名
程序代码如下:
import scrapy
from scrapy.spider import CrawlSpider
from urllib import request
from PIL import Image#加载验证码图片的模块
#登陆豆瓣主程序
class Douban_login(CrawlSpider):
name = 'douban_login'
allowed_domains = ['douban.com']
start_urls = ['https://accounts.douban.com/login']
log_in = 'https://accounts.douban.com/login'
def parse(self, response):
#创建一个可以进行填写模拟登陆信息的字典
formdata = {
'source':'None',
'redir':'https://accounts.douban.com/login',
'form_email':'[email protected]',
'form_password':'xl7187596',
'remember':'on',
'login':'登录'
}
#获取填写登录信息的对象
captcha_url = response.css('img#captcha_image::attr(src)').get()
#如果存在,填入对应的登录信息
if captcha_url:
captcha =self.captcha_img(captcha_url)
formdata['captcha-solution'] = captcha
captcha_id = response.xpath('//input[@name="captcha-id"]/@value').get()
formdata['captcha-id'] = captcha_id
#提交请求
yield scrapy.FormRequest(url =self.log_in,formdata=formdata,callback=self.parse_login)
#判断是否登陆成功
def parse_login(self,response):
if response.url == 'https://www.douban.com/':
print("登录成功")
else:
print("登陆失败")
#获取图形验证码并手动填入
def captcha_img(self,imgurl):
request.urlretrieve(imgurl,'captcha.png')
image = Image.open('captcha.png')
image.show()
captcha = input("请输入验证码:")
return captcha
3.进入settings设置浏览器头部并打开
改写代码如下:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
}
4.建立运行函数main.py文件
写入代码如下:
from scrapy import cmdline
cmdline.execute('scrapy crawl douban_login'.split())
运行就可以模拟登陆豆瓣并进行图形验证码的验证了