版权声明:本文为 [onefine] 原创文章,转载请注明出处: https://blog.csdn.net/jiduochou963/article/details/88360604
1、新建
> scrapy startproject spider_pjt2_zhihu
> cd spider_pjt2_zhihu
> scrapy genspider zhihu www.zhihu.com
2. spider_pjt2_zhihu/utils/ZhihuAccount.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : ZhihuAccount.py
from selenium import webdriver
import requests
from time import sleep
try:
import http.cookiejar as cookielib
except Exception as e:
print("兼容Py2.x", e)
import cookielib # 兼容Py2.x
import os
class ZhihuAccount(object):
""""
入口:check_login
True:
False:
"""
def __init__(self):
self.brower = None
self.session = requests.session()
self.filename = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'cookies/zhihu_cookie.text')
self.session.cookies = cookielib.LWPCookieJar(filename=self.filename)
self.headers = {
'Referer': 'https://www.zhihu.com/signup?next=%2F',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.121 Safari/537.36',
}
# 加载cookie
self.load_cookies() # 加载失败主动抛出异常,还没完成
def login(self, username='', password=''):
if username == '' or password == '':
username = input('输入名称:')
password = input('输入密码:')
self.brower = webdriver.Chrome(executable_path='D:/selenium/chromedriver.exe')
self.brower.get('https://www.zhihu.com/signup?next=%2F')
try:
self.brower.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[2]/div[2]/span').click() # 点击
self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="username"]').send_keys(username)
sleep(2)
self.brower.find_element_by_xpath('//*[@id="root"]//input[@name="password"]').send_keys(password)
self.brower.execute_script('Object.defineProperties(navigator,{webdriver:{get:() => false}});')
self.brower.execute_script('window.navigator.webdriver') # status =
self.brower.find_element_by_xpath('//*/form/button').click() # 点击 # if status == ('None' or 'False'):
sleep(1)
# 登录逻辑中保存session
for cookie in self.brower.get_cookies():
self.session.cookies.set_cookie(
cookielib.Cookie(version=0, name=cookie['name'], value=cookie['value'],
port='80', port_specified=False, domain=cookie['domain'],
domain_specified=True, domain_initial_dot=False,
path=cookie['path'], path_specified=True,
secure=cookie['secure'], rest={},
expires=cookie['expiry'] if "expiry" in cookie else None,
discard=False, comment=None, comment_url=None, rfc2109=False))
self.session.cookies.save()
return True
except Exception as e_login:
print("登录失败", e_login)
return False
def load_cookies(self):
try:
self.session.cookies.load(ignore_discard=True)
return True
except Exception as e_load:
print("zhihu_cookie未能加载", e_load)
print("正在重新登录...")
# 第一次尝试登录:
if self.login():
print("cookie成功加载")
return True
else:
print("加载cookie失败")
return False
def check_login(self):
# 通过设置页面返回状态码来判断是否为登录状态
inbox_url = 'https://www.zhihu.com/settings/account'
response = self.session.get(inbox_url, headers=self.headers, allow_redirects=False)
status = True
if not response.status_code == 200:
# 第二次尝试登录:
# print("正在重新登录...")
if not self.login():
status = False
# 关闭浏览器:
if self.brower:
self.brower.quit()
self.session.close()
if status:
return True
else:
return False
spider_pjt2_zhihu/middlewares.py
from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
from spider_pjt2_zhihu.utils.ZhihuAccount import ZhihuAccount
import requests
from collections import defaultdict
from scrapy.http.cookies import CookieJar
try:
import http.cookiejar as cookielib
except Exception as e:
print("兼容Py2.x", e)
import cookielib # 兼容Py2.x
class ZhihuCookiesMiddleware(CookiesMiddleware):
def __init__(self, debug=False):
super().__init__(debug)
self.load_zhihu_cookies()
def load_zhihu_cookies(self):
# 加载zhihu_cookie.txt
# 先测试登录,根据返回状态码判断是否进一步操作
account = ZhihuAccount()
if not account.check_login():
print("登录失败")
return
print("登录成功")
# 利用cookie重新登录知乎
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename=account.filename)
session.cookies.load(ignore_discard=True) # 由于在ZhihuAccount中已经检测过,所以这里不用检测异常
post_url = 'https://www.zhihu.com'
response = session.get(post_url, headers=account.headers, allow_redirects=False)
if response.status_code == 200:
# print(response.text)
for cookie in session.cookies: # 坑,,,注意从session中获取,而不是从response中获取。。。
self.jars['zhihu'].set_cookie(cookie)
spider_pjt2_zhihu/settings.py
COOKIES_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, # 关闭
'spider_pjt2_zhihu.middlewares.ZhihuCookiesMiddleware': 1,
}
spider_pjt2_zhihu/spiders/zhihu.py
# -*- coding: utf-8 -*-
import scrapy
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['http://www.zhihu.com/'] # 虽然start_requests被重写用不到这个
def __init__(self):
self.headers = {
# 'Referer': 'https://www.zhihu.com/', # 这里可以不加,作用不大
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/72.0.3626.121 Safari/537.36',
}
super(ZhihuSpider, self).__init__()
def start_requests(self):
# 重写start_requests
for url in self.start_urls:
yield scrapy.Request(url, headers=self.headers, meta={'cookiejar': 'zhihu'})
def parse(self, response):
print("body:\n", response.body)
测试
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# 调用execute()函数来执行命令,此方法传递一个数组作为参数
execute(["scrapy", "crawl", "zhihu"])
进一步操作:设计数据库表
table: zhihu_question
question_id # 主键,问题编号
question_title # 问题的标题
created_time # 创建时间
updated_time # 更新时间
question_url # 问题链接
question_topics # 关键词
question_content # 问题描述
answer_num # 回答数量
comments_num # 评论数,评论问题
watch_user_num # 关注者数量
click_num # 被浏览数量
crawl_time # 初始爬取时间
crawl_update_time # 最后爬取时间
table: zhihu_answer
answer_id # 回答id, 作为主键
question_id # 问题id,作为外键
answer_url # 回答链接
author_id # zhihu ID,由于可以匿名回答,可为空
author_name # 回答者名称,显示名称
answer_content # 回答内容
voteup_count # 赞同数
comment_count # 评论数
create_time # 回答时间
updated_time # 最后更新时间
crawl_time # 初始爬取时间
crawl_update_time # 最后爬取时间
正则表达式分析:
(.*question\/(\d+))(\/|$).*
参考:
scrapy添加cookie的三种方式 https://blog.csdn.net/qq_40655579/article/details/85126064
scrapy中如何设置应用cookies https://blog.csdn.net/fuck487/article/details/84617194
scrapy在中间件携带cookie发送请求 https://blog.csdn.net/qq_42336549/article/details/80991814
Scrapy源码注解–CookiesMiddleware https://www.cnblogs.com/thunderLL/p/8060279.html