2018.8.13 登陆知乎
1、配置scrapy调试
在工程文件下新建python文件夹main.py用于调试项目(当然还可以使用pdb进行调试)
main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "Buycar"])
2、设置robots为False
3、拿到xsrf
import requests
import http.cookiejar as cookielib
import re
from bs4 import BeautifulSoup
agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
header = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
"User-Agent": agent #之间是短线短线!!!!,不是下划线
}
def get_xsrf():
response = requests.get("https://www.zhihu.com", headers=header)
xsrf = response.request._cookies._cookies.get('.zhihu.com').get('/').get('_xsrf').value
get_xsrf()
4、登陆全部代码:
import requests
import http.cookiejar
import re
session =requests.session()
agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
header = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhihu.com/",
"User-Agent": agent #之间是短线短线!!!!,不是下划线
}
def get_xsrf():
response = requests.get('https://www.zhihu.com',headers =header)
xsrf = response.request._cookies._cookies.get('.zhihu.com').get('/').get('_xsrf').value
return xsrf
def zhihu_login(account,password):
post_url = 'https://www.zhihu.com/login/phone_num'
post_data = {
'_xsrf':get_xsrf(),
'phone':account,
'password':password
}
response_text = session.post(post_url, data=post_data, headers=header)
if response_text.status_code == 200:
print('登陆成功')
else:
print('登陆失败')
zhihu_login('18328020353','*****')
一片关webdriver xpath的使用http://toolsqa.com/selenium-webdriver/choosing-effective-xpath/
https://blog.csdn.net/passionboyxie/article/details/28632965
获取属性值https://blog.csdn.net/xm_csdn/article/details/53390649
dr = driver.find_element_by_id('tooltip')
dr.get_attribute('data-original-title') #获取tooltip的内容
我们唯一确定的是文本'Profile'将始终包含在此图像的src中,因此我们可以在xpath中使用此提示,如下所示:
web.find_element_by_xpath(".//*[@class='Login-content']/form/button/ img [contains(@src,'Profile') ]").click()
常用查找方式
find_element_by_name
find_element_by_id
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
5、处理知乎验证码
https://blog.csdn.net/sinat_37202005/article/details/54406105
https://www.imooc.com/article/35586
2018.08.15改写middleware
class JSMiddleware(object):
def process_request(self, request, spider):
web = webdriver.Chrome("E:/software/python3.6/chromedriver.exe")
try:
if spider.name == "DouyuImage":
# self.web.get(request.url)
web.get(request.url)
time.sleep(3)
body = web.page_source
print("访问:{0}".format(request.url))
print("^" * 50)
return HtmlResponse(url=web.current_url, body=body, encoding="utf-8", request=request)
except Exception as e:
print(e)
print("webdriver 失败")
return None
2018.08.18
python 链接 mysql 的SQL语句中如果含有中文一定要用format
sql = 'select id from question where user = "{0}" and Q_title = "{1}"'.format(item['Q_user'],item['Q_title']),一定要这样写,记住{0}外面是有引号滴,有引号!!!!!!!!