Transfer from https://www.cnblogs.com/DOLFAMINGO/p/9170429.html
1 # coding:utf-8 2 from selenium import webdriver 3 import requests 4 import sys 5 import time 6 from lxml import etree 7 import cPickle 8 import os 9 # reload(sys) 10 # sys.setdefaultencoding('utf-8') 11 12 class Zhihu: 13 def __init__(self,homeurl): 14 self.homeurl =homeurl 15 16 DEF save_session (Self, the session): # save the session, the next can be used directly, to avoid log in again 17 with Open ( ' session.txt ' , ' wb ' ) AS f: 18 cPickle.dump (session, f) . 19 Print " Cookies have have been writed. " 20 is 21 is DEF the load_session (Self): # loading the session 22 is with Open ( ' session.txt ' , ' RB ' ) AS F: 23 is S = cPickle.load(f) 24 return s 25 26 def GetCookies(self): #初次登录用selenium模拟,并获得cookies 27 browser = webdriver.Chrome() 28 browser.get("https://www.zhihu.com/signin") 29 browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[1]/div[2]/div[1]/input").send_keys("13060882373") 30 browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[2]/div/div[1]/input").send_keys("xxxxxx") 31 browser.find_element_by_xpath("//main//div[2]/div[1]/form/button").click() 32 time.sleep(10) 33 cookies = browser.get_cookies() 34 browser.quit() 35 return cookies 36 37 def get_session(self): #获取session 38 s = requests.Session() 39 if not os.path.exists('session.txt'): #If the session is not, one is created and saved to a file 40 s.headers.clear () 41 is for Cookie in self.GetCookies (): 42 is s.cookies.set (Cookie [ ' name ' ], Cookie [ ' value ' ]) 43 is self.save_session (S) 44 is the else : # If the session already exists, it is loaded directly using 45 S = self.load_session () 46 is return S 47 48 DEF crawl (Self): # start crawling 49 S = self.get_session() 50 html = s.get(self.homeurl).text 51 html_tree = etree.HTML(html) 52 items = html_tree.xpath('//main//div[1]/div[2]//div[@class="ContentItem AnswerItem"]/@data-zop') 53 for item in items: 54 content = eval(item) 55 authorName = content['authorName'] 56 title = content['title'] 57 print authorName + " Answer: " + title 58 59 zhihu = Zhihu ( ' https://www.zhihu.com/ ' ) 60 zhihu.Crawl ()