Python reptiles - known to obtain almost the simulated landing selenium cookies + requests.Session () + session access serialization - turn

Transfer from https://www.cnblogs.com/DOLFAMINGO/p/9170429.html

 1 # coding:utf-8
 2 from selenium import webdriver
 3 import requests
 4 import sys
 5 import time
 6 from lxml import etree
 7 import cPickle
 8 import os
 9 # reload(sys)
10 # sys.setdefaultencoding('utf-8')
11 
12 class Zhihu:
13     def __init__(self,homeurl):
14         self.homeurl =homeurl
 15  
16      DEF save_session (Self, the session): # save the session, the next can be used directly, to avoid log in again 
17          with Open ( ' session.txt ' , ' wb ' ) AS f:
 18              cPickle.dump (session, f)
 . 19              Print  " Cookies have have been writed. " 
20 is  
21 is      DEF the load_session (Self):      # loading the session 
22 is          with Open ( ' session.txt ' , ' RB ' ) AS F:
 23 is              S = cPickle.load(f)
24         return s
25 
26     def GetCookies(self):       #初次登录用selenium模拟,并获得cookies
27         browser = webdriver.Chrome()
28         browser.get("https://www.zhihu.com/signin")
29         browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[1]/div[2]/div[1]/input").send_keys("13060882373")
30         browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[2]/div/div[1]/input").send_keys("xxxxxx")
31         browser.find_element_by_xpath("//main//div[2]/div[1]/form/button").click()
32         time.sleep(10)
33         cookies = browser.get_cookies()
34         browser.quit()
35         return cookies
36 
37     def get_session(self):  #获取session
38         s = requests.Session()
39         if not os.path.exists('session.txt'):   #If the session is not, one is created and saved to a file 
40              s.headers.clear ()
 41 is              for Cookie in self.GetCookies ():
 42 is                  s.cookies.set (Cookie [ ' name ' ], Cookie [ ' value ' ])
 43 is              self.save_session (S)
 44 is          the else :                                    # If the session already exists, it is loaded directly using 
45              S = self.load_session ()
 46 is          return S
 47  
48      DEF crawl (Self):     # start crawling 
49          S = self.get_session()
50         html = s.get(self.homeurl).text
51         html_tree = etree.HTML(html)
52         items = html_tree.xpath('//main//div[1]/div[2]//div[@class="ContentItem AnswerItem"]/@data-zop')
53         for item in items:
54             content = eval(item)
55             authorName = content['authorName']
56             title = content['title']
57             print authorName + " Answer: " + title
 58  
59 zhihu = Zhihu ( ' https://www.zhihu.com/ ' )
 60 zhihu.Crawl ()

 

Guess you like

Origin www.cnblogs.com/Young-shi/p/11518090.html