Python爬虫-爬取豆瓣信息(selenium+xpath)

======================

====================================

利用selenium模拟豆瓣登录:

 1 '''
 2 利用selenium模拟登录豆瓣
 3 需要输入验证码
 4 思路:
 5 1. 保存页面成快照
 6 2. 等待用户手动输入验证码
 7 3. 继续自动执行提交等动作
 8 
 9 '''
10 
11 from selenium import webdriver
12 from selenium.webdriver.common.keys import  Keys
13 import time
14 
15 
16 url = 'https://accounts.douban.com/login?alias=&redir=https%3A%2F%2Fwww.douban.com%2F&source=index_nav&error=1001'
17 driver = webdriver.Chrome()
18 driver.get(url)
19 
20 time.sleep(4)
21 
22 # 生成快照,用来查看验证码
23 driver.save_screenshot('douban_index.png')
24 
25 captcha = input("plz input youre code:")
26 
27 # 利用账户信息和验证码登录
28 driver.find_element_by_id("email").send_keys("[email protected]")
29 driver.find_element_by_id("password").send_keys("haha123456")
30 driver.find_element_by_id("captcha_field").send_keys(captcha)
31 
32 
33 driver.find_element_by_xpath("//input[@class='btn-submit']").click()
34 
35 time.sleep(5)
36 
37 driver.save_screenshot("logined.png")
38 
39 with open("douban_home.html", 'w', encoding='utf-8') as file:
40     file.write(driver.page_source)
41 
42 driver.quit()

============================================

利用selenium+xpath爬取豆瓣书籍信息

 1 '''
 2 https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s0
 3 使用selenium爬去页面
 4 保存内容后用xpath进行分析
 5 '''
 6 
 7 from selenium import webdriver
 8 import time
 9 from lxml import etree
10 
11 
12 def get_web(url):
13     driver = webdriver.Chrome()
14     driver.get(url)
15 
16     print('waitting for .......')
17     time.sleep(20)
18     print('waitting done .......')
19 
20     driver.save_screenshot('douban_reader.png')
21 
22 #豆瓣中没有页面源码,需要先把页面源码download下来,driver.page_source
23     fn = 'douban_reader.html'
24     with open(fn, 'w', encoding='utf-8') as f:
25         f.write(driver.page_source)
26 
27     content_parse(fn)
28     driver.quit()
29 
30 def content_parse(fn):
31     html = ''
32 
33     with open(fn, 'r', encoding='utf-8') as f:
34         html = f.read()
35 
36 
37     # 生成xml树,用xpath解析
38     tree = etree.HTML(html)
39 
40     #查找book
41     books = tree.xpath('//div[@class="item-root"]')
42 
43     for book in books:
44         book_name = book.xpath(".//div[@class='title']/a")
45         print(book_name[0].text)
46 
47 
48 if __name__ == '__main__':
49     url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s0'
50     get_web(url)

猜你喜欢

转载自www.cnblogs.com/xuxaut-558/p/10088415.html