selenium simulated landing watercress, climbing Swordsman essay

selenium simulated landing watercress, climbing Swordsman essay:

  In the beginning write reptiles, crawl watercress comments, we can directly from the F12 which is found in the interface, but recently updated watercress, JS data is loaded asynchronously, so there is no way to find a suitable climbing, so the use of selenium to analog browser crawling.

  Landed watercress also changed the style, we can find the landing page in another frame inside

So the code is as follows:

# - * - Coding: UTF-. 8 - * - 
# leader packet 
Import Time 
from the webdriver Selenium Import 
from selenium.webdriver.common.keys Import Keys 
# Create Object parameters chrome 
opt = webdriver.ChromeOptions () 
# set to the chrome-free interface mode, regardless of windows or linux can automatically adapt the corresponding parameters 
opt.set_headless () 
# use the Google browser 
Driver = webdriver.Chrome (Options = opt) 
Driver = webdriver.Chrome () 
# log on Douban 
driver.get ( "http://www.douban.com/") 

# switch to login to the frame 
driver.switch_to.frame (driver.find_elements_by_tag_name ( "iframes") [0]) 
# click "password" 
bottom1 = driver.find_element_by_xpath ( '/ HTML / body / div [. 1] / div [. 1] / UL [. 1] / Li [2]') 
bottom1.click () 

# # account password
input1 = driver.find_element_by_xpath('//*[@id="username"]')
input1.clear()
input1.send_keys("xxxxx")

input2 = driver.find_element_by_xpath('//*[@id="password"]')
input2.clear()
input2.send_keys("xxxxx")

# 登录
bottom = driver.find_element_by_class_name('account-form-field-submit ')
bottom.click()

 Then jump to the comment interface       https://movie.douban.com/subject/3882715/comments?sort=new_score

Click Next discovered url change   https://movie.douban.com/subject/3882715/comments?start=20&limit=20&sort=new_score  so after we observed changes can directly write cycle

 

 Get the user's name

 

driver.find_element_by_xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a'.format(str(i))).text
用户的评论

driver.find_element_by_xpath('//*[@id="comments"]/div[{}]/div[2]/p/span'.format(str(i))).text
然后我们想要知道用户的居住地:
1    #获取用户的url然后点击url获取居住地
2             userInfo=driver.find_element_by_xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a'.format(str(i))).get_attribute('href')
3             driver.get(userInfo)
4             try:
5                 userLocation = driver.find_element_by_xpath('//*[@id="profile"]/div/div[2]/div[1]/div/a').text
6                 print("用户的居之地是:  ")
7                 print(userLocation)
8             except Exception as e:
9                 print(e)

这里要注意有些用户没有写居住地,所以必须要捕获异常

完整代码

# -*- coding:utf-8 -*-
# 导包
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

class doubanwlwz_spider():
    def __init__(self):
        # 创建chrome参数对象
        opt = webdriver.ChromeOptions()
        # 把chrome设置成无界面模式,不论windows还是linux都可以,自动适配对应参数
        opt.set_headless()
        # 用的是谷歌浏览器
        driver = webdriver.Chrome(options=opt)
        driver=webdriver.Chrome()
        self.getInfo(driver)
    def getInfo(self,driver):
    # 切换到登录框架中来
    # 登录豆瓣网
        driver = driver
        driver.get("http://www.douban.com/")
        driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
        # 点击"密码登录"
        bottom1 = driver.find_element_by_xpath('/html/body/div[1]/div[1]/ul[1]/li[2]')
        bottom1.click()
        # # 输入密码账号
        input1 = driver.find_element_by_xpath('//*[@id="username"]')
        input1.clear()
        input1.send_keys("ZZZ2")

        input2 = driver.find_element_by_xpath('//*[@id="password"]')
        input2.clear()
        input2.send_keys("ZZZ")

        # 登录
        bottom = driver.find_element_by_class_name('account-form-field-submit ')
        bottom.click()

        time.sleep(1)
        driver.get('https://movie.douban.com/subject/3882715/comments?start=300&limit=20&sort=new_score')
        search_window = driver.current_window_handle
        # pageSource=driver.page_source
        # print(pageSource)
        #获取用户的名字 每页20个
        for i in range(1,21):
            print("用户的评论是:  ")
            print(driver.find_element_by_xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a'.format(str(i))).text)
     #  获取用户的评论
        # print(driver.find_element_by_xpath('//*[@id="comments"]/div[1]/div[2]/p/span').text)
            print("用户的名字是: ")
            print(driver.find_element_by_xpath('//*[@id="comments"]/div[{}]/div[2]/p/span'.format(str(i))).text)
    #获取用户的url然后点击url获取居住地
            userInfo=driver.find_element_by_xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a'.format(str(i))).get_attribute('href')
            driver.get(userInfo)
            try:
                userLocation = driver.find_element_by_xpath('//*[@id="profile"]/div/div[2]/div[1]/div/a').text
                print("用户的居之地是:  ")
                print(userLocation)
            except Exception as e:
                print(e)
            driver.back()


pageNum=int(input("请输入您想要爬去的步行街的页数: "))
AAA=doubanwlwz_spider()

 

  

 

Guess you like

Origin www.cnblogs.com/ZFBG/p/10992970.html