day02-selenium library

Yesterday Review:
First, the basic principles of reptiles
- reptiles whole process
1. Send request
2 receives the response data
3. parsing and extracting valuable data
4. Save the data
two, Resquests request library
-get
URL
header
Cookies
-post
URL
header
Cookies
Data
Third, crawling Xiaohua net
Resolving extract details page by page
2. extract url via video details page
3. obtain a binary stream local video written
four automatic login GitHub
1. analysis of the request header and body information request
- user name
- password
Addr
- assorted
2.token
- by parsing login page extract
3. session_url send a request
-header:
the User-Agent

-cookies:
Cookies the Login page

-data:
form-the Data
Today's content:
a, requests crawling IMDb database request information
- requested url
https://movie.douban.com / TOP250

- request method
GET

- request header
the User-Agent
Cookies
two, selenium request library
1. What is selenium
at first automated testing tool, is the driving principle of the browser to perform some self-defined operations.
The reptile is essentially analog browser, so you can use him to do reptiles.
2. Why use selenium
advantages:
- js code execution
- no need to analyze complex communication process
- do pop, pull-down and other operations to the browser
- ***** may acquire dynamic data
- *** cracks can login authentication
drawbacks;
- low efficiency
3. installation and use
1. install selenium request library:
PIP3 install selenium
2. You must install the browser:
Google, Firefox
http://npm.taobao.org/mirrors/chromedriver/2.38/
3. install the browser driver
# crawling movie
'' ' 
Crawling IMDb message:
movie name, movie url
film director, movie starring
movie Year, Genre
film scores, movie reviews
movie Introduction
1. Analysis of all home url
on the first page:
https://movie.douban.com ? / top250 start = 0 & filter =
second page:
https://movie.douban.com/top250?start=25&filter=
'' '
Import requests
Import Re
# crawler trilogy
# 1 sends a request
def get_page (url):
= requests.get the Response (url)
#Print (response.text)
return the Response

# 2. parse the data
DEF parse_index (HTML):
'' '
movie rankings, movies url, name of the movie, film director, movie star,
movie Year / type , film scores, movie reviews, movie Introduction
<div class="item">.*?<em class="">(.*?)</em>.*?<a href=".*?">.*?<span class="title">(.*?)</span>
.*?导演: (.*?)主演: (.*?)<br>.*?</p>.*?<span class="rating_num" .*?>(.*?)</span>.*?<span>(.*?)人评价</span>
.*?<span class="inq">(.*?)</span>
<div class="item">.*?<em class="">(.*?)</em>.*?<a href=".*?">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>.*?</p>.*?<span class="rating_num" .*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>
'''
movie_list=re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num" .*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',
HTML,
re.S)
return movie_list

# 3. Save the data
DEF save_data (Movie):
# movie rankings, movies url, name of the movie, film director, film starring
# Movie Year / type of movie scores, movie reviews, film synopsis
top, m_url, name, daoyan, the Actor, year_type, Point, the commit, desc = movie
the Data = f '' '
=========== ========== welcome to watch
movies ranking: { top}
movie url: {m_url}
movie name: {name}
film directors: {daoyan}
movie starring: {actor}
Year type: {year_type}
movie ratings: {point}
movie review: {commit}
The movie: {desc}
Welcome next time ========== =========
'' '
Print (Data)

with Open (' douban_top250.text ',' A ', encoding =' UTF-. 8 ' ) AS F:
f.write (Data)

Print (F 'movie: {name} successfully written ...')

IF the __name__ == '__main__':
NUM = 0
for Line Range in (10):
URL = F ' https://movie.douban.com/top250?start={num}&filter= '
NUM = + 25
Print (URL)

#. 1. to each home transmits a request
index_res the get_page = (URL)

# 2. analytical home page for movie information
= parse_index movie_list (index_res.text)

for Movie in movie_list:
#Print (Movie)
# 3.Maintain data
save_data (Movie)

#selenium basic library use
Import webdriver the Selenium from 
from selenium.webdriver.common.by Import By # find in what way, By.ID, By.CSS_SELECTOR
from selenium.webdriver.common.keys Import Keys # keyboard operation
from selenium.webdriver.support import expected_conditions as EC # and together with the following WebDriverWait
from selenium.webdriver.support.wait import WebDriverWait # wait for page load certain elements
Import Time
# way: by driving opening a browser
driver = webdriver.Chrome (r'chromedriver.exe path where ')

# way: the driving into the python interpreter webdriver.exe installation directory / Scripts folder
#python interpreter installation directory / Scripts configure the environment variables
#python interpreter installation directory, configure the environment variables
the try:
driver.get ( 'https://www.jd.com/')

# obtain explicit wait 10 seconds objects
# may wait for 10 seconds to load a label
the wait = WebDriverWait (Driver, 10)

# key to find the element id
= wait.until input_tag (EC.presence_of_element_located (
(By.ID, 'Key')
))
the time.sleep (. 5)

# Enter a product name input box
input_tag.send_keys ( 'doll')

# Press Enter
input_tag .send_keys (Keys.ENTER)

the time.sleep (20)
a finally:
driver.close ()


# selector
Time Import 
from Import By selenium.webdriver.common.by
from selenium.webdriver.common.keys Import Keys
from selenium.webdriver.support.wait Import WebDriverWait
from selenium.webdriver.support Import expected_conditions AS EC
from the webdriver Selenium Import
Driver = the webdriver. Chrome (r'chromedriver.exe where the path ')

the try:

# implicit wait: you need to call before the GET
# wait 10 seconds to load any element
driver.implicitly_wait (10)

driver.get (' https://www.baidu. COM / ')

# explicit wait: call need after GET
the time.sleep (. 5)

' ''
=============== all methods ========== =========
Element is to find a label
elements is to find all the labels
' ''
# auto-login Baidu start
# 1、find_element_by_link_text # 通过链接文本去找
login_link = driver.find_element_by_link_text('登录')
login_link.click() # 点击登录

time.sleep(1)

# 2、find_element_by_id # 通过id去找
user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
user_login.click()

time.sleep(1)

# 3、find_element_by_class_name
user = driver.find_element_by_class_name('pass-text-input-userName')
user.send_keys('*****')

# 4、find_element_by_name
pwd = driver.find_element_by_name('password')
pwd.send_keys('*****')

submit = driver.find_element_by_id('TANGRAM__PSP_10__submit')
submit.click()
# end

. 5 #, find_element_by_partial_link_text
# local link text search
login_link = driver.find_element_by_partial_link_text ( 'registration')
login_link.click ()

#. 6, find_element_by_css_selector
# find elements based on attributes selectors
#:. Class
# #: ID
login2_link = driver.find_element_by_css_selector ( ' Pass-footerBarULogin-.tang ')
login2_link.click ()

# 7, find_element_by_tag_name
div = driver.find_elements_by_tag_name (' div ')
Print (div)


the time.sleep (20)

a finally:
# close the browser release operating system resources
driver.close ()


Guess you like

Origin www.cnblogs.com/2328322824chx/p/11121314.html