python reptile Day 02

A request requests crawling IMDb database information
    - requested url
        https://movie.douban.com/top250

    - request method
        GET

    - request header
        the User-Agent
        Cookies

"" "" "" 
'' ' 
Crawling IMDb message: 
    movie rankings, movie name, movie url, film director 
    movie star, movie Year, Genre 
    film scores, movie reviews, movie Introduction 
    
1, the analysis url all home 
first a: 
https://movie.douban.com/top250?start=0&filter= 
second page: 
https://movie.douban.com/top250?start=25&filter= 
third page: 
HTTPS: //movie.douban ? .com / TOP250 Start = 50 & filter = 

'' ' 
Import requests
 Import Re
 # crawler trilogy 
# 1, sends a request 
DEF the get_page (URL): 
    Response = requests.get (URL)
     # Print (Response.text) 
    return Response 


# 2, parsed data 
defparse_index (HTML):
     '' '' '' 
    '' ' 
    movie rankings, movies url, film name, film director, film starring 
    movie Year / Genre, film scores, movie reviews, movie Introduction 
    <div class = "item"> . *? <EM class = ""> (. *?) </ EM>. *? <a href="(.*?)"> 
    . *? <span class = "title"> (. *?) </ span> * director:? (.? *) starring: <br> </ the p-> (*.?) (*.?) 
    * <span class = "rating_num" *.?> (*.?. ?) </ span>. * ? <span> (. *?) people commented </ span> 
    . *? <span class = "INQ"> (. *?) </ span> 
    
    <div class = "Item" >. *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = "title"> (. *? ) </ span> * director:? (*) starring:.?.? (*) <br> (*) </ p>.?.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>
    '''
    movie_list = re.findall('<div class = "item"> . *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = " title "(.? *)> </ span> * director:? (.? *) starring: <br> </ p> * < span class = (*.?) (*.?).?" rating_num ". *?> (. * ?) </ span>. *? <span> (. *?) people commented </ span>. *? < span class =" inq "> (. *?) </ span > ' , 
               HTML, 
               re.S) 

    return movie_list 

# 3, save the data 
DEF save_data (movie):
     # movie rankings, movies url, name of the movie, film director, film starring 
    # movie Year / Genre, film scores, movie reviews, The movie 
    Top, m_url, name, daoyan, the Actor, year_type, \ 
    Point, the commit,desc = movie
    year_type = year_type.strip('\n')
    data F = '' ' 
                ========== ========= welcome to watch 
                        movies ranking: {top} 
                        movies url: {m_url} 
                        Movie Name: {name} 
                        film directors: {daoyan} 
                        movie starring: {actor} 
                        Year type: {year_type} 
                        movie ratings: {point} 
                        movie review: {commit} 
                        The movie: {desc} 
                ======== ======= Please come back next time yo = 
                \ n- 
                \ n- 
                '' ' 
    Print (Data) 

    with Open ( ' douban_top250.txt ' , ' A ', Encoding = ' UTF-. 8 ' ) AS F: 
        f.write (Data) 

    Print (F ' Movie: {name} successfully written ... ' ) 

IF  the __name__ == ' __main__ ' :
     # spliced all Home 
    NUM = 0
     for Line in Range (10 ): 
        URL = F ' https://movie.douban.com/top250?start={num}&filter= ' 
        NUM + = 25
         Print (URL) 

        # 1. each home transmits a request to 
        index_res = get_page (url) 

        #2. Parse the home page for movie information 
        movie_list = parse_index (index_res.text) 

        for Movie in movie_list:
             # Print (Movie) 

            # 3. Save the data 
            save_data (movie)

Two selenium request library
    1. What is selenium?
        Opening is an automated testing tool, it is the driving principle of
        the browser to perform some certain good operation. Reptile nature of
        the browser is analog, so you can use it for reptiles.

    2. Why use selenium?
        Advantages:
            - js code execution
            - no need to analyze complex communication process
            - do pop, pull-down and other operations to the browser
            - ***** obtain dynamic data
            - *** cracks login authentication

        disadvantages:
            - Low efficiency

    3, mounting and use
        1. install selenium request library:
            PIP3 install selenium

        2. You must install the browser
            , "Google" or Firefox

        3. install the browser driver
            http://npm.taobao.org/mirrors/chromedriver/2.38/
            Windows:
                Download win32 drive

The basic use of selenium

 

from the Selenium Import webdriver   # Web drive 
from selenium.webdriver.common.by Import By   # to find in what way, By.ID, By.CSS_SELECTOR 
from selenium.webdriver.common.keys Import Keys   # keyboard key operation 
from selenium.webdriver.support Import expected_conditions AS EC   # , and together with the following WebDriverWait 
from selenium.webdriver.support.wait Import WebDriverWait   # wait for a page to load certain elements 
Import Time 

Import Time 

# way: by driving open browser 
#driver = webdriver.Chrome (r 'drive absolute path /webdriver.exe') 

# way: the driving into the python interpreter webdriver.exe installation directory / Scripts folder 
# python interpreter installation directory / Scripts configure the environment variables 
# Python interpreter installation directory configuration environment variable 
Driver = webdriver.Chrome () 

the try : 

    driver.get ( ' https://www.jd.com/ ' ) 

    # obtain explicit wait 10 seconds objects 
    # may wait loading a label 10 seconds 
    the wait = WebDriverWait (Driver, 10 ) 

    # find an element of id Key 
    The input_tag = wait.until (EC.presence_of_element_located ( 
        (By.ID, ' Key ' ) 
    )) 

    the time.sleep ( . 5) 
            
    # Enter a product name in the input box 
    input_tag.send_keys ( ' doll ' ) 

    # press the keyboard Enter key 
    input_tag.send_keys (Keys.ENTER) 


    the time.sleep ( 20 ) 

a finally :
     # close the browser release operating system resources 
    driver. close ()

 

selenium selector

 

'' '' '' 
From Selenium Import the webdriver   # Web drive 
from selenium.webdriver.common.keys Import Keys   # keyboard operation 
Import Time 

Import Time 

Driver = webdriver.Chrome () 

the try : 

    # Implicit wait: call needs to get before 
    # wait 10 seconds to load any element 
    driver.implicitly_wait (10 ) 

    driver.get ( ' https://www.baidu.com/ ' ) 

    # explicit wait: call need after GET 
    the time.sleep (. 5 ) 

    '' '  
    = All methods ============== ===================
        element It is to find a label
        elements is to find all the labels 
    ' '' 
    # auto-login Baidu Start 
    # 1, # find_element_by_link_text go through linked text 
    LOGIN_LINK = driver.find_element_by_link_text ( ' login ' ) 
    login_link.click ()   # click Sign 

    the time.sleep ( 1 ) 

    # 2, find_element_by_id # id go through 
    USER_LOGIN = driver.find_element_by_id ( ' TANGRAM__PSP_10__footerULoginBtn ' ) 
    user_login.click () 

    the time.sleep ( . 1 ) 

    # . 3, find_element_by_class_name 
    User driver.find_element_by_class_name = ( 'pass-text-input-userName')
    user.send_keys('*****')

    # 4、find_element_by_name
    pwd = driver.find_element_by_name('password')
    pwd.send_keys('*****')

    submit = driver.find_element_by_id('TANGRAM__PSP_10__submit')
    submit.click()
    # end

    # 5、find_element_by_partial_link_text
    # 局部链接文本查找
    login_link = driver.find_element_by_partial_link_text('') 
    Login_link.click () 

    # . 6, find_element_by_css_selector 
    # Find The attribute selector element 
    # :. Class 
    # #: ID 
    login2_link = driver.find_element_by_css_selector ( ' .tang-Pass-footerBarULogin ' ) 
    login2_link.click () 

    # . 7, find_element_by_tag_name 
    div driver.find_elements_by_tag_name = ( ' div ' )
     Print (div) 


    the time.sleep ( 20 ) 

a finally :
     # close the browser release operating system resources 
    driver.close ()

 

Guess you like

Origin www.cnblogs.com/zyl0517/p/11122410.html