Python Reptile Day 2

'' ' 
Crawling IMDb message 
    movie rankings, movies url, name of the movie 
    film director, movie star, movie Year / type of 
    movie scores, movie reviews, movie The 

analysis url all home 

' '' 
Import Requests
 Import Re
 # three reptiles Qu 
# 1 sends a request 
DEF the get_page (URL): 
    Response = requests.get (URL)
     # Print (response.text) 
    return Response 

# 2. analysis data 
DEF parse_index (HTML): 
   movie_list = the re.findall ( '<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演:(.*?)主演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',html,re.S)
   return movie_list

# 3.保存数据
def save_data(movie):
    top , m_url, name,daoyan,actor,year_type,point,commit,desc = movie
    year_type = year_type.strip('\n')
    data = '''
            Movie Name: {}
            movies url: {}
            movies ranking: {}
          ========== ========== welcome to watch
            filmmaker: {} 
            Movie Starring: {} 
            Genre: {} 
            Movie Rating: {} 
            Movie Review: {} 
            The movie: {} 
          ========== ======= come again === 
          \ n- 
          \ n- 
          '' ' .format (Top, m_url, name, daoyan, the Actor, year_type, Point, the commit, desc)
     Print (Data) 
    with Open ( ' douban_top250.txt ' , ' A ' , encoding = ' UTF-. 8 ' ) AS F: 
        f.write (Data) 
    Print ( ' movie: {} writing success ... ' .format (name)) 

IF  the __name__ == '__main__ ' : 
    NUM = 0
     for Line in Range (10 ): 
        URL = ' https://movie.douban.com/top250?start={}&filter= ' .format (NUM) 
        NUM + = 25
         Print (URL) 

        # 1. each home page to send a request 
        index_res = get_page (url) 

        # 2. resolve home page for movie information 
        movie_list = parse_index (index_res.text)
         for movie in movie_list:
             # Print (movie) 
            # 3. save data
            save_data(movie)
selenium request library 
1. What is selenium
beginning is an automated testing tool, is the driving principle must perform some good browser operating
on the reptile is essentially analog browser, you can use it for reptiles

2. Why should I use selenium
advantages: js execution Code
no need to analyze complex communication processes
can make browser pop, pull-down and other operations
***** may acquire dynamic data
*** cracks can login authentication

disadvantages: low efficiency

3. installation and use
1. installation request selenium library
PIP3 install the Selenium

2. You must install the browser
, "Google" or Firefox

3. install the browser driver
http://npm.taobao.org/mirrors/chromedriver/2.38/
from the Selenium Import webdriver # Web drive 
from selenium.webdriver Import ActionChains   # crack the code slide when you can drag the picture used 
from selenium.webdriver.common.by Import By   # to find in what way, By.ID, By.CSS_SELECTOR 
from selenium.webdriver.common.keys Import keys   # keyboard operation 
from selenium.webdriver.support Import expected_conditions EC AS   # and together with the following WebDriverWait 
from selenium.webdriver.support.wait Import WebDriverWait   #Wait for page load certain elements 
Import Time 

# way: by opening a browser driver 
Driver = webdriver.Chrome (R & lt ' : C \ the Users \ Merliah \ Downloads / chromedriver.exe ' ) 

# two ways: the driver into webdriver.exe python interpreter installation directory / Scripts folder 
# python interpreter installation directory / Scripts configure the environment variables 
# python interpreter environment variable installation directory configuration 

the try : 

    driver.get ( ' https://www.jd.com/ ' )
     # wait 10 seconds obtaining a display objects 
    # may wait for 10 seconds to load a label 
    the wait = WebDriverWait (Driver, 10 )
     # find an element of id Key 
    The input_tag = wait.until (EC.presence_of_element_located ((By.ID, ' Key' ))) 
    The time.sleep ( . 5 ) 

    # enter the product in the input box 
    input_tag.send_keys ( ' doll ' )
     # Press Enter 
    input_tag.send_keys (Keys.ENTER) 
    the time.sleep ( 20 is ) 

the finally :
     # Close Browse release operating system resources 
    driver.close ()

 selenium selector

from Selenium Import the webdriver # Web drive 
from selenium.webdriver.common.keys Import Keys   # keyboard operation 
Import Time 

driver = webdriver.Chrome () 

the try : 

    # Implicit wait: call the prior GET 
    # Wait 10 seconds to load any element 
    driver .implicitly_wait (10 ) 

    driver.get ( ' https://www.jd.com/ ' ) 

    # show wait: you need to call after GET 
    the time.sleep (5 ) 

    '' ' 
========== ===== all methods =================== 
    Element is to find a label 
    elements is to find all tags
 '' ' 
     # Auto-login Baidu Start 
    # 1, # find_element_by_link_text go through linked text 
    LOGIN_LINK = driver.find_element_by_link_text ( ' login ' ) 
    login_link.click ()   # click Sign 

    the time.sleep ( 1 ) 

    # 2, # find_element_by_id go by id Get 
    USER_LOGIN = driver.find_element_by_id ( ' TANGRAM__PSP_10__footerULoginBtn ' ) 
    user_login.click () 

    the time.sleep ( . 1 ) 

    # . 3, find_element_by_class_name 
    User driver.find_element_by_class_name = ( ' Pass-text-INPUT-the userName ' )
    user.send_keys ( '' ) 

    # . 4, find_element_by_name 
    pwd = driver.find_element_by_name ( ' password ' ) 
    pwd.send_keys ( '' ) 

    Submit = driver.find_element_by_id ( ' TANGRAM__PSP_10__submit ' ) 
    submit.click () 
    # End 

    # . 5, find_element_by_partial_link_text 
    # Find local link text 
    LOGIN_LINK = driver.find_element_by_partial_link_text ( ' registration ' ) 
    login_link.click () 

    # . 6, find_element_by_css_selector 
    # find elements based on attributes selectors
    # :. Class 
    # #: ID 
    login2_link = driver.find_element_by_css_selector ( ' .tang-Pass-footerBarULogin ' ) 
    login2_link.click () 

    # . 7, find_element_by_tag_name 
    div = driver.find_elements_by_tag_name ( ' div ' )
     Print (div) 

    the time.sleep ( 20 ) 

a finally :
     # close the browser release operating system resources 
    driver.close ()

 



Guess you like

Origin www.cnblogs.com/merliah/p/11119558.html