Crawling watercress video _top250
'' '
Crawling IMDb message:
movie rankings, film name, url movies, film director
movie star, movie Year, Genre
film scores, movie reviews, movie Introduction
1, all analyzes url home
on the first page:
HTTPS: / /movie.douban.com/top250?start=0&filter=
second page:
https://movie.douban.com/top250?start=25&filter=
third page:
https://movie.douban.com/top250?start = 50 & filter =
'''
import requests import re
# Crawler Trilogy
# 1, the transmission request
def get_page(url): response = requests.get(url) # print(response.text) return response
# 2, parses the data
'' '' ''
'' '
movie rankings, movies url, film name, film director, film starring
movie Year / Genre, film scores, movie reviews, movie Introduction
<div class = "item"> . *? <EM class = ""> (. *?) </ EM>. *? <a href="(.*?)">
. *? <span class = "title"> (. *?) </ span> * director:? (.? *) starring: <br> </ the p-> (*.?) (*.?)
* <span class = "rating_num" *.?> (*.?. ?) </ span>. * ? <span> (. *?) people commented </ span>
. *? <span class = "INQ"> (. *?) </ span>
<div class = "Item" >. *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = "title"> (. *? ) </ span> * director:.?.? (*) starring: (*) <br> (*) </ p> * <span class = "rating_num" *.?>.?.?.?(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>
'''
def parse_index(html): movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>', html, re.S) return movie_list
# 3, save the data
# movie rankings, movies url, name of the movie, film director, film starring
# Movie Year / Genre, film scores, movie reviews, movie Introduction
DEF save_data (Movie): Top, m_url, name, daoyan, the Actor, year_type, \ Point, the commit, desc = Movie year_type = year_type.strip ( ' \ n- ' ) Data = F '' ' ======= ======== = Welcome Dear Husband watching movies ranking: {top} movies url: {m_url} movie name: {name} film directors: {daoyan} movie starring: {actor} Year type: {year_type} movie rating: {point} movie review: {commit} The movie: {desc} ======== Husband please come again yo ======== \ the n- \ the n- '' ' Print ( Data) with Open ( ' douban_top250.txt ', ' A ' , encoding = ' UTF-. 8 ' ) AS F: f.write (Data) Print (F ' Movie: {name} successfully written ... ' )
IF the __name__ == ' __main__ ' : # spliced all Home NUM = 0 for Line in Range (10 ): URL = F ' https://movie.douban.com/top250?start={num}&filter= ' NUM + = 25 Print (url) # 1. each home page to send a request index_res = get_page (url) # 2. resolve home page for movie information movie_list = parse_index (index_res.text) for movie in movie_list: # Print (movie) # 3. save data save_data(movie)
The basic use of selenium
from Selenium Import the webdriver # Web drive from selenium.webdriver.common.keys Import Keys # keyboard operation Import Time
= Driver webdriver.Chrome () the try : # Implicit wait: call the prior GET # Wait 10 seconds to load any element driver.implicitly_wait (10 ) driver.get ( ' https://www.baidu.com/ ' ) # explicit wait: to call after GET the time.sleep (. 5 ) '' ' =============== all methods =============== ==== Element is to find a label elements is to find all the labels ' '' # auto-login Baidu Start # 1, # find_element_by_link_text go through linked text LOGIN_LINK = driver.find_element_by_link_text ( ' login ' ) login_link.click () # click Login time.sleep(1) # 2、find_element_by_id # 通过id去找 user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') user_login.click() time.sleep(1) # 3、find_element_by_class_name user = driver.find_element_by_class_name('pass-text-input-userName') user.send_keys('*****') # 4、find_element_by_name pwd = driver.find_element_by_name('password') pwd.send_keys('*****' ) Submit = driver.find_element_by_id ( ' TANGRAM__PSP_10__submit ' ) submit.click () # End # . 5, find_element_by_partial_link_text # local link text search LOGIN_LINK = driver.find_element_by_partial_link_text ( ' registration ' ) login_link.click () # . 6, find_element_by_css_selector # according to the attribute Find selector elements # :. class # #: ID login2_link = driver.find_element_by_css_selector ( ' .tang-Pass-footerBarULogin ' ) login2_link.click () # 7, find_element_by_tag_name div = driver.find_elements_by_tag_name ( ' div ' ) Print (div) the time.sleep ( 20 ) a finally : # close the browser release operating system resources driver.close ()