Watercress crawling using basic video _top250 + selenium

Crawling watercress video _top250

'' '
Crawling IMDb message:
movie rankings, film name, url movies, film director
movie star, movie Year, Genre
film scores, movie reviews, movie Introduction

1, all analyzes url home
on the first page:
HTTPS: / /movie.douban.com/top250?start=0&filter=
second page:
https://movie.douban.com/top250?start=25&filter=
third page:
https://movie.douban.com/top250?start = 50 & filter =

'''

import requests
import re

# Crawler Trilogy
# 1, the transmission request

def get_page(url):
  response = requests.get(url)
  # print(response.text)
return response

# 2, parses the data
'' '' ''
'' '
movie rankings, movies url, film name, film director, film starring
movie Year / Genre, film scores, movie reviews, movie Introduction
<div class = "item"> . *? <EM class = ""> (. *?) </ EM>. *? <a href="(.*?)">
. *? <span class = "title"> (. *?) </ span> * director:? (.? *) starring: <br> </ the p-> (*.?) (*.?)
* <span class = "rating_num" *.?> (*.?. ?) </ span>. * ? <span> (. *?) people commented </ span>
. *? <span class = "INQ"> (. *?) </ span>

<div class = "Item" >. *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = "title"> (. *? ) </ span> * director:.?.? (*) starring: (*) <br> (*) </ p> * <span class = "rating_num" *.?>.?.?.?(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>
'''

def parse_index(html):
  movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',
  html,
  re.S)

return movie_list

# 3, save the data
# movie rankings, movies url, name of the movie, film director, film starring
# Movie Year / Genre, film scores, movie reviews, movie Introduction

DEF save_data (Movie): 
  Top, m_url, name, daoyan, the Actor, year_type, \ 
  Point, the commit, desc = Movie 
  year_type = year_type.strip ( ' \ n- ' ) 
  Data = F '' ' 
  ======= ======== = Welcome Dear Husband watching 
  movies ranking: {top} 
  movies url: {m_url} 
  movie name: {name} 
  film directors: {daoyan} 
  movie starring: {actor} 
  Year type: {year_type} 
  movie rating: {point} 
  movie review: {commit} 
  The movie: {desc} 
  ======== Husband please come again yo ======== 
  \ the n- 
  \ the n- 
  '' ' 
  Print ( Data) 

  with Open ( ' douban_top250.txt ', ' A ' , encoding = ' UTF-. 8 ' ) AS F: 
    f.write (Data) 

  Print (F ' Movie: {name} successfully written ... ' )
IF  the __name__ == ' __main__ ' :
  # spliced all Home 
  NUM = 0
  for Line in Range (10 ): 
  URL = F ' https://movie.douban.com/top250?start={num}&filter= ' 
  NUM + = 25
  Print (url) 

  # 1. each home page to send a request 
  index_res = get_page (url) 

  # 2. resolve home page for movie information 
  movie_list = parse_index (index_res.text) 

  for movie in movie_list:
  # Print (movie) 

  # 3. save data
    save_data(movie)

 

 

 The basic use of selenium

 

from Selenium Import the webdriver # Web drive 
from selenium.webdriver.common.keys Import Keys # keyboard operation 
Import Time
= Driver webdriver.Chrome () 

the try : 

  # Implicit wait: call the prior GET 
  # Wait 10 seconds to load any element 
  driver.implicitly_wait (10 ) 

  driver.get ( ' https://www.baidu.com/ ' ) 

  # explicit wait: to call after GET 
  the time.sleep (. 5 ) 

  '' ' 
  =============== all methods =============== ==== 
  Element is to find a label 
  elements is to find all the labels 
  ' '' 
  # auto-login Baidu Start 
  # 1, # find_element_by_link_text go through linked text 
  LOGIN_LINK = driver.find_element_by_link_text ( ' login ' ) 
  login_link.click () # click Login

  time.sleep(1)

  # 2、find_element_by_id # 通过id去找
  user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
  user_login.click()

  time.sleep(1)

  # 3、find_element_by_class_name
  user = driver.find_element_by_class_name('pass-text-input-userName')
  user.send_keys('*****')

  # 4、find_element_by_name
  pwd = driver.find_element_by_name('password')
  pwd.send_keys('*****' ) 

  Submit = driver.find_element_by_id ( ' TANGRAM__PSP_10__submit ' ) 
  submit.click () 
  # End 

  # . 5, find_element_by_partial_link_text 
  # local link text search 
  LOGIN_LINK = driver.find_element_by_partial_link_text ( ' registration ' ) 
  login_link.click () 

  # . 6, find_element_by_css_selector 
  # according to the attribute Find selector elements 
  # :. class 
  # #: ID 
  login2_link = driver.find_element_by_css_selector ( ' .tang-Pass-footerBarULogin ' ) 
  login2_link.click ()

  # 7, find_element_by_tag_name 
  div = driver.find_elements_by_tag_name ( ' div ' )
  Print (div) 
   the time.sleep ( 20 )
 a finally :
  # close the browser release operating system resources 
  driver.close ()

 

Guess you like

Origin www.cnblogs.com/tankyy/p/11123223.html