python reptile Day2: crawling IMDb message top250

'' ' 
Crawling IMDb message 
    movie rankings, movies url, name of the movie 
    film director, movie star, movie Year / type of 
    movie scores, movie reviews, movie The 

analysis url all home 

' '' 
Import Requests 
Import Re 
three # reptile Qu 
# 1 sends a request 
DEF the get_page (URL): 
    Response = requests.get (URL) 
    # Print (response.text) 
    return Response 

# 2. analysis data 
DEF parse_index (HTML): 
   movie_list the re.findall = ( '<div class = "item">. *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = "title"> (.? *) </ span > * director:? (.? *) starring: (.? *) <br> < / p> * <span class = "rating_num" * (*.?).?. ?> (. *?) < / span>. *? <span> (. *?) people commented </ span>. *? <span class="inq">(.*?)</span>',html,re.S)
   return movie_list

# 3.保存数据
def save_data(movie):
    Top, m_url, name, daoyan, the Actor, year_type, Point, the commit, desc = Movie 
    year_type = year_type.strip ( '\ n-') 
    Data = '' ' 
          ========== viewing welcome === ======= 
            movie rankings: {} 
            movies url: {} 
            movie name: {} 
            filmmaker: {} 
            movie starring: {} 
            Genre: {} 
            movie Rating: {} 
            movie review: {} 
            film synopsis: {} 
          ========== next time ============== 
          \ n- 
          \ n- 
          '' '.format (Top, m_url, name, daoyan, the Actor, year_type, Point, the commit , desc) 
    Print (Data) 
    with Open ( 'douban_top250.txt', 'A', encoding = 'UTF-. 8') AS F: 
        F.write(data) 
    Print ( 'Film: {} writing success ... 'format (name)).

the __name__ == IF '__main__': 
    NUM = 0 
    for Line Range in (10): 
        URL = 'https://movie.douban.com/top250?start={}&filter='.format(num) 
        NUM = + 25 
        Print (url) 

        # 1. each home page to send a request 
        index_res = get_page (url) 

        . # 2 movie resolve home page for information 
        movie_list = parse_index (index_res.text) 
        for movie in movie_list: 
            # Print (movie) 
            # 3. save data 
            save_data (movie) 
'' ' 
crawling IMDb message 
    movie rankings, movies url, name of the movie 
    film director, movie star, movie Year / type of 
    movie scores, movie reviews, movie the 

analysis url all home 
 
'' '
Import Requests 
Import Re 
# reptile three - steps 
# 1 sends a request
def get_page(url):
    response = requests.get(url)
    # print(response.text)
    return response

# 2.解析数据
def parse_index(html):
   movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演:(.*?)主演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',html,re.S)
   return movie_list

# 3.保存数据
def save_data(movie):
    top , m_url, name,daoyan,actor,year_type,point,commit,desc = movie
    year_type = year_type.strip('\n') 
                  movies url: {}
                  movies ranking: {}
    data = '''
          ========== ========== welcome to watch 
                  a movie name: {} 
                  filmmaker: {} 
                  Movie Starring: {} 
                  Genre: {} 
                  Movie Rating: {} 
                  Movie Review: {} 
                  The movie: {} 
          ========== next time ============== 
          \ n- 
          \ n- 
          '' '.format (Top, m_url, name, daoyan, the Actor, year_type, Point, the commit, desc) 
    Print (Data) 
    with Open ( 'douban_top250.txt', 'A', encoding = 'UTF-. 8') AS F: 
        f.write (Data) 
    Print ( 'Movies: write {} the success ... 'the format (name)). 

__main__' IF the __name__ == ': 
    NUM = 0 
    for Line Range in (10):
        = URL 'https://movie.douban.com/top250?start={}&filter='.format(num) 
        NUM = + 25 
        Print (URL) 

        # 1. each home transmits a request to 
        index_res = get_page (url) 

        # 2 movie resolve home page for information 
        movie_list = parse_index (index_res.text) 
        for movie in movie_list: 
            # Print (movie) 
            # 3. save the data 
            save_data (movie)

  

Guess you like

Origin www.cnblogs.com/Auraro997/p/11119917.html