day02 crawling IMDb top250

'' ' 
Crawling IMDb message: 
    movie rankings, film name, url movies, film director 
    movie star, movie Year, Genre 
    film scores, movie reviews, movie Introduction 

1. Analyze all url home 
on the first page: https: / /movie.douban.com/top250?start=0&filter= 
second page: https:? //movie.douban.com/top250 start = 25 & filter = 
third page: https: //movie.douban.com/top250 start? 50 & filter = = 

'' ' 
Import requests
 Import Re
 # crawler trilogy 
# 1, sends a request 
DEF the get_page (URL): 
    Response = requests.get (URL)
     # Print (response.text) 
    return Response 

# 2, parsed data 
DEF parse_index (HTML):
     '' '' ''
    '''
    Movie rankings, movies url, film name, film director, film starring 
    movie of the year / movie category, film scores, movie reviews, movie Introduction 
    <div class = "item"> . *? <Em class = ""> (. *? ) </ EM> <a href="(.*?)"> *.? 
    * <span class = "title"> (*) </ span> * director:.?.?.? (*).? starring: (.? *) <br> </ the p-> (*.?) 
    * <span class = "rating_num" *.?> </ span> * <span> (.? (*.?).?. *?) people commented </ span> 
    . *? <span class = "INQ"> (. *?) </ span> 
    
    <div class = "Item">. *? <EM class = ""> (. * ?) </ em> * <a href="(.*?)"> * <span class = "title"> (*) </ span> * director:.?.?.?.? (*.? ) starring: (*) <br> (*) </ p> * <span class = "rating_num" *.?> (*) </ span> * <span> (.?.?.?.?.? . *?) people commented </ span>.*?<span class="inq">(.*?)</span>
    '''
    movie_list = re.findall('<div class = "item"> . *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = " title "(.? *)> </ span> * director:? (.? *) starring: <br> </ p> * < span class = (*.?) (*.?).?" rating_num ". *?> (. * ?) </ span>. *? <span> (. *?) people commented </ span>. *? < span class =" inq "> (. *?) </ span > ' , 
                HTML, 
                re.S) 
    return movie_list 

# 3, save the data 
DEF save_data (movie):
     # movie rankings, movies url, name of the movie, film director, film starring 
    # movie Year / movie category, film scores, movie reviews, The movie 
    Top, m_url, name, daoyao, the Actor, year_type, \ 
    Point, the commit,desc = movie
    year_type = year_type.strip('\n')
    data = f'' ' 
                ======== ======== welcome to watch 
                    movies ranking: {top} 
                    movies url: {m_url} 
                    Movie Name: {name} 
                    film directors: {daoyao} 
                    Movie Starring: {actor } 
                    Year type: {year_type} 
                    movie ratings: {point} 
                    movie review: {commit} 
                    The movie: {desc}      
                ======== Thanks for watching ======== 
                \ the n- 
                \ the n- 
                '' ' 
    Print (Data) 

    with Open ( ' douban_top250.txt ' , ' A ' , encoding = ' UTF-. 8' ) AS F: 
        f.write (Data) 

    Print (F ' Movie: {name} successfully written ... ' ) 

IF  the __name__ == ' __main__ ' :
     # spliced all Home 
    NUM = 0
     for Line in Range (10 ) : 
        URL = F ' https://movie.douban.com/top250?start={num}&filter= ' 
        NUM + = 25
         Print (URL) 

        # 1. each home transmits a request to 
        index_res = the get_page (URL) 

        # 2 parsing home page for movie information 
        movie_list = parse_index(index_res.text)

        for movie in movie_list:
            #print(movie)

            #3.保存数据
            save_data(movie)

 

Guess you like

Origin www.cnblogs.com/zaccheo/p/11118717.html