03 get IMDb top250

'' '' '' 
'' ' 
Https://movie.douban.com/top250?start=0&filter= 
https://movie.douban.com/top250?start=25&filter= 
https://movie.douban.com ? / TOP250 Start = 50 & filter = 

1, the transmission request 
2, the analysis data 
3, data storage 
'' ' 
Import requests
 Import Re
 # crawler trilogy 
# 1, sends a request 
DEF the get_page (the base_url): 
    Response = requests.get (the base_url)
     return Response 

# 2, parse text 
DEF parse_index (text): 
    RES = the re.findall ( '<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>'
                     '.*?导演:(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>'
                     '.*?<span class="inq">(.*?)</span>',text,re.S)
    #print(res)
    return res

#3、保存数据
def save_data(data):
    with open('douban.txt','a',encoding='utf-8') as f:
        f.write(data)



#main + 回车键
if __name__ == ' __Main__ ' :
     # NUM = 10 
    # the base_url = 'https://movie.douban.com/top250?start={}&filter='.format(num) 

    NUM = 0
     for Line in Range (10 ): 
        the base_url = F ' https://movie.douban.com/top250?start={num}&filter= ' 
        NUM + = 25
         Print (the base_url) 

        # . 1, the transmission request, the calling function 
        Response = the get_page (the base_url) 

        # 2, parse text 
        movie_list = parse_index (response.text) 

        # . 3, save data 
        # data formatted
        for Movie in movie_list:
             # Print (Movie) 

            # extract the assignment 
            # movie rankings, movies url, film name, director - starring - the type of film evaluation, the number of evaluators, film synopsis 
            v_top, v_url, v_name, v_daoyan, v_point, v_num, v_desc = movie 


            movie_content = f '' ' 
            movie rankings: {v_top} 
            movies url: {v_url} 
            movie name: {v_name} 
            movie starring: {v_daoyan} 
            movie ratings: {v_point} 
            number of evaluators: {v_num} 
            The movie: {v_desc} 
            \ n- 
            '' ' 
            Print (movie_content) 

            # save data 
            save_data (movie_content)

 

Guess you like

Origin www.cnblogs.com/urassya/p/11093872.html