'' ' Crawling IMDb message movie rankings, movies url, name of the movie film director, movie star, movie Year / type of movie scores, movie reviews, movie The analysis url all home ' '' Import Requests Import Re three # reptile Qu # 1 sends a request DEF the get_page (URL): Response = requests.get (URL) # Print (response.text) return Response # 2. analysis data DEF parse_index (HTML): movie_list the re.findall = ( '<div class = "item">. *? <em class = ""> (. *?) </ em>. *? <a href="(.*?)">. *? <span class = "title"> (.? *) </ span > * director:? (.? *) starring: (.? *) <br> < / p> * <span class = "rating_num" * (*.?).?. ?> (. *?) < / span>. *? <span> (. *?) people commented </ span>. *? <span class="inq">(.*?)</span>',html,re.S) return movie_list # 3.保存数据 def save_data(movie): Top, m_url, name, daoyan, the Actor, year_type, Point, the commit, desc = Movie year_type = year_type.strip ( '\ n-') Data = '' ' ========== viewing welcome === ======= movie rankings: {} movies url: {} movie name: {} filmmaker: {} movie starring: {} Genre: {} movie Rating: {} movie review: {} film synopsis: {} ========== next time ============== \ n- \ n- '' '.format (Top, m_url, name, daoyan, the Actor, year_type, Point, the commit , desc) Print (Data) with Open ( 'douban_top250.txt', 'A', encoding = 'UTF-. 8') AS F: F.write(data) Print ( 'Film: {} writing success ... 'format (name)). the __name__ == IF '__main__': NUM = 0 for Line Range in (10): URL = 'https://movie.douban.com/top250?start={}&filter='.format(num) NUM = + 25 Print (url) # 1. each home page to send a request index_res = get_page (url) . # 2 movie resolve home page for information movie_list = parse_index (index_res.text) for movie in movie_list: # Print (movie) # 3. save data save_data (movie) '' ' crawling IMDb message movie rankings, movies url, name of the movie film director, movie star, movie Year / type of movie scores, movie reviews, movie the analysis url all home '' ' Import Requests Import Re # reptile three - steps # 1 sends a request def get_page(url): response = requests.get(url) # print(response.text) return response # 2.解析数据 def parse_index(html): movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演:(.*?)主演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',html,re.S) return movie_list # 3.保存数据 def save_data(movie): top , m_url, name,daoyan,actor,year_type,point,commit,desc = movie year_type = year_type.strip('\n') movies url: {} movies ranking: {} data = ''' ========== ========== welcome to watch a movie name: {} filmmaker: {} Movie Starring: {} Genre: {} Movie Rating: {} Movie Review: {} The movie: {} ========== next time ============== \ n- \ n- '' '.format (Top, m_url, name, daoyan, the Actor, year_type, Point, the commit, desc) Print (Data) with Open ( 'douban_top250.txt', 'A', encoding = 'UTF-. 8') AS F: f.write (Data) Print ( 'Movies: write {} the success ... 'the format (name)). __main__' IF the __name__ == ': NUM = 0 for Line Range in (10): = URL 'https://movie.douban.com/top250?start={}&filter='.format(num) NUM = + 25 Print (URL) # 1. each home transmits a request to index_res = get_page (url) # 2 movie resolve home page for information movie_list = parse_index (index_res.text) for movie in movie_list: # Print (movie) # 3. save the data save_data (movie)