# Use requests, regular expressions, crawling IMDb top250 ranking # requirements crawl ranking, movie title, year, director and other fields. Import Requests Import Re Import CSV Import Time class doubanTop250 (): film_list = [] # 1. Send request DEF send_request (Self, URL): # 1.1 header request to add headers = { ' the User-- Agent ' : " the Mozilla / 5.0 (the Windows 10.0 NT; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 " } # 1.3 sends a request response = requests.get(url=url,headers=headers) print(response.status_code) return response #2.解析数据 def parse(self,response): data = response.content.decode() rank = re.findall('<em class="">(\d+)</em>', data) name = re.findall('<img width="100" alt="(.*) src=', data) country = re.findall(' / (.*) / ', data) director = re.findall('导演:(.*)', data) score = re.findall('<span class="rating_num" property="v:average">(.*)</span>', data) for i in range(0, len(rank)): film_dict = {} film_dict['rank'] = rank[i] film_dict['name'] = name[i] film_dict['country'] = country[i] film_dict['director'] =Director [I] film_dict [ ' Score ' ] = Score [I] self.film_list.append (film_dict) # 3. storing data DEF save_data (Self): # of 0. The Create File Open CSV_File = Open ( ' top250.csv ' , ' W ' , encoding = ' UTF-. 8 ' ) # 1. Create csv writer csv_writer = csv.writer (CSV_File) # 2. write header csv_writer.writerow (self.film_list [0] .keys ( )) # 3. write content csv_list =[] For Film in self.film_list: film_data = film.values () csv_list.append (film_data) csv_writer.writerows (csv_list) # 4. Close the file csv_file.close () Pass # 4. Run DEF RUN (Self): # 1.1 target url address # stitching url base_url = " https://movie.douban.com/top250?start= " for i in the Range (0,225,25 ): FINAL_URL = base_url + str (i) #1. Send request, and returning the response object response = self.send_request (FINAL_URL) # 2. parsing response data self.parse (response) the time.sleep ( . 5 ) # 3. Save data self.save_data () doubanTop250 (). RUN ( )