# coding = utf-8 '' ' Crawler Douban movie, support chronological crawling ' '' import time import requests import os import re import json import Save_Data import logging #define log level and log file name logging.basicConfig ( # log level level = "ERROR", # log printing time format datefmt = "% Y-% m-% d% H:% M:% S", # log printing content format format = '% (asctime) s% (filename) s [line:% (lineno) d]% (message) s', # log output to file filename = ("log_2010.txt"), # override mode filemode = 'w' ) # define request headers = { "Referer" : "https: // movie.douban.com/explore", "User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 78.0.3904.108 Safari / 537.36" } # Define the time interval for obtaining the year year_dict = { "80 "Age": "1980,1989", "90s": "1990,1999", "2000s": "2000,2009", "2010s": "2010,2019", } # Define movies acquired in each age Number of pages max_page = { "80s": "376", "90s": "752", "2000s": "1342", "2010s": "2315", } def get_doubanmovie_index (time_start, idx, headers ): '' ' Get movie category page content : param movie_type: movie category : param headers: request header information proxy = '' : return: movie category page data '' ' global proxy my_proxy = { 'http': 'http://' + proxy, 'https': 'https://' + proxy, } logging.error(proxy) logging.error(my_proxy) # page_idx = idx * 20 url = "https://movie.douban.com/j/new_search_subjects?" \ "sort=U&range=0,10&tags=%E5%8A%A8%E6%BC%AB&start={}" \ "&countries=%E6%97%A5%E6%9C%AC&year_range={}".format(idx,time_start) try: logging.error(my_proxy) ret = requests.get(url,headers=headers,timeout=(2,4)) # ret.raise_for_status() #判断http头中的状态码是否为200 except: print("Error getting movie:% s page content; url:% s "% (time_start, url)) logging.error ("Error getting movie:% s page content; url:% s"% (time_start, url)) logging.error ('') return "" return ret.text def get_movie_list (data): '' ' parse Movie page data, mainly for obtaining movie name and movie detail page url : param data: movie category page data : return: movie name and movie detail page url '' ' movie_list = [] try: tmp_data = json.loads (data) subjects_data = tmp_data ["data"] for i in subjects_data: movice_name = i ["title"] movie_url = i ["url"] movie_list.append ([movice_name, movie_url]) except: print ("Failed to get movie title and movie details page ~ ") logging.error ("Failed to get movie name and movie details page ~") logging.error (data) if len (movie_list) == 0: logging.error (data) return [] return movie_list def movie_content_page (url, headers): '' ' Get movie detail page content : param url: movie detail page url : param headers: request header : return: movie detail page data ' '' global proxy my_proxy = { 'http': 'http: //' + proxy, 'https': 'https: //' + proxy, } logging.error (proxy) logging.error (my_proxy) try: logging.error (my_proxy) ret = requests.get (url, headers = headers, timeout = (2,4)) # ret.raise_for_status () except: # proxy = MyProxy.get_proxy () print ("Failed to get movie details page, url:% s" % url) logging.error ("Failed to get movie detail page, url:% s"% url) logging.error ('') return "" return ret.text def handle_movie_data (data): '' ' Parse movie detail page data , Extract the cast, plot introduction : param data: all the data of the movie details page : return: movie cast : return: movie plot introduction '' ' movie_info = [] mark_pattern = re.compile (' property = \ "v: average \ "> (. *?) </ strong> ') mark_and_count = re.findall(mark_pattern,data) count_pattern = re.compile('property=\"v:votes\">(.*?)</span>') count = re.findall(count_pattern,data) movie_year_pattern = re.compile('\"datePublished\".*?:.*?\"(.*?)\"') movie_name_pattern = re.compile('property=\"v:itemreviewed\">(.*?)</span>') movie_name = re.findall(movie_name_pattern,data) movie_year_bk = re.findall('class=\"year\">(.*?)</span>',data) movie_year = re.findall(movie_year_pattern,data) try: #电影名称 _name = (movie_name[0]) except: _name = "" try: # 电影年份 _year_bk = (str(movie_year_bk[0]).replace("(","").replace(")","")) except: _year_bk = "" try: _year = (movie_year[0]) except: _year = "" try: _count = (count[0]) except: _count = "" try: _mark = (mark_and_count[0]) except: _mark = "" try: movie_info.append([_name,_year_bk,_year,_mark,_count]) except: print("err") logging.error("ERROR movie_name:%s movie_year:%s movie_mark:%s movie_count:%s"%( movie_name, movie_year_bk, movie_year, mark_and_count, count)) return movie_info def main (year_type, count): global proxy my_count = 1 mk_path = 'data set' + "\\" + year_type # If the category folder does not exist, create if not os.path.exists (mk_path): os .makedirs (mk_path) # Extract start year time_start = year_dict.get (year_type) for idx in range (count): page_idx = idx * 20 if page_idx> int (max_page.get (year_type)): break for i in range (5 ): # Get the content of the movie homepage in this category index_data = get_doubanmovie_index (time_start, page_idx, headers) # Parse the content of this homepage to get ["movie name", "movie detail page url"] movie_url_list = get_movie_list(index_data) if len(index_data) > 0 and len(movie_url_list) > 0: break else: logging.error("------------%s"%proxy) time.sleep(2) if i > 3: break logging.error("第%d页获取成功! length_html:%d"%( (page_idx+1), len(index_data) ) ) for url_list in movie_url_list: logging.error(url_list) movie_url = url_list[1] for i in range(5): # 获取电影详情页的内容 movie_content_data = movie_content_page (movie_url, headers) # Get movie cast and plot introduction movie_info_list = handle_movie_data (movie_content_data) if len (movie_content_data)> 0 and len (movie_info_list)> 0: print ('has obtained% d movie data'% my_count ) my_count + = 1 break else: logging.error ("------------% s"% proxy) if i> 3: break # Save the movie plot introduction Save_Data.save_content (mk_path, movie_info_list) logging.error ("Page% d got successfully! url:% s"%( (page_idx+1), url_list [1] ) ) if __name__ == '__main__': if not os.path.exists ('data set'): os.mkdir ('data set') # Define the list of categories to be crawled want_get_movie = ["80 Years "," 90s "," 2000s "," 2010s "] # Define the number of pages to crawl, 20 per page COUNT = 2315 # Traverse the list of categories that need to be crawled for type_name in want_get_movie: main (type_name, COUNT ) # coding = utf-8 '' ' Data storage module to save movie data ' '' import csv import logging def save_actor (file_path, data): '' ' Save movie actor information : param file_path:Saved path information : param data: movie actor information : return: '' ' file_name = file_path + "\\" + "actor table.csv" #In the folder of movie name, save the data to the actor table.csv o_file = open (file_name, "w", newline = "", encoding = "utf-8-sig") f = csv.writer (o_file) for i in data: f.writerow (i) o_file.close () def save_content (file_path, data): '' ' : param file_path: save Path information : param data: movie information : return: '' ' try: file_name = file_path + "\\" + "animation data.csv" f = open (file_name, "a", encoding = "utf-8-sig ", newline =" ") c_f = csv.writer (f) for i in data: c_f.writerow(i) f.close() except: logging.error ("Failed to save data:% s"% data)