Get the movie details of Douban from 1980 to 2010

# coding = utf-8 
'' ' 
Crawler Douban movie, support chronological crawling 
' '' 

import time 
import requests 
import os 
import re 
import json 
import Save_Data 
import logging #define 


log level and log file name 
logging.basicConfig ( 
    # log level 
    level = "ERROR", 
    # log printing time format 
    datefmt = "% Y-% m-% d% H:% M:% S", 
    # log printing content format 
    format = '% (asctime) s% (filename) s [line:% (lineno) d]% (message) s', 
    # log output to file 
    filename = ("log_2010.txt"), 
    # override mode 
    filemode = 'w' 
) 

# define request 
headers = { 
    "Referer" : "https: // movie.douban.com/explore",
    "User-Agent": "Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 78.0.3904.108 Safari / 537.36" 
} 

# Define the time interval for obtaining the year 
year_dict = { 
    "80 "Age": "1980,1989", 
    "90s": "1990,1999", 
    "2000s": "2000,2009", 
    "2010s": "2010,2019", 
} 

# Define movies acquired in each age 
Number of pages max_page = { 
    "80s": "376", 
    "90s": "752", 
    "2000s": "1342", 
    "2010s": "2315", 
} 


def get_doubanmovie_index (time_start, idx, headers ): 
    '' '
    Get movie category page content 
    : param movie_type: movie category 
    : param headers: request header information 
    proxy = '' 
    : return: movie category page data 
    '' '
    global proxy
    my_proxy = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy,
    }
    logging.error(proxy)
    logging.error(my_proxy)
    # page_idx = idx * 20
    url = "https://movie.douban.com/j/new_search_subjects?" \
          "sort=U&range=0,10&tags=%E5%8A%A8%E6%BC%AB&start={}" \
          "&countries=%E6%97%A5%E6%9C%AC&year_range={}".format(idx,time_start)
    try:
        logging.error(my_proxy)
        ret = requests.get(url,headers=headers,timeout=(2,4))
        # ret.raise_for_status()  #判断http头中的状态码是否为200
    except:
        print("Error getting movie:% s page content; url:% s "% (time_start, url))
        logging.error ("Error getting movie:% s page content; url:% s"% (time_start, url)) 
        logging.error ('') 
        return "" 
    return ret.text 

def get_movie_list (data): 
    '' ' 
    parse Movie page data, mainly for obtaining movie name and movie detail page url 
    : param data: movie category page data 
    : return: movie name and movie detail page url 
    '' ' 
    movie_list = [] 
    try: 
        tmp_data = json.loads (data) 
        subjects_data = tmp_data ["data"] 
        for i in subjects_data: 
            movice_name = i ["title"] 
            movie_url = i ["url"] 
            movie_list.append ([movice_name, movie_url]) 
    except: 
        print ("Failed to get movie title and movie details page ~ ")
        logging.error ("Failed to get movie name and movie details page ~") 
        logging.error (data) 
    if len (movie_list) == 0: 
        logging.error (data) 
        return [] 
    return movie_list 

def movie_content_page (url, headers): 
    '' ' 
    Get movie detail page content 
    : param url: movie detail page url 
    : param headers: request header 
    : return: movie detail page data 
    ' '' 
    global proxy 
    my_proxy = { 
        'http': 'http: //' + proxy, 
        'https': 'https: //' + proxy, 
    } 
    logging.error (proxy) 
    logging.error (my_proxy) 
    try: 
        logging.error (my_proxy)
        ret = requests.get (url, headers = headers, timeout = (2,4)) 
        # ret.raise_for_status () 
    except: 
        # proxy = MyProxy.get_proxy () 
        print ("Failed to get movie details page, url:% s" % url) 
        logging.error ("Failed to get movie detail page, url:% s"% url) 
        logging.error ('') 
        return "" 
    return ret.text 

def handle_movie_data (data): 
    '' ' 
    Parse movie detail page data , Extract the cast, plot introduction 
    : param data: all the data of the movie details page 
    : return: movie cast 
    : return: movie plot introduction 
    '' ' 
    movie_info = [] 
    mark_pattern = re.compile (' property = \ "v: average \ "> (. *?) </ strong> ') 
    mark_and_count = re.findall(mark_pattern,data)
    count_pattern = re.compile('property=\"v:votes\">(.*?)</span>')
    count = re.findall(count_pattern,data)
    movie_year_pattern = re.compile('\"datePublished\".*?:.*?\"(.*?)\"')
    movie_name_pattern = re.compile('property=\"v:itemreviewed\">(.*?)</span>')
    movie_name = re.findall(movie_name_pattern,data)
    movie_year_bk = re.findall('class=\"year\">(.*?)</span>',data)
    movie_year = re.findall(movie_year_pattern,data)
    try:
        #电影名称
        _name = (movie_name[0])
    except:
        _name = ""
    try:
        # 电影年份
        _year_bk = (str(movie_year_bk[0]).replace("(","").replace(")",""))
    except:
        _year_bk = ""
    try:
        _year = (movie_year[0])
    except:
        _year = ""
    try:
        _count = (count[0])
    except:
        _count = ""
    try:
        _mark = (mark_and_count[0])
    except:
        _mark = ""
    try:
        movie_info.append([_name,_year_bk,_year,_mark,_count])
    except:
        print("err")
        logging.error("ERROR movie_name:%s movie_year:%s movie_mark:%s movie_count:%s"%(
            movie_name,
            movie_year_bk,
            movie_year,
            mark_and_count,
            count))
    return movie_info 

def main (year_type, count): 
    global proxy 
    my_count = 1 
    mk_path = 'data set' + "\\" + year_type 
    # If the category folder does not exist, create 
    if not os.path.exists (mk_path): 
        os .makedirs (mk_path) 
    # Extract start year 
    time_start = year_dict.get (year_type) 
    for idx in range (count): 
        page_idx = idx * 20 
        if page_idx> int (max_page.get (year_type)): 
            break 
        for i in range (5 ): 
            # Get the content of the movie homepage in this category 
            index_data = get_doubanmovie_index (time_start, page_idx, headers) 
            # Parse the content of this homepage to get ["movie name", "movie detail page url"]
            movie_url_list = get_movie_list(index_data)
            if len(index_data) > 0 and len(movie_url_list) > 0:
                break
            else:
                logging.error("------------%s"%proxy)
                time.sleep(2)
            if i > 3:
                break
        logging.error("第%d页获取成功! length_html:%d"%(
            (page_idx+1),
            len(index_data)
            )
        )
        for url_list in movie_url_list:
            logging.error(url_list)
            movie_url = url_list[1]
            for i in range(5):
                # 获取电影详情页的内容
                movie_content_data = movie_content_page (movie_url, headers) 
                # Get movie cast and plot introduction 
                movie_info_list = handle_movie_data (movie_content_data) 
                if len (movie_content_data)> 0 and len (movie_info_list)> 0: 
                    print ('has obtained% d movie data'% my_count ) 
                    my_count + = 1 
                    break 
                else: 
                    logging.error ("------------% s"% proxy) 
                if i> 3: 
                    break 
            # Save the movie plot introduction 
            Save_Data.save_content (mk_path, movie_info_list) 
            logging.error ("Page% d got successfully! url:% s"%(
                (page_idx+1),
                url_list [1] 
                ) 
            ) 

if __name__ == '__main__': 
    if not os.path.exists ('data set'): 
        os.mkdir ('data set') 
    # Define the list of categories to be crawled 
    want_get_movie = ["80 Years "," 90s "," 2000s "," 2010s "] 
    # Define the number of pages to crawl, 20 per page 
    COUNT = 2315 
    # Traverse the list of categories that need to be crawled 
    for type_name in want_get_movie: 
        main (type_name, COUNT ) 
        # coding = utf-8 
'' ' 
Data storage module to save movie data 
' '' 

import csv 
import logging 

def save_actor (file_path, data): 
    '' ' 
    Save movie actor information 
    : param file_path:Saved path information 
    : param data: movie actor information 
    : return:
    '' ' 
    file_name = file_path + "\\" + "actor 
    table.csv" #In the folder of movie name, save the data to the actor table.csv o_file = open (file_name, "w", newline = "", encoding = "utf-8-sig") 
    f = csv.writer (o_file) 
    for i in data: 
        f.writerow (i) 
    o_file.close () 

def save_content (file_path, data): 
    '' ' 
    : param file_path: save Path information 
    : param data: movie information 
    : return: 
    '' ' 
    try: 
        file_name = file_path + "\\" + "animation data.csv" 
        f = open (file_name, "a", encoding = "utf-8-sig ", newline =" ") 
        c_f = csv.writer (f) 
        for i in data:
            c_f.writerow(i)
        f.close()
    except:
        logging.error ("Failed to save data:% s"% data)

  

Guess you like

Origin www.cnblogs.com/48520-xc/p/12699060.html