datawhale reptiles task01

# Use requests, regular expressions, crawling IMDb top250 ranking 
# requirements crawl ranking, movie title, year, director and other fields. 


Import Requests
 Import Re
 Import CSV
 Import Time
 class doubanTop250 (): 

    film_list = [] 

    # 1. Send request 
    DEF send_request (Self, URL):
         # 1.1 header request to add 
        headers = { ' the User-- Agent ' : " the Mozilla / 5.0 (the Windows 10.0 NT; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 75.0.3770.100 Safari / 537.36 " }
         # 1.3 sends a request
        response = requests.get(url=url,headers=headers)
        print(response.status_code)
        return response

    #2.解析数据
    def parse(self,response):
        data = response.content.decode()
        rank  = re.findall('<em class="">(\d+)</em>', data)
        name = re.findall('<img width="100" alt="(.*) src=', data)
        country =  re.findall('&nbsp;/&nbsp;(.*)&nbsp;/&nbsp;', data)
        director = re.findall('导演:(.*)', data)
        score = re.findall('<span class="rating_num" property="v:average">(.*)</span>', data)
        for i in range(0, len(rank)):
            film_dict = {}
            film_dict['rank'] = rank[i]
            film_dict['name'] = name[i]
            film_dict['country'] = country[i]
            film_dict['director'] =Director [I] 
            film_dict [ ' Score ' ] = Score [I] 
            self.film_list.append (film_dict) 


    # 3. storing data 
    DEF save_data (Self):
         # of 0. The Create File Open 
        CSV_File = Open ( ' top250.csv ' , ' W ' , encoding = ' UTF-. 8 ' )
         # 1. Create csv writer 
        csv_writer = csv.writer (CSV_File)
         # 2. write header 
        csv_writer.writerow (self.film_list [0] .keys (
         )) # 3. write content 
        csv_list =[]
         For Film in self.film_list: 
            film_data = film.values () 
            csv_list.append (film_data) 
        csv_writer.writerows (csv_list) 
        # 4. Close the file 
        csv_file.close () 

        Pass 
    # 4. Run 
    DEF RUN (Self):
         # 1.1 target url address 
        # stitching url 
        base_url = " https://movie.douban.com/top250?start= " 
        for i in the Range (0,225,25 ): 
            FINAL_URL = base_url + str (i)
             #1. Send request, and returning the response object 
            response = self.send_request (FINAL_URL) 

            # 2. parsing response data 
            self.parse (response) 
            the time.sleep ( . 5 )
         # 3. Save data 
        self.save_data () 



doubanTop250 (). RUN ( )

 

Guess you like

Origin www.cnblogs.com/tommyngx/p/11312172.html