day02 python from entry to abandon ---- crawling IMDb Top250

Experimental environment python3.7 windows 10

Use the library

import requests
import re

 

# Get the page source 
DEF GET_DATA (the Url): 
    Response = requests.get (the Url)
     return Response
#正则匹配找寻所需数据
def Print_Data(Res):
    Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?)&nbsp.*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S)
    return Data_Temp
# Write text and outputs print information 
DEF Save_Data (file_name, FILE_CONTENT): 
    Top, URL, name, Director, Performer, Comment = FILE_CONTENT 
    the Data = F '' ' 
    =============== ============= 
    movie rankings: {top} 
    movie links: {url} 
    movie name: {name} 
    film directors: {director} 
    movie starring: {performer} 
    movie review: the comment {} 
    = =========================== 
    \ n- 
    '' ' 
    Print (the Data) 
    with Open (file_name.replace ( ' / ' , ' _ ' ) + " .txt " , "a",encoding='utf-8') as f:
        f.write(Data)
        f.close()
#函数测试实现
Head_Agreement='https'
Domain='movie.douban.com'
Port='443'
File_Name='top250?start='
for Tmp in range(0,226,25):
    Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp)
    Res = Get_Data(Url_Link)
    Data = Print_Data(Res.text)
    for movie in Data:
        Save_Data('movie',movie)
print('Print movie working done!')

Here is the complete code implementation

import requests
import re

def Get_Data(Url):
    response = requests.get(Url)
    return response

def Print_Data(Res):
    Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?)&nbsp.*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S)
    return Data_Temp

def Save_Data(file_name, file_content):
    top,url,name,director,performer,comment=file_content
    Data = f'' ' 
    ============================ 
    movie rankings: {top} 
    Movie links: {url} 
    Movie name: {name} 
    film director: {director} 
    movie starring: {performer} 
    movie review: the comment} { 
    ============================ 
    \ the n- 
    '' ' 
    Print (the Data) 
    with Open (file_name.replace ( ' / ' , ' _ ' ) + " .txt " , " A " , encoding = ' UTF-. 8 ' ) AS F: 
        f.write (the Data) 
        f.close ( ) 

Head_Agreement = 'https'
Domain='movie.douban.com'
Port='443'
File_Name='top250?start='
for Tmp in range(0,226,25):
    Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp)
    Res = Get_Data(Url_Link)
    Data = Print_Data(Res.text)
    for movie in Data:
        Save_Data('movie',movie)
print('Print movie working done!')

 

Guess you like

Origin www.cnblogs.com/tankfaledeblog/p/11123331.html