Experimental environment python3.7 windows 10
Use the library
import requests
import re
# Get the page source DEF GET_DATA (the Url): Response = requests.get (the Url) return Response
#正则匹配找寻所需数据 def Print_Data(Res): Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?) .*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S) return Data_Temp
# Write text and outputs print information DEF Save_Data (file_name, FILE_CONTENT): Top, URL, name, Director, Performer, Comment = FILE_CONTENT the Data = F '' ' =============== ============= movie rankings: {top} movie links: {url} movie name: {name} film directors: {director} movie starring: {performer} movie review: the comment {} = =========================== \ n- '' ' Print (the Data) with Open (file_name.replace ( ' / ' , ' _ ' ) + " .txt " , "a",encoding='utf-8') as f: f.write(Data) f.close()
#函数测试实现 Head_Agreement='https' Domain='movie.douban.com' Port='443' File_Name='top250?start=' for Tmp in range(0,226,25): Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp) Res = Get_Data(Url_Link) Data = Print_Data(Res.text) for movie in Data: Save_Data('movie',movie) print('Print movie working done!')
Here is the complete code implementation
import requests import re def Get_Data(Url): response = requests.get(Url) return response def Print_Data(Res): Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?) .*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S) return Data_Temp def Save_Data(file_name, file_content): top,url,name,director,performer,comment=file_content Data = f'' ' ============================ movie rankings: {top} Movie links: {url} Movie name: {name} film director: {director} movie starring: {performer} movie review: the comment} { ============================ \ the n- '' ' Print (the Data) with Open (file_name.replace ( ' / ' , ' _ ' ) + " .txt " , " A " , encoding = ' UTF-. 8 ' ) AS F: f.write (the Data) f.close ( ) Head_Agreement = 'https' Domain='movie.douban.com' Port='443' File_Name='top250?start=' for Tmp in range(0,226,25): Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp) Res = Get_Data(Url_Link) Data = Print_Data(Res.text) for movie in Data: Save_Data('movie',movie) print('Print movie working done!')