2-6-1 应用案例:爬取豆瓣 TOP250 电影信息并存储(版本:py3)——学习笔记

版权声明:自学笔记,如有引用请标明博客,感谢 https://blog.csdn.net/feng_jlin/article/details/82221492

爬取电影名称、链接并写入文件

import urllib.request as urlrequest
from bs4 import BeautifulSoup
import time #休息时间
import random #为了时间随机

top250_url="https://movie.douban.com/top250?start={}&filter="

with open('C:/Users/feng_jlin/Desktop/douban_250.txt','w') as outputfile:
    
     for i in range(10):
            start = i*25
            url_visit = top250_url.format(start)
            crawl_content = urlrequest.urlopen(url_visit).read()
            http_content = crawl_content.decode('utf8')
            soup = BeautifulSoup(http_content,'html.parser')
            
            all_item_divs = soup.find_all(class_='item')
            
            for each_item_div in all_item_divs:
                    pic_div=each_item_div.find(class_='pic')
                    item_href=pic_div.find('a')['href']
                    item_name=pic_div.find('img')['alt']

                    outputfile.write('{}  {}\n'.format(item_href,item_name))
                    print('{}   {}\n'.format(item_href,item_name))

改进为爬去电影详细信息,报错输出错误原因

# -*- coding:utf-8 -*

import urllib.request as urlrequest
from bs4 import BeautifulSoup
import time #休息时间
import random #为了时间随机
import bs4   #注意点1:引入模块

#item_href 链接
#item_name 名称
#all_attrs_divs 主演
#movie_type_join 电影类型
#score_soup_divs 电影评分

top250_url = "https://movie.douban.com/top250?start={}&filter=" #top250的链接
movie_url = "https://movie.douban.com/subject/{}/" #movie进去详情页面

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}

with open('C:/Users/feng_jlin/Desktop/douban_250.txt','w',encoding='utf8') as outputfile: #打开本地存储CSV文件
    
     for i in range(10): #一共250个,一页25个,共10页,这个则是循环10页
            start = i*25 #设置放在链接{}中的start
            url_visit = top250_url.format(start)
            req_url_visit = urlrequest.Request(url=url_visit, headers=headers)
            crawl_content = urlrequest.urlopen(req_url_visit).read() #读取链接
            http_content = crawl_content.decode('utf8') #因为有中文,把格式改为utf8
            soup = BeautifulSoup(http_content,'html.parser') #用beautifulsoup解析网页
            
            all_item_divs = soup.find_all(class_='item') #找到所有class=item,形成all_item_divs的列表
            
            for each_item_div in all_item_divs: #进行列表循环
                
                pic_div = each_item_div.find(class_='pic') #找到pic
                item_href = pic_div.find('a')['href'] #找到a中的href链接
                item_name=pic_div.find('img')['alt'] #找到电影名称
                    
                https, blank , web, subject, doubanID ,other = item_href.split('/') #分割/得到豆瓣ID
                    
                movie_url_visit = movie_url.format(doubanID) #movie详情页链接补充完整
                try:
                    req_movie_url_visit = urlrequest.Request(url=movie_url_visit, headers=headers)
                    movie_crawl_content = urlrequest.urlopen(req_movie_url_visit).read() #读取链接
                    movie_http_content = movie_crawl_content.decode('utf8') #因为有中文,把格式改为utf8
                    movie_soup = BeautifulSoup(movie_http_content,'html.parser') #用beautifulsoup解析网页
                    
                    #获取主演
                    all_actor_divs = movie_soup.find(class_='actor')
                    if isinstance(all_actor_divs,bs4.element.Tag) == True: #上面actor类别为空的话下一步会出错,所以用isinstance过滤空TAG,避免错误
                        all_attrs_divs = all_actor_divs.find(class_='attrs').get_text() #可以用split('/')分出列表,但本次不需要 
                    else:
                        all_attrs_divs = "空"
                    
                    type_soup_divs = movie_soup.find_all(property="v:genre") #获取电影类型
                    movie_type = [] #重置
                    for i in range(0,len(type_soup_divs)):
                        movie_type.append(type_soup_divs[i].get_text()) #获取get_text()文本,去除tag,放到一个新的列表中
                    movie_type_join = '/'.join(movie_type) #join连接列表中的元素
                    
                    score_soup_divs = movie_soup.find(class_="ll rating_num").get_text() #获取电影评分
                    
                    outputfile.write('{}  {}  {}  {}  {}\n'.format(item_href,item_name,movie_type_join,score_soup_divs,all_attrs_divs))
                    print('{}  {}  {}  {}  {}\n'.format(item_href,item_name,movie_type_join,score_soup_divs,all_attrs_divs))
                    
                    time_interval = random .uniform(1,5) #随机1-5秒停止
                    time.sleep(time_interval)  # wait some time, trying to avoid google forbidden (of crawler)
                        
                except urlrequest.HTTPError as err:
                    other_div = re.findall('主演: (.*?)<br/>.*?/.*?/.(.*?)\n.*?</p>',str(each_item_div),re.S) #正则抓取TOP250页面的类型和主演
                    other_score_div = re.findall('<span class="rating_num" property="v:average">(.*?)</span>',str(each_item_div),re.S) #正则抓取TOP250页面的电影评分
                    
                    outputfile.write('{}  {}  {}  {}  {}\n'.format(item_href,item_name,movie_type_join,score_soup_divs,all_attrs_divs))
                    print('{}  {}  {}  {}  {}\n'.format(item_href,item_name,other_div[1],other_score_div[0],other_div[0]))
                    continue #防中间爬取的20页为空,判断若有错不跳出,继续

outputfile.close()
print('OK')

改进为爬去电影详细信息,报错只按顺序从TOP250页面爬取电影详情

猜你喜欢

转载自blog.csdn.net/feng_jlin/article/details/82221492
今日推荐