一、前言
2020-04-3日爬虫练习
每日一个爬虫小练习,学习爬虫的记得关注哦!
学习编程就像学习骑自行车一样,对新手来说最重要的是持之以恒的练习。
在《汲取地下水》这一章节中看见的一句话:“别担心自己的才华或能力不足。持之以恒地练习,才华便会有所增长”,现在想来,真是如此。
最近在学习 数据解析之BeautifulSoup4库 ,所以写这篇爬虫来练练手,巩固知识点。
BeautifulSoup4知识点参考我博文:【爬虫学的好,基础少不了】:数据解析之BeautifulSoup4库
二、需求:
爬取豆瓣电影TOP250,并储存到本地CSV
三、技术路线:
技术路线:
1.requests
2.BeautifulSoup
四、爬虫示例:
import requests
from bs4 import BeautifulSoup
import time
class Douban:
def __init__(self):
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'referer': 'https://movie.douban.com/cinema/nowplaying/wuhan/'
}
# 获取分页html
def get_html(self, url):
try:
result = requests.get(url=url, headers=self.headers)
result.raise_for_status() # 主动抛出一个异常
html = BeautifulSoup(result.text, 'lxml')
return html
except:
print('链接失败!')
# 获取电影url
def get_movie_url(self, html):
lis = html.find('ol', class_="grid_view").find_all('li')
for li in lis:
detail_url = li.find('a')['href']
yield detail_url
# 获取电影相关信息 电影名、导演、编剧、主演、评分
def get_movie_info(self, movie_html):
# 电影名
titie = list(movie_html.find('h1').stripped_strings)
movie_name = ''.join(titie)
# 导演
director = movie_html.find('div', id="info").find('span', class_="attrs").string
# 编剧
screenwriter = list(movie_html.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
screenwriter = ''.join(screenwriter)
# 主演
actors = list(movie_html.find('span', class_="actor").find('span', class_='attrs').stripped_strings)
actors = ''.join(actors)
# 评分
grade = movie_html.find('strong', class_="ll rating_num").string
movie_infos = (movie_name, director, screenwriter, actors, grade)
yield movie_infos
# 保存电影信息到本地
def save_movies_info(self, movie_info, f):
f.write("{},{},{},{},{}\n".format(movie_info[0], movie_info[1], movie_info[2], movie_info[3], movie_info[4]))
print('保存:【{}】电影成功!'.format(movie_info[0]))
# 逻辑功能
def func(self, base_url):
# 将with open放在循环前的好处是避免每次循环写入都需要重新打开,降低系统资源请求
with open('./res/DoubanTop250.csv', 'a', encoding='utf-8')as f:
for x in range(0, 250, 25):
# 加入异常处理,因为有1-2部电影没有主演,会导致抓取出错,加上异常处理让程序不中断
try:
url = base_url.format(x) # 拼接分页url
time.sleep(1) # 延时请求,避免请求速度过快被豆瓣反爬
html = self.get_html(url)
for detail_url in self.get_movie_url(html):
movie_html = self.get_html(detail_url)
for movie_info in self.get_movie_info(movie_html):
self.save_movies_info(movie_info, f)
except Exception as e:
print(e) # 打印异常类型
continue
if __name__ == '__main__':
obj = Douban()
obj.func('https://movie.douban.com/top250?start={}&filter=')