环境准备
1>python环境
路径准备
爬虫开启点都是重获取种子路径开始:
进入某瓣排行首页:
浏览器F12获取到加载电影信息u
https://movie.douban.com/j/chart/top_list?type=6&interval_id=100%3A90&action=&start=0&limit=20
分析url地址,发现start 跟 limit 分页参数,定制url模板,后续动态替换分页参数
self.base_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100:90&action=&start={}&limit={}'
代码实现
import urllib.request
import json
# 某瓣电影排行信息抓取
class DouBan(object):
# 构造器
# 参数1:页码,参数2:每页显示条数, 参数2:电影类型: 6 为情色
def __init__(self, page_no, page_size, type):
self.base_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100:90&action=&start={}&limit={}'
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
self.urls = [self.base_url.format(type, i * page_size, page_size ) for i in range(page_no)]
# 获取内容
def get_content(self, url):
request = urllib.request.Request(url=url, headers=self.headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
# 下载影片信息
def download(self):
for url in self.urls:
content = self.get_content(url)
self.show_movie_info(content)
pass
# 展示影片信息
def show_movie_info(self, content):
content = json.loads(content)
for info in content:
print(f"电影名:【{info['title']}】,上映时间:{info['release_date']},豆瓣评分:{info['score']},类型:{info['types']},主演:{info['actors']} ")
# 启动
def run(self):
self.download()
pass
# 入口
if __name__ == '__main__':
page_no = 5
page_size = 20
type = 6 # 类别-6为情色
DouBan(page_no, page_size, type).run()