Crawling Top250 Information on Douban Movies
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
def get_detail_urls(url):
#获取详情页面url
resp = requests.get(url, headers=headers)
# print(resp.text)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
lis = soup.find('ol', class_='grid_view').find_all('li')
detail_urls = []
for li in lis:
detail_url = li.find('a')['href']
# print(detail_url)
detail_urls.append(detail_url)
return detail_urls
#解析详情页面内容
def parse_detail_page(detail_url, f):
# infos = {}
resp = requests.get(detail_url, headers=headers)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
#获取电影名
title = list(soup.find('div', id='content').find('h1').stripped_strings)
title = ''.join(title)
# print(title)
#获取导演
director= list(soup.find('div', id='info').find('span', class_='attrs').stripped_strings)
director = ''.join(director)
# print(director)
#获取编剧
screen_writer = list(soup.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
screen_writer = ''.join(screen_writer)
# print(screen_writer)
#演员
actor = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
actor = ''.join(actor)
# print(actor)
#豆瓣评分
score = list(soup.find('strong', class_='ll rating_num').strings)
# print(score)
# infos['title'] = title
# infos['director'] = director
# infos['screen_writer'] = screen_writer
# infos['actor'] = actor
# infos['score'] = score
f.write('{}, {}, {}, {}, {}\n'.format(title, director, screen_writer, actor, score))
# return infos
def main():
base_url = 'https://movie.douban.com/top250?start={}&filter='
with open('Top250.csv', 'a', encoding='utf-8') as f:
for i in range(0, 251, 25):
url = base_url.format(i)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
parse_detail_page(detail_url, f)
if __name__ == '__main__':
main()