[Python web crawler] 150 lectures to easily get the Python web crawler paid course notes ten-crawling Douban movies TOP250 (actual combat)

 Crawling Top250 Information on Douban Movies

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}

def get_detail_urls(url):
    #获取详情页面url
    resp = requests.get(url, headers=headers)
    # print(resp.text)
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    lis = soup.find('ol', class_='grid_view').find_all('li')
    detail_urls = []
    for li in lis:
        detail_url = li.find('a')['href']
        # print(detail_url)
        detail_urls.append(detail_url)
    return detail_urls

#解析详情页面内容
def parse_detail_page(detail_url, f):
    # infos = {}
    resp = requests.get(detail_url, headers=headers)
    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    #获取电影名
    title = list(soup.find('div', id='content').find('h1').stripped_strings)
    title = ''.join(title)
    # print(title)

    #获取导演
    director= list(soup.find('div', id='info').find('span', class_='attrs').stripped_strings)
    director = ''.join(director)
    # print(director)

    #获取编剧
    screen_writer = list(soup.find('div', id='info').find_all('span')[3].find('span', class_='attrs').stripped_strings)
    screen_writer = ''.join(screen_writer)
    # print(screen_writer)

    #演员
    actor = list(soup.find('span', class_='actor').find('span', class_='attrs').stripped_strings)
    actor = ''.join(actor)
    # print(actor)

    #豆瓣评分
    score = list(soup.find('strong', class_='ll rating_num').strings)
    # print(score)
    # infos['title'] = title
    # infos['director'] = director
    # infos['screen_writer'] = screen_writer
    # infos['actor'] = actor
    # infos['score'] = score
    f.write('{}, {}, {}, {}, {}\n'.format(title, director, screen_writer, actor, score))
    # return infos


def main():
    base_url = 'https://movie.douban.com/top250?start={}&filter='

    with open('Top250.csv', 'a', encoding='utf-8') as f:
        for i in range(0, 251, 25):
            url = base_url.format(i)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                parse_detail_page(detail_url, f)

if __name__ == '__main__':
    main()

 

Guess you like

Origin blog.csdn.net/weixin_44566432/article/details/108678780