Crawl Douban movie top250 (python3)

I saw it on someone else's blog and made a little modification. The specific URL can't be found. That's it.

code show as below:

 

# Crawling Douban movies top250
# -*-coding:utf-8-*-

import requests
from bs4 import BeautifulSoup

DownLoad_URL = 'https://movie.douban.com/top250'


def parse_html_info(html):
    """
    Parse the second-level URL for profile information
    :param: html
    :return: text
    """
    html = download_page(html)
    soup = BeautifulSoup(html, "lxml")
    detail = soup.find('div', attrs={'id': 'wrapper'})
    detail = detail.find('div', attrs={'id': 'content'})
    try:
        return detail.h1.span.string

    except Exception as e:
        print (s)


def parse_html(html):
    """
    Parse HTML to scrape the data needed
    :param:html
    :return:movie_name_list,URL
    """
    # Parse the document
    soup = BeautifulSoup(html, "lxml")
    # find the ol tag
    movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
    # Store the list of movie names
    movie_name_list = []
    # Traverse each list to find the movie name in it
    for movie_li in movie_list_soup.find_all('li'):
        # movie name
        detail = movie_li.find('div', attrs={'class': 'hd'})
        movie_name = detail.find('span', attrs={'class': 'title'}).getText()
        # Introduction
        next_page = detail.find('a')['href']
        # The following is to switch to a new page, the speed is a bit slow, it's just for fun
        if next_page:
            xx = parse_html_info(next_page)
        # rating
        detail = movie_li.find('div', attrs={'class': 'bd'})
        score = detail.find('span', attrs={'class': 'rating_num'}).getText()
        if xx!=None:
            text = xx + ' ' + score
        else:
            text = movie_name + ' ' + score
            print(movie_name)
        # add to list
        movie_name_list.append(text)
    # Find the content of the a tag on the next page
    next_page = soup.find('span', attrs={'class': 'next'}).find('a')

    # Return data If true, return the movie name and URL address
    if next_page:
        return movie_name_list, DownLoad_URL + next_page['href']
    # must add else because when there is no URL address on the next page, only one data will be returned and the error 'NoneType' object is not iterable will be reported
    else:
        return movie_name_list, None


def download_page(url):
    """
    This is the download page
    :param:url
    :return:data
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
    }
    data = requests.get(url, headers=headers).content
    return data


def main():
    url = DownLoad_URL
    # Get the main path of proDir
    # proDir = os.path.split(os.path.realpath(__file__))[0]
    # configPath configuration file path address
    # configPath = os.path.join(proDir, "movies.txt")

    with open("movies.txt", "w", encoding='utf-8') as f:
        while url:
            html = download_page(url)

            movies, url = parse_html(html)
            new_movies = u'{movies}\n'.format(movies='\n'.join(movies))
            f.write(new_movies)


if __name__ == "__main__":
    main()

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326950994&siteId=291194637