I saw it on someone else's blog and made a little modification. The specific URL can't be found. That's it.
code show as below:
# Crawling Douban movies top250 # -*-coding:utf-8-*- import requests from bs4 import BeautifulSoup DownLoad_URL = 'https://movie.douban.com/top250' def parse_html_info(html): """ Parse the second-level URL for profile information :param: html :return: text """ html = download_page(html) soup = BeautifulSoup(html, "lxml") detail = soup.find('div', attrs={'id': 'wrapper'}) detail = detail.find('div', attrs={'id': 'content'}) try: return detail.h1.span.string except Exception as e: print (s) def parse_html(html): """ Parse HTML to scrape the data needed :param:html :return:movie_name_list,URL """ # Parse the document soup = BeautifulSoup(html, "lxml") # find the ol tag movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) # Store the list of movie names movie_name_list = [] # Traverse each list to find the movie name in it for movie_li in movie_list_soup.find_all('li'): # movie name detail = movie_li.find('div', attrs={'class': 'hd'}) movie_name = detail.find('span', attrs={'class': 'title'}).getText() # Introduction next_page = detail.find('a')['href'] # The following is to switch to a new page, the speed is a bit slow, it's just for fun if next_page: xx = parse_html_info(next_page) # rating detail = movie_li.find('div', attrs={'class': 'bd'}) score = detail.find('span', attrs={'class': 'rating_num'}).getText() if xx!=None: text = xx + ' ' + score else: text = movie_name + ' ' + score print(movie_name) # add to list movie_name_list.append(text) # Find the content of the a tag on the next page next_page = soup.find('span', attrs={'class': 'next'}).find('a') # Return data If true, return the movie name and URL address if next_page: return movie_name_list, DownLoad_URL + next_page['href'] # must add else because when there is no URL address on the next page, only one data will be returned and the error 'NoneType' object is not iterable will be reported else: return movie_name_list, None def download_page(url): """ This is the download page :param:url :return:data """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } data = requests.get(url, headers=headers).content return data def main(): url = DownLoad_URL # Get the main path of proDir # proDir = os.path.split(os.path.realpath(__file__))[0] # configPath configuration file path address # configPath = os.path.join(proDir, "movies.txt") with open("movies.txt", "w", encoding='utf-8') as f: while url: html = download_page(url) movies, url = parse_html(html) new_movies = u'{movies}\n'.format(movies='\n'.join(movies)) f.write(new_movies) if __name__ == "__main__": main()