从别人的博客上看到的 做了一点点修改 具体网址找不到了 就这样吧
代码如下:
# 爬取豆瓣电影top250 # -*-coding:utf-8-*- import requests from bs4 import BeautifulSoup DownLoad_URL = 'https://movie.douban.com/top250' def parse_html_info(html): """ 解析第二级URL 查找简介信息 :param: html :return: text """ html = download_page(html) soup = BeautifulSoup(html, "lxml") detail = soup.find('div', attrs={'id': 'wrapper'}) detail = detail.find('div', attrs={'id': 'content'}) try: return detail.h1.span.string except Exception as e: print(e) def parse_html(html): """ 解析HTML抓取需要的数据 :param:html :return:movie_name_list,URL """ # 解析文档 soup = BeautifulSoup(html, "lxml") # 查找ol标签 movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) # 存放电影名的列表 movie_name_list = [] # 遍历每一条列表 查找其中的电影名 for movie_li in movie_list_soup.find_all('li'): # 电影名 detail = movie_li.find('div', attrs={'class': 'hd'}) movie_name = detail.find('span', attrs={'class': 'title'}).getText() # 简介 next_page = detail.find('a')['href'] # 下面的是调到新的页面 速度有点慢纯属玩玩 if next_page: xx = parse_html_info(next_page) # 评分 detail = movie_li.find('div', attrs={'class': 'bd'}) score = detail.find('span', attrs={'class': 'rating_num'}).getText() if xx!=None: text = xx + ' ' + score else: text = movie_name + ' ' + score print(movie_name) # 添加到列表中 movie_name_list.append(text) # 查找a标签内容 下一页 next_page = soup.find('span', attrs={'class': 'next'}).find('a') # 返回数据 若为真 返回电影名称与URL地址 if next_page: return movie_name_list, DownLoad_URL + next_page['href'] # 必须要添加 else 因为当下一页不存在URL地址的时候 返回的只有一个数据 会报错 'NoneType' object is not iterable else: return movie_name_list, None def download_page(url): """ 这是下载页面 :param:url :return:data """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } data = requests.get(url, headers=headers).content return data def main(): url = DownLoad_URL # 获取 proDir 主路径 # proDir = os.path.split(os.path.realpath(__file__))[0] # configPath 配置文件路径地址 # configPath = os.path.join(proDir, "movies.txt") with open("movies.txt", "w", encoding='utf-8') as f: while url: html = download_page(url) movies, url = parse_html(html) new_movies = u'{movies}\n'.format(movies='\n'.join(movies)) f.write(new_movies) if __name__ == "__main__": main()