小菜鸟的第一个爬虫:豆瓣爬取电影信息

爬取页面:
【2016年国内公映电影排期】

#!/usr/bin/env python
# -*- coding=utf-8 -*-

import urllib.request
import re
import time
import os
from bs4 import BeautifulSoup
def get_html(url):  #通过url获取网页内容
    result = urllib.request.urlopen(url)
    return result.read()
    # save_file(result.read(), 'thefile.txt')
def get_movie_all(html):     #通过soup提取到每个电影的全部信息,以list返回
    soup = BeautifulSoup(html,'html.parser')
    movie_list = soup.find_all('div', class_='bd doulist-subject')
    return movie_list
def get_one_movie(movie):
    result = []  # 用于存储提取出来的电影信息
    soup_all = BeautifulSoup(str(movie), 'html.parser')
    title = soup_all.find_all('div', class_='title')
    soup_title = BeautifulSoup(str(title[0]), 'html.parser')
    for line in soup_title.stripped_strings:  # 对获取到的<a>里的内容进行提取
        result.append(line)

    # num = soup_all.find_all('span', class_='rating_nums')
    # 加入电影评分
    num = soup_all.find_all('span')
    result.append(num[1].string)

    # 加入abstract
    info = soup_all.find_all('div', class_='abstract')
    soup_info = BeautifulSoup(str(info[0]), 'html.parser')
    result_str = ""
    for line in soup_info.stripped_strings:  # 对获取到的<div>里的内容进行提取
        result_str = result_str +" "+ line
    result.append(result_str)
    return result  # 返回获取到的结果

def save(text,file_name):
    with open(file_name,'ab') as f:
        texts = str.encode(text)
        f.write(texts)


if __name__=='__main__':

    url = 'https://www.douban.com/doulist/3516235/?start=0&sort=seq&sub_type='
    html = get_html(url)
    movie_list = get_movie_all(html)
    for movie in movie_list:
        result = get_one_movie(movie)
        text = '电影名:'+result[0]+'   '+'评分:'+result[1]+'    '+result[2]+'\n'
        save(text,'movie.txt')

只爬取了第一页的内容,参考这位大神的代码
毕竟小白开始学习是要从模仿开始的嘛~~思路懂了又自己敲了一遍。
慢慢来吧,相信自己不是废物┭┮﹏┭┮
相关待看
豆瓣电影TOP250爬取

猜你喜欢

转载自blog.csdn.net/yumi_huang/article/details/78765842