爬虫豆瓣电影

模仿http://blog.csdn.net/u011489043/article/details/63255902豆瓣读书来爬电影,主要是根据电影排名页面与详情页面进行更改

源码如下(python3):

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# https://movie.douban.com/top250
from urllib.request import urlopen
import os
import csv
import re
import io
import sys
import time
import random

topnum = 1


# 将url转化成html
def getHtml(url):
    try:
        page = urlopen(url)
        html = page.read()
    except Exception as e:
        print("failed to geturl:", e)
        return ""
    else:
        return html


# 爬电影列表
def getTitle(html):
    # re.S：'.'并且包括换行符在内的任意字符(注意：'.'不包括换行符)
    nameList = re.findall(r'<span.*?class="title">(.*?)</span>', html, re.S)
    newNameList = []
    global topnum
    for index, item in enumerate(nameList):
        if item.find(" ") == -1:  # 过滤掉第二行别名
            newNameList.append(item)
            topnum += 1
    return newNameList


# 通过点击图片链接进入每部电影的详情页
def getDetail(html):
    detailList = re.findall(r'<a href="(https.*?)".*?class="">.*?</a>', html, re.S)
    newDetailList = []
    for index, item in enumerate(detailList):
        if item.find("subject") != -1 and index % 2 == 0:
            newDetailList.append(item)
    return newDetailList


"""详情页"""


# 获取电影制片国家
def getRegion(html):
    regionList = re.findall(r'<span class="pl">制片国家/地区.*?</span>(.*?)<br/>', html, re.S)
    return regionList


# 获取电影语言
def getLanguage(html):
    languageList = re.findall(r'<span class="pl">语言.*?</span>(.*?)<br/>', html, re.S)
    return languageList


# 获取电影上映日期
def getPublishDate(html):
    publishDate = re.findall(r'<span property="v:initialReleaseDate" content=.*?>(.*?)</span>(.*?)', html, re.S)
    return publishDate


# 获取电影片长
def getMovieLength(html):
    movieLengthList = re.findall(r'<span.*?property="v:runtime".*?content=".*?">(.*?)</span>', html, re.S)
    return movieLengthList


"""详情页结束"""


# 爬图片链接
def getImg(html):
    imgList = re.findall(r'img.*?alt=.*?src="(https.*?)"', html, re.S)
    newImgList = []
    for index, item in enumerate(imgList):
        if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(
                "icon") == -1 and item.find("png") == -1:
            newImgList.append(item)
    return newImgList


# 爬评分
def getScore(html):
    scoreList = re.findall(r'<span.*?class="rating_num".*?property="v:average">(.*?)</span>', html, re.S)
    return scoreList


# 爬评价总数
def getComment(html):
    commentList = re.findall(r'<span>(.*?)</span>', html, re.S)
    newCommentList = []
    for index, item in enumerate(commentList):
        if item.find("评价") >= 1:
            newCommentList.append(item)
    return newCommentList


# 将获取的信息进行保存
def saveInfo(infoList):
    # 路径改成自己的
    with open('/home/han/PycharmProjects/WebScrapingWithPython/python_web/movie_scraper.csv', 'w+', newline='',
              encoding='gb18030') as fp:
        a = csv.writer(fp, delimiter=',')  # delimiter的意思是插入到csv文件中的一行记录以它分隔开
        a.writerow(['影  名', '评  分', '评价人数', '图片链接', '制片国家/地区', '语言', ' 上映日期 ', '片长'])
        a.writerows(infoList)
        print('保存完毕')


# 程序开始
# 初始化
namesUrl = []
imagesUrl = []
scoresUrl = []
commentsUrl = []
detailsUrl = []
introductionsUrl = []
publishDatesUrl = []
regions = []
languages = []
movieLengths = []
allInfo = []
# 翻页,每页25个
for page in range(0, 50, 25):
    url = "https://movie.douban.com/top250?start={}&filter=&type=".format(page)
    html = getHtml(url).decode("UTF-8")
    if html == '':
        namesUrl.extend('none')
        imagesUrl.extend('none')
        scoresUrl.extend('none')
        commentsUrl.extend('none')
        introductionsUrl.extend('none')
    else:
        namesUrl.extend(getTitle(html))
        imagesUrl.extend(getImg(html))
        scoresUrl.extend(getScore(html))
        commentsUrl.extend(getComment(html))
        introductionsUrl.extend(getDetail(html))

print("len namesUrl:", len(namesUrl))
print("len imagesUrl:", len(imagesUrl))
print("len scoresUrl:", len(scoresUrl))
print("len commentsUrl:", len(commentsUrl))
print("len intro:", len(introductionsUrl))
for index, item in enumerate(introductionsUrl):
    print(item)
    if getHtml(item) == '':  # 排除链接不存在的情况
        regions.append("该链接不存在")
        languages.append("该链接不存在")
        publishDatesUrl.append("该链接不存在")
        movieLengths.append("该链接不存在")
    else:
        html_detail = getHtml(item).decode("UTF-8")
        regions.append(getRegion(html_detail))
        languages.append(getLanguage(html_detail))
        publishDatesUrl.append(getPublishDate(html_detail))
        movieLengths.append(getMovieLength(html_detail))
        time.sleep(random.randint(1, 2))

for i in range(0, len(namesUrl)):
    tmp = []
    tmp.append(namesUrl[i])
    tmp.append(scoresUrl[i])
    tmp.append(commentsUrl[i])
    tmp.append(imagesUrl[i])
    tmp.append(regions[i])
    tmp.append(languages[i])
    tmp.append(publishDatesUrl[i])
    tmp.append(movieLengths[i])
    allInfo.append(tmp)

print(len(namesUrl))
print(len(scoresUrl))
print(len(commentsUrl))
print(len(imagesUrl))
print(len(regions))
print(len(languages))
print(len(publishDatesUrl))
print(len(movieLengths))

saveInfo(allInfo)
print("Exiting Main \n 普通爬取结束时时间")
print(time.ctime(time.time()))

具体流程是从排名页面抓取电影名称,图片链接,评分,评论数和详情页链接(点击图片或影片名均可至详情页).然后便利详情页链接,进入每一个链接详情页再抓取影片制作国家,语言,上映日期,影片长度

注意:saveInfo函数保存路径自行修改.编写代码运行时,要保证影片名,图片,评分,评论数及详情页链接数组个数相等,如果个数不相等,将每个影片信息打印下来,寻找没有显示的影片,看看是什么情况导致的.是正则表达式写的不对没有完全匹配或是其它情况,再进行修改即可.

猜你喜欢