python scrapy爬取豆瓣即将上映电影用邮件定时推送给自己

本文不是python、scrapy的教程，而是分享一个好玩的点子。

python教程请看python教程，scrapy教程请看scrapy教程

本文爬的是即将上映电影

先看一下推送的效果图，这个是针对手机上qq和微信写的CSS，其它的应该没办法看，很乱。。。

默认你已经安装好scrapy了啊，用anaconda装比较方便，mysql安装：

pip install mysql-connector

只粘一些关键的代码了，不太清楚的看上面给的python爬虫入门笔记

数据库设计如下，用来存放电影的信息

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for upcomingfilm
-- ----------------------------
DROP TABLE IF EXISTS `upcomingfilm`;
CREATE TABLE `upcomingfilm` (
  `id` bigint(255) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(255) DEFAULT NULL,
  `date` varchar(255) DEFAULT NULL,
  `wish` int(10) DEFAULT NULL,
  `description` varchar(500) DEFAULT NULL,
  `duration` int(10) DEFAULT NULL,
  `region` varchar(100) DEFAULT NULL,
  `director` varchar(255) DEFAULT NULL,
  `actors` varchar(255) DEFAULT NULL,
  `type` varchar(100) DEFAULT NULL,
  `poster` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;

items.py 用来接收爬取到电影的参数

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy import Field


class RecommendItem(scrapy.Item):
    # define the fields for your item here like:

    title = Field()  # 电影名
    description = Field()  # 电影简介
    date = Field()  # 上映日期
    wish = Field()  # 想看人数
    duration = Field()  # 电影时长
    region = Field()  # 哪国的电影
    director = Field()  # 导演
    actors = Field()  # 演员
    poster = Field()  # 海报
    type = Field()  # 类型

然后爬取一周左右时长内的电影，爬多了也不好，可能会有新的电影加进来，爬的注释也懒的写了，当时也是写给自己用的，就没怎么写注释。。。爬取的逻辑中yield request会自动让pipelines.py将数据插入到数据库

# -*- coding: UTF-8 –*-
import datetime

import chardet
import mysql
import scrapy
from mysql import connector
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, TCPTimedOutError

from recommend.items import RecommendItem

class upcomingfilms(scrapy.Spider):

    name = 'upcomingfilms'
    # 豆瓣即将上映电影的URL
    start_urls = [
        'https://movie.douban.com/coming'
    ]


    def parse(self, response):
        text = response.body
        content_type = chardet.detect(text)
        if content_type['encoding'] != 'UTF-8':
            text = text.decode(content_type['encoding'])
        text = text.encode('utf-8')
        films = response.css('tbody')[1].css('tr')
        #爬取一周左右时长内的电影
        nextWeekTime = (datetime.datetime.now() + datetime.timedelta(days=8)).strftime('%m%d')  # 一周后的周一
        nextMonth = nextWeekTime[0:2]
        nextDay = nextWeekTime[2:4]
        for film in films:
            item = RecommendItem()
            date = film.css('td::text')[0].extract().strip()
            if date[0:2] == nextMonth and int(date[3:5]) > int(nextDay):
                break

            filmtype = film.css('td::text')[3].extract()
            region = film.css('td::text')[4].extract()
            wish = film.css('td::text')[5].extract()
            title = film.css('a::text')[0].extract()
            filmDetail = film.css('a::attr(href)')[0].extract()

            item['date'] = str(date).strip()
            tempTitle = str(title).strip()
            if len(tempTitle) > 6:
                tempTitle = tempTitle[0:6]+'...'
            item['title'] = tempTitle
            item['type'] = str(filmtype).strip()
            item['region'] = str(region).strip()
            split = str(wish).strip().index('人')
            item['wish'] = int(str(wish).strip()[0:split])

            request = scrapy.Request(url=filmDetail, callback=self.parse_detail)
            request.meta['item'] = item
            yield request


    def parse_detail(self, response):
        director = response.css('#info a::text')[0].extract()
        description = response.css('#link-report span::text')[0].extract()
        actorsSpan = response.css('#info span')[8]
        actors = actorsSpan.css('a::text').extract()
        poster = response.css('#mainpic img::attr(src)')[0].extract()
        duration = response.css("#info span[property='v:runtime']")[0].css('::attr(content)')[0].extract()

        item = response.meta['item']
        item['director'] = str(director)
        desc = str(description).strip()
        if len(desc) > 110:
            desc = desc[0:110] + '...'#超出字数就用省略号
        item['description'] = desc
        item['actors'] = str('/'.join(actors[0:3]))
        item['poster'] = str(poster)
        item['duration'] = int(duration)

        return item

pipelines.py用了保存电影信息

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import mysql


class RecommendPipeline(object):

    def __init__(self):
        #数据库信息自己填
        self.conn = mysql.connector.connect(host='', port=3306, user='root', password='', database='',charset='utf8')
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        title = item.get('title')
        description = item.get('description')
        date = item.get('date')
        wish = item.get('wish')
        duration = item.get('duration')
        region = item.get('region')
        director = item.get('director')
        actors = item.get('actors')
        poster = item.get('poster')
        filmtype = item.get('type')

        insert_sql = "insert into upcomingfilm(title, description, date, wish, duration, region, director, actors, poster, type)VALUES (%s, %s, %s, %s, %s,%s, %s, %s, %s,%s)"
        self.cursor.execute(insert_sql, (title, description, date, wish, duration, region, director, actors, poster, filmtype))
        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

最后是发送邮件，本文最有价值的东东，就是这个我自己设计的CSS，其它东西都能百度到，爬即将上映的应该也有人写了吧，要发送邮件别忘了申请smtp口令。发送邮件分几个步骤，1、连接数据库把想看人数大于5000的筛选出来，我不想看烂片，哈哈，这个参数你可以自己调。2、将数据填充到模板，用邮件发送，我这里只发送给自己，当然也支持群发。如果你的机器定时发送比较麻烦，并且想要每周接收即将上映的电影信息可以评论留下邮箱。3、清空数据库的内容，以便每次得到新的内容。其实这种实时性的东西用不着数据库，放在一个list中排序就行了，我只是懒得百度那种方法了。。。

# -*- coding: UTF-8 –*-
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr

import mysql.connector
#数据库属性自己填哦
conn = mysql.connector.connect(host='', port=3306, user='root', password='', database='',charset='utf8')
cursor = conn.cursor()
cursor.execute('select * from upcomingfilm where wish > 5000 order by wish desc')
values = cursor.fetchall()
print(values)


head = '''
<html>
<head>
<charset="utf-8">
<title>即将上映</title>
</head>
<body>
'''
content='''
<div style="width: 100%;box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);border-radius: 20px;display: inline-flex;background: #1DC7EA;margin-top: 10px;margin-bottom: 10px;">
<img src='{0}' style='height: 260px;width: 50%;border-top-left-radius: 20px;border-bottom-left-radius: 20px;display: inline-block;'>
		<div style='display: block;margin-left: 10px;color: white;'>
			<div style='color: yellow;font-size: 22px;'>
			{1}
			</div>
			<div style='font-size: 13px;'><span>导演：</span><span>
			{2}
			</span></div>
			<div style='font-size: 13px;'><span>主演：</span><span>
			{3}
			</span></div>
			<div style='font-size: 13px;'><span>类型：</span><span>
			{4}
			</span></div>
			<div style='font-size: 13px;'><span>
			{5}
			</span><span> | </span><span>
			{6}
			</span><span>分钟</span></div>
			<div style='font-size: 13px;'><span>上映日期：</span><span style='color: lawngreen'>
			{7}
			</span></div>
			<div style='font-size: 13px;'><span>想看人数：</span><span style='color: yellow'>
			{8}
			</span></div>
			<div style='font-size: 10px;display: inline-block; text-overflow:ellipsis;overflow: hidden;height: 99px;'><span>简介：</span><span>
			{9}
			<span></div>
		</div>
</div>
'''

for value in values:
    str = content.format(value[10],value[1],value[7],value[8],value[9],value[6],value[5],value[2],value[3],value[4])
    head+=str



foot='''
</body>
</html>
'''

head+=foot
print(head)

my_sender='[email protected]'    # 发件人邮箱账号
my_pass = ''              # 发件人邮箱密码(当时申请smtp给的口令)
my_user='[email protected]'      # 收件人邮箱账号，我这边发送给自己


def mail(content):
    ret = True
    try:
        msg=MIMEText(content, 'html', 'utf-8')
        msg['From'] = formataddr(["细肥尸丁", my_sender])  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
        msg['To'] = formataddr(["粪鸡",my_user])              # 括号里的对应收件人邮箱昵称、收件人邮箱账号
        msg['Subject'] = "即将上映"                # 邮件的主题，也可以说是标题

        server = smtplib.SMTP_SSL("smtp.qq.com", 465)  # 发件人邮箱中的SMTP服务器，端口是465
        server.login(my_sender, my_pass)  # 括号中对应的是发件人邮箱账号、邮箱密码
        server.sendmail(my_sender, [my_user, ], msg.as_string())  # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件
        server.quit()  # 关闭连接
    except Exception:  # 如果 try 中的语句没有执行，则会执行下面的 ret=False
        ret = False
    return ret


mail(head)

cursor.execute('delete from upcomingfilm')
conn.commit()
cursor.close()
conn.close

linux上用crontab定时推送，不详细解释，自行百度，注意要用绝对路径，样例如下：

4 9 * * 1,3,5 cd /root/python/recommend && /root/anaconda3/bin/scrapy crawl upcomingfilms
5 9 * * 1,3,5 cd /root/python/recommend/recommend && /root/anaconda3/bin/python send_mail.py
windows上用计划任务。

可能就算给自己推荐了即将上映的电影也没money去影院看，不过没关系，一部电影上映没几天我就能在网上找到资源，哈哈哈

还是老老实实继续做java吧。。。因为懒得主动去看有什么电影即将上映，就学了一手python、scrapy，linux。。。写了个这玩意儿推送给自己。

项目完整代码可以在https://download.csdn.net/download/qq_37518622/10588266下载，改一下数据库配置

python scrapy爬取豆瓣即将上映电影用邮件定时推送给自己

猜你喜欢