[Python] [爬虫] 8.批量政府网站的招投标、中标信息爬取和推送的自动化爬虫——数据推送模块

目录

1.Intro

2.Source

(1)dataPusher

(2)dataPusher_HTML


1.Intro

文件名:dataPusher.py、dataPusher_HTML.py

模块名:数据推送模块

引用库:

smtplib email pyExcelerator
sys time datetime

自定义引用文件:dataDisposer、Console_Color、configManager

功能:从数据库中获取数据生成HTML文件,更新推送标识,格式化邮件地址,发送邮件。

2.Source

(1) dataPusher

#!/usr/bin/env Python
# -*- coding: utf-8 -*-
'''
# Author  : YSW
# Time    : 2018/6/6 14:05
# File    : dataPusher.py
# Version : 1.0
# Describe: 数据推送模块(旧版本推送方式)
# Update  :
'''

'''
    smtplib模块主要负责发送邮件:
        是一个发送邮件的动作,连接邮箱服务器,登录邮箱,发送邮件(有发件人,收信人,邮件内容)。
    
    email模块主要负责构造邮件:
        指的是邮箱页面显示的一些构造,如发件人,收件人,主题,正文,附件等。
    
    xlwt模块:
        操作excel
    
    pyExcelerator模块:
        操作excel,写入excel较为方便
    
'''
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
from email import encoders
from email.mime.base import MIMEBase
from email.utils import parseaddr, formataddr
import time
from pyExcelerator import *

class DataWrite(object):
    def __init__(self):
        print("[*] 正在初始化数据写入模块")
        self.excel_Workbook = Workbook()
        self.excel_Workbook_parse = Workbook()

    def excel_name(self, logic_file_type):
        '''
        获取当前时间,生成excel文件名
        文件名格式为:
            年月日_时分秒
            如:20180619_161819
        :return: excel文件名
        '''
        print("[+] 正在创建文件名称")
        current_time = time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())).replace(' ', '_').replace(':', '')
        file_name = ""

        if logic_file_type == 0:
            file_name = r".\history_file\{0}.xls".format(current_time)
        elif logic_file_type == 1:
            file_name = r".\history_file\{0}[keyword].xls".format(current_time)
        elif logic_file_type == 2:
            file_name = r".\history_file\{0}_ZB.xls".format(current_time)
        elif logic_file_type == 3:
            file_name = r".\history_file\{0}_ZB[keyword].xls".format(current_time)

        print("[+] 创建成功")
        return file_name

    def excel_header(self, row, excel_sheet, excel_head_data, excel_sheet_name):
        '''
        生成excel标题头
        :param row: 当前标题的行数
        :param excel_sheet: 当前excel中的表
        :param excel_head_data: 标题列表数据
        :param excel_sheet_name: 表名
        :return:
        '''
        print("[*] 正在写入标题,表名:{0}".format(excel_sheet_name))
        try:
            index = 0
            for data in excel_head_data:
                excel_sheet.write(row, index, data)
                index += 1
            print("[+] 写入标题成功")
            return True
        except Exception, e:
            print("[-] 写入标题失败")
            print("ERROR: " + str(e.message))
            return False

    def excel_write(self, excel_sheet_name, excel_head_data, excel_data, logic_file_type):
        '''
        excel文件写入
        :param excel_sheet_name: excel的sheet表名
        :param excel_head_data: excel的标题列表数据
        :param excel_data: 要写入excel的数据
        :param logic_file_type: 判断文件是否为关键词提取文件
        :return: 返回生成的excel文件地址
        '''
        excel_name = self.excel_name(logic_file_type)
        try:
            print("[*] 正在写入文件")
            # 在excel文件中对应生成每一张表
            excel_sheet = self.excel_Workbook.add_sheet(excel_sheet_name)

            if self.excel_header(0, excel_sheet, excel_head_data, excel_sheet_name):
                index = 1
                for data in excel_data:
                    column_index = 0
                    for item in excel_head_data:
                        excel_sheet.write(index, column_index, data[item])
                        column_index += 1
                    index += 1
                self.excel_Workbook.save(excel_name)
            print("[+] 写入文件成功")
            return excel_name
        except Exception, e:
            print("[-] 写入文件失败")
            print("ERROR: " + str(e.message))
            return excel_name

    def excel_write_parse(self, excel_sheet_name, excel_head_data, excel_data, logic_file_type):
        '''
        excel文件写入(筛选后)
        :param excel_sheet_name: excel的sheet表名
        :param excel_head_data: excel的标题列表数据
        :param excel_data: 要写入excel的数据
        :param logic_file_type: 判断文件是否为关键词提取文件
        :return: 返回生成的excel文件地址
        '''
        excel_name = self.excel_name(logic_file_type)
        try:
            print("[*] 正在写入文件")
            # 在excel文件中对应生成每一张表
            excel_sheet = self.excel_Workbook_parse.add_sheet(excel_sheet_name)

            if self.excel_header(0, excel_sheet, excel_head_data, excel_sheet_name):
                index = 1
                for data in excel_data:
                    column_index = 0
                    for item in excel_head_data:
                        excel_sheet.write(index, column_index, data[item])
                        column_index += 1
                    index += 1
                self.excel_Workbook_parse.save(excel_name)
            print("[+] 写入文件成功")
            return excel_name
        except Exception, e:
            print("[-] 写入文件失败")
            print("ERROR: " + str(e.message))
            return excel_name

class DataSend(object):
    def __init__(self):
        print("[*] 正在初始化数据推送模块")

    def format_address(self, address):
        '''
        格式化邮件地址
        :param address: 邮件地址
        :return: 格式化后的邮件地址
        '''
        print("[+] 正在格式化邮件地址")
        name, addr = parseaddr(address)
        print("[+] 格式化完成")
        return formataddr((Header(name, 'utf-8').encode(), addr))

    def send_mail(self, body, attachment):
        '''
        发送邮件
        :param body: 邮件正文
        :param attachment: 附件地址
        :return: 发送成功返回True
        '''
        print("[+] 开始发送邮件...")
        # 要发送的服务器
        smtp_server = 'smtp.qq.com'
        # 要发送的邮箱用户名/密码
        from_mail = '发送方邮箱地址'
        mail_pass = '邮箱SMTP服务密码'
        # 接收的邮箱
        to_mail = '接收方邮箱地址'

        # 构造一个 MIMEMultipart 对象代表邮件本身
        msg = MIMEMultipart()

        # Header 对中文进行转码
        msg['From'] = self.format_address('爬虫机器人 <%s>' % from_mail).encode()
        msg['To'] = to_mail
        msg['Subject'] = Header('今日份的招投标信息', 'utf-8').encode()

        # # plain 代表纯文本
        msg.attach(MIMEText(body, 'plain', 'utf-8'))
        # 二进制方式模式文件
        if len(attachment) != 0:
            for file_path in attachment:
                with open(file_path, 'rb') as excel:
                    # MIMEBase 表示附件的名字
                    mime = MIMEBase(file_path[str(file_path).rfind('\\') + 1: -4], 'xls',
                                    filename=file_path[str(file_path).rfind('\\') + 1:])

                    # filename 是显示附件名字
                    mime.add_header('Content-Disposition', 'attachment',
                                    filename=file_path[str(file_path).rfind('\\') + 1:])

                    # 获取附件内容
                    mime.set_payload(excel.read())
                    encoders.encode_base64(mime)

                    # 作为附件添加到邮件
                    msg.attach(mime)

        print("[+] 正在连接 SMTP 服务器")
        email = smtplib.SMTP_SSL(smtp_server, 465)
        print("[+] 连接成功")
        print("[+] 正在授权 SMTP 服务")
        login_code = email.login(from_mail, mail_pass)
        if login_code[0] is 235:
            print("[+] 授权成功")
        else:
            print("[-] 授权失败")
            return False
        try:
            # as_string()把 MIMEText 对象变成 str
            print("[+] 正在发送邮件")
            email.sendmail(from_mail, to_mail, msg.as_string())
            email.quit()
            print("[+] 发送成功")
            return True
        except Exception as e:
            print("[-] 发送失败")
            print("ERROR: " + str(e.message))
            return False

(2) dataPusher_HTML

#!/usr/bin/env Python
# -*- coding: utf-8 -*-
'''
# Author  : YSW
# Time    : 2018/8/14 14:05
# File    : dataPusher_HTML.py
# Version : 1.0
# Describe: 数据推送模块(HTML版)
# Update  :
'''

import sys
import time
from Lib import Console_Color
import configManager
import dataDisposer
import datetime
reload(sys)
sys.setdefaultencoding('utf-8')

# 关键词列表
KEY_WORD = []
# 表的标题名
TABLE_TITLE = configManager.table_title
TENDER = dataDisposer.tenderDB

# 数据库
TENDER_TABLE = dataDisposer.DataOperate.dataOperate()

# 时间
DATE = dataDisposer.current_time()
TODAY_TIME = datetime.datetime(DATE.year, DATE.month, DATE.day, 0, 0, 0)


class HTML_Content(object):
    def __init__(self):
        Console_Color.print_color("[*] 正在初始化HTML数据写入模块")

    def get_data(self, table_name):
        '''
        数据获取函数
        :param table_name: 表名
        :return: 返回数据列表
        '''
        tenderTable = TENDER_TABLE[table_name]
        # 获取今日数据
        list_data = list(tenderTable.find(
            {
                '发布时间': {"$gte": TODAY_TIME},
                # '推送': False
            })
        )
        tenderTable.update(
            {'推送': False},
            {'$set': {'推送': True}},
            multi=True,
            upsert=True
        )
        return list_data

    def delete_data(self, table_name):
        '''
        移除链接为空的数据行
        :param table_name: 数据表名称
        '''
        sheet = TENDER[table_name]
        sheet.remove({"链接": None})

    def current_time(self):
        time_parse = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        return time_parse

    def html_name(self, logic_file_type):
        '''
        获取当前时间,生成 html 文件名
        文件名格式为:
        年月日_时分秒
        如:20180619_161819
        :return: html 文件名
        '''
        Console_Color.print_color("[+] 正在创建文件名称")
        current_time = time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())).replace(' ', '_').replace(':', '')
        file_name = ""
        if logic_file_type == 0:
            file_name = r".\history_file\{0}.html".format(current_time)
        elif logic_file_type == 1:
            file_name = r".\history_file\{0}[keyword].html".format(current_time)
        elif logic_file_type == 2:
            file_name = r".\history_file\{0}_ZB.html".format(current_time)
        elif logic_file_type == 3:
            file_name = r".\history_file\{0}_ZB[keyword].html".format(current_time)
        Console_Color.print_color("[+] 创建成功")
        return file_name

    def __html_1(self, title, name):
        '''
        HTML网页第一部分
        :param title: 网页标题,如 “招投标信息”
        :param name: 当前网页名称,如 “今日份的招投标文件”
        :param desc: 描述信息
        :return: 返回网页第一部分信息
        '''
        desc = "推送时间:{0}".format(self.current_time())
        html1 = """
        <html>
        <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
        <title>{0}</title></head><body bgcolor="white">
        </head>
        <body>
        <Center><H2>{1}</h2></Center>
        <p align="center">{2}</p>
        <Hr width="100%">
        <BR>
        """.format(title, name, desc)
        return html1

    def __html_content_header(self, current_website_name):
        '''
        分隔每个网站的标题头
        :param current_website_name: 标题名称
        :return: 带标题名称的网页信息
        '''
        Console_Color.print_color("[+] 创建网站标题头")
        html_header = """
        <hr width="100%" style="margin-top:-5px;border:3px solid blue;"/>
        <h3>{0}</h3>
        """.format(current_website_name)
        return html_header

    def __html_a(self, url, time_parse, name, dict_data):
        '''
        主要内容
        :param url: 子链接地址
        :param name: 标题
        :param time_parse: 时间
        :param *args: 其他内容
        :return: 返回主要内容
        '''
        Console_Color.print_color("[+] 写入主要内容: {0}".format(name))
        html_a = """
        <Hr width="100%">
        ├─<a>[{1}] #### </a><a href="{0}" target="_blank">{2}</a><br>
        """.format(url, time_parse, name)
        html_a_second = """"""
        for key, value in dict_data.items():
            html_a_second_tmp = """
            ├───────<a>{0}</a><br>
            """.format("{0}: {1}".format(key, value))
            html_a_second += html_a_second_tmp

        html = html_a + html_a_second + "<Hr width='100%'>"
        return html

    # Fixed
    def __html2(self):
        '''
        HTML网页第二部分
        :return: 返回网页第二部分信息
        '''
        html2 = """
        </body>
        </html>
        """
        return html2

    def html_content_func(self, list_data, current_website_name):
        '''
        网页主内容方法
        :param list_data: 数据列表
        :param current_website_name: 当前网站名称
        :return: 返回页面数据
        '''
        print("[*] 正在写入网页数据")
        html_content = self.__html_content_header(current_website_name)
        for data in list_data:
            url = str(data[u"链接"]).encode('utf-8')
            data.pop(u"链接")
            try:
                project_name = str(data[u"工程名称"]).encode('utf-8')
                data.pop(u"工程名称")
            except KeyError:
                try:
                    project_name = str(data[u"公告标题"]).encode('utf-8')
                    data.pop(u"公告标题")
                except KeyError:
                    project_name = str(data[u"公告名称"]).encode('utf-8')
                    data.pop(u"公告名称")

            time_parse = str(data[u"发布时间"]).encode('utf-8')
            data.pop(u"发布时间")
            data.pop(u"_id")
            data.pop(u"推送")
            html_content += self.__html_a(url, time_parse, project_name, data) + '\n'
        Console_Color.print_color("[+] 写入完成")
        return html_content

    def html_engine(self, title, name, html_content):
        '''
        HTML生成器
        :param title: 网页标题,如 “招投标信息”
        :param name: 当前网页名称,如 “今日份的招投标文件”
        :param current_website_name: 当前网站的标题名称,如 “云南省公共资源交易中心电子服务系统_工程建设”
        :param html_content: 当前网站的主要内容
        :return: 全网页
        '''
        Console_Color.print_color("[*] 正在生成HTML页面")
        html = \
                self.__html_1(title, name) \
                + "\n" \
                + html_content \
                + "\n" \
                + self.__html2()
        Console_Color.print_color("[+] 生成成功")
        return html

    def html_write(self, title, name, dict_html_data_name, logic_file_type):
        '''
        HTML 文件写入方法
        :param title: 网页标题
        :param name: 当前网页的名称
        :param func: 数据获取的方法
        :param list_html_data_name: 包含数据库表名和网站名称的字典
        :param logic_file_type: 文件标识
        :return html文件路径
        '''
        html_file_name = self.html_name(logic_file_type)
        html_con = """"""
        for table_name, table_value in dict_html_data_name.items():
            self.delete_data(table_name)
            current_website_name = table_value
            list_data = self.get_data(table_name)
            if list_data == []:
                continue
            html_content = self.html_content_func(list_data, current_website_name)
            html_con += html_content
        if html_con == """""":
            return ''
        html = self.html_engine(title, name, html_con)
        with open(html_file_name, "w") as f:
            f.write(html)
        return html_file_name

    def html_write_keywords(self, title, name, dict_html_data_name, logic_file_type):
        '''
        HTML 文件写入方法(加入关键词筛选)
        :param title: 网页标题
        :param name: 当前网页的名称
        :param func: 数据获取的方法
        :param list_html_data_name: 包含数据库表名和网站名称的字典
        :param logic_file_type: 文件标识
        :return html文件路径
        '''
        html_file_name = self.html_name(logic_file_type)
        html_con = """"""
        for table_name, table_value in dict_html_data_name.items():
            self.delete_data(table_name)
            current_website_name = table_value
            list_data = self.get_data(table_name)
            # 读取关键词文件并生成关键字列表
            with open(r".\keyword_file\keyword.txt", 'r') as f:
                line = f.read()
                if line not in KEY_WORD:
                    KEY_WORD.append(line)
            key_word = str(KEY_WORD[0]).split('\n')

            # 筛选关键词信息
            list_data_parse = []
            for data in list_data:
                for key in key_word:
                    # 获取每张表对应的标题字段并判断是否包含关键词信息
                    if key in data[TABLE_TITLE[table_name]] and data not in list_data_parse:
                        list_data_parse.append(data)
            if list_data_parse == []:
                continue
            html_content = self.html_content_func(list_data_parse, current_website_name)
            html_con += html_content
        if html_con == """""":
            return ''
        html = self.html_engine(title, name, html_con)
        with open(html_file_name, "w") as f:
            f.write(html)
        return html_file_name

猜你喜欢

转载自blog.csdn.net/weixin_42015762/article/details/83862285