1.Intro

文件名：pageResolver.py
模块名：网页解析器
引用库：
re	lxml	datetime	sys
retry	random	urllib2
自定义引用文件：configManager
功能：解析网页源代码，获得相应的数据，以字典形式存储行记录，最后返回包含字典对象的列表。
2.Source

#!/usr/bin/env Python
# -*- coding: utf-8 -*-
'''
# Author  : YSW
# Time    : 2018/6/6 14:04
# File    : pageResolver.py
# Version : 1.1
# Describe: 网页解析器
# Update  :
        1.增加了中标网页的解析方法
'''

import re
from lxml import etree
import datetime
import sys
from retry import retry
import configManager
import random
import urllib2
# 设置默认编码，防止出现中文字符乱码
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
    reload(sys)
    sys.setdefaultencoding(defaultencoding)

HEADERS = {
    "User-Agent": random.choice(configManager.headers)
}

class Resolver(object):
    def time_parse(self, currentTime):
        '''
        获取系统当前时间，返回规约后的时间信息
        :param currentTime: 当前时间（字符串类型）
        :return:当前时间（时间类型）
        '''
        date = datetime.datetime.strptime(currentTime, '%Y-%m-%d')
        return date

    #### 招投标数据 ####

    @retry(tries=3, delay=2)
    def resovler_ynsggzxxt(self, html, page_num):
        '''
        云南省公共资源交易中心电子服务系统解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        # 获取招标信息
        xpathPattern = "//div/table[@id='data_tab']/tbody/tr"

        # 通过 xpath 返回符合匹配的结果列表
        node_list = text.xpath(xpathPattern)

        # 正则规约字段
        strParse = re.compile("\s")

        # 遍历结果列表
        for node in node_list:
            # 筛除标题的空值标签
            if len(node.xpath("./td")) > 0:
                # 项目编号
                projectNumber = node.xpath("./td")[1].text

                # 公告标题（正则规约）
                title = strParse.sub("", node.xpath("./td/a")[0].text)

                # 发布时间
                startTime = node.xpath("./td")[3].text
                start_time = self.time_parse(startTime)

                # 截止时间
                endTime = node.xpath("./td")[4].text
                end_time = self.time_parse(endTime)

                # 状态（正则规约）
                status = strParse.sub("", node.xpath("./td")[5].text)

                # 判断状态是否为空，如果为空，则跳转到下一级标签 i
                if status is "":
                    status = strParse.sub("", node.xpath("./td/i")[0].text)

                # href 链接地址
                href = "https://www.ynggzyxx.gov.cn" + str(node.xpath("./td/a/@href")[0])

                # 存储到字典
                resolveMessage = {
                    "项目编号": projectNumber,
                    "公告标题": title,
                    "发布时间": start_time,
                    "截止时间": end_time,
                    "状态": status,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)
    def resovler_ynsggzzw(self, html, page_num):
        '''
        云南省公共资源交易中心网解析器
        :param html:
        :param page_num:
        :return:
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 获取招标信息
        xpathPattern = "//table[@id='data_tab']/tbody/tr"

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        # 通过 xpath 返回符合匹配的结果列表
        node_list = text.xpath(xpathPattern)

        # 正则规约字段
        strParse = re.compile("\s")

        # 遍历结果列表
        for node in node_list:
            # 筛除标题的空值标签
            if len(node.xpath("./td")) > 0:
                # 序号
                serialNumber = node.xpath("./td")[0].text

                # 项目编号
                projectNumber = node.xpath("./td")[1].text

                # href 链接地址
                href = "https://www.ynggzyxx.gov.cn" + str(node.xpath("./td/a/@href")[0])

                # 发布时间
                startTime = node.xpath("./td")[3].text
                start_time = self.time_parse(startTime)

                # 公告标题（正则规约）
                title = strParse.sub("", node.xpath("./td/a")[0].text)

                # 存储到字典
                resolveMessage = {
                    "项目编号": projectNumber,
                    "公告标题": title,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)
    def resovler_kmsgg(self, html, page_num):
        '''
        昆明市公共资源交易中心网解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr")
        for i in range(1, 16):
            # 编号
            num = node_list[i].xpath("./td")[1].text

            # 工程名称
            project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8')

            # 链接
            href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0])

            start_time = None
            # 起始时间
            startTime = node_list[i].xpath("./td")[3].text
            if startTime is not None:
                start_time = self.time_parse(startTime)

            end_time = None
            # 结束时间
            endTime = node_list[i].xpath("./td")[4].text
            if endTime is not None:
                end_time = self.time_parse(endTime)

            status = None
            # 状态
            if node_list[i].xpath("./td")[5].text is not None:
                status = (node_list[i].xpath("./td")[5].text).encode('utf8')

            # 存储到字典
            if num and project_name and start_time and end_time and status is not None:
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "发布时间": start_time,
                    "结束时间": end_time,
                    "状态": status,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)
    def resovler_kmsgg_gc(self, html, page_num):
        '''
        昆明市公共资源交易中心网解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr")
        for i in range(1, 16):
            # 编号
            num = node_list[i].xpath("./td")[1].text

            # 工程名称
            project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8')

            # 链接
            href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0])

            start_time = None
            # 起始时间
            startTime = node_list[i].xpath("./td")[3].text
            if startTime is not None:
                start_time = self.time_parse(startTime)

            end_time = None
            # 结束时间
            endTime = node_list[i].xpath("./td")[4].text
            if endTime is not None:
                end_time = self.time_parse(endTime)

            status = None
            # 状态
            if node_list[i].xpath("./td")[5].text is not None:
                status = (node_list[i].xpath("./td")[5].text).encode('utf8')

            # 存储到字典
            if num and project_name and start_time and end_time and status is not None:
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "发布时间": start_time,
                    "结束时间": end_time,
                    "状态": status,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)
    def resovler_ynsggzxxt_zf(self, html, page_num):
        '''
        云南省公共资源交易中心电子服务系统解析器 政府采购
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        # 获取招标信息
        xpathPattern = "//div/table[@id='data_tab']/tbody/tr"

        # 通过 xpath 返回符合匹配的结果列表
        node_list = text.xpath(xpathPattern)

        # 正则规约字段
        strParse = re.compile("\s")

        # 遍历结果列表
        for node in node_list:
            # 筛除标题的空值标签
            if len(node.xpath("./td")) > 0:
                # 项目编号
                projectNumber = node.xpath("./td")[1].text

                # 公告标题（正则规约）
                title = strParse.sub("", node.xpath("./td/a")[0].text)

                # 发布时间
                startTime = node.xpath("./td")[3].text
                start_time = self.time_parse(startTime)

                # 截止时间
                endTime = node.xpath("./td")[4].text
                end_time = self.time_parse(endTime)

                # 状态（正则规约）
                status = strParse.sub("", node.xpath("./td")[5].text)

                # 判断状态是否为空，如果为空，则跳转到下一级标签 i
                if status is "":
                    status = strParse.sub("", node.xpath("./td/i")[0].text)

                # href 链接地址
                href = "https://www.ynggzyxx.gov.cn" + str(node.xpath("./td/a/@href")[0])

                # 存储到字典
                resolveMessage = {
                    "项目编号": projectNumber,
                    "公告标题": title,
                    "发布时间": start_time,
                    "截止时间": end_time,
                    "状态": status,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)

        return resolveResult

    @retry(tries=3, delay=2)
    def resovler_ynszfcgw(self, html, page_num):
        '''
        云南省政府采购网
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)
        for i in range(0, 10):
            node_list = text.xpath("//tr[@data-row-id='{0}']".format(i))

            for node in node_list:
                text_total = node.xpath('./td')[0].xpath('./a')[0].text

                # 编号
                num = text_total[:text_total.find('：')]

                # 工程名称
                project_name = text_total[text_total.find('：') + 1:]

                # 区划
                area = node.xpath('./td')[2].text

                time_push = None
                # 发布时间
                timePush = node.xpath('./td')[3].text
                if timePush is not None:
                    time_push = self.time_parse(timePush)

                # 链接
                cursor = node.xpath('./td')[0].xpath('./a/@data-bulletin_id')[0]

                href = "http://www.yngp.com/newbulletin_zz.do?method=preinsertgomodify&operator_state=1&flag=view&bulletin_id={0}".format(
                    cursor)

                # 存储到字典
                if num and project_name and area and href and time_push is not None:
                    resolveMessage = {
                        "编号": num,
                        "工程名称": project_name,
                        "发布时间": time_push,
                        "区划": area,
                        "链接": href,
                        "推送": False
                    }
                    resolveResult.append(resolveMessage)
        return resolveResult

    #### 中标数据 ####
    @retry(tries=3, delay=2)
    def get_url(self, url, proxy_dict):
        proxyIP = proxy_dict['ip']
        proxyPort = proxy_dict['port']
        proxyProtocol = proxy_dict['protocol']
        proxy_handler = urllib2.ProxyHandler({proxyProtocol: "{0}:{1}".format(proxyIP, proxyPort)})

        opener_proxy = urllib2.build_opener(proxy_handler)
        urllib2.install_opener(opener_proxy)
        request = urllib2.Request(url=url, headers=HEADERS)
        response = urllib2.urlopen(request)
        html = response.read()

        return html

    @retry(tries=3, delay=2)  # 70%
    def resovler_ynsggzxxt_gc_zb(self, html, page_num, proxy_dict):
        '''
        云南省公共资源交易信息网_工程建设_中标公告解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        def resolve_pp_0(html):
            try:
                people = ''
                price = 0.0
                text = etree.HTML(html)
                node_second_list = text.xpath("//div[@class='con']//tr")
                for node_second in node_second_list:
                    if "中标人：" == node_second.xpath("./td")[0].text:
                        people = node_second.xpath("./td")[1].xpath('./b//span')[0].text
                    if "中标价" in node_second.xpath("./td")[0].text:
                        totalCount = node_second.xpath("./td")[1].xpath('./b//span')[0].text
                        price = float(re.sub("\D", "", totalCount))
                return people, price
            except:
                return None, 0.0

        def resolve_pp_1(html):
            '''
            子网页解析器_1
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=7befec50-6cf1-49b1-a5ec-b3b1cf6d3ab2&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                people = ''
                price = 0.0
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//table"
                node_list = text.xpath(xpathPattern)[0]
                for index, node in enumerate(node_list):
                    if index == 7:
                        people = node.xpath('./td//tr')[1].xpath('./td')[1].text
                        price_tmp = node.xpath('./td//tr')[1].xpath('./td')[6].text
                        if price_tmp == 0 or price_tmp == '/':
                            price = float(0.0)
                # print("中标人： {0}，中标价：{1}".format(people, price))
                return people, price
            except:
                return None, 0.0

        def resolve_pp_2(html):
            '''
            子网页解析器_2
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=2ab5a6f5-30e2-4599-846b-22597815e3dd&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                people = ''
                price = 0.0
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//div[@class='detail_contect']//p"
                node_list = text.xpath(xpathPattern)
                for node in node_list:
                    if "第一中标候选人" in node.text:
                        people_tmp = str(node.text).strip()
                        people = people_tmp[people_tmp.find('：') + 3:]
                    elif "投标报价" in node.text:
                        price_tmp = node.xpath('./span')[0].text
                        price = float(price_tmp)
                # print("中标人： {0}，中标价：{1}".format(people, price))
                return people, price
            except:
                return None, 0.0

        def resolve_pp_3(html):
            '''
            子网页解析器_3
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=e145f187-b9d9-4573-b4b0-f5c4c66ddbdb&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                people = ''
                price = 0.0
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//tr"
                node_list = text.xpath(xpathPattern)
                for node in node_list:
                    ## 中标人
                    tmp = node.xpath('./td//span')[0].text
                    if "第一中标候选人" == tmp:
                        people = node.xpath('./td//span')[1].text

                    ## 中标价格
                    node_td = node.xpath('./td')
                    if len(node_td) > 3:
                        for no in node_td:
                            if len(no.xpath('./span')) > 0 and "中标价（万元）" == no.xpath('./span')[0].text:
                                price = float(node_td[3].xpath('./span')[0].text)
                # print("中标人： {0}，中标价：{1}".format(people, price))
                return people, price
            except:
                return None, 0.0

        def resolve_pp_4(html):
            '''
            子网页解析器_4
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=562df3b5-207a-4f2e-b3f7-3b29736ae191&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//tr"
                node_list = text.xpath(xpathPattern)
                node = node_list[12]

                people_td = node.xpath('./td')[1]
                people = people_td.xpath('./p/span')[0].text

                price_td = node.xpath('./td')[2]
                price_tmp = price_td.xpath('./p/span')[0].text
                price = float(price_tmp)

                return people, price
            except:
                return None, 0.0

        def resolve_pp_5(html):
            '''
            子网页解析器_5
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=61a3019b-33cb-44ba-a193-20c5d7f38543&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//table"
                node_list = text.xpath(xpathPattern)
                tr_list = node_list[0].xpath('./tbody//tr')
                td_list = tr_list[1]
                people_td = td_list[2]
                people = people_td.xpath('./p/b/span')[0].text

                price_td = td_list[4]
                price_tmp = price_td.xpath('./p/b/span')[0].text
                price = float(price_tmp)

                return people, price
            except:
                return None, 0.0

        def resolve_pp_6(html):
            '''
            子网页解析器_6
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=e8cc5564-4664-4d45-aabd-2690a3366e2b&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//table//td[@colspan='4']//tr"
                node_list = text.xpath(xpathPattern)

                people = node_list[1].xpath('./td')[1].text

                price_tmp = node_list[1].xpath('./td')[4].text
                price = float(price_tmp)

                return people, price
            except:
                return None, 0.0

        def resolve_pp_7(html):
            '''
            子网页解析器_7
            eg: https://www.ynggzyxx.gov.cn/jyxx/jsgcZbjggsDetail?guid=2a7c021d-db9d-4dc5-8294-39083501dd9f&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                text = etree.HTML(html)
                xpathPattern = "//div[@class='w1200s']//div[@class='page_contect bai_bg']//table//tr"
                node_list = text.xpath(xpathPattern)
                people = node_list[9].xpath('./td')[1].xpath('./p/span')[0].text
                return people, 0.0
            except:
                return None, 0.0

        print("[+] 正在解析第{0}页信息".format(page_num))

        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        xpathPattern = "//div/table[@id='data_tab']/tbody/tr"
        node_list = text.xpath(xpathPattern)

        for node in node_list:
            if len(node.xpath("./td")) > 0:
                project_name = node.xpath("./td//a")[0].text
                project_name_parse = project_name.replace('\n', '').replace(u'\t', '').replace(u' ', '')
                startTime = node.xpath("./td")[2].text
                start_time = self.time_parse(startTime)

                href = "https://www.ynggzyxx.gov.cn" + node.xpath('./td//a//@href')[0]

                html_second = self.get_url(href, proxy_dict)

                people, price = resolve_pp_0(html_second)
                if people == '':
                    people, price = resolve_pp_2(html_second)

                if people == '':
                    people, price = resolve_pp_1(html_second)

                if people == '':
                    people, price = resolve_pp_3(html_second)

                if people == None:
                    people, price = resolve_pp_4(html_second)

                if people == None:
                    people, price = resolve_pp_5(html_second)

                if people == None:
                    people, price = resolve_pp_6(html_second)

                if people == None:
                    people, price = resolve_pp_7(html_second)

                # 存储到字典
                resolveMessage = {
                    "公告名称": project_name_parse,
                    "发布时间": start_time,
                    "链接": href,
                    "中标公司": people,
                    "中标价格": price,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)  # Done
    def resovler_ynsggzxxt_zf_zb(self, html, page_num):
        '''
        云南省公共资源交易信息网_政府采购_中标结果解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))

        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        xpathPattern = "//div/table[@id='data_tab']/tbody/tr"
        node_list = text.xpath(xpathPattern)

        for node in node_list:
            if len(node.xpath("./td")) > 0:
                project_name = node.xpath("./td//a")[0].text
                project_name_parse = project_name.replace('\n', '').replace(u'\t', '').replace(u' ', '')
                startTime = node.xpath("./td")[2].text
                start_time = self.time_parse(startTime)

                href = "https://www.ynggzyxx.gov.cn" + node.xpath('./td//a//@href')[0]

                # 存储到字典
                resolveMessage = {
                    "公告名称": project_name_parse,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult


    @retry(tries=3, delay=2)  # Done
    def resovler_ynsggzzw_gc_zb(self, html, page_num, proxy_dict):
        '''
        云南省公共资源交易中心_工程建设_中标结果解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        def resolve_pp_1(html):
            '''
            子网页解析器_1
            eg: https://www.ynggzy.com/jyxx/jsgcZbjggsDetail?guid=fbd514af-5716-4e30-bc1d-b42892986f85&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                people = ''
                price = ''
                text = etree.HTML(html)
                node_second_list = text.xpath("//div[@class='con']//tr")
                for node_second in node_second_list:
                    if "中标人：" == node_second.xpath("./td")[0].text:
                        people = node_second.xpath("./td")[1].xpath('./b//span')[0].text
                    if "中标价" in node_second.xpath("./td")[0].text:
                        totalCount = node_second.xpath("./td")[1].xpath('./b//span')[0].text
                        price = totalCount
                return people, price
            except:
                return None, ''

        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []
        # 存储的列表
        text = etree.HTML(html)
        xpathPattern = "//div/table[@id='data_tab']/tbody/tr"
        node_list = text.xpath(xpathPattern)

        # 正则规约字段
        strParse = re.compile("\s")

        for node in node_list:
            if len(node.xpath("./td")) > 0:
                # 公告标题（正则规约）
                title = strParse.sub("", node.xpath("./td")[1].xpath("./a")[0].text)

                # 发布时间
                startTime = node.xpath("./td")[2].text
                start_time = self.time_parse(startTime)

                # href 链接地址
                href = "https://www.ynggzy.com" + str(node.xpath("./td/a/@href")[0])
                html_second = self.get_url(href, proxy_dict)
                people, price = resolve_pp_1(html_second)
                # 存储到字典
                resolveMessage = {
                    "公告标题": title,
                    "发布时间": start_time,
                    "链接": href,
                    "中标公司": people,
                    "中标价格": price,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)  # Done
    def resovler_ynsggzzw_zf_zb(self, html, page_num):
        '''
        云南省公共资源交易中心_政府采购_结果公示解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []
        # 存储的列表
        text = etree.HTML(html)
        xpathPattern = "//div/table[@id='data_tab']/tbody/tr"
        node_list = text.xpath(xpathPattern)

        # 正则规约字段
        strParse = re.compile("\s")

        for node in node_list:
            if len(node.xpath("./td")) > 0:
                # 公告标题（正则规约）
                title = strParse.sub("", node.xpath("./td")[1].xpath("./a")[0].text)

                # 发布时间
                startTime = node.xpath("./td")[2].text
                start_time = self.time_parse(startTime)

                # href 链接地址
                href = "https://www.ynggzy.com" + str(node.xpath("./td/a/@href")[0])
                # 存储到字典
                resolveMessage = {
                    "公告标题": title,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult


    @retry(tries=3, delay=2)  # Done
    def resolver_kmsgg_gc_zb(self, html, page_num):
        '''
        昆明市公共资源交易平台公共服务系统_工程建设_中标结果公示解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr")
        for i in range(1, 16):
            # 编号
            num = node_list[i].xpath("./td")[1].text

            # 工程名称
            project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8')

            # 链接
            href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0])

            start_time = None
            # 发布时间
            startTime = node_list[i].xpath("./td")[3].text
            if startTime is not None:
                start_time = self.time_parse(startTime)

            # 存储到字典
            if num and project_name and start_time is not None:
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)  # Done
    def resolver_kmsgg_zf_zb(self, html, page_num):
        '''
        昆明市公共资源交易平台公共服务系统_政府采购_结果公示解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr")
        for i in range(1, 16):
            # 编号
            num = node_list[i].xpath("./td")[1].text

            # 工程名称
            project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8')

            # 链接
            href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0])

            start_time = None
            # 发布时间
            startTime = node_list[i].xpath("./td")[3].text
            if startTime is not None:
                start_time = self.time_parse(startTime)

            # 存储到字典
            if num and project_name and start_time is not None:
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)  # Done
    def resolver_kmsgg_gc_by(self, html, page_num):
        '''
        昆明市公共资源交易平台公共服务系统_工程建设_补遗通知解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr")
        for i in range(1, 16):
            # 编号
            num = node_list[i].xpath("./td")[1].text

            # 工程名称
            project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8')

            # 链接
            href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0])

            start_time = None
            # 发布时间
            startTime = node_list[i].xpath("./td")[3].text
            if startTime is not None:
                start_time = self.time_parse(startTime)

            # 存储到字典
            if num and project_name and start_time is not None:
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult

    @retry(tries=3, delay=2)  # Done
    def resolver_kmsgg_zf_by(self, html, page_num):
        '''
        昆明市公共资源交易平台公共服务系统_政府采购_补遗通知解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []

        # 解析源码并返回 XML 对象
        text = etree.HTML(html)

        node_list = text.xpath("//div[@class='zb_from']/table/tbody/tr")
        for i in range(1, 16):
            # 编号
            num = node_list[i].xpath("./td")[1].text

            # 工程名称
            project_name = (node_list[i].xpath("./@field_bdmcggbt")[0]).encode('utf8')

            # 链接
            href = "https://www.kmggzy.com/Jyweb/" + str(node_list[i].xpath("./td/a/@href")[0])

            start_time = None
            # 发布时间
            startTime = node_list[i].xpath("./td")[3].text
            if startTime is not None:
                start_time = self.time_parse(startTime)

            # 存储到字典
            if num and project_name and start_time is not None:
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "发布时间": start_time,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult


    @retry(tries=3, delay=2)  # Done
    def resolver_ynszfcgw_cg(self, html, page_num, driver_second):
        '''
        云南省政府采购网_采购结果解析器
        :param html: 网页源码
        :param page_num: 网页页数
        :return: 返回包含数据字典的列表
        '''
        def resolver_pp_1(url_second):
            '''
            子网页解析器_1
            eg: https://www.ynggzy.com/jyxx/jsgcZbjggsDetail?guid=fbd514af-5716-4e30-bc1d-b42892986f85&isOther=false
            :param html:网页源码
            :return:中标公司和中标价格
            '''
            try:
                driver_second.get(url_second)
                people = driver_second.find_element_by_id('winSupply').get_attribute('value')
                price_tmp = driver_second.find_element_by_id('winMoney').get_attribute('value')
                price = price_tmp + "万元"
                return people, price
            except:
                return None, ''
        if page_num != 0:
            print("[+] 正在解析第{0}页信息".format(page_num))
        # 存储的列表
        resolveResult = []
        text = etree.HTML(html)
        for i in range(0, 10):
            node_list = text.xpath("//tr[@data-row-id='{0}']".format(i))

            for node in node_list:
                text_total = node.xpath('./td')[0].xpath('./a')[0].text

                # 编号
                num = text_total[:text_total.find('：')]

                # 工程名称
                project_name = text_total[text_total.find('：') + 1:]

                # 区划
                area = node.xpath('./td')[2].text

                time_push = None
                # 发布时间
                timePush = node.xpath('./td')[3].text
                if timePush is not None:
                    time_push = self.time_parse(timePush)

                # 链接
                cursor = node.xpath('./td')[0].xpath('./a/@data-bulletin_id')[0]

                href = "http://www.yngp.com/newbulletin_zz.do?method=preinsertgomodify&operator_state=1&flag=view&bulletin_id={0}".format(
                    cursor)

                people, price = resolver_pp_1(href)

                # 存储到字典
                resolveMessage = {
                    "编号": num,
                    "工程名称": project_name,
                    "区划": area,
                    "发布时间": time_push,
                    "中标公司": people,
                    "中标价格": price,
                    "链接": href,
                    "推送": False
                }
                resolveResult.append(resolveMessage)
        return resolveResult
[Python] [爬虫] 6.批量政府网站的招投标、中标信息爬取和推送的自动化爬虫——网页解析器

1.Intro

2.Source

猜你喜欢