[Python] [爬虫] 2.批量政府网站的招投标、中标信息爬取和推送的自动化爬虫——验证模块

目录

1.Intro

2.Source


1.Intro

文件名:authentication.py

模块名:验证模块

引用库:

urllib2 requests pymongo socket
gc retry spiderData(自定义库)  

自定义引用文件:spiderData,包含了一个网页返回状态码的字典,键为网页状态码,值为网页状态码对应的信息。由于没有合适的状态码返回值信息,所以自己写了个字典用于存储状态码和状态信息的键值对,其中包含了常见的网页错误状态码和错误信息:

httpStatusCode = {
        "300": "Multiple Choices",
        "301": "Moved Permanently",
        "302": "Move temporarily",
        "303": "See Other",
        "304": "Not Modified",
        "305": "Use Proxy",
        "306": "Switch Proxy",
        "307": "Temporary Redirect",
        "400": "Bad Request",
        "401": "Unauthorized",
        "402": "Payment Required",
        "403": "Forbidden",
        "404": "Not Found",
        "405": "Method Not Allowed",
        "406": "Not Acceptable",
        "407": "Proxy Authentication Required",
        "408": "Request Timeout",
        "409": "Conflict",
        "410": "Gone",
        "411": "Length Required",
        "412": "Precondition Failed",
        "413": "Request Entity Too Large",
        "414": "Request-URI Too Long",
        "415": "Unsupported Media Type",
        "416": "Requested Range Not Satisfiable",
        "417": "Expectation Failed",
        "421": "Too many connections",
        "422": "Unprocessable Entity",
        "423": "Locked",
        "424": "Failed Dependency",
        "425": "Unordered Collection",
        "426": "Upgrade Required",
        "449": "Retry With",
        "451": "Unavailable For Legal Reasons",
        "500": "Internal Server Error",
        "501": "Not Implemented",
        "502": "Bad Gateway",
        "503": "Service Unavailable",
        "504": "Gateway Timeout",
        "505": "HTTP Version Not Supported",
        "506": "Variant Also Negotiates",
        "507": "Insufficient Storage",
        "509": "Bandwidth Limit Exceeded",
        "510": "Not Extended",
        "600": "Unparseable Response Headers"
    }

功能:用于验证MongoDB数据库连接状态、网页连通性(HTTP状态码)、代理IP可用性。


2.Source

#!/usr/bin/env Python
# -*- coding: utf-8 -*-
'''
# Author  : YSW
# Time    : 2018/6/6 14:01
# File    : authentication.py
# Version : 1.1
# Describe: 验证模块
# Update  :
        1.新增了retry库,可多次尝试网站连通性,直到连接超时。
'''

import urllib2
import requests
import socket
import spiderData
import pymongo
import gc
from retry import retry

class Authentication(object):
    def __init__(self, headers):
        print("[*] 初始化验证模块")
        self.headers = headers

    def dataBaseVerify(self, dbParams):
        '''
        验证数据库连接状态
        :param dbParams: 数据库连接参数
        :return: 验证通过返回 True,否则返回 False
        '''
        print("[+] 正在验证 MongoDB 数据库连接状态")
        try:
            userName = dbParams["userName"]
            port = dbParams["port"]
            pymongo.MongoClient(userName, port)
            print("[+] 数据库验证通过")
            return True
        except Exception, e:
            print("[+] 数据库验证失败")
            print("ERROR: " + str(e.message))
            return False

    @retry(tries=5, delay=2)
    def httpCodeVerify(self, url):
        '''
        验证 HTTP 状态码
        :return: 验证通过返回 True,否则返回 False
        '''
        print("[+] 正在验证 HTTP 状态码:{0}".format(url))
        try:
            request = urllib2.Request(url, headers=self.headers)
            urllib2.urlopen(request)
            print("[+] HTTP 验证通过:{0}".format(url))
            return True
        except urllib2.HTTPError, e:
            print("[+] HTTP 验证失败:{0}".format(url))
            print("ERROR: " + str(e.code) + " " + spiderData.httpStatusCode[str(e.code)])
            return False

    def proxyVerify(self, url, protocol, ip, port):
        '''
        检查代理IP是否可用
        :param ip:代理IP
        :param port:代理端口
        :param protocol:代理协议
        :return:返回检查结果
        '''
        check_url = url
        proxy_url = "{0}://{1}:{2}".format(protocol, ip, port)
        print("[+] 正在验证代理 IP 可用性")
        socket_timeout = 30
        socket.setdefaulttimeout(socket_timeout)
        try:
            proxy_dict = {
                protocol: proxy_url
            }
            response = requests.get(check_url, proxies=proxy_dict, headers=self.headers)
            code = response.status_code
            print(str(code))
            if code >= 200 and code < 300:
                print("[+] 可用的代理IP和端口: {0}:{1}:{2}".format(protocol, ip, port))
                print("[+] 验证通过")
                return True
            else:
                print("[-] 不可用的代理IP和端口: {0}:{1}:{2}".format(protocol, ip, port))
                return False
        except Exception, e:
            print("[-] 不可用的代理IP和端口: {0}:{1}:{2}".format(protocol, ip, port))
            print("ERROR: " + str(e.message))
            return False
        finally:
            gc.collect()

猜你喜欢

转载自blog.csdn.net/weixin_42015762/article/details/83860202