python3从www.weather.com.cn 获取城市AQI数据

环境:

mysql+python3.5+pycharm

sql脚本:

CREATE TABLE `AQIHH` (
  `CITYCODE` VARCHAR(12) NOT NULL,
  `DATATIME` DATETIME NOT NULL,
  `AQI` VARCHAR(20) DEFAULT NULL,
  `PM25` VARCHAR(20) DEFAULT NULL,
  `PM10` VARCHAR(20) DEFAULT NULL,
  `SO2` VARCHAR(20) DEFAULT NULL,
  `NO2` VARCHAR(20) DEFAULT NULL,
  `CO` VARCHAR(20) DEFAULT NULL,
  `O3` VARCHAR(20) DEFAULT NULL,
  `UPDATETIME` DATETIME DEFAULT NULL,
  `FS` VARCHAR(20) DEFAULT NULL,
  `XDSD` VARCHAR(20) DEFAULT NULL,
  `JSL` VARCHAR(20) DEFAULT NULL,
  `FX` VARCHAR(20) DEFAULT NULL,
  PRIMARY KEY (`CODE`,`DATATIME`)
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;

CREATE TABLE `t_ods_wea_city` (
  `citycode` VARCHAR(20) DEFAULT NULL,
  `cityname` VARCHAR(50) DEFAULT NULL
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;

-- 这里只添加了2个城市的
INSERT INTO t_ods_wea_city(citycode,cityname) VALUES('101010100','北京'),('101020100','上海');

直接上详细代码把:

# encoding=utf-8
# import requests 这是python2里的
import json
import time
# from urllib.request import urlopen 另一种用法
import urllib.request
import urllib
import datetime
import random
import socket
import http

# 正则
import re

# db
import pymysql

db = pymysql.connect(host="192.168.xx.xx", port=3306, user="root", passwd='123456', db="r_db", charset="utf8")


def requests(citycode):
    #citycode = 101280601
    # 请求地址http://d1.weather.com.cn/aqi_all/101280601.html?_=1576479057467,后边的一串数字是时间戳可以随机或者确定的不影响,这里用time.time()
    # unixtime-> format time >>> #转换成localtime ->   time_local = time.localtime(time_now) #转换成新的时间格式(2016-05-09 18:59:20) dt = time.strftime("%Y-%m-%d %H:%M:%S",time_local)
    time_now = str(int(time.time()))

    url = "http://d1.weather.com.cn/aqi_all/" + str(citycode) + ".html?_="+time_now
    # 请求头,不加获取不到数据
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
        "Referer": "http://www.weather.com.cn/air/?city=101280601",
    }
    #  超时,取随机数是因为防止被网站认定为网络爬虫
    timeout = random.choice(range(80, 180))
    # 请求对象
    req = urllib.request.Request(url, headers=headers)
    # 这里使用request对象打开
    # res = urllib.request.urlopen(req,timeout=timeout).read()
    #res = urllib.request.urlopen(req,timeout=0.001).read()
    #timeout = 0.001
    while True:
        try:
            #  获取请求数据
            res = urllib.request.urlopen(req, timeout=timeout).read()
            break
        except socket.timeout as e:
            print('error_2:', e)
            # test timeout case
            # timeout = random.choice(range(80, 180))
            time.sleep(random.choice(range(8, 15)))
        except socket.error as e:
            print('error_3:', e)
            # timeout = random.choice(range(80, 180))
            time.sleep(random.choice(range(20, 60)))
        # except http.client.BadStatusLine as e:
        #     print('5:', e)
        #     time.sleep(random.choice(range(30, 80)))
        # except http.client.BadStatusLine as e:
        #     print('6:', e)
        #     time.sleep(random.choice(range(5, 15)))
    return res


def parse(res):
    # p1 = re.compile(r'[(](.*?)[)]', re.S)  # 最小匹配
    # 使用正则取出第一个方括号中的内容
    p2 = re.compile(r'[[](.*)[]]', re.S)  # 贪婪匹配
    json_str = re.findall(p2, res.decode('utf-8'))
    # json反序列化(字符串转成字典)方法   loads:无文件操作  load: 读文件+反序列化
    # json序列化(字典转成字符串)方法:   dumps:无文件操作  dump:序列化+写入文件
    try:
        json_str1 = json.loads('[' + json_str[0] + ']')
    except Exception as e:
        print (e)
        return None
    return json_str1


def save(list,citycode):
    # 计数 100条提交一次 cur_cnt 当前计数
    commit_cnt = 10
    cur_cnt = 0
    count = len(list)

    # 作用 1,过滤指定内容,2 映射为想要的字串
    subkey = {'time': 'HH24', 't1': 'AQI', 't3': 'PM25', 't4': 'PM10', 't5': 'CO', 't6': 'NO2', 't7': 'O3', 't9': 'SO2',
              't11': 'FS', 't12': 'XDSD', 't13': 'JSL', 't14': 'FX'}  # 最后分别是风速,相对湿度,降水量,风向
    # 使用cursor()方法获取游标对象
    cursor = db.cursor()
    # 判断是否是今天
    vtatusv = 1
    # 当天昨天
    nday = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
    yday = datetime.datetime.now().strftime('%Y-%m-%d')

    # 批量插入的list
    dv_list = []
    # 插入的sql
    sql = "replace into AQIHH(AREACODE,DATATIME,AQI,PM25,PM10,SO2,NO2,CO,O3,FS,XDSD,JSL,FX)" \
          " values(%s,%s,%s,%s,%s ,%s,%s,%s,%s,%s,%s,%s,%s)"
    for dict in list:
        # 这里用到了 列表推导式(for表达式)
        subdict = {value: dict[key] for key, value in subkey.items()}  # 提取原字典中部分键值对,并替换key为中文
        hh = subdict['HH24']
        # print(subdict.keys())
        if subdict['HH24'] == '00':
            vtatusv = 0
        if vtatusv == 0:
            datatime = yday + ' ' + hh + ':00:00'
        else:
            datatime = nday + ' ' + hh + ':00:00'
        #print(subdict)
        # print(datatime)
        aqi_status = len(subdict['AQI'].strip())
        #如果有无效的计数减去1
        if aqi_status == 0:
            count -= 1
        # 判断如果有AQI信息则执行入库
        # if len(subdict['AQI'].strip()) != 0:
        if len(hh) !=0 :
            try:  # ,SO2,NO2,CO,O3,FS,XDSD,JSL,FX

                # 计数 100条提交一次
                cur_cnt += 1
                if len(subdict['AQI'].strip()) != 0:
                    AQI = (subdict['AQI'])
                else:
                    AQI = None
                if len(subdict['PM25'].strip()) != 0:
                    PM25 = (subdict['PM25'])
                else:
                    PM25 = None
                if len(subdict['PM10'].strip()) != 0:
                    PM10 = (subdict['PM10'])
                else:
                    PM10 = None
                if len(subdict['SO2'].strip()) != 0:
                    SO2 = (subdict['SO2'])
                    # SO2 = int(subdict['SO2'])
                else:
                    SO2 = None
                if len(subdict['NO2'].strip()) != 0:
                    NO2 = (subdict['NO2'])
                else:
                    NO2 = None
                if len(subdict['CO'].strip()) != 0:
                    CO = (subdict['CO'])
                else:
                    CO = None
                if len(subdict['O3'].strip()) != 0:
                    O3 = (subdict['O3'])
                else:
                    O3 = None
                if len(subdict['FS'].strip()) != 0:
                    FS = subdict['FS']
                else:
                    FS = None
                if len(subdict['XDSD'].strip()) != 0:
                    XDSD = (subdict['XDSD'])
                else:
                    XDSD = None
                if len(subdict['JSL'].strip()) != 0:
                    JSL = (subdict['JSL'])
                else:
                    JSL = None
                if len(subdict['FX'].strip()) != 0:
                    FX = subdict['FX'].strip()
                else:
                    FX = None
                dv_list.append((citycode, datatime,AQI, PM25, PM10, SO2, NO2, CO, O3 ,FS,XDSD, JSL, FX))
                if cur_cnt%commit_cnt == 0 or cur_cnt == count:
                    #val = [(citycode, datatime, int(subdict['AQI']), PM25, PM10, SO2, NO2, CO, O3 ,FS,None, None, None)]
                    cursor.executemany(sql, dv_list)
                    # 提交到数据库执行
                    db.commit()

                    # 清零技术
                    dv_list.clear()
                # if datatime=='2019-12-19 09:00:00':
                #     cursor.execute(
                #         "replace into AQIHH(AREACODE,DATATIME,AQI,PM25,PM10,SO2,NO2,CO,O3,FS,XDSD,JSL,FX)"
                #         " values(%s,'%s',%d,%d,%d ,%d,%d,%f,%d,%s,%d,%f,'%s')" % (
                #             citycode, datatime, int(subdict['AQI']),PM25, PM10, SO2, NO2, CO, O3,FS,XDSD,JSL,FX))
            except Exception as e:
                # 如果发生错误
                db.rollback()
                print(e)
                print('----------------------------------')

    # 最后一批处理
    try:
        cursor.executemany(sql, dv_list)
        # 提交到数据库执行
        db.commit()

        # 清零技术
        dv_list.clear()
    except Exception as e:
        # 如果发生错误
        db.rollback()
        print(e)

if __name__ == '__main__':
    citycode_list = []
    r_cursor = db.cursor()
    r_cursor.execute('SELECT citycode FROM  t_ods_wea_city   limit 1')
    #rows = r_cursor.execute(None, {'ln': 9}) 101090401 101010100 and citycode="101090401"
    rows = r_cursor.fetchall()
    for r in rows:
        citycode_list.append(r[0])
    r_cursor.close()
    #citycode_list = ['101280601','101010100']
    for citycode in citycode_list:
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+' ########'+citycode+':start!!!#######')
        # 获取数据
        res = requests(citycode)
        #解析
        pas = parse(res)
        # 入库
        if pas != None:
            save(pas, citycode)
        else:
            print(citycode+':解析失败')
        print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))+' ########' + citycode + ':end!!!#######')
        # 休息1s
        #time.sleep(3.1)
发布了522 篇原创文章 · 获赞 87 · 访问量 75万+

猜你喜欢

转载自blog.csdn.net/jc_benben/article/details/103680515
今日推荐