python爬取空气质量

一,下载chromdriver.exe

下载地址:http://chromedriver.storage.googleapis.com/index.html

下载完毕之后不用安装

版本匹配,参考博文:https://blog.csdn.net/mmayanshuo/article/details/78962398

二,代码

# -*- coding: utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import time
import random
import requests


#获取ip列表
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list


#随机获取一个代理ip
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies


#========================================代理ip==============================================
url = 'http://www.xicidaili.com/nn/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ip_list = get_ip_list(url, headers=headers)
proxies = get_random_ip(ip_list)


#=====================================获取外层a标签======================================
url_host='https://www.aqistudy.cn/historydata/'
httpproxy_handler = urllib.request.ProxyHandler(proxies)
opener =urllib.request.build_opener(httpproxy_handler)
req=urllib.request.Request(url_host)
html=urllib.request.urlopen(req)
html_code=html.read()
plain_text=str(html_code,'utf-8')
soup=BeautifulSoup(plain_text)
soups_div=soup.find_all('div',{'class':'all'})
soups_a=soups_div.pop().find_all('a')
#
#=====================================获取内层a标签======================================  
url_host2='https://www.aqistudy.cn/historydata/daydata.php?city=%E9%83%91%E5%B7%9E&month=201909'
html2=requests.get(url_host2)
html2.encoding='utf-8'
html_code2=html2.text
soup2=BeautifulSoup(html_code2)
soups_div2=soup2.find_all('ul',{'class':'unstyled1'})
soups_a2=soups_div2.pop().find_all('a')


#=====================================循环获取页面数据============================================
def aqi_get(url_host,citys,months):
    chromedriver = r'C:\chromedriver.exe'#指定软件存放的位置
    driver = webdriver.Chrome(executable_path=chromedriver)
    air_quality=pd.DataFrame(columns=['city','month','AQI','level','PM25','PM10','SO2','CO','NO2','O3_8h'])
    for city in citys:
        for month in months:
            for hh in soups_a:
                for mm in soups_a2:
                    if hh.text==city and mm.text==month:                        
                        url1=url_host+'daydata.php?city='+city+'&month='+month[0:4]+month[5:7]
            driver.get(url1)
            print(url1)
            html2 = driver.page_source
            bf1 = BeautifulSoup(html2, 'lxml')
            result = bf1.find_all('tr')
            #若没有爬取到数据,一直访问该页面
            while len(result) < 10:
                html2 = driver.page_source
                bf1 = BeautifulSoup(html2, 'lxml')
                result = bf1.find_all('tr')
            for re in result[1:]:
                td=re.find_all('td')
                ss=[]
                for tt in td:
                    ss.append(tt.text)
                sss=[city]+ss
                s=pd.Series(sss,index=air_quality.columns)
                air_quality=air_quality.append(s,ignore_index=True)
             #适当暂停一段时间,防止反爬虫
#            time.sleep(8)
    driver.close()
    return air_quality



#citys=['郑州', '平顶山', '洛阳','安阳']
months=['2019年07月','2019年08月','2019年09月']
citys=['郑州','开封','许昌','洛阳','平顶山','三门峡','南阳','信阳','安阳','濮阳','商丘','鹤壁','焦作','驻马店','周口','新乡','漯河','济源']
now = time.strftime("%Y-%m-%d-%H-%M-%S",time.localtime(time.time()))
air_quality=aqi_get(url_host,citys,months)
pd.DataFrame.to_excel(air_quality,r"D:\air_quality_day"+now+".xls",',')
发布了348 篇原创文章 · 获赞 210 · 访问量 87万+

猜你喜欢

转载自blog.csdn.net/u010916338/article/details/101303347