python - 记录一次爬虫练习

这次练习的目标是自由抓取 www.pm25x.com 上的数据，实现查看各个市的PM2.5的实时数据

没有用到BeautifulSoup，也没用xpath，只用了python内置的re库，数据的抓取是基于正则表达式实现的
正则表达式还是很强大的，学好正则表达式很有用。而且大部分编程语言都支持正则表达式。python的正则表达式的库是用 c 实现的，速度非常快。

使用前需要安装的库：

pip install requests    #用于抓取网页源码
pip install prettytable    #用来打印输出表格的

源码：

# coding=utf-8

import requests
import re
from prettytable import PrettyTable

#实现中文字符对齐的方法
def aligns(string,length=20):
    difference = length - len(string)  # 计算限定长度为20时需要补齐多少个空格
    if difference == 0:  # 若差值为0则不需要补
        return string
    elif difference < 0:
        print('错误：限定的对齐长度小于字符串长度!')
        return None
    new_string = ''
    space = '　'
    for i in string:
        codes = ord(i)  # 将字符转为ASCII或UNICODE编码
        if codes <= 126:  # 若是半角字符
            new_string = new_string + chr(codes+65248) # 则转为全角
        else:
            new_string = new_string + i  # 若是全角，则不转换
    return new_string + space*(difference)  # 返回补齐空格后的字符串


#自动制表自动对齐
def tabulator(inputlist,column,length=20):
    p = ''
    num = 0
    sum = len(inputlist)
    number = 1
    for i in inputlist:
        p = p + str(number)+'.'+aligns(i,length)
        number = number + 1
        num = num + 1
        sum = sum - 1
        if num >= column:
            print(p)
            p = ''
            num = 0
        elif sum <= 0:
            print(p)

            

#目标网站
print("...开始初始化...")
url = "http://www.pm25x.com"
print("设置目标网站为：http://www.pm25x.com")

#发送HTTP GET请求
print("正在向目标网站发送 HTTP GET请求...")
gethtml = requests.get(url)

#HTTP请求的返回状态 
if gethtml.status_code == 200:
    print("目标服务器已响应\n")
else:
    print("HTTP请求失败！\n")

#设置编码为UTF-8
gethtml.encoding = 'utf-8'

#截取城市区域
html_search = re.search(r'<dl\sclass="citylist">[\s\S]+</dl>',gethtml.text)


#创建字母列表
alphabet = [chr(i) for i in range(97,123)] #小写字母
alphabet.extend([chr(i) for i in range(65,91)]) #大写字母


#主程序循环
while True:
    
    #验证输入字符
    while True:
        firstletter = input("请输入要查询的城市的首字母：")
        if firstletter in alphabet:
            break
        else:
            print("输入有误！请重新输入。")

    #获取首字母对应城市
    city_search = re.search('<dt>'+firstletter.upper()+'\.</dt>[\s\S]+?<dd>[\s\S]+?</dd>',html_search.group())

    #获取城市名列表
    city = re.findall(r'<a\shref="/city/.*?">(.*?)</a>.*?',city_search.group())

    #获取城市链接列表
    city_link = re.findall(r'<a\shref="(.*?)">.*?</a>.*?',city_search.group())

    #自动制表并自动加上数字
    tabulator(city,5,6)

    #验证输入数字
    while True:
        try:
            number = int(input("请输入城市对应的数字："))
            numlist =  [i for i in range(1,len(city)+1)]
            if number in numlist:
                break
            else:
                print('输入数字不在对应范围 ！请重新输入。')
        except:
            print('输入有误！请重新输入。')

        
    #取得当前城市的url
    city_url = url+city_link[number-1]

    #发送HTTP GET请求
    print("正在向目标网站发送 HTTP GET请求...")
    gethtml = requests.get(city_url)

    #HTTP请求的返回状态 
    if gethtml.status_code == 200:
        print("目标服务器已响应\n")
    else:
        print("HTTP请求失败！\n")

    #获取AQI指数
    aqi_search =  re.search(r'<div\sclass="aqivalue">(.*?)</div>.*?',gethtml.text)
    aqi =  aqi_search.group(1)

    #获取空气质量等级
    grade_search = re.search(r'<div\sclass="aqileveltext">(.*?)</div>.*?',gethtml.text)
    grade = grade_search.group(1)

    #获取表格标题
    rtitle_search = re.search(r'<div class="thd">(.*?)</div>[\s\S]+?<div class="ut" id="utip">今天(.*?)的数据</div>',gethtml.text)
    rtitle = rtitle_search.group(1)

    #获取数据更新时间
    time = rtitle_search.group(2)

    #截取表格部分
    tbody_search = re.search(r'<table cellspacing=0 cellpadding=0>[\s\S]+?</table>.*?',gethtml.text)
    tbody = tbody_search.group()

    #获取表头
    header_search = re.search(r'<table cellspacing=0 cellpadding=0>[\s\S]+?<tr>[\s\S]+?</tr>',tbody)
    header = re.findall('<th.*?>(.*?)</th>',header_search.group())

    #获取行
    row_findall = re.findall(r'<tr><td>(.*?)</td><td>(.*?)</td><td class=\'.*?\'><span>(.*?)</span></td><td>(.*?)</td><td><em class="t_(.*?)">&nbsp;</em></td></tr>',tbody)

    #获取建议
    advise_search = re.search('<p class="bold">(.*?)</p>[\s\S]+?<p>(.*?)</p>',gethtml.text)
    advise = [advise_search.group(1),advise_search.group(2)]

    
    #打印标题
    print('-------------------------------------------')
    print(city[number-1]+'实时AQI指数：'+aqi+'     '+grade+'\n')

    #打印表格标题
    print(rtitle)

    #打印表格
    table = PrettyTable(header)
    for i in row_findall:
        table.add_row(i)
    print(table)

    #打印建议
    print(advise[0]+advise[1]+'\n')
    
    #打印数据更新时间
    print('数据更新时间：'+ time)
    print('-------------------------------------------')
    print('\n')

python - 记录一次爬虫练习

python - 记录一次爬虫练习

源码：

猜你喜欢