python - 记录一次爬虫练习
这次练习的目标是自由抓取 www.pm25x.com
上的数据,实现查看各个市的PM2.5的实时数据
没有用到BeautifulSoup
,也没用xpath
,只用了python内置的re
库,数据的抓取是基于正则表达式实现的
正则表达式还是很强大的,学好正则表达式很有用。而且大部分编程语言都支持正则表达式。python的正则表达式的库是用 c 实现的,速度非常快。
使用前需要安装的库:
pip install requests #用于抓取网页源码
pip install prettytable #用来打印输出表格的
源码:
# coding=utf-8
import requests
import re
from prettytable import PrettyTable
#实现中文字符对齐的方法
def aligns(string,length=20):
difference = length - len(string) # 计算限定长度为20时需要补齐多少个空格
if difference == 0: # 若差值为0则不需要补
return string
elif difference < 0:
print('错误:限定的对齐长度小于字符串长度!')
return None
new_string = ''
space = ' '
for i in string:
codes = ord(i) # 将字符转为ASCII或UNICODE编码
if codes <= 126: # 若是半角字符
new_string = new_string + chr(codes+65248) # 则转为全角
else:
new_string = new_string + i # 若是全角,则不转换
return new_string + space*(difference) # 返回补齐空格后的字符串
#自动制表自动对齐
def tabulator(inputlist,column,length=20):
p = ''
num = 0
sum = len(inputlist)
number = 1
for i in inputlist:
p = p + str(number)+'.'+aligns(i,length)
number = number + 1
num = num + 1
sum = sum - 1
if num >= column:
print(p)
p = ''
num = 0
elif sum <= 0:
print(p)
#目标网站
print("...开始初始化...")
url = "http://www.pm25x.com"
print("设置目标网站为:http://www.pm25x.com")
#发送HTTP GET请求
print("正在向目标网站发送 HTTP GET请求...")
gethtml = requests.get(url)
#HTTP请求的返回状态
if gethtml.status_code == 200:
print("目标服务器已响应\n")
else:
print("HTTP请求失败!\n")
#设置编码为UTF-8
gethtml.encoding = 'utf-8'
#截取城市区域
html_search = re.search(r'<dl\sclass="citylist">[\s\S]+</dl>',gethtml.text)
#创建字母列表
alphabet = [chr(i) for i in range(97,123)] #小写字母
alphabet.extend([chr(i) for i in range(65,91)]) #大写字母
#主程序循环
while True:
#验证输入字符
while True:
firstletter = input("请输入要查询的城市的首字母:")
if firstletter in alphabet:
break
else:
print("输入有误!请重新输入。")
#获取首字母对应城市
city_search = re.search('<dt>'+firstletter.upper()+'\.</dt>[\s\S]+?<dd>[\s\S]+?</dd>',html_search.group())
#获取城市名列表
city = re.findall(r'<a\shref="/city/.*?">(.*?)</a>.*?',city_search.group())
#获取城市链接列表
city_link = re.findall(r'<a\shref="(.*?)">.*?</a>.*?',city_search.group())
#自动制表并自动加上数字
tabulator(city,5,6)
#验证输入数字
while True:
try:
number = int(input("请输入城市对应的数字:"))
numlist = [i for i in range(1,len(city)+1)]
if number in numlist:
break
else:
print('输入数字不在对应范围 !请重新输入。')
except:
print('输入有误!请重新输入。')
#取得当前城市的url
city_url = url+city_link[number-1]
#发送HTTP GET请求
print("正在向目标网站发送 HTTP GET请求...")
gethtml = requests.get(city_url)
#HTTP请求的返回状态
if gethtml.status_code == 200:
print("目标服务器已响应\n")
else:
print("HTTP请求失败!\n")
#获取AQI指数
aqi_search = re.search(r'<div\sclass="aqivalue">(.*?)</div>.*?',gethtml.text)
aqi = aqi_search.group(1)
#获取空气质量等级
grade_search = re.search(r'<div\sclass="aqileveltext">(.*?)</div>.*?',gethtml.text)
grade = grade_search.group(1)
#获取表格标题
rtitle_search = re.search(r'<div class="thd">(.*?)</div>[\s\S]+?<div class="ut" id="utip">今天(.*?)的数据</div>',gethtml.text)
rtitle = rtitle_search.group(1)
#获取数据更新时间
time = rtitle_search.group(2)
#截取表格部分
tbody_search = re.search(r'<table cellspacing=0 cellpadding=0>[\s\S]+?</table>.*?',gethtml.text)
tbody = tbody_search.group()
#获取表头
header_search = re.search(r'<table cellspacing=0 cellpadding=0>[\s\S]+?<tr>[\s\S]+?</tr>',tbody)
header = re.findall('<th.*?>(.*?)</th>',header_search.group())
#获取行
row_findall = re.findall(r'<tr><td>(.*?)</td><td>(.*?)</td><td class=\'.*?\'><span>(.*?)</span></td><td>(.*?)</td><td><em class="t_(.*?)"> </em></td></tr>',tbody)
#获取建议
advise_search = re.search('<p class="bold">(.*?)</p>[\s\S]+?<p>(.*?)</p>',gethtml.text)
advise = [advise_search.group(1),advise_search.group(2)]
#打印标题
print('-------------------------------------------')
print(city[number-1]+'实时AQI指数:'+aqi+' '+grade+'\n')
#打印表格标题
print(rtitle)
#打印表格
table = PrettyTable(header)
for i in row_findall:
table.add_row(i)
print(table)
#打印建议
print(advise[0]+advise[1]+'\n')
#打印数据更新时间
print('数据更新时间:'+ time)
print('-------------------------------------------')
print('\n')