天气网站如下:
http://www.weather.com.cn/weather/101281001.shtml
爬取各天气信息:
"""
思路分析:
①div/ul/li,获得7天天气预报的所有信息; --lxml、requests、XPath --list类型
②li下的所有数据进行提取数据; --对上述的7条数据进行处理,提取数据 XPath
③保存文件。 --文件操作、json模块。
"""
首先试着将网址的源码爬取下来
def parse_url(url, header):
"""解析url地址,获得网页的所有数据内容"""
response = requests.get(url, headers=header)
# 数据
# return response.text
return response.content.decode("utf-8")
def main():
# print("HelloWorld...")
# ①div/ul/li,获得7天天气预报的所有信息; --lxml、requests、XPath --list类型
# 网址、请求头
http_url = "http://www.weather.com.cn/weather/10128100101A.shtml"
headers = {
"User-Agent": "Mozi424/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/524.36 (KHTML, like Gecko) Chrome/72.0.3617.111 Safari/414.36"}
# 解析url
html_data = parse_url(http_url, headers)
print(html_data)
发现成功输出网页的源码,说明爬取成功。
利用xpath爬取天气预报的关键信息,获得7天天气信息,并保存到file下的weatherbyzj.json下,完整代码如下:
# coding:utf-8
import requests
import lxml.html
import json
"""
思路分析:
①div/ul/li,获得7天天气预报的所有信息; --lxml、requests、XPath --list类型
②li下的所有数据进行提取数据; --对上述的7条数据进行处理,提取数据 XPath
③保存文件。 --文件操作、json模块。
"""
def parse_url(url, header):
"""解析url地址,获得网页的所有数据内容"""
response = requests.get(url, headers=header)
# 数据
# return response.text
return response.content.decode("utf-8")
def get_weather_datas(html_content):
"""从所有的HTML数据信息中,提取出li标签的所有天气信息"""
metree = lxml.html.etree
# 获得解析对象
parser = metree.HTML(html_content, metree.HTMLParser())
# 使用Xpath语法获得li所有标签
li_list = parser.xpath("//div[@class='c7d']/ul[@class='t clearfix']/li")
# print(li_list)
# print(len(li_list))
# 空列表
data = []
# 获得li标签下的所有信息
for element in li_list: # <Element li at 0x4351300>
# print(element)
# 创建一个空字典
item = {}
# 继续使用XPath语法
item["date"] = element.xpath("./h1/text()")[0]
item["weather"] = element.xpath("./p[@class='wea']/text()")[0]
# item["weather"] = element.xpath("./p[@class='wea']/@title")[0]
item["tem_down"] = element.xpath("./p[@class='tem']/i/text()")[0]
# 稍加处理
tem_up = element.xpath("./p[@class='tem']/span/text()")
if not tem_up==[]:
item["tem_up"] =element.xpath("./p[@class='tem']/span/text()")[0]
# item["tem_up"] = tem_up
# item["tem_up"] = element.xpath("./p[@class='tem']/span/text()")[0] + "℃"
# print(item)
data.append(item)
return data
def save_weather_file(datas):
"""保存文件"""
# 列表转json字符串,并保存到文件中
json_strs = json.dumps(datas, ensure_ascii=False, indent=2)
# print(json_strs)
# print(type(json_strs))
# with...
with open("./file/weatherbyzj.json", "w", encoding="utf-8") as files:
files.write(json_strs)
print("数据保存成功,美滋滋.")
def main():
# print("HelloWorld...")
# ①div/ul/li,获得7天天气预报的所有信息; --lxml、requests、XPath --list类型
# 网址、请求头
http_url = "http://www.weather.com.cn/weather/10128100101A.shtml"
headers = {
"User-Agent": "Mozi424/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/524.36 (KHTML, like Gecko) Chrome/72.0.3617.111 Safari/414.36"}
# 解析url
html_data = parse_url(http_url, headers)
# print(html_data)
# ②li下的所有数据进行提取数据; --对上述的7条数据进行处理,提取数据 XPath --列表
weather_datas = get_weather_datas(html_data)
# print(weather_datas)
# ③保存文件。 --文件操作、json模块。
save_weather_file(weather_datas)
# 写Python程序需要有一个入口,通常会写在程序的末尾处
if __name__ == '__main__':
main()
最后成功爬取到json数据,如下所示:
[
{
"date": "21日(今天)",
"weather": "阴",
"tem_down": "25℃"
},
{
"date": "22日(明天)",
"weather": "小雨转阴",
"tem_down": "25℃",
"tem_up": "29℃"
},
{
"date": "23日(后天)",
"weather": "多云转晴",
"tem_down": "25℃",
"tem_up": "31℃"
},
{
"date": "24日(周三)",
"weather": "多云转晴",
"tem_down": "25℃",
"tem_up": "31℃"
},
{
"date": "25日(周四)",
"weather": "晴转小雨",
"tem_down": "25℃",
"tem_up": "31℃"
},
{
"date": "26日(周五)",
"weather": "阴",
"tem_down": "25℃",
"tem_up": "31℃"
},
{
"date": "27日(周六)",
"weather": "小雨",
"tem_down": "25℃",
"tem_up": "31℃"
}
]