import urllib.request as urlrequest
weather_url = 'https://forecast.weather.gov/MapClick.php?lat=37.77492773500046&lon=-122.41941932299972'
web_page = urlrequest.urlopen(weather_url).read() #open是错的,read是正确的
print(web_page)
from bs4 import BeautifulSoup
soup=BeautifulSoup(web_page,'html.parser') #读取解析HTML
#soup.find按照id或者class等查找相应信息
print(soup.find(id='seven-day-forecast-body').get_text()) #get_text()获取文本内容,但是包含javescript代码,因为包含在div内,需要找到上一个没有的div,不能通过get_text去除
print(soup.find(id='seven-day-forecast-container').get_text()) #不包含javescript代码
print(soup.find(id='seven-day-forecast-container').prettify()) #prettify()将代码美化
forecast_text = soup.find(id='seven-day-forecast-container').get_text() #常用的针对字符串的方法叫split,根据某些字符去对原先的一个字符串进行数组的切分
forecast_text.split('\n') #输出有很多空格,把天气和最高温度连到一起不美观
#输出结果:
['',
'',
'Overnight',
'PatchyDrizzleLow: 57 °F',
'',
'Wednesday',
'PatchyDrizzle thenMostly SunnyHigh: 65 °F',
'',
'WednesdayNight',
'Mostly CloudyLow: 57 °F',
'',
'Thursday',
'Partly SunnyHigh: 65 °F',
'',
'ThursdayNight',
'Mostly CloudyLow: 55 °F',
'',
'Friday',
'Partly SunnyHigh: 65 °F',
'',
'FridayNight',
'Mostly CloudyLow: 54 °F',
'',
'Saturday',
'Mostly SunnyHigh: 64 °F',
'',
'SaturdayNight',
'Mostly CloudyLow: 54 °F']
soup_forecast = soup.find(id='seven-day-forecast-container')
soup_forecast.find_all(class_='temp') #class后中的temp有多个属性,用空格连接
#输出结果:
[<p class="temp temp-low">Low: 57 °F</p>,
<p class="temp temp-high">High: 65 °F</p>,
<p class="temp temp-low">Low: 57 °F</p>,
<p class="temp temp-high">High: 65 °F</p>,
<p class="temp temp-low">Low: 55 °F</p>,
<p class="temp temp-high">High: 65 °F</p>,
<p class="temp temp-low">Low: 54 °F</p>,
<p class="temp temp-high">High: 64 °F</p>,
<p class="temp temp-low">Low: 54 °F</p>]
soup_forecast = soup.find(id='seven-day-forecast-container')
date_list = soup_forecast.find_all(class_='period-name') #find_all方法寻找class后的所有内容
desc_list = soup_forecast.find_all(class_='short-desc')
temp_list = soup_forecast.find_all(class_='temp')
for i in range(9):
date=date_list[i].get_text()
desc=desc_list[i].get_text()
temp=temp_list[i].get_text()
print("{} {} {}".format(date,desc,temp))
#输出结果:
Overnight PatchyDrizzle Low: 57 °F
Wednesday PatchyDrizzle thenMostly Sunny High: 65 °F
WednesdayNight Mostly Cloudy Low: 57 °F
Thursday Partly Sunny High: 65 °F
ThursdayNight Mostly Cloudy Low: 55 °F
Friday Partly Sunny High: 65 °F
FridayNight Mostly Cloudy Low: 54 °F
Saturday Mostly Sunny High: 64 °F
SaturdayNight Mostly Cloudy Low: 54 °F
完整代码
#导入需要的包和模块,这里需要的是 urllib.request 和 Beautifulsoup
import urllib.request as urlrequest
from bs4 import BeautifulSoup
#通过urllib来获取我们需要爬取的网页
weather_url='http://forecast.weather.gov/MapClick.php?lat=37.77492773500046&lon=-122.41941932299972'
web_page=urlrequest.urlopen(weather_url).read()
#用 BeautifulSoup 来解析和获取我们想要的内容块
soup=BeautifulSoup(web_page,'html.parser')
soup_forecast=soup.find(id='seven-day-forecast-container')
#找到我们想要的那一部分内容
date_list=soup_forecast.find_all(class_='period-name')
desc_list=soup_forecast.find_all(class_='short-desc')
temp_list=soup_forecast.find_all(class_='temp')
#将获取的内容更好地展示出来,用for循环来实现
for i in range(9):
date=date_list[i].get_text()
desc=desc_list[i].get_text()
temp=temp_list[i].get_text()
print("{} {} {}".format(date,desc,temp))