2-4-1 基于 HTML 的爬虫,Python(Beautifulsoup)实现(版本:py3)——学习笔记

import urllib.request as urlrequest
weather_url = 'https://forecast.weather.gov/MapClick.php?lat=37.77492773500046&lon=-122.41941932299972'
web_page = urlrequest.urlopen(weather_url).read() #open是错的,read是正确的
print(web_page)
from bs4 import BeautifulSoup
soup=BeautifulSoup(web_page,'html.parser') #读取解析HTML

#soup.find按照id或者class等查找相应信息
print(soup.find(id='seven-day-forecast-body').get_text()) #get_text()获取文本内容,但是包含javescript代码,因为包含在div内,需要找到上一个没有的div,不能通过get_text去除

print(soup.find(id='seven-day-forecast-container').get_text()) #不包含javescript代码

print(soup.find(id='seven-day-forecast-container').prettify()) #prettify()将代码美化
forecast_text = soup.find(id='seven-day-forecast-container').get_text() #常用的针对字符串的方法叫split,根据某些字符去对原先的一个字符串进行数组的切分
forecast_text.split('\n') #输出有很多空格,把天气和最高温度连到一起不美观


#输出结果:
['',
 '',
 'Overnight',
 'PatchyDrizzleLow: 57 °F',
 '',
 'Wednesday',
 'PatchyDrizzle thenMostly SunnyHigh: 65 °F',
 '',
 'WednesdayNight',
 'Mostly CloudyLow: 57 °F',
 '',
 'Thursday',
 'Partly SunnyHigh: 65 °F',
 '',
 'ThursdayNight',
 'Mostly CloudyLow: 55 °F',
 '',
 'Friday',
 'Partly SunnyHigh: 65 °F',
 '',
 'FridayNight',
 'Mostly CloudyLow: 54 °F',
 '',
 'Saturday',
 'Mostly SunnyHigh: 64 °F',
 '',
 'SaturdayNight',
 'Mostly CloudyLow: 54 °F']
soup_forecast = soup.find(id='seven-day-forecast-container')
soup_forecast.find_all(class_='temp') #class后中的temp有多个属性,用空格连接


#输出结果:
[<p class="temp temp-low">Low: 57 °F</p>,
 <p class="temp temp-high">High: 65 °F</p>,
 <p class="temp temp-low">Low: 57 °F</p>,
 <p class="temp temp-high">High: 65 °F</p>,
 <p class="temp temp-low">Low: 55 °F</p>,
 <p class="temp temp-high">High: 65 °F</p>,
 <p class="temp temp-low">Low: 54 °F</p>,
 <p class="temp temp-high">High: 64 °F</p>,
 <p class="temp temp-low">Low: 54 °F</p>]
soup_forecast = soup.find(id='seven-day-forecast-container')
date_list = soup_forecast.find_all(class_='period-name') #find_all方法寻找class后的所有内容
desc_list = soup_forecast.find_all(class_='short-desc')
temp_list = soup_forecast.find_all(class_='temp') 
for i in range(9):
    date=date_list[i].get_text()
    desc=desc_list[i].get_text()
    temp=temp_list[i].get_text()
    print("{}    {}    {}".format(date,desc,temp))


#输出结果:
Overnight    PatchyDrizzle    Low: 57 °F
Wednesday    PatchyDrizzle thenMostly Sunny    High: 65 °F
WednesdayNight    Mostly Cloudy    Low: 57 °F
Thursday    Partly Sunny    High: 65 °F
ThursdayNight    Mostly Cloudy    Low: 55 °F
Friday    Partly Sunny    High: 65 °F
FridayNight    Mostly Cloudy    Low: 54 °F
Saturday    Mostly Sunny    High: 64 °F
SaturdayNight    Mostly Cloudy    Low: 54 °F

完整代码

#导入需要的包和模块,这里需要的是 urllib.request 和 Beautifulsoup
import urllib.request as urlrequest
from bs4 import BeautifulSoup

#通过urllib来获取我们需要爬取的网页
weather_url='http://forecast.weather.gov/MapClick.php?lat=37.77492773500046&lon=-122.41941932299972'
web_page=urlrequest.urlopen(weather_url).read()

#用 BeautifulSoup 来解析和获取我们想要的内容块
soup=BeautifulSoup(web_page,'html.parser')
soup_forecast=soup.find(id='seven-day-forecast-container')

#找到我们想要的那一部分内容
date_list=soup_forecast.find_all(class_='period-name')
desc_list=soup_forecast.find_all(class_='short-desc')
temp_list=soup_forecast.find_all(class_='temp')

#将获取的内容更好地展示出来,用for循环来实现
for i in range(9):
    date=date_list[i].get_text()
    desc=desc_list[i].get_text()
    temp=temp_list[i].get_text()
    print("{}    {}    {}".format(date,desc,temp))

猜你喜欢

转载自blog.csdn.net/feng_jlin/article/details/82184716
今日推荐