R-指定城市天气爬取

业务需求:爬取指定城市天气

操作:基于R语言和sql server数据库存储,目前仍未使用网页展示

爬取结果示例:

R源码:

# install.packages("rvest")
library(rvest)
# 第一步:定义几个函数
# 定义读取城市URL地址
city.urls<-c("http://www.weather.com.cn/weather/101190101.shtml",
             "http://www.weather.com.cn/weather/101190601.shtml",
             "http://www.weather.com.cn/weather/101190401.shtml",
             "http://www.weather.com.cn/weather/101190301.shtml",
             "http://www.weather.com.cn/weather/101190201.shtml",
             "http://www.weather.com.cn/weather/101191101.shtml",
             "http://www.weather.com.cn/weather/101191201.shtml",
             "http://www.weather.com.cn/weather/101190501.shtml",
             "http://www.weather.com.cn/weather/101190701.shtml",
             "http://www.weather.com.cn/weather/101190901.shtml",
             "http://www.weather.com.cn/weather/101191301.shtml",
             "http://www.weather.com.cn/weather/101190801.shtml",
             "http://www.weather.com.cn/weather/101191001.shtml",
             "http://www.weather.com.cn/weather/101220101.shtml",
             "http://www.weather.com.cn/weather/101221101.shtml",
             "http://www.weather.com.cn/weather/101220601.shtml",
             "http://www.weather.com.cn/weather/101220201.shtml",
             "http://www.weather.com.cn/weather/101220401.shtml",
             "http://www.weather.com.cn/weather/101221201.shtml",
             "http://www.weather.com.cn/weather/101221501.shtml",
             "http://www.weather.com.cn/weather/101220501.shtml",
             "http://www.weather.com.cn/weather/101220301.shtml",
             "http://www.weather.com.cn/weather/101221301.shtml",
             "http://www.weather.com.cn/weather/101221401.shtml",
             "http://www.weather.com.cn/weather/101210101.shtml",
             "http://www.weather.com.cn/weather/101210401.shtml",
             "http://www.weather.com.cn/weather/101211101.shtml",
             "http://www.weather.com.cn/weather/101210507.shtml",
             "http://www.weather.com.cn/weather/101210901.shtml",
             "http://www.weather.com.cn/weather/101210201.shtml",
             "http://www.weather.com.cn/weather/101180101.shtml",
             "http://www.weather.com.cn/weather/101181501.shtml",
             "http://www.weather.com.cn/weather/101181101.shtml",
             "http://www.weather.com.cn/weather/101270101.shtml",
             "http://www.weather.com.cn/weather/101290101.shtml")
             # 定义爬取单个城市天气数据函数
             read.weather<-function(city.url){
               web<-read_html(city.url,encoding = "utf8")
               city<-web%>%
                 html_nodes("div")%>%
                 html_nodes("div")%>%
                 html_nodes("div")%>%
                 html_nodes("div.crumbs.fl")%>%
                 html_nodes("a")%>%
                 html_text()
               city<-paste(city[1],city[2],sep = "")
               weather<-web%>%
                 html_nodes("div")%>%
                 html_nodes("ul.t.clearfix")%>%
                 html_nodes("li")%>%
                 html_text()%>%
                 strsplit(split="[\n]+")%>%
                 as.data.frame(fix.empty.names = FALSE,row.names=c("无","date","weather","temperature","wind"))%>%
                 t()
               weather<-weather[,c(-1,-2)]
               date<-c(Sys.Date(),Sys.Date()+1,Sys.Date()+2,Sys.Date()+3,Sys.Date()+4,Sys.Date()+5,Sys.Date()+6)
               weather<-data.frame(city,date,weather)
               return(weather)
             }
             # 定义爬取国内所有城市天气数据
             rbind.weather<-function(city.urls){
               for(city.url in city.urls){
                 if(!exists("weathers")){
                   weathers<-read.weather(city.url)
                 }else{
                   weather0<-read.weather(city.url)
                   weathers<-rbind(weathers,weather0)
                 }
               }
               return(weathers)
             }
             #第三步:爬取实时天气数据**
             # 开始爬取各城市天气数据
             weathers<-rbind.weather(city.urls)
             #write.csv(weathers,"allcity_weathers_7days.csv",row.names=F)
#将网上抓取结果存储在数据库中
#连接数据库testdb
library(RODBC)
#odbcDataSources()#列出可用的odbc连接
  channel <- odbcConnect("yyy", uid="sa", pwd="Passw0rd")#建立连接
#da<-sqlQuery(channel,"select top 2 * from dbo.Persons")#一个简单的SQL查询语句,结果将保存到data.frame类型的变量ba中
#将结果保存在在weather表中
  sqlSave(channel, weathers, tablename = "weather", append = TRUE,rownames = FALSE, colnames = FALSE,  safer = TRUE)
#关闭odbc数据源链接
  close(channel)

猜你喜欢

转载自blog.csdn.net/java_fresh_man/article/details/75642388