版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u012429555/article/details/84453750
install.packages("Hmisc")
install.packages("mice")
library(Hmisc)
library(mice)
data_1<-impute(ma317projectdata_2$X2012SP.DYN.LE00.IN,mean)
#对结果进行填充
ma317projectdata_2$X2012SP.DYN.LE00.IN[is.na(ma317projectdata_2$X2012SP.DYN.LE00.IN)]<-mean(ma317projectdata_2$X2012SP.DYN.LE00.IN, na.rm = T)
mydata<-as.data.frame(ma317projectdata_2)
#相关性检验
cov(mydata)
# X2012SH.XPD.PCAP X2012SL.UEM.1524.ZS X2012GC.BAL.CASH.GD.ZS X2012SP.DYN.LE00.IN
# X2012SH.XPD.PCAP 3.006310e+06 40.538378 -9.2014918 7767.9251172
# X2012SL.UEM.1524.ZS 4.053838e+01 111.615457 -6.6161710 17.1944126
# X2012GC.BAL.CASH.GD.ZS -9.201492e+00 -6.616171 16.1245419 -0.6002212
# X2012SP.DYN.LE00.IN 7.767925e+03 17.194413 -0.6002212 74.9118626
#建立模型
mydata_Linear_regression<-lm(mydata$X2012SP.DYN.LE00.IN~.,data=mydata)
#模型评估
mydata_Linear_regression
# Call:
# lm(formula = mydata$X2012SP.DYN.LE00.IN ~ ., data = mydata)
#
# Coefficients:
# (Intercept) X2012SH.XPD.PCAP X2012SL.UEM.1524.ZS X2012GC.BAL.CASH.GD.ZS
# 64.914619 0.002582 0.154758 0.027749
summary(mydata_Linear_regression)
# Call:
# lm(formula = mydata$X2012SP.DYN.LE00.IN ~ ., data = mydata)
#
# Residuals:
# Min 1Q Median 3Q Max
# -24.074 -3.137 1.337 4.715 14.328
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 6.491e+01 9.806e-01 66.198 < 2e-16 ***
# X2012SH.XPD.PCAP 2.582e-03 2.662e-04 9.697 < 2e-16 ***
# X2012SL.UEM.1524.ZS 1.548e-01 4.424e-02 3.498 0.000556 ***
# X2012GC.BAL.CASH.GD.ZS 2.775e-02 1.164e-01 0.238 0.811755
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 7.27 on 245 degrees of freedom
# Multiple R-squared: 0.303, Adjusted R-squared: 0.2945
# F-statistic: 35.51 on 3 and 245 DF, p-value: < 2.2e-16
library(car)
scatterplotMatrix(mydata,spread=FALSE)
#图一 各变量线性相关图
# 缺点:虽然它可能会找到一个好的模型,但是不能保证模型就是最佳型,因为不是每一个可能的模型都被评价了。
# 全子集回归
# 即所有可能的模型都会被检验。
# 全子集回归可用leaps包中的regsubsets()函数实现。
#具体的regsubsets 的图可以表示出什么意思你可以自己查查相关的解释
install.packages('leaps')
library(leaps)
leaps<-regsubsets(mydata$X2012SP.DYN.LE00.IN~.,data=mydata,nbest = 3)
plot(leaps,scale = 'adjr2')
#图二 regsubsets图
coef(mydata_Linear_regression)
# coef(mydata_Linear_regression)
# (Intercept) X2012SH.XPD.PCAP X2012SL.UEM.1524.ZS X2012GC.BAL.CASH.GD.ZS
# 64.914619203 0.002581872 0.154757591 0.027748915
#通过这几种方法,我们都可以明显的看出预期寿命与 X2012SL.UEM.1524.ZS相关性较大,与其它因素相关性较小。
#回归诊断
confint(mydata_Linear_regression)
# confint(mydata_Linear_regression)
# 2.5 % 97.5 %
# (Intercept) 62.983122459 66.846115947
# X2012SH.XPD.PCAP 0.002057446 0.003106298
# X2012SL.UEM.1524.ZS 0.067624047 0.241891135
# X2012GC.BAL.CASH.GD.ZS -0.201497827 0.256995658
#标记异常值
qqPlot(mydata_Linear_regression,labels = row.names(mydata),id.method = 'identify',simulate = T)
#图三 有些数据就可以抛弃不用
#预测模型 test 是你的预测数据
p<-predict(mydata_Linear_regression,test)
write.csv(mydata,file="C:/Users/wwq/Documents/data_pre.csv")
getwd()