Spark Mllib Homeprise 数据分析

字段描述分别为:mlsNum: Double, 城市 city: String, 平方英尺 sqFt: Double, 卧室数据

bedrooms: Double, 卫生间数据bathrooms: Double,车库garage: Double, 年龄age: Double,房屋

占地面积 acres: Double, 房屋价格 price: Double

数据描述

4424109|Apple Valley|1634.0|2|2|2|33|0.04|119900.0 4404211|Rosemount|13837.0|4|6|4|17|14.46|35000004339082|Burnsville|9040.0|4|6|8|12|0.74|2690000.0 4362154|Lakeville|6114.0|7|5|12|25|14.83|1649000.0 4388419|Lakeville|6546.0|5|5|11|38|5.28|1575000.0 4188305|Rosemount|1246.0|4|1|2|143|56.28|12950004350149|Eagan|8699.0|5|6|7|28|2.62|1195000.0 4409729|Rosemount|6190.0|7|7|7|22|4.128|1195000.0 4408821|Lakeville|5032.0|5|5|3|9|1.1|1125000.0 4342395|Lakeville|4412.0|4|5|4|9|0.924|1100000.0 4361031|Lakeville|5451.0|5|5|2|22|23.83|975000.0 4424555|Apple Valley|8539.0|5|6|6|20|2.399|975000.4416412|Rosemount|4910.0|5|4|3|29|7.99|799000.0 4420237|Apple Valley|5000.0|4|4|3|14|0.77|796000.0 4392412|Eagan|7000.0|4|5|3|21|1.65|789900.0 4432729|Rosemount|6300.0|5|5|3|22|4.724|789000.0 4349895|Lakeville|5001.0|4|4|6|13|2.62|778500.0 4376726|Burnsville|5138.0|4|5|3|24|1.83|749900.0

SparkMllib 数据分析及特征构建过程

 import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.mllib.feature.{StandardScalerModel, StandardScaler}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LinearRegressionWithSGD, LabeledPoint}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.{SparkContext, SparkConf}

case class Home(mlsNum: Double, city: String, sqFt: Double, bedrooms: Double, bathrooms: Double,
garage: Double, age: Double, acres: Double, price: Double)

object HomePriceRecommender extends Serializable {

def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setMaster("local[2]").setAppName("Home Price Recommender"))
//val base = "hdfs:///user/root/homeprice.data"
val base = "/Users/zhao-chj/develop/IdeaProjects/SparkTest/src/main/scala/com/test/spark/ASparkMllibBase/base21/moviesRec_1/SparkDataProcess/data/HomePrice/homeprice.data"
val homeData = sc.textFile(base)


val parsed = homeData.map(line => parse(line))

// look at some statistics of the data
val priceStats = Statistics.colStats(parsed.map(home => Vectors.dense(home.price)))
println("Price mean: " + priceStats.mean)
println("Price max: " + priceStats.max)
println("Price min: " + priceStats.min)

// filter out anomalous data
val filtered = parsed.filter(home => (home.price > 100000.0 && home.price < 400000.0 && home.sqFt > 1000.0))

// see how correlated price and square feet are
val corr = Statistics.corr(filtered.map(home => home.price), filtered.map(home => home.sqFt))
println("Price and square feet corr: " + corr)

// convert to labeled data for MLLib
val labelData = filtered.map { home =>
LabeledPoint(home.price, Vectors.dense(home.age, home.bathrooms,
home.bedrooms, home.garage, home.sqFt))
}.cache()

// Scale features to 0 mean and common variance
val scaler = new StandardScaler(withMean = true, withStd = true).fit(labelData.map(x => x.features))

println("Scaler mean: " + scaler.mean.toArray.mkString(","))
// println("Scaler variance: " + scaler.variance.toArray.mkString(","))

val scaledData = labelData.map { data =>
LabeledPoint(data.label, scaler.transform(Vectors.dense(data.features.toArray)))
}

val numIterations = 1000
val stepSize = 0.2
// Setup linear regression model and ensure it finds the intercept
val linearReg = new LinearRegressionWithSGD()
linearReg.setIntercept(true)
linearReg.optimizer
.setNumIterations(numIterations)
.setStepSize(stepSize)

// run linear regresssion
val model = linearReg.run(scaledData)

println("Model: " + model)

// determine how well the model predicts the trained data's home prices
val valuesAndPreds = scaledData.map { point =>
val prediction = model.predict(point.features)
(point.label, prediction)
}

val power = valuesAndPreds.map {
case (v, p) => math.pow((v - p), 2)
}

// Mean Square Error
val MSE = power.reduce((a, b) => a + b) / power.count()

println("Mean Square Error: " + MSE)

// persist model to HDFS
sc.parallelize(Seq(model), 1).saveAsObjectFile("/Users/zhao-chj/develop/IdeaProjects/SparkTest/src/main/scala/com/test/spark/ASparkMllibBase/base21/moviesRec_1/SparkDataProcess/data/HomePrice/linReg.model")
sc.parallelize(Seq(scaler), 1).saveAsObjectFile("/Users/zhao-chj/develop/IdeaProjects/SparkTest/src/main/scala/com/test/spark/ASparkMllibBase/base21/moviesRec_1/SparkDataProcess/data/HomePrice/scaler.model")
}

// parse home price data into case class
def parse(line: String) = {
val split = line.split('|')
val mlsNum = split(0).toDouble
val city = split(1).toString
val sqFt = split(2).toDouble
val bedrooms = split(3).toDouble
val bathrooms = split(4).toDouble
val garage = split(5).toDouble
val age = split(6).toDouble
val acres = split(7).toDouble
val price = split(8).toDouble
Home(mlsNum, city, sqFt, bedrooms, bathrooms, garage, age, acres, price)
}
}

猜你喜欢

转载自blog.csdn.net/qq_38483094/article/details/99750902