版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Katherine_hsr/article/details/80988994
本文记录了用pyspark构建一个简单的模型的过程。
1. 读取数据集
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Spark_mllearn_example").setMaster("local")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master("local").appName("Spark_mllearn_example").config("", "").getOrCreate()
dpath = '/Users/huoshirui/Desktop/Spark/'
df = spark.read.csv(dpath + 'spark_mllearn_test.csv', header=True)
数据集如下图:
2. 将数据集转换成可以用于模型使用的features/label的形式
df = df.withColumn('c2', df['c2'].cast('double'))\
.withColumn('c3', df['c3'].cast('double'))\
.withColumn('c4', df['c4'].cast('double'))\
.withColumn('c5', df['c5'].cast('double'))\
.withColumn('c6', df['c6'].cast('double'))\
.withColumn('c7', df['c7'].cast('double'))\
.withColumn('c8', df['c8'].cast('double'))\
.withColumn('c9', df['c9'].cast('double'))\
.withColumn('c10', df['c10'].cast('double'))\
.withColumn('c11', df['c11'].cast('double'))\
.withColumn('c12', df['c12'].cast('double'))\
.withColumn('c13', df['c13'].cast('double'))\
.withColumn('c14', df['c14'].cast('double'))\
.withColumn('c15', df['c15'].cast('double'))\
.withColumn('c16', df['c16'].cast('double'))
input_col = ['c2', 'c3', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16']
vecAssembler = VectorAssembler(inputCols=input_col, outputCol="features")
stringIndexer = StringIndexer(inputCol="c4", outputCol="label")
pipeline = Pipeline(stages=[vecAssembler, stringIndexer])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
以下为转换后的数据集的结果:
3.划分训练集和测试集
这里将数据中的70%作为训练集,剩下的30%作为测试集
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], 123)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
4.模型训练
- 首先使用LogisticRegression模型训练
# 模型训练
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
用训练好的模型在测试集上预测
# 模型预测
prediction = lrModel.transform(testData)
# ROC score
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(prediction)
使用十折交叉验证
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
grid = (ParamGridBuilder()
.addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
.addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
.build())
evaluator = BinaryClassificationEvaluator()
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr,
estimatorParamMaps=grid,
evaluator=evaluator,
numFolds=10)
cvModel = cv.fit(trainingData)
predictions = cvModel.transform(testData)
# Evaluate best model
#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)
- 使用使用随机森林训练
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# 构建模型
rf = RandomForestClassifier(numTrees=3, maxDepth=10, maxBins=30, labelCol="label", seed=123)
# 十折交叉验证
grid = (ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5])
.addGrid(rf.maxDepth, [3, 5, 7, 10])
.addGrid(rf.maxBins, [20, 30, 40])
.build())
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=rf,
evaluator=evaluator,
estimatorParamMaps=grid,
numFolds=10)
cvModel_rf = cv.fit(trainingData)
# 模型预测 ROC
predictions = cvModel_rf.transform(testData)
evaluator.evaluate(predictions)