直接上代码,希望能帮到你
from pyspark.ml.feature import MinMaxScaler as MinMaxScalerSpark, VectorAssembler
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sklearn import metrics
假数据
# 假数据
df_train = ss.createDataFrame([
('123456', '小米', '1', '1', '你好啊', 10, 899, 1),
('2222', '小爱', '1', '1', 'hello', 8, 677, 1),
('1111', '小布', '1', '2', '啦啦啦', 6, 239, 0),
('1111', '小度', '2', '2', '我是小布', 1, 566, 0),
('2222', '小布', '2', '2', '我是小度', 3, 332, 0),
('23456', '小布', '2', '2', '我是小布', 9, 900, 0),
('123456', '小度', '3', '2', '小度小度', 10, 452, 0),
('10', '小爱', '3', '2', '小爱小爱', 8, 129, 1),
('10', '小爱', '3', '2', '小爱小爱', 6, 399, 1),
('1111', '小艺', '3', '1', '小艺小艺', 9, 299, 0),
('10', '小爱', '2', '1', '小爱小爱', 9, 501, 1),
('10', '小爱', '2', '1', '小爱小爱', 10, 692, 1),
], ['appkey', 'appname', 'brand', 'channel', 'hardware', 'adfrom', 'battery', 'y'])
df_train.show()
+------+-------+-----+-------+--------+------+-------+---+
|appkey|appname|brand|channel|hardware|adfrom|battery| y|
+------+-------+-----+-------+--------+------+-------+---+
|123456| 小米| 1| 1| 你好啊| 10| 899| 1|
| 2222| 小爱| 1| 1| hello| 8| 677| 1|
| 1111| 小布| 1| 2| 啦啦啦| 6| 239| 0|
| 1111| 小度| 2| 2|我是小布| 1| 566| 0|
| 2222| 小布| 2| 2|我是小度| 3| 332| 0|
| 23456| 小布| 2| 2|我是小布| 9| 900| 0|
|123456| 小度| 3| 2|小度小度| 10| 452| 0|
| 10| 小爱| 3| 2|小爱小爱| 8| 129| 1|
| 10| 小爱| 3| 2|小爱小爱| 6| 399| 1|
| 1111| 小艺| 3| 1|小艺小艺| 9| 299| 0|
| 10| 小爱| 2| 1|小爱小爱| 9| 501| 1|
| 10| 小爱| 2| 1|小爱小爱| 10| 692| 1|
+------+-------+-----+-------+--------+------+-------+---+
编码
# 编码
enc_cols = ['appkey', 'appname', 'brand', 'channel', 'hardware']
enc_cols_ind = ['{}_ind'.format(s) for s in enc_cols]
enc_cols_val = ['{}_val'.format(s) for s in enc_cols]
# 一次性编码
stringindexer = StringIndexer(inputCols=enc_cols, outputCols=enc_cols_ind)
onehotenc = OneHotEncoder(inputCols=enc_cols_ind, outputCols=enc_cols_val).setHandleInvalid("keep")
pipeline = Pipeline(stages=[stringindexer, onehotenc])
pipeline_fit = pipeline.fit(df_train)
df_train = pipeline_fit.transform(df_train)
df_train.select(*enc_cols_val + ['adfrom', 'battery', 'y']).show()
# # 单独编码
# for i in range(len(enc_cols)):
# indexeenc = StringIndexer(inputCol=enc_cols[i], outputCol=enc_cols_ind[i])
# onehotenc = OneHotEncoder(inputCol=enc_cols_ind[i], outputCol=enc_cols_val[i], dropLast=False)
#
# pipeline = Pipeline(stages=[indexeenc, onehotenc])
# pipeline_fit = pipeline.fit(df_train)
#
# df_train = pipeline_fit.transform(df_train)
#
# df_train.select(*enc_cols_val + ['adfrom', 'battery', 'y']).show()
+-------------+-------------+-------------+-------------+-------------+------+-------+---+
| appkey_val| appname_val| brand_val| channel_val| hardware_val|adfrom|battery| y|
+-------------+-------------+-------------+-------------+-------------+------+-------+---+
|(5,[2],[1.0])|(5,[3],[1.0])|(3,[2],[1.0])|(2,[1],[1.0])|(8,[3],[1.0])| 10| 899| 1|
|(5,[3],[1.0])|(5,[0],[1.0])|(3,[2],[1.0])|(2,[1],[1.0])|(8,[2],[1.0])| 8| 677| 1|
|(5,[1],[1.0])|(5,[1],[1.0])|(3,[2],[1.0])|(2,[0],[1.0])|(8,[4],[1.0])| 6| 239| 0|
|(5,[1],[1.0])|(5,[2],[1.0])|(3,[0],[1.0])|(2,[0],[1.0])|(8,[1],[1.0])| 1| 566| 0|
|(5,[3],[1.0])|(5,[1],[1.0])|(3,[0],[1.0])|(2,[0],[1.0])|(8,[7],[1.0])| 3| 332| 0|
|(5,[4],[1.0])|(5,[1],[1.0])|(3,[0],[1.0])|(2,[0],[1.0])|(8,[1],[1.0])| 9| 900| 0|
|(5,[2],[1.0])|(5,[2],[1.0])|(3,[1],[1.0])|(2,[0],[1.0])|(8,[5],[1.0])| 10| 452| 0|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[1],[1.0])|(2,[0],[1.0])|(8,[0],[1.0])| 8| 129| 1|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[1],[1.0])|(2,[0],[1.0])|(8,[0],[1.0])| 6| 399| 1|
|(5,[1],[1.0])|(5,[4],[1.0])|(3,[1],[1.0])|(2,[1],[1.0])|(8,[6],[1.0])| 9| 299| 0|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[0],[1.0])|(2,[1],[1.0])|(8,[0],[1.0])| 9| 501| 1|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[0],[1.0])|(2,[1],[1.0])|(8,[0],[1.0])| 10| 692| 1|
+-------------+-------------+-------------+-------------+-------------+------+-------+---+
训练
# 训练
x_columns = enc_cols_val + ['adfrom', 'battery']
vacal = VectorAssembler(handleInvalid="keep").setInputCols(x_columns).setOutputCol('features')
gbdt = GBTClassifier(featuresCol="features", labelCol="y", predictionCol="prediction",)
pipeline_model = Pipeline(stages=[vacal, gbdt])
model = pipeline_model.fit(df_train)
评估
# 评估
df_train_eval = model.transform(df_train)
# df_train_eval.select(*['features', 'rawPrediction', 'probability', 'prediction']).show(1, truncate=False)
df_train_eval.select(*['y', 'rawPrediction', 'probability', 'prediction']).show(truncate=False)
# spark评估
f1_score = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='f1').evaluate(df_train_eval)
acc_score = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='accuracy').evaluate(df_train_eval)
loss = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='logLoss').evaluate(df_train_eval)
precision = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='weightedPrecision').evaluate(df_train_eval)
recall = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='weightedRecall').evaluate(df_train_eval)
print('f1: ', f1_score)
print('accuracy: ', acc_score)
print('loss: ', loss)
print('precision: ', precision)
print('recall: ', recall)
# sklearn评估
df_train_eval = df_train_eval.toPandas()
y=list(df_train_eval['y'])
y_pre=list(df_train_eval['prediction'])
y_predprob=[x[1] for x in list(df_train_eval['probability'])]
precision_score=metrics.precision_score(y, y_pre)#精确率
recall_score=metrics.recall_score(y, y_pre)#召回率
accuracy_score=metrics.accuracy_score(y, y_pre)#准确率
f1_score=metrics.f1_score(y, y_pre)#F1分数
auc_score=metrics.roc_auc_score(y, y_predprob)#auc分数
print("精确率:",precision_score )#精确率
print("召回率:",recall_score )#召回率
print("准确率:",accuracy_score )#准确率
print("F1分数:", f1_score)#F1分数
print("auc分数:",auc_score )#auc分数
+---+----------------------------------------+-----------------------------------------+----------+
|y |rawPrediction |probability |prediction|
+---+----------------------------------------+-----------------------------------------+----------+
|1 |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0 |
|1 |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0 |
|0 |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0 |
|0 |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0 |
|0 |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0 |
|0 |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0 |
|0 |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0 |
|1 |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0 |
|1 |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0 |
|0 |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0 |
|1 |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0 |
|1 |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0 |
+---+----------------------------------------+-----------------------------------------+----------+
f1: 1.0
accuracy: 1.0
loss: 0.04462768680949278
precision: 1.0
recall: 1.0
[1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1]
[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0]
[0.9563534785727068, 0.9563534785727068, 0.043646521427293306, 0.043646521427293306, 0.043646521427293306, 0.043646521427293306, 0.043646521427293306, 0.9563534785727068, 0.9563534785727068, 0.043646521427293306, 0.9563534785727068, 0.9563534785727068]
精确率: 1.0
召回率: 1.0
准确率: 1.0
F1分数: 1.0
auc分数: 1.0
保存
# 保存编码
pipeline_fit.write().overwrite().save('path1')
# 保存模型
model.write().overwrite().save('path2')
# 加载模型
encode = PipelineModel.load()
model = PipelineModel.load()
pre = model.transform(data)
pre = pre.select(['features', 'y', 'probability', 'prediction'])
pyspark 编码,评估 transform时遇到新值的解决办法:
https://blog.csdn.net/qq_42363032/article/details/123295085?spm=1001.2014.3001.5501
pyspark mllib 二分类是以softmax作为输出的解释:
https://blog.csdn.net/qq_42363032/article/details/123331865?spm=1001.2014.3001.5501