pyspark mllib 编码、训练、评估流程示例

直接上代码,希望能帮到你

from pyspark.ml.feature import MinMaxScaler as MinMaxScalerSpark, VectorAssembler
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sklearn import metrics

假数据

# 假数据
df_train = ss.createDataFrame([
  ('123456', '小米', '1', '1', '你好啊', 10, 899, 1),
  ('2222', '小爱', '1', '1', 'hello', 8, 677, 1),
  ('1111', '小布', '1', '2', '啦啦啦', 6, 239, 0),
  ('1111', '小度', '2', '2', '我是小布', 1, 566, 0),
  ('2222', '小布', '2', '2', '我是小度', 3, 332, 0),
  ('23456', '小布', '2', '2', '我是小布', 9, 900, 0),
  ('123456', '小度', '3', '2', '小度小度', 10, 452, 0),
  ('10', '小爱', '3', '2', '小爱小爱', 8, 129, 1),
  ('10', '小爱', '3', '2', '小爱小爱', 6, 399, 1),
  ('1111', '小艺', '3', '1', '小艺小艺', 9, 299, 0),
  ('10', '小爱', '2', '1', '小爱小爱', 9, 501, 1),
  ('10', '小爱', '2', '1', '小爱小爱', 10, 692, 1),    
], ['appkey', 'appname', 'brand', 'channel', 'hardware', 'adfrom', 'battery', 'y'])
df_train.show()
+------+-------+-----+-------+--------+------+-------+---+
|appkey|appname|brand|channel|hardware|adfrom|battery|  y|
+------+-------+-----+-------+--------+------+-------+---+
|123456|   小米|    1|      1|  你好啊|    10|    899|  1|
|  2222|   小爱|    1|      1|   hello|     8|    677|  1|
|  1111|   小布|    1|      2|  啦啦啦|     6|    239|  0|
|  1111|   小度|    2|      2|我是小布|     1|    566|  0|
|  2222|   小布|    2|      2|我是小度|     3|    332|  0|
| 23456|   小布|    2|      2|我是小布|     9|    900|  0|
|123456|   小度|    3|      2|小度小度|    10|    452|  0|
|    10|   小爱|    3|      2|小爱小爱|     8|    129|  1|
|    10|   小爱|    3|      2|小爱小爱|     6|    399|  1|
|  1111|   小艺|    3|      1|小艺小艺|     9|    299|  0|
|    10|   小爱|    2|      1|小爱小爱|     9|    501|  1|
|    10|   小爱|    2|      1|小爱小爱|    10|    692|  1|
+------+-------+-----+-------+--------+------+-------+---+

编码

# 编码
enc_cols = ['appkey', 'appname', 'brand', 'channel', 'hardware']
enc_cols_ind = ['{}_ind'.format(s) for s in enc_cols]
enc_cols_val = ['{}_val'.format(s) for s in enc_cols]

# 一次性编码
stringindexer = StringIndexer(inputCols=enc_cols, outputCols=enc_cols_ind)
onehotenc = OneHotEncoder(inputCols=enc_cols_ind, outputCols=enc_cols_val).setHandleInvalid("keep")

pipeline = Pipeline(stages=[stringindexer, onehotenc])
pipeline_fit = pipeline.fit(df_train)

df_train = pipeline_fit.transform(df_train)

df_train.select(*enc_cols_val + ['adfrom', 'battery', 'y']).show()

# # 单独编码
# for i in range(len(enc_cols)):
#     indexeenc = StringIndexer(inputCol=enc_cols[i], outputCol=enc_cols_ind[i])
#     onehotenc = OneHotEncoder(inputCol=enc_cols_ind[i], outputCol=enc_cols_val[i], dropLast=False)
# 
#     pipeline = Pipeline(stages=[indexeenc, onehotenc])
#     pipeline_fit = pipeline.fit(df_train)
# 
#     df_train = pipeline_fit.transform(df_train)
# 
# df_train.select(*enc_cols_val + ['adfrom', 'battery', 'y']).show()
+-------------+-------------+-------------+-------------+-------------+------+-------+---+
|   appkey_val|  appname_val|    brand_val|  channel_val| hardware_val|adfrom|battery|  y|
+-------------+-------------+-------------+-------------+-------------+------+-------+---+
|(5,[2],[1.0])|(5,[3],[1.0])|(3,[2],[1.0])|(2,[1],[1.0])|(8,[3],[1.0])|    10|    899|  1|
|(5,[3],[1.0])|(5,[0],[1.0])|(3,[2],[1.0])|(2,[1],[1.0])|(8,[2],[1.0])|     8|    677|  1|
|(5,[1],[1.0])|(5,[1],[1.0])|(3,[2],[1.0])|(2,[0],[1.0])|(8,[4],[1.0])|     6|    239|  0|
|(5,[1],[1.0])|(5,[2],[1.0])|(3,[0],[1.0])|(2,[0],[1.0])|(8,[1],[1.0])|     1|    566|  0|
|(5,[3],[1.0])|(5,[1],[1.0])|(3,[0],[1.0])|(2,[0],[1.0])|(8,[7],[1.0])|     3|    332|  0|
|(5,[4],[1.0])|(5,[1],[1.0])|(3,[0],[1.0])|(2,[0],[1.0])|(8,[1],[1.0])|     9|    900|  0|
|(5,[2],[1.0])|(5,[2],[1.0])|(3,[1],[1.0])|(2,[0],[1.0])|(8,[5],[1.0])|    10|    452|  0|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[1],[1.0])|(2,[0],[1.0])|(8,[0],[1.0])|     8|    129|  1|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[1],[1.0])|(2,[0],[1.0])|(8,[0],[1.0])|     6|    399|  1|
|(5,[1],[1.0])|(5,[4],[1.0])|(3,[1],[1.0])|(2,[1],[1.0])|(8,[6],[1.0])|     9|    299|  0|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[0],[1.0])|(2,[1],[1.0])|(8,[0],[1.0])|     9|    501|  1|
|(5,[0],[1.0])|(5,[0],[1.0])|(3,[0],[1.0])|(2,[1],[1.0])|(8,[0],[1.0])|    10|    692|  1|
+-------------+-------------+-------------+-------------+-------------+------+-------+---+

训练

# 训练
x_columns = enc_cols_val + ['adfrom', 'battery']

vacal = VectorAssembler(handleInvalid="keep").setInputCols(x_columns).setOutputCol('features')
gbdt = GBTClassifier(featuresCol="features", labelCol="y", predictionCol="prediction",)

pipeline_model = Pipeline(stages=[vacal, gbdt])
model = pipeline_model.fit(df_train)

评估

# 评估
df_train_eval = model.transform(df_train)
# df_train_eval.select(*['features', 'rawPrediction', 'probability', 'prediction']).show(1, truncate=False)
df_train_eval.select(*['y', 'rawPrediction', 'probability', 'prediction']).show(truncate=False)

# spark评估
f1_score = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='f1').evaluate(df_train_eval)
acc_score = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='accuracy').evaluate(df_train_eval)
loss = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='logLoss').evaluate(df_train_eval)
precision = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='weightedPrecision').evaluate(df_train_eval)
recall = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='y', metricName='weightedRecall').evaluate(df_train_eval)
print('f1: ', f1_score)
print('accuracy: ', acc_score)
print('loss: ', loss)
print('precision: ', precision)
print('recall: ', recall)

# sklearn评估
df_train_eval = df_train_eval.toPandas()
y=list(df_train_eval['y'])
y_pre=list(df_train_eval['prediction'])
y_predprob=[x[1] for x in list(df_train_eval['probability'])]

precision_score=metrics.precision_score(y, y_pre)#精确率
recall_score=metrics.recall_score(y, y_pre)#召回率
accuracy_score=metrics.accuracy_score(y, y_pre)#准确率
f1_score=metrics.f1_score(y, y_pre)#F1分数
auc_score=metrics.roc_auc_score(y, y_predprob)#auc分数
print("精确率:",precision_score )#精确率
print("召回率:",recall_score )#召回率
print("准确率:",accuracy_score )#准确率
print("F1分数:", f1_score)#F1分数
print("auc分数:",auc_score )#auc分数
+---+----------------------------------------+-----------------------------------------+----------+
|y  |rawPrediction                           |probability                              |prediction|
+---+----------------------------------------+-----------------------------------------+----------+
|1  |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0       |
|1  |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0       |
|0  |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0       |
|0  |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0       |
|0  |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0       |
|0  |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0       |
|0  |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0       |
|1  |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0       |
|1  |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0       |
|0  |[1.5435020027249835,-1.5435020027249835]|[0.9563534785727067,0.043646521427293306]|0.0       |
|1  |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0       |
|1  |[-1.5435020027249835,1.5435020027249835]|[0.04364652142729318,0.9563534785727068] |1.0       |
+---+----------------------------------------+-----------------------------------------+----------+

f1:  1.0
accuracy:  1.0
loss:  0.04462768680949278
precision:  1.0
recall:  1.0

[1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1]
[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0]
[0.9563534785727068, 0.9563534785727068, 0.043646521427293306, 0.043646521427293306, 0.043646521427293306, 0.043646521427293306, 0.043646521427293306, 0.9563534785727068, 0.9563534785727068, 0.043646521427293306, 0.9563534785727068, 0.9563534785727068]
精确率: 1.0
召回率: 1.0
准确率: 1.0
F1分数: 1.0
auc分数: 1.0

保存

# 保存编码
pipeline_fit.write().overwrite().save('path1')
# 保存模型
model.write().overwrite().save('path2')

# 加载模型
encode = PipelineModel.load()
model = PipelineModel.load()
pre = model.transform(data)
pre = pre.select(['features', 'y', 'probability', 'prediction'])

pyspark 编码,评估 transform时遇到新值的解决办法:
https://blog.csdn.net/qq_42363032/article/details/123295085?spm=1001.2014.3001.5501

pyspark mllib 二分类是以softmax作为输出的解释:
https://blog.csdn.net/qq_42363032/article/details/123331865?spm=1001.2014.3001.5501

猜你喜欢

转载自blog.csdn.net/qq_42363032/article/details/123346025
今日推荐