[Spark](task5)SparkML Foundation(分類|クラスタリングモデル)

3.分類モデル

3.1タイプ1がラベルであると想定して、タスク5のステップを続行し、それをラベルエンコーダーとして使用します。

# encoding=utf-8

from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 任务6:SparkML基础:分类模型
spark = SparkSession.builder.appName('pyspark').getOrCreate()
spark.sparkContext.addFile("https://cdn.coggle.club/Pokemon.csv")
path = "file://"+SparkFiles.get("Pokemon.csv")
df = spark.read.csv(path=path, header=True, inferSchema= True)
df = df.withColumnRenamed('Sp. Atk', 'SpAtk')
df = df.withColumnRenamed('Sp. Def', 'SpDef')
df = df.withColumnRenamed('Type 1', 'Type1')
df = df.withColumnRenamed('Type 2', 'Type2')
df = df.withColumn("Legendary", col("Legendary").cast('string'))
# df.show()

# 步骤1:继续任务5的步骤,假设Type 1为标签,将其进行labelencoder
indexer = StringIndexer(inputCol="Type1", outputCol="Type1_idx")
df = indexer.fit(df).transform(df)
# df.show()

3.2適切なラベル評価指標をインポートし、選択の理由を教えてください。

# 步骤2:导入合适的标签评价指标,说出选择的原因?
# Accuracy, Precision, Recall

3.3トレーニングを完了するには、少なくとも3つの分類方法を選択してください。

# 步骤3:选择至少3种分类方法,完成训练。
# encode categorical features
# in_cols = ["Name", "Type2", "Generation", "Legendary"]
# out_cols = ["Name_idx", "Type2_idx", "Generation_idx", "Legendary_idx"]
in_cols = ["Type2", "Generation", "Legendary"]
out_cols = ["Type2_idx", "Generation_idx", "Legendary_idx"]
indexer = StringIndexer(inputCols=in_cols, outputCols=out_cols, handleInvalid="skip")
df = indexer.fit(df).transform(df)

# encode numerical features
columns_to_scale = ["Total", "HP", "Attack", "Defense", "SpAtk", "SpDef", "Speed"]
assemblers, scalers = list(), list()
for col in columns_to_scale:
    vec = VectorAssembler(inputCols=[col], outputCol=col + "_vec")
    assemblers.append(vec)
    sc = MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scl")
    scalers.append(sc)
pipeline = Pipeline(stages=assemblers + scalers)
df = pipeline.fit(df).transform(df)

# encode all features into vectors
# cols = ["Name_idx", "Type2_idx", "Generation_idx", "Legendary_idx",
#         "Total_scl", "HP_scl", "Attack_scl", "Defense_scl", "SpAtk_scl", "SpDef_scl", "Speed_scl"]
cols = ["Type2_idx", "Generation_idx", "Legendary_idx",
        "Total_scl", "HP_scl", "Attack_scl", "Defense_scl", "SpAtk_scl", "SpDef_scl", "Speed_scl"]
assembler = VectorAssembler(inputCols=cols, outputCol="feature")
df = assembler.transform(df)
# df.show()

train, test = df.randomSplit(weights=[0.8, 0.2], seed=42)
evaluator = MulticlassClassificationEvaluator(
    labelCol="Type1_idx",
    predictionCol="prediction",
    metricName="accuracy")

models = {
    
    
    "Decision Tree": DecisionTreeClassifier(labelCol="Type1_idx", featuresCol="feature", predictionCol="prediction"),
    "Random Forest": RandomForestClassifier(labelCol="Type1_idx", featuresCol="feature", predictionCol="prediction"),
    "Naive Bayes": NaiveBayes(labelCol="Type1_idx", featuresCol="feature", predictionCol="prediction"),
}

for name, cls in models.items():
    predictions = cls.fit(train).transform(test)
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy of %s is %.4f" % (name, accuracy))

第四に、クラスタリングモデル

4.1タイプ1がラベルであると想定して、タスク5のステップを続行し、それをラベルエンコーダーとして使用します

# encoding=utf-8

from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# 任务7:SparkML基础:聚类模型
spark = SparkSession.builder.appName('pyspark').getOrCreate()
spark.sparkContext.addFile("https://cdn.coggle.club/Pokemon.csv")
path = "file://"+SparkFiles.get("Pokemon.csv")
df = spark.read.csv(path=path, header=True, inferSchema= True)
df = df.withColumnRenamed('Sp. Atk', 'SpAtk')
df = df.withColumnRenamed('Sp. Def', 'SpDef')
df = df.withColumnRenamed('Type 1', 'Type1')
df = df.withColumnRenamed('Type 2', 'Type2')
df = df.withColumn("Legendary", col("Legendary").cast('string'))

# 步骤1:继续任务5的步骤,假设Type 1为标签,将其进行labelencoder
indexer = StringIndexer(inputCol="Type1", outputCol="Type1_idx")
df = indexer.fit(df).transform(df)

4.2 kmeansを使用してポケモンをクラスター化し、elbow法を使用して適切な数のクラスターを選択します。

# 步骤2:使用kmeans对宝可梦进行聚类,使用肘部法选择合适聚类个数。
# encode categorical features
in_cols = ["Type2", "Generation", "Legendary"]
out_cols = ["Type2_idx", "Generation_idx", "Legendary_idx"]
indexer = StringIndexer(inputCols=in_cols, outputCols=out_cols, handleInvalid="skip")
df = indexer.fit(df).transform(df)

# encode numerical features
columns_to_scale = ["Total", "HP", "Attack", "Defense", "SpAtk", "SpDef", "Speed"]
assemblers, scalers = list(), list()
for col in columns_to_scale:
    vec = VectorAssembler(inputCols=[col], outputCol=col + "_vec")
    assemblers.append(vec)
    sc = MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scl")
    scalers.append(sc)
pipeline = Pipeline(stages=assemblers + scalers)
df = pipeline.fit(df).transform(df)

# encode all features into vectors
cols = ["Type2_idx", "Generation_idx", "Legendary_idx",
        "Total_scl", "HP_scl", "Attack_scl", "Defense_scl", "SpAtk_scl", "SpDef_scl", "Speed_scl"]
assembler = VectorAssembler(inputCols=cols, outputCol="feature")
df = assembler.transform(df)
# df.show()

train, test = df.randomSplit(weights=[0.8, 0.2], seed=42)
evaluator = MulticlassClassificationEvaluator(
    labelCol="Type1_idx",
    predictionCol="prediction",
    metricName="accuracy")

num_of_type1 = df.select("Type1").distinct().count()
for k in range(2, num_of_type1+1):
    cluster = KMeans(featuresCol="feature", predictionCol="prediction", k=k, seed=42)
    model = cluster.fit(train)
    prediction = model.transform(test)
    prediction = prediction.withColumn("prediction", prediction.prediction.cast(DoubleType()))
    cost = model.summary.trainingCost
    accuracy = evaluator.evaluate(prediction)
    print("Accuracy of k=%d is %.4f, with cost is %.4f" % (k, accuracy, cost))

おすすめ

転載: blog.csdn.net/qq_35812205/article/details/123907055