Spark machine learning basis - features works

Continuous process values

0.binarizer / binarization

from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer

spark = SparkSession\
        .builder\
        .appName("BinarizerExample")\
        .getOrCreate()
        
continuousDataFrame = spark.createDataFrame([
    (0, 1.1),
    (1, 8.5),
    (2, 5.2)
], ["id", "feature"])

binarizer = Binarizer(threshold=5.1, inputCol="feature", outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

spark.stop()

　　result:

Binarizer output with Threshold = 5.100000
+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    1.1|              0.0|
|  1|    8.5|              1.0|
|  2|    5.2|              1.0|
+---+-------+-----------------+

1. In accordance with given boundary discretization

from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer

spark = SparkSession\
    .builder\
    .appName("BucketizerExample")\
    .getOrCreate()

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]#-float("inf"):指的是负无穷

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# 按照给定的边界进行分桶
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

spark.stop()

　　result:

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+

2.quantile_discretizer / Press quantile discretization

Import print_function __future__ from 
from pyspark.ml.feature Import QuantileDiscretizer 
from pyspark.sql Import SparkSession 

Spark = SparkSession \ 
    .builder \ 
    .appName ( "QuantileDiscretizerExample") \ 
    .getOrCreate () 

Data = [(0, 18.0), (. 1, 19.0 ), (2, 8.0), (. 3, 5.0), (. 4, 2.2), (. 5, 9.2), (. 6, 14.4)] 
DF = spark.createDataFrame (Data, [ "ID", "hour"]) 
df = df.repartition (1) # small amount of data is set to one partition, so that no mistakes! If large amount of data may be provided a plurality of partitions. 

# 3 is divided into discrete buckets 
discretizer = QuantileDiscretizer (= numBuckets. 3, inputCol = "hour", outputCol = "Result") 

Result = discretizer.fit (DF) .transform (DF) 
result.show () 

spark.stop ( )

　　result:

+---+----+------+
| id|hour|result|
+---+----+------+
|  0|18.0|   2.0|
|  1|19.0|   2.0|
|  2| 8.0|   1.0|
|  3| 5.0|   0.0|
|  4| 2.2|   0.0|
|  5| 9.2|   1.0|
|  6|14.4|   2.0|
+---+----+------+

3. The maximum and minimum amplitude scaling

__future__ Import print_function from 
from pyspark.ml.feature Import MaxAbsScaler 
from pyspark.ml.linalg Import Vectors 
from pyspark.sql Import SparkSession 

the Spark = SparkSession \ 
    .builder \ 
    .appName ( "MaxAbsScalerExample") \ 
    .getOrCreate () 

DataFrame = spark.createDataFrame ([ 
    (0, Vectors.dense ([1.0, 0.1, -8.0]),), # represents dense dense vector 
    (. 1, Vectors.dense ([2.0, 1.0, -4.0]),), 
    (2, vectors. Dense ([4.0, 10.0, 8.0]),) 
], [ "ID", "Features"]) 

Scaler = MaxAbsScaler (inputCol = "Features", outputCol = "scaledFeatures") # minimum and maximum values for the scaling 

calculation of the maximum # a minimum value for scaling 
scalerModel = scaler.fit (dataFrame) and write #fit transform separately, but also because the fit of the data for converting the test set
fit (dataFrame) #fit transform and write separately, but also because the fit of the data for converting the test set 
# scaled amplitude to [-1, 1] between
scaledData = scalerModel.transform(dataFrame)
scaledData.select("features", "scaledFeatures").show()

spark.stop()

　result:

+--------------+----------------+
|      features|  scaledFeatures|
+--------------+----------------+
|[1.0,0.1,-8.0]|[0.25,0.01,-1.0]|
|[2.0,1.0,-4.0]|  [0.5,0.1,-0.5]|
|[4.0,10.0,8.0]|   [1.0,1.0,1.0]|
+--------------+----------------+

Spark machine learning basis - features works

Continuous process values

0.binarizer / binarization

1. In accordance with given boundary discretization

2.quantile_discretizer / Press quantile discretization

3. The maximum and minimum amplitude scaling

Guess you like