1. Spark version of LR
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
import sklearn.datasets as datasets
import numpy as np
import time
from sklearn.linear_model import LogisticRegression as LR
def normalize(x):
return (x - np.min(x))/(np.max(x) - np.min(x))
# input datasets
X, y = datasets.make_blobs(n_samples=1000000, centers=10,
n_features=10, random_state=0)
# 归一化
X_norm = normalize(X)
X_train = X_norm[:int(len(X_norm)*0.8)]
X_test = X_norm[int(len(X_norm)*0.8):]
y_train = y[:int(len(X_norm)*0.8)]
y_test = y[int(len(X_norm)*0.8):]
y_train = y_train.reshape(-1,1)
# spark df
df = np.concatenate([y_train,X_train], axis=1)
train_df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), df)
spark_train = spark.createDataFrame(train_df,schema=["label", "features"])
test_df = map(lambda x: (Vectors.dense(x),), X_test)
spark_test = spark.createDataFrame(test_df,schema=["features"])
Spark LR:
# train model
st = time.time()
lr = LogisticRegression()#maxIter=10,regParam=0.001
pipeline = Pipeline(stages=[lr])
model = pipeline.fit(spark_train)
prediction = model.transform(spark_test)
# get acc
selected = prediction.select("prediction")
count = 0
for i,row in enumerate(selected.collect()):
pred = row
if pred == y_test[i]:
count += 1
print("acc:{:.4f}".format(count/len(y_test)))
et = time.time()
print("used:{:.4f}".format(et-st))
Operation result:
30 seconds, because it is only calculated under the spark framework, it should belong to the local independent mode, if it is distributed, it may be faster.
acc:1.0000
used:30.2634
sklearn 的LR:
st = time.time()
def accuracy(pred, true):
count = 0
for i in range(len(pred)):
if(pred[i] == true[i]):
count += 1
return count/len(pred)
# model 2
clf_lr = LR()
clf_lr.fit(X_train, y_train)
y_pred2 = clf_lr.predict(X_test)
print("acc2", accuracy(y_pred2, y_test))
et = time.time()
print("used:{:.4f}".format(et-st))
operation result:
acc2 1.0
used:58.3716
When the amount of data is small, spark runs longer, because spark includes communication time, and the amount of data is adjusted to 1 million, which barely reflects the advantages of spark.
The conclusion is that when the amount of data is large enough, the model under the spark framework will run faster than the ordinary implementation.
reference: