Spark MLlib 下的逻辑回归二元分类
训练模型
导入必要的包
import numpy as np
import pyspark
from matplotlib import pyplot as plt
from pyspark. mllib. classification import LogisticRegressionWithSGD
from pyspark. mllib. feature import StandardScaler
from pyspark. mllib. regression import LabeledPoint
from pyspark. mllib. evaluation import BinaryClassificationMetrics
演示回归曲线
x = np. linspace( 0 , 1000 , 100 )
y_t = x* 4 + 5
y_r = y_t + np. random. randint( - 1000 , 1000 , 100 )
plt. plot( x, y_t, ls= "-" , c= "r" )
plt. scatter( x, y_r)
初始化spark的上下文对象
sc = pyspark. SparkContext( master= "local[*]" , appName= "StumbleuponAnalysis" )
准备为数据
def extract_features ( fields, categories_dict, end) :
category_id = categories_dict[ fields[ 3 ] ]
category_features = np. zeros( len ( categories_dict) )
category_features[ category_id] = 1
numerical_features = [ 0.0 if f== "?" else float ( f) for f in fields[ 4 : end] ]
return np. concatenate( ( category_features, numerical_features) )
def parpare_data ( sc, scale) :
raw_lines_and_header = sc. textFile( "file:/home/zh123/.jupyter/workspace/stumbleupon/train.tsv" )
header_line = raw_lines_and_header. first( )
raw_non_header_data = raw_lines_and_header. filter ( lambda l: l!= header_line)
raw_non_quot_lines = raw_non_header_data. map ( lambda l: l. replace( "\"" , "" ) )
raw_data = raw_non_quot_lines. map ( lambda l: l. split( "\t" ) )
print ( "数据长度:" , raw_data. count( ) )
categories_dict = raw_data. map ( lambda field: field[ 3 ] ) . distinct( ) . zipWithIndex( ) . collectAsMap( )
label_rdd = raw_data. map ( lambda fields: float ( fields[ - 1 ] ) )
feature_rdd = raw_data. map ( lambda fields: extract_features( fields, categories_dict, len ( fields) - 1 ) )
std_scaler = StandardScaler( withMean= True , withStd= True ) . fit( feature_rdd)
scaler_features = std_scaler. transform( feature_rdd)
label_point = label_rdd. zip ( scaler_features)
label_point_rdd = label_point. map ( lambda r: LabeledPoint( r[ 0 ] , r[ 1 ] ) )
return label_point_rdd. randomSplit( scale) , categories_dict
模型评估
定义评估模型AUC值的函数
def evaluate_model ( model, validation_data) :
predict = model. predict( validation_data. map ( lambda p: p. features) ) . map ( lambda x: float ( x) )
predict_and_label = predict. zip ( validation_data. map ( lambda p: p. label) )
metrics = BinaryClassificationMetrics( predict_and_label)
return metrics. areaUnderROC
定义综合模型评估函数
import time
def train_evaluate_model ( train_data, validation_data, numIterations, stepSize, miniBatchFraction) :
start_time = time. time( )
model = LogisticRegressionWithSGD. train( train_data, numIterations, stepSize, miniBatchFraction)
duration = time. time( ) - start_time
AUC = evaluate_model( model, validation_data)
return ( model, AUC, duration, numIterations, stepSize, miniBatchFraction)
定义评估参数的函数
import pandas as pd
def evaluate_parameter ( train_data, validation_data, numIterationsList, stepSizeList, miniBatchFractionList) :
metrics = [ ]
columns = [ "Model" , "AUC" , "Duration" , "numIterations" , "stepSize" , "miniBatchFraction" ]
for numIterations in numIterationsList:
for stepSize in stepSizeList:
for miniBatchFraction in miniBatchFractionList:
metrics. append( train_evaluate_model( train_data, validation_data, numIterations, stepSize, miniBatchFraction) )
if ( len ( numIterationsList) > 1 ) :
return pd. DataFrame( metrics, index= numIterationsList, columns= columns)
elif ( len ( stepSizeList) > 1 ) :
return pd. DataFrame( metrics, index= stepSizeList, columns= columns)
elif ( len ( miniBatchFractionList) > 1 ) :
return pd. DataFrame( metrics, index= miniBatchFractionList, columns= columns)
else :
return pd. DataFrame( metrics, index= [ 0 ] , columns= columns)
获取训练数据,验证数据,测试数据
( ( train_data, validation_data, test_data) , categories_dict) = parpare_data( sc, scale= [ 8 , 1 , 1 ] )
train_data. persist( )
validation_data. persist( )
test_data. persist( )
数据长度: 7395
PythonRDD[4739] at RDD at PythonRDD.scala:52
评估 numIterations参数影响
训练模型并获取评估参数表
evaluate_table = evaluate_parameter( train_data, validation_data, [ i for i in range ( 1 , 50 , 5 ) ] , [ 10 ] , [ 1 ] )
evaluate_table
Model
AUC
Duration
numIterations
stepSize
miniBatchFraction
1
(weights=[0.6677226910837364,-0.69951944405741...
0.664205
0.542155
1
10
1
6
(weights=[0.28810190368216665,-0.3890579409906...
0.603375
0.149749
6
10
1
11
(weights=[0.2982103093226861,-0.30009276222335...
0.637453
0.186136
11
10
1
16
(weights=[0.2590246366263148,-0.27478234116180...
0.690569
0.213902
16
10
1
21
(weights=[0.25133027462275814,-0.2542369719546...
0.696628
0.267709
21
10
1
26
(weights=[0.24840617513903634,-0.2527605271207...
0.697719
0.317076
26
10
1
31
(weights=[0.2480626698782132,-0.25281749529624...
0.693588
0.355656
31
10
1
36
(weights=[0.24788753296317756,-0.2530393653347...
0.693588
0.488446
36
10
1
41
(weights=[0.24788753296317756,-0.2530393653347...
0.693588
0.362525
41
10
1
46
(weights=[0.24788753296317756,-0.2530393653347...
0.693588
0.378403
46
10
1
根据评估参数表绘制图像
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. bar( evaluate_table. index, evaluate_table[ "AUC" ] , color= "c" , tick_label= evaluate_table. index, label= "AUC" , width= 4 )
ax. set_ylim( 0.6 , 0.7 )
ax2 = ax. twinx( )
ax2. plot( evaluate_table. index, evaluate_table[ "Duration" ] , c= "r" , label= "Duration" , marker= "o" )
ax. grid( )
fig. legend( loc= 1 , bbox_to_anchor= ( 1 , 1 ) , bbox_transform= ax. transAxes)
评估 stepSize 参数的影响
训练模型并获取评估参数表
evaluate_table = evaluate_parameter( train_data, validation_data, [ 26 ] , [ i for i in range ( 10 , 200 , 15 ) ] , [ 1 ] )
evaluate_table
Model
AUC
Duration
numIterations
stepSize
miniBatchFraction
10
(weights=[0.24840617513903634,-0.2527605271207...
0.697719
0.306683
26
10
1
25
(weights=[0.40103746760777653,-0.4924966686183...
0.591412
0.305612
26
25
1
40
(weights=[0.5409425093445586,-0.77344879343874...
0.564893
0.311465
26
40
1
55
(weights=[0.6844234097438462,-1.09699570420703...
0.559457
0.418840
26
55
1
70
(weights=[0.8379207450635585,-1.43000712772985...
0.557723
0.299107
26
70
1
85
(weights=[1.0323510305921046,-1.76105166506314...
0.571635
0.288278
26
85
1
100
(weights=[1.313234120315815,-2.091223074965485...
0.590554
0.304034
26
100
1
115
(weights=[1.5106494358271485,-2.37554034126727...
0.590554
0.288630
26
115
1
130
(weights=[1.6808460801490464,-2.64560901166279...
0.586638
0.323949
26
130
1
145
(weights=[1.846760000240688,-2.914826089181457...
0.585547
0.307586
26
145
1
160
(weights=[2.0073226982616266,-3.18046915476317...
0.581202
0.305315
26
160
1
175
(weights=[2.1580796544605683,-3.43464112632351...
0.570992
0.295500
26
175
1
190
(weights=[2.295776697917227,-3.674935300385708...
0.565770
0.337451
26
190
1
根据评估参数表绘制图像
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. bar( evaluate_table. index, evaluate_table[ "AUC" ] , color= "c" , tick_label= evaluate_table. index, label= "AUC" , width= 6 )
ax. set_ylim( 0.6 , 0.7 )
ax2 = ax. twinx( )
ax2. plot( evaluate_table. index, evaluate_table[ "Duration" ] , c= "r" , label= "Duration" , marker= "o" )
fig. legend( loc= 1 , bbox_to_anchor= ( 1 , 1 ) , bbox_transform= ax. transAxes)
评估miniBatchFraction 参数影响
训练模型并获取评估参数表
evaluate_table = evaluate_parameter( train_data, validation_data, [ 26 ] , [ 10 ] , np. linspace( 0.1 , 1 , 5 ) )
evaluate_table
Model
AUC
Duration
numIterations
stepSize
miniBatchFraction
0.100
(weights=[0.22432239986157868,-0.2165393087222...
0.682073
0.293671
26
10
0.100
0.325
(weights=[0.25329319340814027,-0.2708727029103...
0.702727
0.273905
26
10
0.325
0.550
(weights=[0.24474754141432709,-0.2484500877818...
0.693803
0.276777
26
10
0.550
0.775
(weights=[0.25171480871609914,-0.2515106513891...
0.702064
0.292244
26
10
0.775
1.000
(weights=[0.24840617513903634,-0.2527605271207...
0.697719
0.280513
26
10
1.000
根据评估参数表绘制图像
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. bar( evaluate_table. index, evaluate_table[ "AUC" ] , color= "c" , tick_label= evaluate_table. index, label= "AUC" , width= 0.1 )
ax. set_ylim( 0.6 , 0.75 )
ax2 = ax. twinx( )
ax2. plot( evaluate_table. index, evaluate_table[ "Duration" ] , c= "r" , label= "Duration" , marker= "o" )
fig. legend( loc= 1 , bbox_to_anchor= ( 1 , 1 ) , bbox_transform= ax. transAxes)
测试模型
导入测试集
def loadTestData ( sc) :
raw_lines_and_header = sc. textFile( "file:/home/zh123/.jupyter/workspace/stumbleupon/test.tsv" )
header_line = raw_lines_and_header. first( )
raw_non_header_data = raw_lines_and_header. filter ( lambda l: l!= header_line)
raw_non_quot_lines = raw_non_header_data. map ( lambda l: l. replace( "\"" , "" ) )
raw_data = raw_non_quot_lines. map ( lambda l: l. split( "\t" ) )
print ( "数据长度:" , raw_data. count( ) )
web_url_rdd = raw_data. map ( lambda fields: fields[ 0 ] )
feature_rdd = raw_data. map ( lambda fields: extract_features( fields, categories_dict, len ( fields) ) )
std_scaler = StandardScaler( withMean= True , withStd= True ) . fit( feature_rdd)
scaler_features = std_scaler. transform( feature_rdd)
test_point_rdd = web_url_rdd. zip ( scaler_features)
return test_point_rdd
test_file_data = loadTestData( sc)
test_file_data. first( )
数据长度: 3171
('http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html',
DenseVector([-0.355, -0.2496, -0.7015, -0.3917, -0.1041, -0.2274, -0.21, -0.059, -0.1056, 0.0, 0.0, 2.3909, -0.2594, -0.1983, 0.1364, -0.021, -0.3888, 0.3429, -0.4867, -0.3604, -0.3208, 0.342, 0.0, 0.2093, -0.1513, -0.1, -0.0436, 0.7933, 0.7491, -0.7269, -0.2042, -0.0052, -0.2303, -0.5689, 0.406, -0.2558]))
加载最终的模型
model = evaluate_table[ evaluate_table. AUC == evaluate_table. AUC. max ( ) ] . Model. values[ 0 ]
使用模型进行预测
for f in test_file_data. randomSplit( [ 10 , 3171 - 10 ] ) [ 0 ] . collect( ) :
print ( f[ 0 ] , bool ( model. predict( f[ 1 ] ) ) )
http://www.youbeauty.com/body-fitness/dressing-for-your-body-type?page=2 False
http://www.couponingncooking.com/2012/03/super-easy-whole-chicken-in-crock-pot.html True
http://www.rsvlts.com/2012/08/04/inside-the-london-olympics-week-one-62-high-quality-photos/ False
http://backtoherroots.com/2011/08/04/90-second-nutella-chocolate-cake/ True
http://cathlincooks.blogspot.com/ True
http://www.cheapcooking.com/articles/healthy-school-lunch-ideas.htm True
http://www.ted.com/index.php/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html False
http://www.break.com/index/hot-girls-risky-business-fail.html True
http://www.salon.com/2010/04/03/toasted_peeps_brulee_recipe/ True
http://www.joepastry.com/category/pastry/charlotte/ True
http://www.behance.net/leon_farrant/frame/2878481 True
http://www.wimp.com/pageturner/ False