机器学习数据预处理

运行环境：win10 64位 py 3.6 pycharm 2018.1.1

from  sklearn.preprocessing import Binarizer

#二元化
X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [3,3,3,3,3],
    [1,1,1,1,1]
]
print("before transform:",X)
binarizer = Binarizer(threshold=2.5)
print('after transform:',binarizer.transform(X))

#独热码
from  sklearn.preprocessing import OneHotEncoder

X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [3,3,3,3,3],
    [1,1,1,1,1]
]
print("before transform:",X)
encoder = OneHotEncoder(sparse=False)
encoder.fit(X)
print('active_features_:',encoder.active_features_)
print('featuer_indices_:',encoder.feature_indices_)
print('n_values_:',encoder.n_values_)
print('after transform:',encoder.transform([[1,2,3,4,5]]))

#标准化
#MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [3,3,3,3,3],
    [1,1,1,1,1]
]
print("before transform:",X)
scalar = MinMaxScaler(feature_range=(0,2))
scalar.fit(X)
print('min_is:',scalar.min_)
print('scale_is:',scalar.scale_)
print('data_max_is:',scalar.data_max_)
print('data_min_is:',scalar.data_min_)
print('data_range_is:',scalar.data_range_)
print('after transform:',scalar.transform(X))

#MaxAbsScaler
from sklearn.preprocessing import MaxAbsScaler
X = [
    [1,5,1,2,10],
    [2,6,3,2,7],
    [3,7,5,6,4],
    [4,8,7,8,1]
]
print("before transform:",X)
scaler = MaxAbsScaler()
scaler.fit(X)
print("scale_is:",scaler.scale_)
print("max_abs_is:",scaler.max_abs_)
print('after transform:',scaler.transform(X))

#StandardScaler
from sklearn.preprocessing import StandardScaler
X = [
    [1,5,1,2,10],
    [2,6,3,2,7],
    [3,7,5,6,4],
    [4,8,7,8,1]
]
print("before transform:",X)
scaler = StandardScaler()
scaler.fit(X)
print('scale_is:',scaler.scale_)
print('mean_is:',scaler.mean_)
print('var_is:',scaler.var_)
print('after transform:',scaler.transform(X))

#正则化
from sklearn.preprocessing import Normalizer
X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [1,3,5,2,4],
    [2,4,1,3,5]
]
print("before transform:",X)
normalizer = Normalizer(norm='l2')
print('after transform:',normalizer.transform(X))

#过滤式特征选取
from sklearn.feature_selection import VarianceThreshold
X = [
    [100,1,2,3],
    [100,4,5,6],
    [100,7,8,9],
    [101,11,12,13]
]
selector = VarianceThreshold(1)
selector.fit(X)
print("variances is %s:"%selector.variances_)
print('After transform is %s:'%selector.transform(X))
print('The surport is %s'%selector.get_support(True))
print('After reverse transform is %s:'%selector.inverse_transform(selector.transform(X)))

#单变量特征提取
#SelectKBest:可以保留在该统计指标上得分最高的k个特征
#SelectPercentile：可以保留在该统计指标上得分最高的百分之k的特征
from sklearn.feature_selection import SelectKBest,f_classif
X = [
    [1,2,3,4,5],
    [5,4,3,2,1],
    [3,3,3,3,3],
    [1,1,1,1,1]
]
y = [0,1,0,1]
print("before transform:",X)
selector = SelectKBest(score_func=f_classif,k=3)
#共有5个特征，选取f_classif最好的3个指标
selector.fit(X,y)
print('scores_:',selector.scores_)
print('pvalues_:',selector.pvalues_)
print('selected index:',selector.get_support(True))
print('after transform:',selector.transform(X))

#包裹式特征提取
#提供了RFE类来实现包裹式特征提取。REF通过外部的外部提供的一个学习器来选择特征，它要求学习器学习的是特征的权重
#首先，学习器在初始的特征集合以及初始的权重上训练
#然后，学习器学得每个特征的权重，剔除权重最小的特征，构成新的训练集
#再将学习器在新的训练集上训练，直到剩下的特征数量满足条件为止
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
estimator = LinearSVC()
selector = RFE(estimator=estimator,n_features_to_select=2)
selector.fit(X,y)
print('N_features %s'%selector.n_features_)
print('Support is %s'%selector.support_)
print('Ranking %s'%selector.ranking_)

#特征的提取对于预测性能的提升没有必然的联系.
#下面只用了两个特征，预测精度的甩下降
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.datasets import load_iris
##加载数据
iris = load_iris()
X,y = iris.data,iris.target
##特征提取
estimator = LinearSVC()
selector = RFE(estimator=estimator,n_features_to_select=2)
X_t = selector.fit_transform(X,y)
##切分与集与验证集
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.25,random_state=0,stratify=y)
X_train_t,X_test_t,y_train_t,y_test_t = cross_validation.train_test_split(X_t,y,test_size=0.25,random_state=0,stratify=y)
##测试与验证
clf = LinearSVC()
clf_t = LinearSVC()
clf.fit(X_train,y_train)
clf_t.fit(X_train_t,y_train_t)
print('Original DataSet:test score=%s'%(clf.score(X_test,y_test)))
print('Selected DataSet: test score=%s'%(clf_t.score(X_test_t,y_test_t)))

#RFECV
#执行一个交叉验证来寻找最优的剩余特征
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
estimator = LinearSVC()
selector = RFECV(estimator=estimator,cv=3)
selector.fit(X,y)
print('N_features %s'%selector.n_features_)
print('Support is %s'%selector.support_)
print('Ranking %s'%selector.ranking_)
print('Grid Scores %s'%selector.grid_scores_)

#嵌入式特征选取
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import  LinearSVC
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
y = digits.target
estimator = LinearSVC(penalty='l1',dual=False)
selector = SelectFromModel(estimator=estimator,threshold='mean')
selector.fit(X,y)
selector.transform(X)
print('Threshold %s'%selector.threshold_)
print('Support is %s'%selector.get_support(indices=True))

#下面代码来说明a与C参数与稀疏性的关系。通过权重中等于0的个数来表征稀疏性：零的个数越多，稀疏性越大
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits,load_diabetes
from sklearn.linear_model import Lasso
def test_Lasso(*data):
    X,y = data
    alphas = np.logspace(-2,2)
    zeros = []
    for alpha in alphas:
        regr = Lasso(alpha=alpha)
        regr.fit(X,y)
        num = 0
        for ele in regr.coef_:
            if abs(ele) < 1e-5:num+=1
        zeros.append(num)
    ####绘图
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(alphas,zeros)
    ax.set_xlabel(r'$\alpha$')
    ax.set_xscale('log')
    ax.set_ylim(0,X.shape[1]+1)
    ax.set_ylabel('zeros in coef')
    ax.set_title('Sparsity In Lasso')
    plt.show()
def test_LinearSVC(*data):
    X,y = data
    Cs = np.logspace(-2,2)
    zeros = []
    for C in Cs:
        clf = LinearSVC(C = C,penalty='l1',dual=False)
        clf.fit(X,y)
        ##计算0的个数##
        num = 0
        for row in clf.coef_:
            for ele in row:
                if abs(ele) < 1e-5:num+=1
        zeros.append(num)
    ###绘图
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(Cs,zeros)
    ax.set_xlabel("C")
    ax.set_xscale("log")
    ax.set_ylabel("zeros in coef")
    ax.set_title("Sparsity In SVM")
    plt.show()
if __name__=='__main__':
    data = load_diabetes()
    test_Lasso(data.data,data.target)
    data = load_digits()
    test_LinearSVC(data.data,data.target)

#学习器流水线
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
def test_Pipeline(data):
    X_train,X_test,y_train,y_test=data
    steps=[('LinearSVC',LinearSVC(C=1,penalty='l1',dual=False)),('LogisticRegression',LogisticRegression(C=1))]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train,y_train)
    print('Name steps:',pipeline.named_steps)
    print('Pipeline Score:',pipeline.score(X_test,y_test))
if __name__=='__main__':
    data = load_digits()
    X = data.data
    y = data.target

#字典学习
from sklearn.decomposition import DictionaryLearning
X = [
    [1,2,3,4,5],
    [6,7,8,9,10],
    [10,9,8,7,6],
    [5,4,3,2,1]
]
print('before transform:',X)
dct = DictionaryLearning(n_components=3)
dct.fit(X)
print('components is :',dct.components_)
print('after transform:',dct.transform(X))

机器学习数据预处理

猜你喜欢