数据挖掘算法原理与实践：数据预处理（第一关-第五关）

第1关：标准化

# -*- coding: utf-8 -*-

from sklearn.preprocessing import scale,MaxAbsScaler,MinMaxScaler

#实现数据预处理方法
def Preprocessing(x,y):
    '''
    x(ndarray):处理 数据
    y(str):y等于'z_score'使用z_score方法
           y等于'minmax'使用MinMaxScaler方法
           y等于'maxabs'使用MaxAbsScaler方法
    '''
    #********* Begin *********#
    if y == 'z_score':        
        x = scale(x)
        return x
    elif y == 'minmax':
        min_max_scaler  = MinMaxScaler()
        x = min_max_scaler.fit_transform(x)
        return x
    elif y == 'maxabs':
        maxabs = MaxAbsScaler()
        x = maxabs.fit_transform(x)
        return x
    #********* End *********#

第2关：非线性转换

# -*- coding: utf-8 -*-
from sklearn.preprocessing import QuantileTransformer

#实现非线性转换方法
def non_linear_transformation(x,y):
    '''
    x(ndarray):待处理数据
    y(int):y等于0映射到均匀分布
           y等于1映射到高斯分布
    '''
    #********* Begin *********#
    if y == 0:
        transformer = QuantileTransformer(random_state=666)
        x = transformer.fit_transform(x)
        return x
    elif y == 1:
        transformer = QuantileTransformer(output_distribution='normal',random_state=666)
        x = transformer.fit_transform(x)
        return x
    #********* End *********#

第3关：归一化

# -*- coding: utf-8 -*-

from sklearn.preprocessing import normalize

#实现数据归一化方法
def normalization(x,y):
    '''
    x(ndarray):待处理数据
    y(str):y等于'l1'则使用l1归一化
           y等于'l2'则使用l2归一化
    '''
    #********* Begin *********#
    if y == 1:
        x = normalize(x,norm='l1')
        return x
    elif y == 2:
        x = normalize(x,norm='l2')
        return x
    #********* End *********#

第4关：离散值编码

# -*- coding: utf-8 -*-
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

def onehot_label(label):
    '''
    input:label(list):待处理标签
    output:lable(ndarray):onehot处理后的标签
    '''
    #********* Begin *********#
    int_label = LabelEncoder()
    label = int_label.fit_transform(label)
    label = np.array(label).reshape(len(label),1)
    onehot_label = OneHotEncoder()
    label = onehot_label.fit_transform(label).toarray()
    return label
    #********* End *********#

第5关：生成多项式特征

# -*- coding: utf-8 -*-
from sklearn.preprocessing import PolynomialFeatures

def polyfeaturs(x,y):
    '''
    x(ndarray):待处理特征
    y(int):y等于0生成二项式特征
           y等于1生成二项式特征，只需要特征之间交互
    '''
    #********* Begin *********#
    if y==0:
        poly = PolynomialFeatures(2)#生成二项式特征
        x = poly.fit_transform(x)
        return x
    elif y==1:
        poly = PolynomialFeatures(degree=2, interaction_only=True)#生成二项式特征，只需要特征之间交互
        x = poly.fit_transform(x)
        return x
    #********* End *********#

第6关：估算缺失值

# -*- coding: utf-8 -*-
from sklearn.preprocessing import Imputer

def imp(x,y):
    '''
    x(ndarray):待处理数据
    y(str):y为'mean'则用取平均方式补充缺失值
           y为'meian'则用取中位数方式补充缺失值
           y为'most_frequent'则用出现频率最多的值代替缺失值        
    '''
    #********* Begin *********#
    if y == 'mean':
        im = Imputer(missing_values='NaN', strategy='mean', axis=0)
        x = im.fit_transform(x)
        return x
    elif y == 'median':
        im = Imputer(missing_values='NaN', strategy='median', axis=0)
        x = im.fit_transform(x)
        return x
    elif y == 'most_frequent':
        im = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
        x = im.fit_transform(x)
        return x
    #********* End *********#

数据挖掘算法原理与实践：数据预处理（第一关-第五关）

猜你喜欢