Basics of Machine Learning: "Feature Engineering (3) - Feature Preprocessing"

1. What is feature preprocessing?

The process of converting feature data into feature data more suitable for the algorithm model through some conversion functions

Before processing, the feature values ​​are numerical values. After processing, feature scaling is performed.

1. Contents:
Dimensionless transformation of numerical data:
normalization and
standardization

2. Feature preprocessing API
sklearn.preprocessing

3. Why do we need to normalize/standardize
features? The units or sizes of features are very different, or the variance of a feature is several orders of magnitude larger than other features, which can easily affect (dominated) the target results, making some algorithms unable to learn. to other features
because many data dimensions are inconsistent

It can be seen that the mileage value is relatively large and the time consumption is relatively small.
When using the KNN algorithm to calculate the Euclidean distance formula, the final result is dominated by the mileage and no other features can be learned
(72993 - 35948) ^ 2 + (10.14 - 6.83 ) ^ 2 + (1.03 - 1.21) ^ 2

We need to use some methods for dimensionless transformation to convert data of different specifications into the same specification.

4. What is the Euclidean distance formula?

5. Dimensionless processing.
The reason why dimensionless processing is required is because the dimensions are not unified, resulting in very large mileage values. In order to make features equally important, normalization/standardization is required

2. Normalization

1. Definition:
Map the data to between (default is [0,1]) by transforming the original data

2, official

3、API函数
sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1)...)

4.
MinMaxScaler.fit_transform
( X
)

5. Normalize the data in dating.txt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import jieba
import pandas as pd

def datasets_demo():
    """
    sklearn数据集使用
    """
    #获取数据集
    iris = load_iris()
    print("鸢尾花数据集:\n", iris)
    print("查看数据集描述:\n", iris["DESCR"])
    print("查看特征值的名字:\n", iris.feature_names)
    print("查看特征值几行几列:\n", iris.data.shape)
   
    #数据集的划分
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
    print("训练集的特征值:\n", x_train, x_train.shape)

    return None
 
def dict_demo():
    """
    字典特征抽取
    """
    data = [{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]
    # 1、实例化一个转换器类
    transfer = DictVectorizer(sparse=False)

    # 2、调用fit_transform()
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    print("特征名字:\n", transfer.get_feature_names())

    return None

def count_demo():
    """
    文本特征抽取
    """
    data = ["life is short,i like like python", "life is too long,i dislike python"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer()
    # 2、调用fit_transform()
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names()) 

    return None

def count_chinese_demo():
    """
    中文文本特征抽取
    """
    data = ["我 爱 北京 天安门", "天安门 上 太阳 升"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer()
    
    # 2、调用fit_transform
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new.toarray());
    print("特征名字:\n", transfer.get_feature_names())
    
    return None

def cut_word(text):
    """
    进行中文分词
    """
    return " ".join(list(jieba.cut(text)))  #返回一个分词生成器对象,强转成list,再join转成字符串

def count_chinese_demo2():
    """
    中文文本特征抽取,自动分词
    """
    # 1、将中文文本进行分词
    data = ["今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
        "我们看到的从很远星系来的光是在几百万年前之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
        "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]

    data_new = []
    for sent in data:
        data_new.append(cut_word(sent))
    print(data_new)
    # 2、实例化一个转换器类
    transfer = CountVectorizer()
    # 3、调用fit_transform()
    data_final = transfer.fit_transform(data_new)
    print("data_final:\n", data_final.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    return None

def tfidf_demo():
    """
    用tf-idf的方法进行文本特征抽取
    """
    # 1、将中文文本进行分词
    data = ["今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
        "我们看到的从很远星系来的光是在几百万年前之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
        "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]

    data_new = []
    for sent in data:
        data_new.append(cut_word(sent))
    print(data_new)
    # 2、实例化一个转换器类
    transfer = TfidfVectorizer()
    # 3、调用fit_transform()
    data_final = transfer.fit_transform(data_new)
    print("data_final:\n", data_final.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    return None

def minmax_demo():
    """
    归一化
    """
    # 1、获取数据
    data = pd.read_csv("dating.txt")
    #print("data:\n", data)
    data = data.iloc[:, 0:3] #行都要,列取前3列
    print("data:\n", data)
    # 2、实例化一个转换器
    transfer = MinMaxScaler()
    # 3、调用fit_transform
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    return None

if __name__ == "__main__":
    # 代码1:sklearn数据集使用
    #datasets_demo()
    # 代码2:字典特征抽取
    #dict_demo()
    # 代码3:文本特征抽取
    #count_demo()
    # 代码4:中文文本特征抽取
    #count_chinese_demo()
    # 代码5:中文文本特征抽取,自动分词
    #count_chinese_demo2()
    # 代码6: 测试jieba库中文分词
    #print(cut_word("我爱北京天安门"))
    # 代码7:用tf-idf的方法进行文本特征抽取
    #tfidf_demo()
    # 代码8:归一化
    minmax_demo()

operation result:

data:
      milage     liters  consumtime
0     40920   8.326976    0.953952
1     14488   7.153469    1.673904
2     26052   1.441871    0.805124
3     75136  13.147394    0.428964
4     38344   1.669788    0.134296
..      ...        ...         ...
984   11145   3.410627    0.631838
985   68846   9.974715    0.669787
986   26575  10.650102    0.866627
987   48111   9.134528    0.728045
988   43757   7.882601    1.332446

[989 rows x 3 columns]
data_new:
 [[0.44832535 0.39805139 0.56233353]
 [0.15873259 0.34195467 0.98724416]
 [0.28542943 0.06892523 0.47449629]
 ...
 [0.29115949 0.50910294 0.51079493]
 [0.52711097 0.43665451 0.4290048 ]
 [0.47940793 0.3768091  0.78571804]]

3. Standardization

1. What are the disadvantages of normalization?
The normalization result is calculated based on the minimum and maximum values. If there are many outliers in the data, the maximum and minimum values ​​are very easily affected by the outliers, so
this method is robust ( Robustness) is poor and only suitable for traditional accurate small data scenarios

2. Definition:
Transform the original data to a range with a mean of 0 and a standard deviation of 1.

3, official

Subtract the mean of the column from the data and divide by the standard deviation

4. What is standard deviation?

5. After processing by API function
sklearn.preprocessing.StandardScaler()
, for each column, all data are clustered around the mean of 0 and the standard deviation of 1

6. StandardScaler.fit_transform(X)
X: Data in numpy array format [n_samples, n_features]
Return value: array with the same shape after conversion

7. Standardize the data in dating.txt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import jieba
import pandas as pd

def datasets_demo():
    """
    sklearn数据集使用
    """
    #获取数据集
    iris = load_iris()
    print("鸢尾花数据集:\n", iris)
    print("查看数据集描述:\n", iris["DESCR"])
    print("查看特征值的名字:\n", iris.feature_names)
    print("查看特征值几行几列:\n", iris.data.shape)
   
    #数据集的划分
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
    print("训练集的特征值:\n", x_train, x_train.shape)

    return None
 
def dict_demo():
    """
    字典特征抽取
    """
    data = [{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]
    # 1、实例化一个转换器类
    transfer = DictVectorizer(sparse=False)

    # 2、调用fit_transform()
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    print("特征名字:\n", transfer.get_feature_names())

    return None

def count_demo():
    """
    文本特征抽取
    """
    data = ["life is short,i like like python", "life is too long,i dislike python"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer()
    # 2、调用fit_transform()
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new.toarray())
    print("特征名字:\n", transfer.get_feature_names()) 

    return None

def count_chinese_demo():
    """
    中文文本特征抽取
    """
    data = ["我 爱 北京 天安门", "天安门 上 太阳 升"]
    # 1、实例化一个转换器类
    transfer = CountVectorizer()
    
    # 2、调用fit_transform
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new.toarray());
    print("特征名字:\n", transfer.get_feature_names())
    
    return None

def cut_word(text):
    """
    进行中文分词
    """
    return " ".join(list(jieba.cut(text)))  #返回一个分词生成器对象,强转成list,再join转成字符串

def count_chinese_demo2():
    """
    中文文本特征抽取,自动分词
    """
    # 1、将中文文本进行分词
    data = ["今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
        "我们看到的从很远星系来的光是在几百万年前之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
        "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]

    data_new = []
    for sent in data:
        data_new.append(cut_word(sent))
    print(data_new)
    # 2、实例化一个转换器类
    transfer = CountVectorizer()
    # 3、调用fit_transform()
    data_final = transfer.fit_transform(data_new)
    print("data_final:\n", data_final.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    return None

def tfidf_demo():
    """
    用tf-idf的方法进行文本特征抽取
    """
    # 1、将中文文本进行分词
    data = ["今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所以每个人不要放弃今天。",
        "我们看到的从很远星系来的光是在几百万年前之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
        "如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]

    data_new = []
    for sent in data:
        data_new.append(cut_word(sent))
    print(data_new)
    # 2、实例化一个转换器类
    transfer = TfidfVectorizer()
    # 3、调用fit_transform()
    data_final = transfer.fit_transform(data_new)
    print("data_final:\n", data_final.toarray())
    print("特征名字:\n", transfer.get_feature_names())
    return None

def minmax_demo():
    """
    归一化
    """
    # 1、获取数据
    data = pd.read_csv("dating.txt")
    #print("data:\n", data)
    data = data.iloc[:, 0:3] #行都要,列取前3列
    print("data:\n", data)
    # 2、实例化一个转换器
    transfer = MinMaxScaler()
    # 3、调用fit_transform
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    return None

def stand_demo():
    """
    标准化
    """
    # 1、获取数据
    data = pd.read_csv("dating.txt")
    #print("data:\n", data)
    data = data.iloc[:, 0:3] #行都要,列取前3列
    print("data:\n", data)
    # 2、实例化一个转换器
    transfer = StandardScaler()
    # 3、调用fit_transform
    data_new = transfer.fit_transform(data)
    print("data_new:\n", data_new)
    return None

if __name__ == "__main__":
    # 代码1:sklearn数据集使用
    #datasets_demo()
    # 代码2:字典特征抽取
    #dict_demo()
    # 代码3:文本特征抽取
    #count_demo()
    # 代码4:中文文本特征抽取
    #count_chinese_demo()
    # 代码5:中文文本特征抽取,自动分词
    #count_chinese_demo2()
    # 代码6: 测试jieba库中文分词
    #print(cut_word("我爱北京天安门"))
    # 代码7:用tf-idf的方法进行文本特征抽取
    #tfidf_demo()
    # 代码8:归一化
    #minmax_demo()
    # 代码9:标准化
    stand_demo()

operation result:

data:
      milage     liters  consumtime
0     40920   8.326976    0.953952
1     14488   7.153469    1.673904
2     26052   1.441871    0.805124
3     75136  13.147394    0.428964
4     38344   1.669788    0.134296
..      ...        ...         ...
984   11145   3.410627    0.631838
985   68846   9.974715    0.669787
986   26575  10.650102    0.866627
987   48111   9.134528    0.728045
988   43757   7.882601    1.332446

[989 rows x 3 columns]
data_new:
 [[ 0.33984938  0.42024644  0.2460588 ]
 [-0.86581884  0.14356328  1.69344575]
 [-0.338339   -1.2030865  -0.05314407]
 ...
 [-0.31448289  0.96798056  0.07050117]
 [ 0.66785937  0.61064669 -0.2081032 ]
 [ 0.46925618  0.31547409  1.00698075]]

8. Summary of standardization:
It is relatively stable when there are enough samples and is suitable for modern noisy big data scenarios.
 

Guess you like

Origin blog.csdn.net/csj50/article/details/132108346