机器学习sklearn

sklearn相关模块导入

1 from sklearn.feature_extraction import DictVectorizer
2 from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
3 from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer
4 from sklearn.feature_selection import VarianceThreshold
5 from sklearn.decomposition import PCA
6 import jieba
7 import numpy as np

一、字典数据抽取

 1 def dictvec():
 2     """
 3     字典数据抽取
 4     :return: None
 5     """
 6     dict = DictVectorizer(sparse=False)
 7     # 调用ift_transform
 8     data = dict.fit_transform([{"city": "北京", "temperature": 100}])
 9     print(dict.get_feature_names())
10     print(dict.inverse_transform(data))
11     print(data)
12 
13     return None
View Code

二、对文本进行特征值化

  1、英文

 1 def countvec():
 2     """
 3     对文本进行特征值化
 4     :return:None
 5     """
 6     cv = CountVectorizer()
 7     data = cv.fit_transform(["life is short i like python", "life is too long, i dislike python"])
 8     print(cv.get_feature_names())
 9     print(data.toarray())
10     return None
View Code

  2、中文

 1 def cutword():
 2     """
 3     中文特征值化分词
 4     :return:None
 5     """
 6     con1 = jieba.cut("这是一个什么样的时代,这是一个以互联网时代为代表的时代\n")
 7     con2 = jieba.cut("看到这些我们都想到了什么,什么才能让我们想起不该想起的东西")
 8     # 转换成列表
 9     # content1 = list(con1)
10     # content2 = list(con2)
11     # 转换成字符串
12     c1 = " ".join(con1)
13     c2 = " ".join(con2)
14     print(c1,c2)
15     return c1, c2
16 
17 
18 def hanzivec():
19     """
20     中文特征值化
21     :return:None
22     """
23     c1, c2 = cutword()
24     # print(c1, c2)
25     cv = CountVectorizer()
26     data = cv.fit_transform([c1, c2])
27     print(cv.get_feature_names())
28     print(data.toarray())
29 
30     return None
31 
32 
33 def tfidfvec():
34     """
35     中文特征值化
36     :return:None
37     """
38     c1, c2 = cutword()
39     # print(c1, c2)
40     tf = TfidfVectorizer()
41     data = tf.fit_transform([c1, c2])
42     # print(data)
43     print(tf.get_feature_names())
44     print(data.toarray())
45 
46     return None
View Code

三、归一化计算

1 def mm():
2     """
3     归一化计算
4     :return: None
5     """
6     mm=MinMaxScaler(feature_range=(4,5))
7     data=mm.fit_transform([[60,2,40],[90,4,30],[75,6,50]])
8     print(data)
View Code

四、标准化计算

1 def ss():
2     """
3     标准化计算
4     :return: None
5     """
6     ss=StandardScaler()
7     data=ss.fit_transform([[1,-1,4],[2,1,0],[9,2,3]])
8     print(data)
View Code

五、缺失值处理

1 def im():
2     """
3     缺失值处理
4     :return:
5     """
6     im=Imputer(missing_values="NaN",strategy="mean",axis=0)
7     data=im.fit_transform([[1,2],[np.nan,3],[7,6]])
8     print(data)
View Code

六、特征选择-删除低方差的特征

1 def var():
2     """
3     特征选择-删除低方差的特征
4     :return:
5     """
6     var=VarianceThreshold(threshold=0.0)
7     data=var.fit_transform([[0,3,5,4],[0,2,9,4],[0,8,3,4],[0,8,1,4]])
8     print(data)
View Code

七、数据降维处理

1 def pca():
2     """
3     数据降维处理
4     :return:
5     """
6     pca=PCA(n_components=0.9)
7     data=pca.fit_transform([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[7,8,9]])
8     print(data)
View Code

猜你喜欢

转载自www.cnblogs.com/returnes/p/10525013.html