sklearn相关模块导入
1 from sklearn.feature_extraction import DictVectorizer 2 from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 3 from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer 4 from sklearn.feature_selection import VarianceThreshold 5 from sklearn.decomposition import PCA 6 import jieba 7 import numpy as np
一、字典数据抽取
1 def dictvec(): 2 """ 3 字典数据抽取 4 :return: None 5 """ 6 dict = DictVectorizer(sparse=False) 7 # 调用ift_transform 8 data = dict.fit_transform([{"city": "北京", "temperature": 100}]) 9 print(dict.get_feature_names()) 10 print(dict.inverse_transform(data)) 11 print(data) 12 13 return None
二、对文本进行特征值化
1、英文
1 def countvec(): 2 """ 3 对文本进行特征值化 4 :return:None 5 """ 6 cv = CountVectorizer() 7 data = cv.fit_transform(["life is short i like python", "life is too long, i dislike python"]) 8 print(cv.get_feature_names()) 9 print(data.toarray()) 10 return None
2、中文
1 def cutword(): 2 """ 3 中文特征值化分词 4 :return:None 5 """ 6 con1 = jieba.cut("这是一个什么样的时代,这是一个以互联网时代为代表的时代\n") 7 con2 = jieba.cut("看到这些我们都想到了什么,什么才能让我们想起不该想起的东西") 8 # 转换成列表 9 # content1 = list(con1) 10 # content2 = list(con2) 11 # 转换成字符串 12 c1 = " ".join(con1) 13 c2 = " ".join(con2) 14 print(c1,c2) 15 return c1, c2 16 17 18 def hanzivec(): 19 """ 20 中文特征值化 21 :return:None 22 """ 23 c1, c2 = cutword() 24 # print(c1, c2) 25 cv = CountVectorizer() 26 data = cv.fit_transform([c1, c2]) 27 print(cv.get_feature_names()) 28 print(data.toarray()) 29 30 return None 31 32 33 def tfidfvec(): 34 """ 35 中文特征值化 36 :return:None 37 """ 38 c1, c2 = cutword() 39 # print(c1, c2) 40 tf = TfidfVectorizer() 41 data = tf.fit_transform([c1, c2]) 42 # print(data) 43 print(tf.get_feature_names()) 44 print(data.toarray()) 45 46 return None
三、归一化计算
1 def mm(): 2 """ 3 归一化计算 4 :return: None 5 """ 6 mm=MinMaxScaler(feature_range=(4,5)) 7 data=mm.fit_transform([[60,2,40],[90,4,30],[75,6,50]]) 8 print(data)
四、标准化计算
1 def ss(): 2 """ 3 标准化计算 4 :return: None 5 """ 6 ss=StandardScaler() 7 data=ss.fit_transform([[1,-1,4],[2,1,0],[9,2,3]]) 8 print(data)
五、缺失值处理
1 def im(): 2 """ 3 缺失值处理 4 :return: 5 """ 6 im=Imputer(missing_values="NaN",strategy="mean",axis=0) 7 data=im.fit_transform([[1,2],[np.nan,3],[7,6]]) 8 print(data)
六、特征选择-删除低方差的特征
1 def var(): 2 """ 3 特征选择-删除低方差的特征 4 :return: 5 """ 6 var=VarianceThreshold(threshold=0.0) 7 data=var.fit_transform([[0,3,5,4],[0,2,9,4],[0,8,3,4],[0,8,1,4]]) 8 print(data)
七、数据降维处理
1 def pca(): 2 """ 3 数据降维处理 4 :return: 5 """ 6 pca=PCA(n_components=0.9) 7 data=pca.fit_transform([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[7,8,9]]) 8 print(data)