特征抽取
DictVectorizer对使用字典存储的数据进行特征抽取与向量化
由于类别型特征无法直接数字化表示,因此需要借助原特征的名称,组合产生新的特征,并采用0/1二值方式进行量化;而数值型特征的转化则相对方便,一般情况下只需要维持原始特征值即可。
from sklearn.feature_extraction import DictVectorizer
measurements = [{'city': 'Dubai', 'temperature': 33.}, {'city': 'London', 'temperature': 12.}, {'city': 'San Fransisco', 'temperature': 18.}]
vec =DictVectorizer()
print(vec.fit_transform(measurements).toarray())
print(vec.get_feature_names())
#out[]:
# [[ 1. 0. 0. 33.]
# [ 0. 1. 0. 12.]
# [ 0. 0. 1. 18.]]
# ['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']
CountVectorizer 和 TfidfVectorizer
处理没有使用特殊的数据结构进行存储的文本数据,如一系列的字符串,比较常用的文本特征表示方法为词袋法(Bag of Words):不考虑词语出现的顺序,只是将训练文本中的每个出现过的词汇单独视作一列特征。不重复的词汇集合为词表(Vocabulary),每条训练文本都可以在高维度的词表上映射出一个特征向量。
特征向量的计算方式有CountVectorizer和TfidfVectorizer。
CountVectorizer只考虑每种词汇(Term)在该条训练文本中出现的频率(Term Frequency)
TfidfVectorizer除了考量某一词汇在当前文本中出现的频率(Term Frequency)之外,同时关注包含这个词汇的文本条数的倒数(Inverse Document Frequency),能压制常用词汇对分类决策的干扰。
使用CountVectorizer和TfidVectorizer并且不去掉停用词的条件下,对文本特征进行量化的朴素贝叶斯分类性能测试
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
news = fetch_20newsgroups(subset='all')
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)
count_vec = CountVectorizer()
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
tfid_vec = TfidfVectorizer()
X_tfid_train = tfid_vec.fit_transform(X_train)
X_tfid_test = tfid_vec.transform(X_test)
mnb_count = MultinomialNB()
mnb_count.fit(X_count_train, y_train)
mnb_tfid = MultinomialNB()
mnb_tfid.fit(X_tfid_train, y_train)
print('The accuracy of classifying 20newsgroup using Naive Bayes(CountVectorizer without filtering stopwords):', mnb_count.score(X_count_test, y_test))
y_count_predict = mnb_count.predict(X_count_test)
print(classification_report(y_test, y_count_predict, target_names=news.target_names))
print('The accuracy of classifying 20newsgroup using Naive Bayes(TfidVectorizer without filtering stopwords):', mnb_tfid.score(X_tfid_test, y_test))
y_tfid_predict = mnb_tfid.predict(X_tfid_test)
print(classification_report(y_test, y_tfid_predict, target_names=news.target_names))
#out[]:
# The accuracy of classifying 20newsgroup using Naive Bayes(CountVectorizer without filtering stopwords): 0.8397707979626485
# precision recall f1-score support
#
# alt.atheism 0.86 0.86 0.86 201
# comp.graphics 0.59 0.86 0.70 250
# comp.os.ms-windows.misc 0.89 0.10 0.17 248
# comp.sys.ibm.pc.hardware 0.60 0.88 0.72 240
# comp.sys.mac.hardware 0.93 0.78 0.85 242
# comp.windows.x 0.82 0.84 0.83 263
# misc.forsale 0.91 0.70 0.79 257
# rec.autos 0.89 0.89 0.89 238
# rec.motorcycles 0.98 0.92 0.95 276
# rec.sport.baseball 0.98 0.91 0.95 251
# rec.sport.hockey 0.93 0.99 0.96 233
# sci.crypt 0.86 0.98 0.91 238
# sci.electronics 0.85 0.88 0.86 249
# sci.med 0.92 0.94 0.93 245
# sci.space 0.89 0.96 0.92 221
# soc.religion.christian 0.78 0.96 0.86 232
# talk.politics.guns 0.88 0.96 0.92 251
# talk.politics.mideast 0.90 0.98 0.94 231
# talk.politics.misc 0.79 0.89 0.84 188
# talk.religion.misc 0.93 0.44 0.60 158
#
# avg / total 0.86 0.84 0.82 4712
#
# The accuracy of classifying 20newsgroup using Naive Bayes(TfidVectorizer without filtering stopwords): 0.8463497453310697
# precision recall f1-score support
#
# alt.atheism 0.84 0.67 0.75 201
# comp.graphics 0.85 0.74 0.79 250
# comp.os.ms-windows.misc 0.82 0.85 0.83 248
# comp.sys.ibm.pc.hardware 0.76 0.88 0.82 240
# comp.sys.mac.hardware 0.94 0.84 0.89 242
# comp.windows.x 0.96 0.84 0.89 263
# misc.forsale 0.93 0.69 0.79 257
# rec.autos 0.84 0.92 0.88 238
# rec.motorcycles 0.98 0.92 0.95 276
# rec.sport.baseball 0.96 0.91 0.94 251
# rec.sport.hockey 0.88 0.99 0.93 233
# sci.crypt 0.73 0.98 0.83 238
# sci.electronics 0.91 0.83 0.87 249
# sci.med 0.97 0.92 0.95 245
# sci.space 0.89 0.96 0.93 221
# soc.religion.christian 0.51 0.97 0.67 232
# talk.politics.guns 0.83 0.96 0.89 251
# talk.politics.mideast 0.92 0.97 0.95 231
# talk.politics.misc 0.98 0.62 0.76 188
# talk.religion.misc 0.93 0.16 0.28 158
#
# avg / total 0.87 0.85 0.84 4712
#
#
# Process finished with exit code 0
使用CountVectorizer和TfidVectorizer并且去掉停用词的条件下,对文本特征进行量化的朴素贝叶斯分类性能测试
上面代码,改动以下两处即可,运行后,可以观察到,准确率明显提高了。
count_vec = CountVectorizer(analyzer='word', stop_words='english')
tfid_vec = TfidfVectorizer(analyzer='word', stop_words='english')
特征筛选
使用Titanic数据集,通过特征筛选的方法一步步提升决策树的预测性能
import pandas as pd
import numpy as np
import pylab as pl
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import feature_selection
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#分离数据特征与预测目标
y = titanic['survived']
X = titanic.drop(['row.names', 'name', 'survived'], axis=1)
X['age'].fillna(X['age'].mean(), inplace=True)
X.fillna('UNKNOW', inplace=True)
#分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
#类别型特征向量化
vec = DictVectorizer()
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
print(len(vec.feature_names_))
#out[]:474
#使用决策树模型依靠所有特征进行预测,并且评估性能。
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
print(dt.score(X_test, y_test))
#通过交叉验证的方法,按照固定间隔的百分比筛选特征,并作图展示性能随特征筛选比例的变化
percentiles = np.array(range(1, 100, 2))
results = []
for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
#找到体现最佳性能的特征筛选的百分比
opt = np.where(results == results.max())[0]
print('Optimal number of feature %d' % percentiles[opt])
#out[]:Optimal number of feature 7
pl.plot(percentiles, results)
pl.xlabel('percentiles of features')
pl.ylabel('accuracy')
pl.show()
#使用最佳筛选后的特征,利用相同配置的模型在测试集上进行性能评估
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
print(dt.score(X_test_fs, y_test))
#out[]:0.8571428571428571
可以看出使用前7%维度的特征比使用全部特征的模型性能高。