数据集来自七月在线练习
import jieba
import pandas as pd
import random
from sklearn.model_selection import train_test_split #划分训练/测试集
from sklearn.feature_extraction.text import CountVectorizer #抽取特征
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
'''
读入数据
'''
df_technology = pd.read_csv("H:/NLP_project/NLP_project/data/technology_news.csv")
df_technology = df_technology.dropna()
df_car = pd.read_csv("H:/NLP_project/NLP_project/data/car_news.csv")
df_car = df_car.dropna()
df_entertainment = pd.read_csv("H:/NLP_project/NLP_project/data/entertainment_news.csv")
df_entertainment = df_entertainment.dropna()
df_military = pd.read_csv("H:/NLP_project/NLP_project/data/military_news.csv")
df_military = df_military.dropna()
df_sports = pd.read_csv("H:/NLP_project/NLP_project/data/sports_news.csv")
df_sports = df_sports.dropna()
'''
数据预处理
'''
technology =df_technology.content.values.tolist()[1000:21000]
car =df_car.content.values.tolist()[1000:21000]
entertainment=df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000] #个类别随机抽取20000个数
#停用词
stopwords = pd.read_csv('H:/NLP_project/NLP_project/data/stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'])
stopwords = stopwords['stopword'].values
def preprocess_text(content_lines,sentences,category):
for line in content_lines:
segs = jieba.lcut(line)
segs = filter(lambda x:len(x)>1,segs) #过滤长度小于1的字符
segs = filter(lambda x:x not in stopwords,segs) #去停用词
sentences.append((" ".join(segs),category))
return sentences
sentences = []
preprocess_text(technology, sentences, 'technology')
preprocess_text(car, sentences, 'car')
preprocess_text(entertainment, sentences, 'entertainment')
preprocess_text(military, sentences, 'military')
preprocess_text(sports, sentences, 'sports')
random.shuffle(sentences)
x,y = zip(*sentences) #将sentence中的内容和标签分别赋值给x,y
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1234) #利用模型划分测试集和验证集
使用CountVectorizer进行特征提取,使用MultinomialNB分类训练
vec = CountVectorizer( #特征提取
analyzer='word',
ngram_range=(1,4), #词特征变为*,**,***
max_features=20000) #对文本抽取词袋模型特征
vec.fit(x_train) #从训练集fit特征
'''
使用贝叶斯分类器训练,结果为0.87424
'''
classfier = MultinomialNB()
classfier.fit(vec.transform(x_train),y_train)
print(classfier.score(vec.transform(x_test),y_test))
使用TF-IDF进行特征提取:
vec=TfidfVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000)
vec.fit(x_train) #从训练集fit特征
'''
使用TF-IDF提取特征,结果为0.8755
'''
classfier = MultinomialNB()
classfier.fit(vec.transform(x_train),y_train)
print(classfier.score(vec.transform(x_test),y_test))
更换训练模型,使用SVM训练,得到结果为:0.8851,测试时间相对较长
vec=TfidfVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000)
vec.fit(x_train) #从训练集fit特征
'''
使用TF-IDF提取特征,使用SVM训练,结果为0.8851
'''
classfier = SVC(kernel='linear')
classfier.fit(vec.transform(x_train),y_train)
print(classfier.score(vec.transform(x_test),y_test))
另外,在训练时可以添加进交叉验证部分,使用交叉验证能提高准确率,因为数据分类为5类,因此在选取Kfold交叉验证时要尽量保证里边的样本类别是相对均衡的,代码参考如下:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
import numpy as np
def stratifiedkfold_cv(x, y, clf_class, shuffle=True, n_folds=5, **kwargs):
stratifiedk_fold = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle)
y_pred = y[:]
for train_index, test_index in stratifiedk_fold:
X_train, X_test = x[train_index], x[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
NB = MultinomialNB()
print (precision_score(y, stratifiedkfold_cv(vec.transform(x),np.array(y),NB), average='macro'))