1.读取
file_path=r'C:\Users\we\Desktop\SMSSpamCollection' sms=open(file_path,'r',encoding='utf-8') sms_data=[] sms_label=[] csv_reader=csv.reader(sms,delimiter='\t') for line in csv_reader: sms_label.append(line[0]) sms_data.append(preprocessing(line[1])) sms.close()
2.数据预处理
def cixing(word):#根据词性,生成还原参数 if word.startswith('J'): return nltk.corpus.wordnet.ADJ elif word.startswith('V'): return nltk.corpus.wordnet.VERB elif word.startswith('N'): return nltk.corpus.wordnet.NOUN elif word.startswith('R'): return nltk.corpus.wordnet.ADV else: return nltk.corpus.wordnet.NOUN def preprocessing(text): # 分词 fenge = [] for sent in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sent): fenge.append(word) # 停用词 stops = stopwords.words("english") tingyong = [i for i in fenge if i not in stops] # 磁性标注 tag = nltk.pos_tag(tingyong) # 磁性还原 lemmatizer = WordNetLemmatizer() huanyuan = [] # huanyuan = [lemmatizer.lemmatize(i, pos=cixing(tag[index][1])) for index, i in enumerate(tingyong)] for index,i in enumerate(tingyong): huanyuan.append(lemmatizer.lemmatize(i,pos=cixing(tag[index][1]))) return huanyuan
3.数据划分—训练集和测试集数据划分
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0, stratify=y_train)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(sms_data, sms_label, test_size=0.2, random_state=0,
stratify=sms_label)
print('原数据长度:', len(sms_data), '\n训练数据长度:', len(x_train), '\n测试数据长度:', len(x_test))
4.文本特征提取
sklearn.feature_extraction.text.CountVectorizer
扫描二维码关注公众号,回复:
11254830 查看本文章
sklearn.feature_extraction.text.TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer()
观察邮件与向量的关系
向量还原为邮件
4.模型选择
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
说明为什么选择这个模型?
这个模型适合离散的数据
# 向量化 from sklearn.feature_extraction.text import TfidfVectorizer tfidf2 = TfidfVectorizer() X_train = tfidf2.fit_transform(x_train) X_test = tfidf2.transform(x_test) print('X_train矩阵长度:', X_train.toarray().shape, '\nX_test矩阵长度:', X_test.toarray().shape) print('邮件以及向量关系矩阵:\n', X_train.toarray()) print('词汇表:\n', tfidf2.vocabulary_)
5.模型评价:混淆矩阵,分类报告
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_predict)
说明混淆矩阵的含义
from sklearn.metrics import classification_report
说明准确率、精确率、召回率、F值分别代表的意义
6.比较与总结
如果用CountVectorizer进行文本特征生成,与TfidfVectorizer相比,效果如何?