代码:
from gensim import corpora, models # 载入字典 dictionary = corpora.Dictionary.load("corpus.dict") f = open("prepare_corpus.txt", "rb") corpusX, corpusY = [], [] for line in f: words = line.decode("utf-8").strip().split() corpusY.append(words[0]) corpusX.append(words[1:]) # 数据集长度 print("===" * 20) print(len(corpusX), len(corpusY))
运行结果:
============================================================ 12833 12833
代码:
# 总共六个类别 print("===" * 20) mapY = list(set(corpusY)) print("/".join(mapY)) print(mapY[0]) # 为标签建立索引 print("===" * 20) indexY = {} for cls in mapY: indexY[cls] = len(indexY) for k, v in indexY.items(): print("%s %d" % (k, v)) y = [indexY[x] for x in corpusY]
运行结果:
============================================================ 国际/体育/财经/军事/社会/国内 国际 ============================================================ 国际 0 体育 1 财经 2 军事 3 社会 4 国内 5
代码:
# 为新闻建立特征向量 X = [dictionary.doc2bow(doc, allow_update=True, return_missing=False)for doc in corpusX] print(X[0][:20]) print("===" * 20) print(len(X))
运行结果:
[(90, 1), (480, 2), (1069, 1), (1193, 1), (1276, 1), (1406, 1), (1626, 1), (2180, 1), (2444, 2), (2666, 4), (2895, 1), (2957, 4), (3020, 1), (3023, 1), (3039, 2), (3102, 5), (3108, 1), (3133, 1), (3672, 1), (3804, 1)] ============================================================ 12833 In [4]:
代码:
# 利用sklearn进行文本分类 # 分割数据集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666) # 查看数据集数量 print(len(X_train), len(X_test), len(y_train), len(y_test))
运行结果:
10266 2567 10266 2567
代码:
# 数据是稀疏类型的 from sklearn.svm import SVC # 实例化 clf = SVC() print(clf) print("===" * 20) import gensim scipy_csc_matrix = gensim.matutils.corpus2csc(X_train) # 训练模型 clf.fit(scipy_csc_matrix.T, y_train)
运行结果:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) ============================================================ Out[5]: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) In [21]:
代码:
length_dict = len(dictionary) numpy_matrix = gensim.matutils.corpus2dense(X_test[24:25], num_terms=length_dict) # 预测 pred = clf.predict(numpy_matrix.T) print(pred) # 预测结果和真实结果一样,哈哈(试了好久,才找到一个) print("===" * 20) print(y[24:25])
运行结果:
[1] ============================================================ [1]
代码:
# 测试集 numpy_matrix = gensim.matutils.corpus2dense(X_test, num_terms=length_dict) # shape print(numpy_matrix.shape) # 预测 y_pred = clf.predict(numpy_matrix.T) from sklearn.metrics import classification_report report = classification_report(y_test, y_pred) print(report)
运行结果:
(123780, 2567) precision recall f1-score support 0 0.75 0.16 0.26 306 1 0.44 0.99 0.61 794 2 1.00 0.03 0.05 178 3 0.94 0.34 0.50 353 4 0.84 0.07 0.12 323 5 0.60 0.57 0.59 613 avg / total 0.68 0.52 0.45 2567
代码:
# 保存模型 import pickle as pkl with open("corpus.clf_model", "wb") as fw: pkl.dump(clf, fw)
代码:
# 打开模型 fr = open("corpus.clf_model", "rb") clf_new = pkl.load(fr) print(clf_new)
运行结果:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)