文本分类

代码：

from gensim import corpora, models

# 载入字典
dictionary = corpora.Dictionary.load("corpus.dict")


f = open("prepare_corpus.txt", "rb")
corpusX, corpusY = [], []
for line in f:
    words = line.decode("utf-8").strip().split()
    corpusY.append(words[0])
    corpusX.append(words[1:])
# 数据集长度
print("===" * 20)
print(len(corpusX), len(corpusY))

运行结果：

============================================================
12833 12833

代码：

# 总共六个类别
print("===" * 20)
mapY = list(set(corpusY))
print("/".join(mapY))
print(mapY[0])

# 为标签建立索引
print("===" * 20)
indexY = {}
for cls in mapY:
    indexY[cls] = len(indexY)
for k, v in indexY.items():
    print("%s %d" % (k, v))
    
y = [indexY[x] for x in corpusY]

运行结果：

============================================================
国际/体育/财经/军事/社会/国内
国际
============================================================
国际 0
体育 1
财经 2
军事 3
社会 4
国内 5

代码：

# 为新闻建立特征向量
X = [dictionary.doc2bow(doc, allow_update=True, return_missing=False)for doc in corpusX]
print(X[0][:20])
print("===" * 20)
print(len(X))

运行结果：

[(90, 1), (480, 2), (1069, 1), (1193, 1), (1276, 1), (1406, 1), (1626, 1), (2180, 1), (2444, 2), (2666, 4), (2895, 1), (2957, 4), (3020, 1), (3023, 1), (3039, 2), (3102, 5), (3108, 1), (3133, 1), (3672, 1), (3804, 1)]
============================================================
12833
In [4]:

代码：

# 利用sklearn进行文本分类

# 分割数据集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
# 查看数据集数量
print(len(X_train), len(X_test), len(y_train), len(y_test))

运行结果：

10266 2567 10266 2567

代码：

# 数据是稀疏类型的
from sklearn.svm import SVC

# 实例化
clf = SVC()
print(clf)

print("===" * 20)
import gensim
scipy_csc_matrix = gensim.matutils.corpus2csc(X_train)
# 训练模型
clf.fit(scipy_csc_matrix.T, y_train)

运行结果：

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
============================================================
Out[5]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [21]:

代码：

length_dict = len(dictionary)
numpy_matrix = gensim.matutils.corpus2dense(X_test[24:25], num_terms=length_dict)

# 预测
pred = clf.predict(numpy_matrix.T)
print(pred)
# 预测结果和真实结果一样，哈哈(试了好久，才找到一个)
print("===" * 20)
print(y[24:25])

运行结果：

[1]
============================================================
[1]

代码：

# 测试集
numpy_matrix = gensim.matutils.corpus2dense(X_test, num_terms=length_dict)
# shape
print(numpy_matrix.shape)
# 预测
y_pred = clf.predict(numpy_matrix.T)


from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

运行结果：

(123780, 2567)
             precision    recall  f1-score   support

          0       0.75      0.16      0.26       306
          1       0.44      0.99      0.61       794
          2       1.00      0.03      0.05       178
          3       0.94      0.34      0.50       353
          4       0.84      0.07      0.12       323
          5       0.60      0.57      0.59       613

avg / total       0.68      0.52      0.45      2567

代码：

# 保存模型
import pickle as pkl
with open("corpus.clf_model", "wb") as fw:
    pkl.dump(clf, fw)

代码：

# 打开模型
fr = open("corpus.clf_model", "rb")
clf_new = pkl.load(fr)
print(clf_new)

运行结果：

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

猜你喜欢