sklearn的linearSVC实现短文本二分类

代码

# coding: utf-8
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import KFold
import csv
import numpy as np
csv_file = r"F:\train.csv"
test_file = r"F:\test.csv"

def load_dataset():
 dataset = []
 csvFile = open(csv_file,'r',encoding='gb18030',errors='ignore')
 reader = csv.reader(csvFile)
 for item in reader:
     if (item [2]=='Negative') | (item[2]=='Positive'):
         dataset.append((item[2], item[1]))
 df = pd.DataFrame(data=dataset, columns=['label','document'])
 return df
def plot_labels_rel_documents(df):
 fig = plt.figure(figsize=(8, 6))
 df.groupby('label').document.count().plot.bar(ylim=0)
 plt.show()
def preprocess_df(df_train):
 df_train = df_train[pd.notnull(df_train['document'])]
 df_train.columns = ['label', 'document']
 df_train['label_id'] = df_train['label'].factorize()[0]
 df_train.head()
 return df_train

def train_lsvc_classifier(df_train, test):

 classifier = {}
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=0, max_df=0.8, norm='l2', encoding='utf-8', ngram_range=(1, 4),
                         analyzer='char')
 
 features = tfidf.fit_transform(df_train.document).toarray()
 labels = df_train.label_id
 features.shape
 test = tfidf.transform(test.document).toarray()
 model = LinearSVC()
###训练集随机分成train和test########################################################  
 X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df_train.index,
                                                                                  test_size=0.0001, random_state=0)
###训练集Kfold#############################################################
#  X_train = []
#  X_test = []
#  y_train = []
#  y_test = []
#  kf = KFold(n_splits=10,shuffle=False)
#  for train_index , test_index in kf.split(features):
#      for i in train_index:
#          X_train.append(features[i])
#          y_train.append(labels[i])
#      for j in test_index:
#          X_test.append(features[j])
#          y_test.append(labels[j])
#  X_train = np.array(X_train)
#  X_test = np.array(X_test)
#  y_train = np.array(y_train)
#  y_test = np.array(y_test)
#################################################################

 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 pred = model.predict(features)
 test_pred = model._predict_proba_lr(test)
 print(test_pred)
 classifier['df'] = features
 classifier['df_label'] = labels
 classifier['y_test'] = y_test
 classifier['y_pred'] = y_pred
 classifier['test_pred'] = test_pred  
 classifier['pred'] = pred
 classifier['df_train'] = df_train

 return classifier

def predict_data(csv_file):
 dataset = []
 p_num = 0
 n_num = 0
 csvFile = open(csv_file,'r',encoding='gb18030',errors='ignore')
 reader = csv.reader(csvFile)
 for item in reader:
         dataset.append((item[1]))
 df = pd.DataFrame(data=dataset,columns=['document'])
 return df
def classification_report(y_test, y_pred, df_train):
 return metrics.classification_report(y_test, y_pred, target_names=df_train['label'].unique()),metrics.roc_auc_score(y_test, y_pred)


if __name__ == '__main__':
 train = load_dataset()
 preprocessed_df = preprocess_df(train)
#  plot_labels_rel_documents(preprocessed_df)
 test = predict_data(test_file)
 classifier = train_lsvc_classifier(preprocessed_df, test)
 _, auc=classification_report(classifier['df_label'], classifier['pred'], classifier['df_train'])

 print(auc)
 pred = classifier['test_pred']
 data = pd.DataFrame(pred)
 writer = pd.ExcelWriter('cccccc.xlsx')
 data.to_excel(writer, 'a', float_format='%.7f')
 writer.save()
 writer.close()

猜你喜欢

转载自www.cnblogs.com/DutlKY/p/10885573.html
今日推荐