代码
# coding: utf-8
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import KFold
import csv
import numpy as np
csv_file = r"F:\train.csv"
test_file = r"F:\test.csv"
def load_dataset():
dataset = []
csvFile = open(csv_file,'r',encoding='gb18030',errors='ignore')
reader = csv.reader(csvFile)
for item in reader:
if (item [2]=='Negative') | (item[2]=='Positive'):
dataset.append((item[2], item[1]))
df = pd.DataFrame(data=dataset, columns=['label','document'])
return df
def plot_labels_rel_documents(df):
fig = plt.figure(figsize=(8, 6))
df.groupby('label').document.count().plot.bar(ylim=0)
plt.show()
def preprocess_df(df_train):
df_train = df_train[pd.notnull(df_train['document'])]
df_train.columns = ['label', 'document']
df_train['label_id'] = df_train['label'].factorize()[0]
df_train.head()
return df_train
def train_lsvc_classifier(df_train, test):
classifier = {}
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=0, max_df=0.8, norm='l2', encoding='utf-8', ngram_range=(1, 4),
analyzer='char')
features = tfidf.fit_transform(df_train.document).toarray()
labels = df_train.label_id
features.shape
test = tfidf.transform(test.document).toarray()
model = LinearSVC()
###训练集随机分成train和test########################################################
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df_train.index,
test_size=0.0001, random_state=0)
###训练集Kfold#############################################################
# X_train = []
# X_test = []
# y_train = []
# y_test = []
# kf = KFold(n_splits=10,shuffle=False)
# for train_index , test_index in kf.split(features):
# for i in train_index:
# X_train.append(features[i])
# y_train.append(labels[i])
# for j in test_index:
# X_test.append(features[j])
# y_test.append(labels[j])
# X_train = np.array(X_train)
# X_test = np.array(X_test)
# y_train = np.array(y_train)
# y_test = np.array(y_test)
#################################################################
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred = model.predict(features)
test_pred = model._predict_proba_lr(test)
print(test_pred)
classifier['df'] = features
classifier['df_label'] = labels
classifier['y_test'] = y_test
classifier['y_pred'] = y_pred
classifier['test_pred'] = test_pred
classifier['pred'] = pred
classifier['df_train'] = df_train
return classifier
def predict_data(csv_file):
dataset = []
p_num = 0
n_num = 0
csvFile = open(csv_file,'r',encoding='gb18030',errors='ignore')
reader = csv.reader(csvFile)
for item in reader:
dataset.append((item[1]))
df = pd.DataFrame(data=dataset,columns=['document'])
return df
def classification_report(y_test, y_pred, df_train):
return metrics.classification_report(y_test, y_pred, target_names=df_train['label'].unique()),metrics.roc_auc_score(y_test, y_pred)
if __name__ == '__main__':
train = load_dataset()
preprocessed_df = preprocess_df(train)
# plot_labels_rel_documents(preprocessed_df)
test = predict_data(test_file)
classifier = train_lsvc_classifier(preprocessed_df, test)
_, auc=classification_report(classifier['df_label'], classifier['pred'], classifier['df_train'])
print(auc)
pred = classifier['test_pred']
data = pd.DataFrame(pred)
writer = pd.ExcelWriter('cccccc.xlsx')
data.to_excel(writer, 'a', float_format='%.7f')
writer.save()
writer.close()