【预选赛】2019CCCC大数据挑战赛 --------短文本情感分类

这个短文本情感分类开始照着别人的baselline做了一次，用的LSTM，线上0.82左右，后来自己改用了一下双向的，也就是Bi-LSTM 模型
代码如下：

import numpy as np
import pandas as pd
import re
train = pd.read_csv("C:/Users/Nicht_sehen/Desktop/train.csv", lineterminator='\n')
test = pd.read_csv("C:/Users/Nicht_sehen/Desktop/test.csv", lineterminator='\n')

# 改label
train['label'] = train['label'].map({'Negative': 0, 'Positive': 1})

# 清理数据替换掉无词义的符号
def clean(string):
    string = re.sub(r"\'", "", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"\t", "", string)
    string = re.sub(r"\n", "", string)
    string = re.sub(r"\d", "", string)
    string = re.sub(r"\.", "", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\-", " ", string)
    string = re.sub(r"^_.", "", string)
    string = re.sub(r"^ ", "", string)
    string = re.sub(r"\\x\.+", "", string)
    string = re.sub(r"\\x+", "", string)
    string = re.sub(r" $", " ", string)
    string = re.sub(r"_", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\#\.", " ", string)

    return string.lower()

def hashing(word):
  word = re.sub(r'ain$', r'ein', word)
  word = re.sub(r'ai', r'ae', word)
  word = re.sub(r'ay$', r'e', word)
  word = re.sub(r'ey$', r'e', word)
  word = re.sub(r'ie$', r'y', word)
  word = re.sub(r'^es', r'is', word)
  word = re.sub(r'a+', r'a', word)
  word = re.sub(r'j+', r'j', word)
  word = re.sub(r'd+', r'd', word)
  word = re.sub(r'u', r'o', word)
  word = re.sub(r'o+', r'o', word)
  word = re.sub(r'ee+', r'i', word)
  if not re.match(r'ar', word):
    word = re.sub(r'ar', r'r', word)
  word = re.sub(r'iy+', r'i', word)
  word = re.sub(r'ih+', r'eh', word)
  word = re.sub(r's+', r's', word)
  if re.search(r'[rst]y', 'word') and word[-1] != 'y':
    word = re.sub(r'y', r'i', word)
  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
    word = re.sub(r'i$', r'y', word)
  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
    word = re.sub(r'h', '', word)
  word = re.sub(r'k', r'q', word)
  return word

# 再次清洗
def delk(sentence):
    words = sentence.split(' ')
    sen = ''
    for word in words:
        if word == ' ':
            continue
        sen = sen+' '+hashing(clean(word))
    return sen

train['review'] = pd.DataFrame([delk(s) for s in train['review']])
test['review'] = pd.DataFrame([delk(s) for s in test['review']])
# print(train['review'][3])
print('data clean end')

# 截取review和label
X_train = train.values[:, 1]
X_test = test.values[:, 1]
temp_test = test.values
Y_train = train.values[:, -1]
print('data slide end')

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras import regularizers

max_features = 3000
batch_size = 32
X_all = np.concatenate([X_train, X_test])
len_train = len(X_train)
tokenizer = Tokenizer(num_words=2500,
            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True, split=' ')
tokenizer.fit_on_texts(X_all)
X = tokenizer.texts_to_sequences(X_all)
# 填充0
X = pad_sequences(X)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.001)))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

x_train=X[:len_train]
x_test=X[len_train:]

print('Train...')
model.fit(x_train, Y_train,
          batch_size=batch_size,
          epochs=5)

print("training end!")

pre_prob=model.predict(x_test)
submmit=pd.DataFrame(columns=['ID','Pred'])
submmit['ID']=temp_test[:, 0]
pre_prob[:5]
submmit['Pred'] = pre_prob
submmit.to_csv('C:/Users/Nicht_sehen/Desktop/te.csv', index=None)

这个是短文本，所以不能训练过久，否则很容易过拟合，epochs=5时提交线上0.84

因为考虑到是短文本，深度比较容易过拟合，然后考虑用传统的SVM+tfidf，不过结果不是很理想，用10折的交叉验证线下也只有0.79-0.80的样子，代码如下：

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

data_path="./1.txt"
train = pd.read_csv("C:/Users/Nicht_sehen/Desktop/train.csv", lineterminator='\n')
test = pd.read_csv(data_path)

# 改label
train['label'] = train['label'].map({'Negative': 0, 'Positive': 1})

# 清理数据替换掉无词义的符号
def clean(string):
    string = re.sub(r"\'", "", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"\t", "", string)
    string = re.sub(r"\n", "", string)
    string = re.sub(r"\d", "", string)
    string = re.sub(r"\.", "", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\-", " ", string)
    string = re.sub(r"^_.", "", string)
    string = re.sub(r"^ ", "", string)
    string = re.sub(r"\\x\.+", "", string)
    string = re.sub(r"\\x+", "", string)
    string = re.sub(r" $", " ", string)
    string = re.sub(r"_", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\#\.", " ", string)

    return string.lower()

def hashing(word):
  word = re.sub(r'ain$', r'ein', word)
  word = re.sub(r'ai', r'ae', word)
  word = re.sub(r'ay$', r'e', word)
  word = re.sub(r'ey$', r'e', word)
  word = re.sub(r'ie$', r'y', word)
  word = re.sub(r'^es', r'is', word)
  word = re.sub(r'a+', r'a', word)
  word = re.sub(r'j+', r'j', word)
  word = re.sub(r'd+', r'd', word)
  word = re.sub(r'u', r'o', word)
  word = re.sub(r'o+', r'o', word)
  word = re.sub(r'ee+', r'i', word)
  if not re.match(r'ar', word):
    word = re.sub(r'ar', r'r', word)
  word = re.sub(r'iy+', r'i', word)
  word = re.sub(r'ih+', r'eh', word)
  word = re.sub(r's+', r's', word)
  if re.search(r'[rst]y', 'word') and word[-1] != 'y':
    word = re.sub(r'y', r'i', word)
  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
    word = re.sub(r'i$', r'y', word)
  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
    word = re.sub(r'h', '', word)
  word = re.sub(r'k', r'q', word)
  return word

# 再次清洗
def delk(sentence):
    words = sentence.split(' ')
    sen = ''
    for word in words:
        if word == ' ':
            continue
        sen = sen+' '+hashing(clean(word))
    return sen

train['review'] = pd.DataFrame([delk(s) for s in train['review']])
test['review'] = pd.DataFrame([delk(s) for s in test['review']])
# print(train['review'][3])
print('data clean end')

# 截取review和label
X_train = train.values[:, 1]
X_test = test.values[:, 1]
temp_test = test.values
Y_train = train.values[:, -1]

y_train = np.array(Y_train)
y_train = y_train.astype('int8')
print('data slide end')

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             ngram_range=(1, 2),
                             max_df=0.5)

X = np.concatenate([X_train, X_test])
len_train = len(X_train)
vectorizer.fit(X)
X = vectorizer.transform(X)

x_train = X[:len_train]
x_test = X[len_train:]

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2500)
predictions = np.zeros(x_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = x_train[trn_idx]
    trn_label= y_train[trn_idx]
    val_data = x_train[val_idx]
    val_label= y_train[val_idx]
    model_SVM = SVC(C=1,kernel="linear")
    model_SVM.fit(trn_data, trn_label)
    print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model_SVM.predict(val_data))))
    predictions += model_SVM.predict(x_test) / folds.n_splits

output = pd.DataFrame({"ID": test["ID"], "Pred": predictions})
output.to_csv('C:/Users/Nicht_sehen/Desktop/sgd.csv',index=None)

最后采用了SGD+tfidf 成功通过预选赛，预选赛的线应该是0.85左右，本地0.87，提交上去公榜0.87，私榜看不到分数，但是显示通过。：）
代码如下：

import numpy as np
import pandas as pd
import re

data_path="./1.txt"
train = pd.read_csv("C:/Users/Nicht_sehen/Desktop/train.csv", lineterminator='\n')
test = pd.read_csv(data_path)
# 改label
train['label'] = train['label'].map({'Negative': 0, 'Positive': 1})

# 清理数据替换掉无词义的符号
def clean(string):
    string = re.sub(r"\'", "", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"\t", "", string)
    string = re.sub(r"\n", "", string)
    string = re.sub(r"\d", "", string)
    string = re.sub(r"\.", "", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\-", " ", string)
    string = re.sub(r"^_.", "", string)
    string = re.sub(r"^ ", "", string)
    string = re.sub(r"\\x\.+", "", string)
    string = re.sub(r"\\x+", "", string)
    string = re.sub(r" $", " ", string)
    string = re.sub(r"_", " ", string)
    string = re.sub(r"!", " ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " ", string)
    string = re.sub(r"\#\.", " ", string)

    return string.lower()

def hashing(word):
  word = re.sub(r'ain$', r'ein', word)
  word = re.sub(r'ai', r'ae', word)
  word = re.sub(r'ay$', r'e', word)
  word = re.sub(r'ey$', r'e', word)
  word = re.sub(r'ie$', r'y', word)
  word = re.sub(r'^es', r'is', word)
  word = re.sub(r'a+', r'a', word)
  word = re.sub(r'j+', r'j', word)
  word = re.sub(r'd+', r'd', word)
  word = re.sub(r'u', r'o', word)
  word = re.sub(r'o+', r'o', word)
  word = re.sub(r'ee+', r'i', word)
  if not re.match(r'ar', word):
    word = re.sub(r'ar', r'r', word)
  word = re.sub(r'iy+', r'i', word)
  word = re.sub(r'ih+', r'eh', word)
  word = re.sub(r's+', r's', word)
  if re.search(r'[rst]y', 'word') and word[-1] != 'y':
    word = re.sub(r'y', r'i', word)
  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
    word = re.sub(r'i$', r'y', word)
  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
    word = re.sub(r'h', '', word)
  word = re.sub(r'k', r'q', word)
  return word

# 再次清洗
def delk(sentence):
    words = sentence.split(' ')
    sen = ''
    for word in words:
        if word == ' ':
            continue
        sen = sen+' '+hashing(clean(word))
    return sen


train['review'] = pd.DataFrame([delk(s) for s in train['review']])
test['review'] = pd.DataFrame([delk(s) for s in test['review']])
print('data clean end')

# 截取review和label
X_train = train.values[:, 1]
X_test = test.values[:, 1]
temp_test = test.values
Y_train = train.values[:, -1]

y_train = np.array(Y_train)
y_train = y_train.astype('int8')
print('data slide end')

# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(sublinear_tf=True,
                     ngram_range=(1, 2),
                     max_df=0.6)

X = np.concatenate([X_train, X_test])
len_train = len(X_train)
tf.fit(X)
X = tf.transform(X)
x_train = X[:len_train]
x_test = X[len_train:]

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.linear_model import SGDClassifier

folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2000)
predictions = np.zeros(x_test.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
    trn_data = x_train[trn_idx]
    trn_label = y_train[trn_idx]
    val_data = x_train[val_idx]
    val_label = y_train[val_idx]
    SGD = SGDClassifier(alpha=0.00001,random_state=2, shuffle=True, loss='log')
    SGD.fit(trn_data, trn_label)
    print("score: {:.5f}".format(metrics.roc_auc_score(val_label, SGD.predict_proba(val_data)[:, 1])))
    predictions += SGD.predict_proba(x_test)[:, 1] / folds.n_splits

# 保存
pre = pd.DataFrame(columns=['ID','Pred'])
pre['ID'] = test["ID"]
pre['Pred'] = predictions
pre.to_csv('C:/Users/Nicht_sehen/Desktop/sgd.csv', index=None)

总结：
1，这次预选赛学到了一些文本数据处理的知识，比如tfidf,w2v之类的
2，在短文本的分类上，传统的分类方法比神经网络有着更好的效果，或者说泛化能力更强
3，了解了HAN，TextCNN等模型（就是效果不是很好，也可能是数据没处理好）

Nicht_Sehen

发布了125 篇原创文章 · 获赞 56 · 访问量 7万+

私信关注

【预选赛】2019CCCC大数据挑战赛 --------短文本情感分类

猜你喜欢