bag of words + 随机森林(有标签数据)
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import nltk
from nltk.corpus import stopwords
datapath = os.path.join('labeledTrainData.tsv')
df = pd.read_csv(datapath,sep='\t',error_bad_lines=False)
print('Number of reviews:{}'.format(len(df)))
df.head()
eng_stopwords = set(stopwords.words('english'))
def clean_text(text):
text = BeautifulSoup(text,'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]',' ',text)
words = text.lower().split()
words = [w for w in words if w not in eng_stopwords]
return ' '.join(words)
df['clean_review'] = df.review.apply(clean_text)
df.head()
vectorizer = CountVectorizer(max_features = 5000) # 把词频5000以上的词拿出来做词典
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
train_data_features.shape
forest = RandomForestClassifier(n_estimators = 100) # 100颗树
forest = forest.fit(train_data_features,df.sentiment)
del df
del train_data_features
datafile = os.path.join('testData.tsv')
df = pd.read_csv(datafile,sep='\t',error_bad_lines=False)
print('Number of reviews:{}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)
df.head()
test_data_features = vectorizer.transform(df.clean_review).toarray()
test_data_features.shape
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.head()
word2vec + 随机森林(无标签数据)
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords
def load_dataset(name,nrows=None):
datasets = {
'unlabeled_train':'unlabeledTrainData.tsv',
'labeled_train':'labeledTrainData.tsv',
'test':'testData.tsv'
}
if name not in datasets:
raise ValueError(name)
data_file = os.path.join(datasets[name])
df = pd.read_csv(data_file,sep='\t',error_bad_lines=False)
print('Number of reviews:{}'.format(len(df)))
return df
df = load_dataset('unlabeled_train')
df.head()
eng_stopwords = set(stopwords.words('english'))
def clean_text(text,remove_stopwords=False):
text = BeautifulSoup(text,'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]',' ',text)
words = text.lower().split()
if remove_stopwords:
words = [w for w in words if w not in eng_stopwords]
return words
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def print_call_counts(f):
n = 0
def wrapped(*args,**kwargs):
nonlocal n
n += 1
if n % 1000 == 1:
print('method {} called {} times'.format(f.__name__,n))
return f(*args,**kwargs)
return wrapped
@print_call_counts
def split_sentences(review):
raw_sentences = tokenizer.tokenize(review.strip())
sentences = [clean_text(s) for s in raw_sentences if s]
return sentences
%time sentences = sum(df.review.apply(split_sentences),[])
print('{} reviews -> {} sentences'.format(len(df),len(sentences)))
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level = logging.INFO )
# 设定词向量训练的参数
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3
model_name = '{}features_{}minwords_{}context.model'.format(num_features,min_word_count,context)
print('Training model...')
model = Word2Vec(sentences,workers = num_workers,\
vector_size = num_features,min_count = min_word_count,\
window = context,sample = downsampling)
model.init_sims(replace=True)
model.save(os.path.join('.','models',model_name))
print(model.wv.doesnt_match('man woman child kitchen'.split()))
print(model.wv.doesnt_match('france england germany berlin'.split()))
model.wv.most_similar('man')
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('.','models',model_name))
df = load_dataset('labeled_train')
df.head()
def to_review_vector(review):
words = clean_text(review,remove_stopwords=True)
array = np.array([model.wv[w] for w in words if w in model.wv]) # 每句话是由很多词向量组成的矩阵
return pd.Series(array.mean(axis=0))
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()
forest = RandomForestClassifier(n_estimators = 100,random_state=42) # 100颗树
forest = forest.fit(train_data_features,df.sentiment)
del df
del train_data_features
df = load_dataset('test')
df.head()
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.to_csv(os.path.join('.','data','Word2Vec_model.csv'),index=False)
output.head()