NLP 产品可行性评价--Python\nltk

#!/usr/bin/env python
# coding: utf-8
# author: LIU, Jing

# In[1]:


# Pre-Processing
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer

# Modeling
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.util import ngrams
from collections import Counter
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from gensim import corpora, models, similarities
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

# For the first-time user, you should download these two pre-trained tokenizer
#nltk.download('punkt') 
#nltk.download('stopwords')

import numpy as np
import pandas as pd


# In[2]:


df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv',index_col = 0)
sentiment_label = df["Recommended IND"]
df.head()


# In[157]:


# Pre-process
# choose review text
pdtextpreprocess = df[["Title","Review Text", "Rating"]]
pdtextpreprocess['index'] = pdtextpreprocess.index

documents = [str(m)+" "+str(n) for m,n in zip(pdtextpreprocess["Title"],pdtextpreprocess["Review Text"])] # add title as part of review text


# In[ ]:





# In[5]:


texts_tokenized = [[word.lower() for word in word_tokenize(str(document))] for document in documents]#text tokenized and transformed to lowercase

stop_words = set(stopwords.words('english'))
texts_filtered_stopwords = [[word for word in document if not word in stop_words] for document in texts_tokenized]#move stop_words

# puncuations
english_punctuations = list(string.punctuation)
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]#move english_punctuations

stemmer = PorterStemmer()
#stemmer =LancasterStemmer()
texts_stemmed = [[stemmer.stem(word) for word in docment] for docment in texts_filtered]

all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) #texts that appear only once
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] #texts that appear more than once


# In[6]:


from gensim.corpora import Dictionary

dictionary = Dictionary(texts)  # fit dictionary
corpus = [dictionary.doc2bow(line) for line in texts]  # convert corpus to BoW format

tfidf = models.TfidfModel(corpus)#calculate tf-idf
corpus_tfidf = tfidf[corpus]#use tf-idfs to represent documents


# ## Build the LSA and LDA topic models for the reviews. 

# In[13]:


#LSA modek
from gensim.models.lsimodel import LsiModel
lsi=models.LsiModel(corpus_tfidf,id2word=dictionary)
topics_lsi=lsi.show_topics()
for tpc in topics_lsi:
    print(tpc)


# In[26]:


#LDA model
from gensim.models.ldamodel import LdaModel
ldamodel = LdaModel(corpus, id2word=dictionary)
topics_lda=ldamodel.show_topics()
for tpc in topics_lda:
    print(tpc)


# ## Predict sentiment
# 
# (Recommended / Not recommended) of reviews using their bag-of-topics features by fitting a logistic regression model.

# In[28]:


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


# In[65]:


# GET FEATURES
df_lr=pdtextpreprocess.copy()
df_lr['doc']=texts
df_lr['Recommendend IND'] =sentiment_label
df_lr
training_data, testing_data = train_test_split(df_lr,random_state = 2000)
training_data=training_data.set_index('index')
testing_data=testing_data.set_index('index')


# In[66]:


testing_data


# In[46]:


Y_train=training_data['Recommendend IND'].values
Y_test=testing_data['Recommendend IND'].values


# In[69]:


df_train=training_data[['Title','Review Text','Rating','doc']]
df_test=testing_data[['Title','Review Text','Rating','doc']]


# In[90]:


# Pre-process
# choose review text
documents_train = [str(m)+" "+str(n) for m,n in zip(df_train["Title"],df_train["Review Text"])] 
documents_test = [str(m)+" "+str(n) for m,n in zip(df_test["Title"],df_test["Review Text"])] 


# In[93]:


texts_tokenized_train = [[word_train.lower() for word_train in word_tokenize(str(document_train))] for document_train in documents_train]#text tokenized and transformed to lowercase
texts_tokenized_test = [[word_test.lower() for word_test in word_tokenize(str(document_test))] for document_test in documents_test]

stop_words = set(stopwords.words('english'))
texts_filtered_stopwords_train = [[word for word in document_train if not word in stop_words] for document_train in texts_tokenized_train]#move stop_words
texts_filtered_stopwords_test = [[word for word in document_test if not word in stop_words] for document_test in texts_tokenized_test]

# puncuations
english_punctuations = list(string.punctuation)
texts_filtered_train = [[word for word in document_train if not word in english_punctuations] for document_train in texts_filtered_stopwords_train]#move english_punctuations
texts_filtered_test = [[word for word in document_test if not word in english_punctuations] for document_test in texts_filtered_stopwords_test]

stemmer = PorterStemmer()
#stemmer =LancasterStemmer()
texts_stemmed_train = [[stemmer.stem(word) for word in docment_train] for docment_train in texts_filtered_train]
texts_stemmed_test = [[stemmer.stem(word) for word in docment_test] for docment_test in texts_filtered_test]

all_stems_train = sum(texts_stemmed_train, [])
all_stems_test = sum(texts_stemmed_test, [])
stems_once_train = set(stem for stem in set(all_stems_train) if all_stems_train.count(stem) == 1) #texts that appear only once
stems_once_test = set(stem for stem in set(all_stems_test) if all_stems_test.count(stem) == 1)
texts_train = [[stem for stem in text_train if stem not in stems_once_train] for text_train in texts_stemmed_train] #texts that appear more than once
texts_test = [[stem for stem in text_test if stem not in stems_once_test] for text_test in texts_stemmed_test]


# In[ ]:





# In[132]:


def extract_features(training_data,testing_data,type="binary"):
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True)
        cv.fit_transform(training_data)
        
        train_feature_set=cv.transform(training_data)
        test_feature_set=cv.transform(testing_data)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False)
        cv.fit_transform(training_data.values)
        
        train_feature_set=cv.transform(training_data)
        test_feature_set=cv.transform(testing_data)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True)
        tfidf_vectorizer.fit_transform(training_data)
        
        train_feature_set=tfidf_vectorizer.transform(training_data)
        test_feature_set=tfidf_vectorizer.transform(testing_data)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer
 
 


# In[133]:


X_train,X_test,feature_transformer=extract_features(documents_train,documents_test,type='feature_rep')


# In[143]:


scikit_log_reg = LogisticRegression(solver='liblinear',max_iter=10)
lr_model=scikit_log_reg.fit(X_train,Y_train)


# In[144]:


preds=lr_model.predict(X_test)


# In[159]:


actual_index=df_test.index
actual_index
df_test['pred_Recommend']=preds
df_test['actual_Recommend']=sentiment_label[actual_index]


# In[161]:


df_test


# ## Calculate the prediction error
# (Defined as the percentage of incorrect predictions)

# In[165]:


from sklearn.metrics import accuracy_score
y_pred=df_test['pred_Recommend']
y_actual=df_test['actual_Recommend']
accuracy_score(y_pred,y_actual)
Bobby?
发布了7 篇原创文章 · 获赞 0 · 访问量 222
私信关注
NLP 产品可行性评价--Python\nltk

猜你喜欢