# Number of read data, view data structure
df_raw <- read.csv to ( "SMS _spam.csv ", stringsAsFactors = F.) STR (df_raw) length ($ df_raw type) # The data matrix X into eigenvalues and vectors Classmark y two parts, y is replaced factor X-<- df_raw $ text y <- factor (df_raw $ type) length (y) # View Classmark vector y of the structure and composition of the STR (y) Table (y) # installation and loading text mining packet # install.packages ( "(TM)") Library (the NLP) Library ((TM)) # Create corpus X_corpus <- VCorpus (VectorSource (X-))
# #######. 1 cleaning text data ##### ### # 1.1 convert text to lowercase letters X_corpus_clean <- tm_map (X_corpus, content_transformer (tolower))
#Digital X_corpus_clean 1.2 to remove text <- tm_map (X_corpus_clean, removeNumbers) # 1.3 X_corpus_clean remove stop words in the text <- tm_map (X_corpus_clean, removeWords, stopwords ()) # 1.4 to remove the text punctuation X_corpus_clean <- tm_map ( X_corpus_clean, removePunctuation) # Add package # install.packages ( "SnowballC") Library (SnowballC) # 1.5 extract the text of each word stem X_corpus_clean <- tm_map (X_corpus_clean, stemDocument) # 1.6 deleting extra blank X_corpus_clean <- tm_map (X_corpus_clean, stripWhitespace) # 1.7 will be split into words text documents, create documents - word matrix X_dtm <- DocumentTermMatrix (X_corpus_clean) # ############ 2 prepare input data ##### ########
# 2.1 divided training dataset and testing dataset X_dtm_train <- X_dtm [. 1: 4169 ,] X_dtm_test <- X_dtm [4170: 5559 ,] y_train <- Y [. 1: 4169 ] android.permission.FACTOR. <- Y [4170: 5559 ] # Description : df_raw because the original data is randomly selected, it is possible to directly straight to the front 75% of the test data to the data # 2.2 check whether the skewed distribution of samples prop.table (table (y_train)) prop.table (table (y_test)) # 2.3 filter the DTM, select a word frequently appearing X_freq_words <- findFreqTerms (X_dtm_train,. 5 ) where # trial and error can be adjusted to adjust the performance model
# filtered X_dtm_train_freq the DTM <- X_dtm_train [, X_freq_words] X_dtm_test_freq <- X_dtm_test [, X_freq_words ] # 2.4 numeric text encoding matrix # 2.4.1 defines a variable transfer function convert_counts <- function (X) {X <- IfElse (X> 0, "Yes ", " No " )} # 2.4.2 matrix conversion training and test matrix X_train <- Apply (X_dtm_train_freq, MARGIN = 2 , convert_counts) X_test <- Apply (X_dtm_test_freq, MARGIN = 2 , convert_counts)
# ##### ####### 3 based on the data model train ############ # install.packages ( "e1071") Library (e1071) # training model, Laplace estimated parameter defaults to 0 NB_classifier <- naiveBayes (X_train, y_train) # # ############ . 4 performance evaluation model # ############ # 4.1 pairs of samples in the test set predicted y_pred <- predict (NB_classifier, X_test) # comparing the predicted value and the true value # library(gmodels) CrossTable(x=y_test, y=y_pred, prop.chisq = F, prop.t = F, prop.c = F, dnn = c("actural", "predict"))
NB_classifier confusion matrix model to predict on the test set is:
Accuracy = 0.864 + 0.110 = 0.974
The model parameter adjustment
# ################# 5 to improve model performance ################## # 5.1 Add Laplace estimate, training model NB_classifier2 <- naiveBayes (X = X_train, Y = y_train, Laplace =. 1 ) # test set of samples 5.2 pairs prediction y_pred2 <- predict (NB_classifier2, X_test) # 5.3 comparing the predicted value and the true value CrossTable (x = y_test , Y = y_pred2, prop.chisq = F., T = prop.t, prop.c = F., DNN = C ( " actural ", " Predict "))
Confusion matrix prediction model on the test set passes the NB_classifier2 Tuning parameters are:
Accuracy = 0.865 + 0.112 = 0.977
note:
After adjusting the Laplace estimates, the accuracy of the model parameters have increased to 0.977 0.974, under the premise of high accuracy have improved, it is not easy.