

library(tidyverse) # 清洗数据
library(here) # 设置数据文件路径
library(tidytext) # 分词及创建稀疏矩阵
library(e1071) # 建模
library(gmodels) # 评估模型


sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
  mutate(type = factor(type),
         row = row_number()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  filter(!str_detect(word, '\\d')) %>% 
  cast_sparse(row, word) %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  select(which(colSums(.) > 4)) %>% 
  bind_cols(read_csv(here('data', '02-sms_spam.csv')) %>% 
              mutate(type = factor(type),
                     row = row_number()) %>% 
              unnest_tokens(word, text) %>% 
              anti_join(stop_words) %>% 
              filter(!str_detect(word, '\\d')) %>%
              select(-3) %>% 
              distinct()) %>% 
  mutate_if(is.numeric, factor, levels = c(0, 1), labels = c('No', 'Yes'))




(sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')))
## # A tibble: 5,574 x 2
##    type  text                                                              
##    <chr> <chr>                                                             
##  1 ham   Go until jurong point, crazy.. Available only in bugis n great wo~
##  2 ham   Ok lar... Joking wif u oni...                                     
##  3 spam  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 200~
##  4 ham   U dun say so early hor... U c already then say...                 
##  5 ham   Nah I don't think he goes to usf, he lives around here though     
##  6 spam  FreeMsg Hey there darling it's been 3 week's now and no word back~
##  7 ham   Even my brother is not like to speak with me. They treat me like ~
##  8 ham   As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vet~
##  9 spam  WINNER!! As a valued network customer you have been selected to r~
## 10 spam  Had your mobile 11 months or more? U R entitled to Update to the ~
## # ... with 5,564 more rows


sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
  mutate(type = factor(type),
         row = row_number())


(sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
  mutate(type = factor(type),
         row = row_number()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  filter(!str_detect(word, '\\d')))
## # A tibble: 34,390 x 3
##    type    row word  
##    <fct> <int> <chr> 
##  1 ham       1 jurong
##  2 ham       1 crazy 
##  3 ham       1 bugis 
##  4 ham       1 world 
##  5 ham       1 la    
##  6 ham       1 buffet
##  7 ham       1 cine  
##  8 ham       1 amore 
##  9 ham       1 wat   
## 10 ham       2 lar   
## # ... with 34,380 more rows



(sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
  mutate(type = factor(type),
         row = row_number()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  filter(!str_detect(word, '\\d')) %>% 
  cast_sparse(row, word) %>% 
  as.matrix() %>% 
## # A tibble: 5,454 x 7,440
##    jurong crazy bugis world    la buffet  cine amore   wat   lar joking
##     <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1      1     1     1     1     1      1     1     1     1     0      0
##  2      0     0     0     0     0      0     0     0     0     1      1
##  3      0     0     0     0     0      0     0     0     0     0      0
##  4      0     0     0     0     0      0     0     0     0     0      0
##  5      0     0     0     0     0      0     0     0     0     0      0
##  6      0     0     0     0     0      0     0     0     0     0      0
##  7      0     0     0     0     0      0     0     0     0     0      0
##  8      0     0     0     0     0      0     0     0     0     0      0
##  9      0     0     0     0     0      0     0     0     0     0      0
## 10      0     0     0     0     0      0     0     0     0     0      0
## # ... with 5,444 more rows, and 7,429 more variables: wif <dbl>,
## #   oni <dbl>, free <dbl>, entry <dbl>, wkly <dbl>, comp <dbl>, win <dbl>,
## #   fa <dbl>, cup <dbl>, final <dbl>, tkts <dbl>, text <dbl>,
## #   receive <dbl>, question <dbl>, std <dbl>, txt <dbl>, rate <dbl>,
## #   apply <dbl>, dun <dbl>, hor <dbl>, nah <dbl>, usf <dbl>, lives <dbl>,
## #   freemsg <dbl>, hey <dbl>, darling <dbl>, `week's` <dbl>, word <dbl>,
## #   fun <dbl>, tb <dbl>, xxx <dbl>, chgs <dbl>, send <dbl>, rcv <dbl>,
## #   brother <dbl>, speak <dbl>, treat <dbl>, aids <dbl>, patent <dbl>,
## #   request <dbl>, melle <dbl>, oru <dbl>, minnaminunginte <dbl>,
## #   nurungu <dbl>, vettam <dbl>, set <dbl>, callertune <dbl>,
## #   callers <dbl>, press <dbl>, copy <dbl>, friends <dbl>, winner <dbl>,
## #   valued <dbl>, network <dbl>, customer <dbl>, selected <dbl>,
## #   receivea <dbl>, prize <dbl>, reward <dbl>, claim <dbl>, call <dbl>,
## #   code <dbl>, valid <dbl>, hours <dbl>, mobile <dbl>, months <dbl>,
## #   entitled <dbl>, update <dbl>, colour <dbl>, mobiles <dbl>,
## #   camera <dbl>, gonna <dbl>, home <dbl>, talk <dbl>, stuff <dbl>,
## #   anymore <dbl>, tonight <dbl>, cried <dbl>, chances <dbl>, cash <dbl>,
## #   pounds <dbl>, cost <dbl>, day <dbl>, tsandcs <dbl>, reply <dbl>,
## #   hl <dbl>, info <dbl>, urgent <dbl>, won <dbl>, week <dbl>,
## #   membership <dbl>, jackpot <dbl>, www.dbuk.net <dbl>, lccltd <dbl>,
## #   pobox <dbl>, searching <dbl>, words <dbl>, breather <dbl>,
## #   promise <dbl>, wont <dbl>, ...


(sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
  mutate(type = factor(type),
         row = row_number()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  filter(!str_detect(word, '\\d')) %>% 
  cast_sparse(row, word) %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  select(which(colSums(.) > 4)))
## # A tibble: 5,454 x 1,312
##    crazy bugis world    la  cine   wat   lar joking   wif  free entry  wkly
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1     1     1     1     1     1     0      0     0     0     0     0
##  2     0     0     0     0     0     0     1      1     1     0     0     0
##  3     0     0     0     0     0     0     0      0     0     1     1     1
##  4     0     0     0     0     0     0     0      0     0     0     0     0
##  5     0     0     0     0     0     0     0      0     0     0     0     0
##  6     0     0     0     0     0     0     0      0     0     0     0     0
##  7     0     0     0     0     0     0     0      0     0     0     0     0
##  8     0     0     0     0     0     0     0      0     0     0     0     0
##  9     0     0     0     0     0     0     0      0     0     0     0     0
## 10     0     0     0     0     0     0     0      0     0     1     0     0
## # ... with 5,444 more rows, and 1,300 more variables: comp <dbl>,
## #   win <dbl>, cup <dbl>, final <dbl>, text <dbl>, receive <dbl>,
## #   question <dbl>, std <dbl>, txt <dbl>, rate <dbl>, apply <dbl>,
## #   dun <dbl>, nah <dbl>, usf <dbl>, freemsg <dbl>, hey <dbl>,
## #   darling <dbl>, word <dbl>, fun <dbl>, xxx <dbl>, send <dbl>,
## #   brother <dbl>, speak <dbl>, treat <dbl>, request <dbl>, set <dbl>,
## #   callertune <dbl>, callers <dbl>, press <dbl>, copy <dbl>,
## #   friends <dbl>, winner <dbl>, valued <dbl>, network <dbl>,
## #   customer <dbl>, selected <dbl>, prize <dbl>, reward <dbl>,
## #   claim <dbl>, call <dbl>, code <dbl>, valid <dbl>, hours <dbl>,
## #   mobile <dbl>, months <dbl>, entitled <dbl>, update <dbl>,
## #   colour <dbl>, mobiles <dbl>, camera <dbl>, gonna <dbl>, home <dbl>,
## #   talk <dbl>, stuff <dbl>, anymore <dbl>, tonight <dbl>, cash <dbl>,
## #   pounds <dbl>, cost <dbl>, day <dbl>, reply <dbl>, hl <dbl>,
## #   info <dbl>, urgent <dbl>, won <dbl>, week <dbl>, pobox <dbl>,
## #   searching <dbl>, words <dbl>, promise <dbl>, wont <dbl>,
## #   wonderful <dbl>, times <dbl>, date <dbl>, sunday <dbl>, credit <dbl>,
## #   click <dbl>, wap <dbl>, link <dbl>, message <dbl>, http <dbl>,
## #   watching <dbl>, eh <dbl>, remember <dbl>, naughty <dbl>, wet <dbl>,
## #   fine <dbl>, feel <dbl>, england <dbl>, dont <dbl>, miss <dbl>,
## #   team <dbl>, news <dbl>, ur <dbl>, national <dbl>, `i‘m` <dbl>,
## #   ha <dbl>, ü <dbl>, pay <dbl>, da <dbl>, ...


sms <- read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
  mutate(type = factor(type),
         row = row_number()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  filter(!str_detect(word, '\\d')) %>% 
  cast_sparse(row, word) %>% 
  as.matrix() %>% 
  as_tibble() %>% 
  select(which(colSums(.) > 4)) %>% 
  bind_cols(read_csv(here('content', 'post', 'data', '02-sms_spam.csv')) %>% 
              mutate(type = factor(type),
                     row = row_number()) %>% 
              unnest_tokens(word, text) %>% 
              anti_join(stop_words) %>% 
              filter(!str_detect(word, '\\d')) %>%
              select(-3) %>% 
              distinct()) %>% 
  mutate_if(is.numeric, factor, levels = c(0, 1), labels = c('No', 'Yes'))


sms_train <- sms %>% sample_n(4169)
sms_test <- sms %>% setdiff(sms_train)



sms_class <- naiveBayes(sms_train[, -1313:-1314], sms_train$type1)
sms_pred <- predict(sms_class, sms_test)


CrossTable(sms_pred, sms_test$type1, 
           prop.chisq = FALSE, prop.t = FALSE,
           dnn = c('predicted', 'actual'))
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## Total Observations in Table:  890 
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |       789 |        14 |       803 | 
##              |     0.983 |     0.017 |     0.902 | 
##              |     0.996 |     0.143 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         3 |        84 |        87 | 
##              |     0.034 |     0.966 |     0.098 | 
##              |     0.004 |     0.857 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       792 |        98 |       890 | 
##              |     0.890 |     0.110 |           | 
## -------------|-----------|-----------|-----------|



sms_class1 <- naiveBayes(sms_train[, -1313:-1314], sms_train$type1, laplace = 1)
sms_pred1 <- predict(sms_class1, sms_test)

CrossTable(sms_pred1, sms_test$type1, 
           prop.chisq = FALSE, prop.t = FALSE,
           dnn = c('predicted', 'actual'))
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## Total Observations in Table:  890 
##              | actual 
##    predicted |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |       790 |        16 |       806 | 
##              |     0.980 |     0.020 |     0.906 | 
##              |     0.997 |     0.163 |           | 
## -------------|-----------|-----------|-----------|
##         spam |         2 |        82 |        84 | 
##              |     0.024 |     0.976 |     0.094 | 
##              |     0.003 |     0.837 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       792 |        98 |       890 | 
##              |     0.890 |     0.110 |           | 
## -------------|-----------|-----------|-----------|
