Skip to content

Commit 31819fc

Browse files
author
algorithmica-repository
committed
Uploading of spam filter code using naivebayes
1 parent 75ea020 commit 31819fc

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed

naive-bayes/naivebayes.R

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# to simplify selections
2+
library(dplyr)
3+
# for stemming the words
4+
library(SnowballC)
5+
# libraries required by caret
6+
library(klaR)
7+
library(e1071)
8+
# for the Naive Bayes modelling
9+
library(caret)
10+
# to process the text into a corpus
11+
library(tm)
12+
# to get nice looking tables
13+
library(pander)
14+
15+
# Set seed for reproducibility
16+
set.seed(1234)
17+
18+
frqtab = function(x, caption) {
19+
round(100*prop.table(table(x)), 1)
20+
}
21+
22+
# Read the data
23+
setwd("E:/data analytics/datasets")
24+
sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
25+
header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
26+
sms_raw = sms_raw[sample(nrow(sms_raw)),]
27+
28+
# Explore the dataset
29+
dim(sms_raw)
30+
str(sms_raw)
31+
head(sms_raw)
32+
33+
colnames(sms_raw) = c("type", "text")
34+
sms_raw$type = factor(sms_raw$type)
35+
36+
# Preparing the dataset
37+
sms_corpus = Corpus(VectorSource(sms_raw$text))
38+
39+
inspect(sms_corpus[1:10])
40+
41+
#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
42+
#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
43+
#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
44+
45+
#getTransformations() returns standard transformations
46+
47+
sms_corpus_clean = sms_corpus %>%
48+
tm_map(content_transformer(tolower)) %>%
49+
tm_map(removeNumbers) %>%
50+
tm_map(removePunctuation) %>%
51+
tm_map(removeWords, stopwords(kind="en")) %>%
52+
tm_map(stripWhitespace) %>%
53+
tm_map(stemDocument)
54+
55+
inspect(sms_corpus_clean[1:10])
56+
57+
#Generating the training and test partitions from raw_data
58+
#About createDataPartition:
59+
#y = what output we want to split on, which is this case are the two types of messages (SPAM and non Spam).
60+
#p specifies the proportion of data that will exist in each chunk after splitting the data,
61+
#in this case we split into two chunks of 75% and 25%.
62+
#We then subset the data using the output from the createDataPartition function.
63+
64+
train_index = createDataPartition(sms_raw$type, p=0.75, list=FALSE)
65+
sms_raw_train = sms_raw[train_index,]
66+
sms_raw_test = sms_raw[-train_index,]
67+
68+
#Explore the training and test datasets
69+
dim(sms_raw_train)
70+
dim(sms_raw_test)
71+
72+
ft_orig = frqtab(sms_raw$type)
73+
ft_train = frqtab(sms_raw_train$type)
74+
ft_test = frqtab(sms_raw_test$type)
75+
ft_df = as.data.frame(cbind(ft_orig, ft_train, ft_test))
76+
colnames(ft_df) <- c("Original", "Training set", "Test set")
77+
pander(ft_df, style="rmarkdown",
78+
caption=paste0("Comparison of SMS type frequencies among datasets"))
79+
80+
sms_corpus_clean_train = sms_corpus_clean[train_index]
81+
sms_corpus_clean_test = sms_corpus_clean[-train_index]
82+
83+
#Feature Reduction:
84+
#Remove the features whose length is <=2 and
85+
#Remove the features that appear sparse in 98% of documents
86+
sms_train = DocumentTermMatrix(sms_corpus_clean_train,control=list(minWordLength=2))
87+
sms_train = removeSparseTerms(sms_train,0.98)
88+
sms_test = DocumentTermMatrix(sms_corpus_clean_test,control=list(minWordLength=2))
89+
sms_test = removeSparseTerms(sms_test,0.98)
90+
dim(sms_train)
91+
dim(sms_test)
92+
inspect(sms_train[1:10,1:10])
93+
inspect(sms_test[1:10,1:10])
94+
95+
# Convert the dtm into boolean values instead of term frequencies
96+
convert_counts <- function(x) {
97+
x = ifelse(x > 0, 1, 0)
98+
x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
99+
}
100+
sms_train = sms_train %>% apply(MARGIN=2, FUN=convert_counts)
101+
sms_test = sms_test %>% apply(MARGIN=2, FUN=convert_counts)
102+
dim(sms_train)
103+
dim(sms_test)
104+
sms_train[1:10,1:10]
105+
sms_test[1:10,1:10]
106+
107+
108+
#Train the model
109+
sms_model = naiveBayes(sms_train, sms_raw_train$type)
110+
sms_model
111+
112+
#Test the model
113+
sms_predict = predict(sms_model, sms_test)
114+
115+
cm = confusionMatrix(sms_predict, sms_raw_test$type, positive="spam")
116+
cm
117+
118+
119+

0 commit comments

Comments
 (0)