-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_mining.R
More file actions
46 lines (31 loc) · 987 Bytes
/
text_mining.R
File metadata and controls
46 lines (31 loc) · 987 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Ref: https://www.youtube.com/watch?v=j1V2McKbkLo
# Text mining, machine learning in R
# init
## Load the libraries you intend to use the function from
libs <- c("tm", "plyr", "class")
lapply(libs, require, character.only = TRUE)
# set options
options(stringsAsFactors = FALSE)
# set parameters
candicates <0 c("romney", "obama")
pathname <- "speeches"
# clean text
cleanCorpus <- function(corpus) {
corpus.tmp <- tmp_map(corpus, removePunctuation)
corpus.tmp <- tmp_map(corpus.tmp, stripwhitespace)
corpus.tmp <- tmp_map(corpus)
}
# build TDM (Trm Document Matrix)
generateTDM <- function(cand, path) {
s.dir <- sprintf("%s/%s", path, cand)
s.cor <- corpus(DirSource(directory = s.dir, encoding = "ANSI")
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.c)
s.tdm <- removeSparseTerms(s.tdm, 0.7)
result <- list(name = cand, tdm = s.tdm)
}
tdm <- lapply(candidates, generateTDM, path = pathname)
# attach name
# stack
# hold-out
# model