-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpre_processing.r
More file actions
87 lines (74 loc) · 4.22 KB
/
pre_processing.r
File metadata and controls
87 lines (74 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
library(SnowballC)
getClasses <- function(original_files){
classes <- original_files
classes <- gsub("[0-9]", "", classes)
classes <- gsub("/", "", classes)
classes <- gsub(".news-bydatenews-bydate-train", "", classes)
}
#Tokenizing
tokenizing <- function(original_files, concatenated_files){
# if the merged dataset does exist, append to it
for (file in original_files){
text <-readLines(file, encoding="latin1")
#finds where the number of lines are in the file
nLine_line <- grep("Lines", text)
final_line <- as.integer(sub("Lines: ", "", text[nLine_line]))
#threads the 'dog' case
if(is.na(final_line[1])){
final_line <- 26
}
if(final_line < length(text)){
header_line <- length(text) - final_line
#removes the meta-info headers
text <- text[header_line:final_line]
}
#removes replies
#replied_lines <- grep("^>", text)
#if(length(replied_lines) > 0){
# text <- text[-replied_lines]
#}
#change the list to an array with all the chars
formatted_text = paste(unlist(text), collapse=' ')
#Tokenizing
#lower case all strings AND replacwith 'D'
formatted_text = gsub("[0-9]", " ", tolower(formatted_text))
formatted_text = gsub("_", " ", formatted_text)
#replace special characters with empty space
formatted_text = gsub("\\W", " ", formatted_text)
concatenated_files <- append(concatenated_files, formatted_text)
}
concatenated_files
}
#StopWords
#only remove stopWords from a given char vector.
remove_stop_words <- function(vectorized_text){
english_stop_words = c('i','my','myself','we','us','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she',
'her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am',
'is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','would','could','should','ought','might','however',
'will','would','shall','should','can','could','may','might','must','ought','im','youre','hes','shes', 'its','were', 'theyre','ive', 'youve','weve',
'theyve', 'id','youd','hed','shed','wed','theyd','ill','youll', 'hell','shell','well','theyll','isn','aren','wasn','weren','hasn', 'haven',
'hadn','doesn','don','didn','won','wouldn','shan','shouldn','cant','cannot','couldn','mustn','let','thats','whos','whats','heres',
'theres','whens','wheres','whys','hows','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with',
'about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over',
'under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some',
'such','no','nor','not','only','own','same','so','than','too','very','one','every','least','less','many','now','ever','never','say','says','said',
'also','get','go','goes','just','made','make','put','see','seen','whether','like','well','back','even','still','way','take','since','another',
'however','two','three','four','five','first','second','new','old','high','long','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
'p','q','r','s','t','u','v','x','w','y','z','re',"","ll","co","uk","subject", "edu","therefore", "fromyour", "himto", "com", "write", "post", "article",
"from", "lines", "organization", "summary", "keywords", "writing", "\"")
removed_stop_words = match(vectorized_text, english_stop_words)
#Words removed will have a match in the array, words that does not appear will always be NA.
remaning_words = vectorized_text[is.na(removed_stop_words)]
remaning_words
}
#Stemmer e StopWords
stemming <- function(tokenized_text, concatenated_files) {
for (file in tokenized_text){
#separe the original vector in a char vector, then unlist to be able to use the stopWords and stem
vectorized_text = unlist(strsplit(file, " "))
smaller_text <- remove_stop_words(vectorized_text)
final_text = wordStem(smaller_text)
concatenated_files <- append(concatenated_files, paste(final_text, collapse=" "))
}
concatenated_files
}