CSSR/DataProcessing.R at master · BenjaminGaiser/CSSR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#####
# Data Processing File
# Ben Gaiser and Jeremy Russell
# 7 October 2016
# Purpose: Some preliminary descriptive statistics for our two data frames
#####

# Setting our working directory
wrkdir <- c('C:/Users/Benji/Desktop/Statistics/Git/Repositories/CSSR',
            '~/Hertie School/Fall 2016/CollaborativeSocialScienceDataAnalysis/CSSR')
repmis::set_valid_wd(wrkdir)

# Executing the Data Gathering File
source('DataGathering.R')

#####
# Data Frame 1: Alcohol Consumption
# Units: Average serving sizes per person
# Source: 538 from World Health Organisation, Global Information System on Alcohol and Health (GISAH), 2010
# Variables of interest:
#   Independent Variable X: 'beer_servings'
#   Dependent Variable Y: 'total_litres_of_pure_alcohol'
#####

# What the data look like - Initial Descriptive Statistics
summary(AlcoholConsumption)
describe(AlcoholConsumption)

#More descriptive statistics
stat.desc(AlcoholConsumption)

# Who drinks the most in total litres of pure alcohol?
which.max(AlcoholConsumption$total_litres_of_pure_alcohol)
# The answer is row 16, i.e. Belarus
head(AlcoholConsumption[16,])
# Who drinks the least in  total litres of pure alcohol?
which.min(AlcoholConsumption$total_litres_of_pure_alcohol)
# The answer is row 1, i.e. Afghanistan
head(AlcoholConsumption[1,])

#####
# Hypothesis: Beer is the main driver of total litres of pure alcohol consumed per country
#####

# Looking at the correlation to see if hypothesis is accurate
cor(AlcoholConsumption$beer_servings, AlcoholConsumption$total_litres_of_pure_alcohol)    # 0.84
cor(AlcoholConsumption$wine_servings, AlcoholConsumption$total_litres_of_pure_alcohol)    # 0.67
cor(AlcoholConsumption$spirit_servings, AlcoholConsumption$total_litres_of_pure_alcohol)  # 0.65
# It seems that beer_servings are most closely correlated to total_litres_of_pure_alcohol

#####
# Plotting our findings in a scatterplot with a line of best fit and the 95 % confidence interval
#####

# 1. Step: Creating a function for plotting a ggplot
ggplotRegAlcCons <- function(fit){
  ggplot(AlcoholConsumption, aes(beer_servings, total_litres_of_pure_alcohol)) +
    geom_point(colour = 'blue') +
    stat_smooth(method = 'lm', col = 'red', size=0.75) +
    labs(title = paste('Adj R2 = ',signif(summary(fit)$adj.r.squared, 3),
                       'Intercept =',signif(fit$coef[[1]],3 ),
                       ' Slope =',signif(fit$coef[[2]], 1),
                       ' P =',signif(summary(fit)$coef[2,4], 2)))
}

# 2. Step: Running the linear regression for the line of best fit
FitOfData <- lm(total_litres_of_pure_alcohol ~ beer_servings, data=AlcoholConsumption)

# 3. Step: Plotting the graph
ggplotRegAlcCons(FitOfData)
# Beer seems to explain the story well with an Adj.R-value of 0.70

#####
# Analyzing five countries of interest to see how they differ in their 'beer_servings' values
#####

# Finding Germany, USA, South Africa, China and Australia
which(grepl('Germany', AlcoholConsumption$country)) # row 66
which(grepl('USA', AlcoholConsumption$country)) # row 185
which(grepl('South Africa', AlcoholConsumption$country)) # row 160
which(grepl('China', AlcoholConsumption$country)) # row 37
which(grepl('Australia', AlcoholConsumption$country)) # row 9

# Subsetting the Data 'AlcoholConsumption' for ease of commanding
SubsetOfFiveCountries <- AlcoholConsumption[c(9, 37, 66, 160, 185),]

# Plotting our findings in a Scatterplot, we see that Germany drinks most out of the five countries on both variables
ggplot(SubsetOfFiveCountries,
       aes(beer_servings, total_litres_of_pure_alcohol)) +
  geom_point(aes(colour = factor(country))) +
  scale_colour_discrete(name='Countries')

#####
# Data Frame 2: Swiss Data Set
# Variables of interest:
#   Independent Variable X: 'Catholic': % Catholic as opposed to Protestant
#   Dependent Variable Y: 'Fertility': lg, 'common standardized fertility measure'
# Source: Swiss Fertility and Socioeconomic Indicators (1888), R Data Set
#####

# What the data look like - Initial Descriptive Statistics
summary(swiss)

#####
# Hypothesis: Catholics have a higher fertility rate than Protestants
#####

# A closer look at the initial descriptive statistics of our variables of interest
describe(swiss$Fertility)
describe(swiss$Catholic)
var(swiss$Fertility)
var(swiss$Catholic)
sd(swiss$Fertility)
sd(swiss$Catholic)
# 'Catholic' shows high variance and standard deviation for a continuous variable of between 0 and 100

# Plotting both variables to see their relationship
ggplot(swiss, aes(Catholic, Fertility)) + geom_point()

# Plot fertility and Catholic with ggplot
ggplotRegSwiss <- function(fit){
  ggplot(swiss, aes(Catholic, Fertility)) +
    geom_point(colour = 'blue') +
    stat_smooth(method = 'lm', col = 'red', size=0.75) +
    labs(title = paste('Adj R2 = ',signif(summary(fit)$adj.r.squared, 3),
                       'Intercept =',signif(fit$coef[[1]],3 ),
                       ' Slope =',signif(fit$coef[[2]], 1),
                       ' P =',signif(summary(fit)$coef[2,4], 2)))
}
FitOfDataSwiss <- lm(Fertility ~ Catholic, data = swiss)
ggplotRegSwiss(FitOfDataSwiss)

# Despite outliers, this is still a significant relationship, explaining 20 % of the variance (R-value)

#####
# Which cantons are neither mostly Protestant nor mostly Catholic?
#####

# 1. Step: Creating a factor variable with four different groups
swiss$CatholicCat <- cut(swiss$Catholic, seq(0, 100, 25))
# 2. Step: Changing the factor variable into a character variable for renaming the rownames
swiss$CatholicCat <- as.character(swiss$CatholicCat)
# 3. Step: Renaming the rownames
swiss$CatholicCat[swiss$CatholicCat=='(0,25]'] <- 'Protestant'
swiss$CatholicCat[swiss$CatholicCat=='(25,50]'] <- 'Protestant to Catholic'
swiss$CatholicCat[swiss$CatholicCat=='(50,75]'] <- 'Catholic to Protestant'
swiss$CatholicCat[swiss$CatholicCat=='(75,100]'] <- 'Catholic'
# 4. Step: Finding the cantons which are 'Protestant to Catholic' and 'Catholic to Protestant'
which(grepl('Protestant to Catholic', swiss$CatholicCat)) # 4 and 45
which(grepl('Catholic to Protestant', swiss$CatholicCat)) # 46 and 47
# 5. Step: Searching the names
swiss[c(4,45:47),]
# Moutier, V. De Geneve, Rive Droite and Rive Gauce are the only cantons where there is at least a third of the population
# that does not belong to the majority religion