algorithmica-repository
diff --git a/‎assignments/sol1.R‎
Lines changed: 216 additions & 0 deletions b/‎assignments/sol1.R‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎assignments/sol2.R‎
Lines changed: 97 additions & 0 deletions b/‎assignments/sol2.R‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎datasets/20news-bydate.tar.gz‎
13.8 MB b/‎datasets/20news-bydate.tar.gz‎
13.8 MB
@@ -0,0 +1,216 @@
+#Problem 1
+#Q1
+rain.df = read.table("E:/data analytics/datasets/rainfall.dat")
+#Q2
+dim(rain.df)[1] # for number of rows
+#Q3
+dim(rain.df)[2]
+#Q4
+colnames(rain.df)
+#Q5
+rain.df[2,4]
+#Q6
+rain.df[2, ]
+#Q7
+names(rain.df) = c("year", "month", "day", seq(0,23))
+str(rain.df)
+#Q8
+rain.df$daily = apply(sum, X=rain.df[,4:27], MARGIN=1)
+head(rain.df)
+tail(rain.df)
+
+#rain.df = rain.df %>% rowwise() %>% mutate(daily=sum(0,1))
+#rain.df = rain.df %>% group_by(1:n()) %>% mutate(daily=sum(0:23))
+#rain.df[20,]
+
+#Problem 2
+
+# Q1
+DeepSea <- read.table(file ="E:/data analytics/datasets/ISIT.txt", header = TRUE)
+
+# Q2
+names(DeepSea)
+str(DeepSea)
+dim(DeepSea)
+head(DeepSea)
+
+# Q3
+unique(DeepSea$Station)
+
+# Q4
+DeepSea.sta1 = DeepSea[DeepSea$Station==1,]
+DeepSea.sta1
+
+DeepSea.sta1 = DeepSea %>% filter(Station==1)
+DeepSea.sta1
+
+# Q5
+nrow(DeepSea.sta1)
+min(DeepSea.sta1$SampleDepth)
+mean(DeepSea.sta1$SampleDepth)
+max(DeepSea.sta1$SampleDepth)
+
+DeepSea.sta1 %>% summarise(total=n(),min_depth=min(SampleDepth), mean_depth=mean(SampleDepth), max_depth=max(SampleDepth))
+
+# Q6
+nrow(DeepSea[DeepSea$Station==1,])
+nrow(DeepSea[DeepSea$Station==2,])
+nrow(DeepSea[DeepSea$Station==3,])
+nrow(DeepSea[DeepSea$Station==4,])
+nrow(DeepSea[DeepSea$Station==5,])
+
+DeepSea %>% filter(Station==1 | Station==2 | Station==3 | Station==4 | Station==5) %>% group_by(Station) %>% summarize(count=n())
+
+# Q7
+DeepSea %>% group_by(Station) %>% summarize(count=n()) %>% filter(min_rank(count)>2)
+
+
+# Q8
+DeepSea.fall = DeepSea[DeepSea$Month==8 | DeepSea$Month==9 | DeepSea$Month==10, ]
+nrow(DeepSea.fall)
+
+DeepSea %>% filter(Month==8 | Month==9 | Month==10) %>% summarise(count=n())
+
+# Q9
+DeepSea.dep2000 = DeepSea[DeepSea$SampleDepth>2000, ]
+nrow(DeepSea.dep2000)
+
+DeepSea.dep2000 = DeepSea %>% filter(SampleDepth>2000) %>% summarise(count=n())
+
+# Q10
+DeepSea.dep2000.fall2001 <- DeepSea[DeepSea$SampleDepth>2000 & (DeepSea$Month==8 | DeepSea$Month==9 | DeepSea$Month==10) & DeepSea$Year==2001, ]
+nrow(DeepSea.dep2000.fall2001)
+
+DeepSea %>% filter(SampleDepth>2000 & (Month==8 | Month==9 | Month==10) & Year==2001) %>% summarise(count=n())
+
+
+#Problem 3
+
+# Step 1
+DeepSea1 = read.table(file ="E:/data analytics/datasets/DeepSea1.txt", header = TRUE)
+
+# Step 2
+DeepSea2 = read.table(file ="E:/data analytics/datasets/DeepSea2.txt", header = TRUE)
+
+# Step 3
+head(DeepSea1)
+head(DeepSea2)
+
+# Step 4
+DeepSea <- merge(DeepSea1, DeepSea2, by.x = 'ID', by.y = 'SampleID')
+nrow(DeepSea)
+nrow(DeepSea1)
+nrow(DeepSea2)
+
+DeepSea=inner_join(DeepSea1, DeepSea2, by=c('ID'='SampleID'))
+nrow()
+
+# Step 5
+DeepSea.full = merge(DeepSea1, DeepSea2, by.x = 'ID', by.y = 'SampleID', all = TRUE)
+nrow(DeepSea.full)
+head(DeepSea.full)
+
+nrow(right_join(DeepSea1, DeepSea2, by=c('ID'='SampleID')))
+
+# Step 6
+DeepSea[,c('Year','Month','Station','SampleDepth')]
+
+DeepSea %>% select(Year,Month,Station,SampleDepth)
+
+# Step 7
+DeepSea[order(DeepSea$SampleDepth), c('Year','Month','Station','SampleDepth')]
+
+DeepSea %>% select(Year,Month,Station,SampleDepth) %>% arrange(SampleDepth)
+
+# Step 8
+DeepSea[order(DeepSea$Station,-DeepSea$SampleDepth), c('Year','Month','Station','SampleDepth')]
+
+DeepSea %>% select(Year,Month,Station,SampleDepth) %>% arrange(Station,desc(SampleDepth))
+
+# Step 9
+DeepSea$fYear <- factor(DeepSea$Year)
+DeepSea$fMonth <- factor(DeepSea$Month)
+
+DeepSea = DeepSea %>% mutate(fYear=factor(Year), fMonth=factor(Month))
+str(DeepSea)
+
+# Step 10
+levels(DeepSea$fYear)
+levels(DeepSea$fMonth)
+
+# Step 11
+DeepSea$fMonthName <- factor(DeepSea$Month, levels=c(3,4,8,10), labels = c('March','April','August','October'))
+
+DeepSea = DeepSea %>% mutate(fMonthName=factor(DeepSea$Month, levels=c(3,4,8,10), labels = c('March','April','August','October')))
+str(DeepSea)
+
+
+# Step 12
+write.table(DeepSea[,c('ID','Year','Month','Station','SampleDepth','fYear','fMonth','fMonthName')], file = "DeepSea.txt", sep="\t", quote = TRUE, append = FALSE, na = "NA", row.names = FALSE)
+
+res=DeepSea %>% select(ID, Year, Month, Station, SampleDepth, fYear, fMonth, fMonthName)
+write.table(res, file = "DeepSea.txt", sep="\t", quote = TRUE, append = FALSE, na = "NA", row.names = FALSE)
+
+
+#Problem 3
+# Step 1
+setwd('E:/data analytics/datasets')
+Temp = read.table(file = "Temperature.txt", header = TRUE)
+
+# Step 2
+names(Temp)
+str(Temp)
+dim(Temp)
+head(Temp)
+
+# Step 3
+tapply(Temp$Temperature, INDEX = Temp$Month, FUN = mean, na.rm = TRUE)
+tapply(Temp$Temperature, INDEX = Temp$Month, FUN = sd, na.rm = TRUE)
+
+Temp %>% group_by(Month) %>% summarise(temp_mean=mean(Temperature, na.rm=TRUE), temp_sd=sd(Temperature,na.rm=TRUE))
+
+# Step 4
+tapply(Temp$Temperature, INDEX = list(Temp$Month,Temp$Station), FUN = mean, na.rm = TRUE)
+tapply(Temp$Temperature, INDEX = list(Temp$Month,Temp$Station), FUN = sd, na.rm = TRUE)
+
+Temp %>% group_by(Month,Station) %>% summarise(temp_mean=mean(Temperature, na.rm=TRUE), temp_sd=sd(Temperature,na.rm=TRUE))
+
+
+# Step 5
+Temp.1990 <- Temp[Temp$Year == 1990 , ]
+tapply(Temp.1990$Temperature, INDEX = Temp.1990$Month, FUN=mean, na.rm = TRUE)
+tapply(Temp.1990$Temperature, INDEX = Temp.1990$Month, FUN=sd, na.rm = TRUE)
+
+Temp %>% filter(Year==1990) %>% group_by(Month) %>% summarise(temp_mean=mean(Temperature, na.rm=TRUE), temp_sd=sd(Temperature,na.rm=TRUE))
+
+
+# Step 6
+sapply(Temp[, c('Salinity','Temperature','CHLFa')], FUN = mean, na.rm = TRUE)
+sapply(Temp[, c('Salinity','Temperature','CHLFa')], FUN = sd, na.rm = TRUE)
+
+Temp %>% summarise(temp_mean=mean(Salinity, na.rm=TRUE), temp_sd=sd(Salinity,na.rm=TRUE))
+Temp %>% summarise(temp_mean=mean(Temperature, na.rm=TRUE), temp_sd=sd(Temperature,na.rm=TRUE))
+Temp %>% summarise(temp_mean=mean(CHLFa, na.rm=TRUE), temp_sd=sd(CHLFa,na.rm=TRUE))
+
+Temp %>% select(Salinity,Temperature,CHLFa) %>% summarise_each(funs(mean(.,na.rm=TRUE),sd(.,na.rm=TRUE)))
+
+# Step 7
+Temp.dant <- Temp[Temp$Station == 'DANT' , ]
+sapply(Temp.dant[, c('Salinity','Temperature','CHLFa')], FUN = mean, na.rm = TRUE)
+sapply(Temp.dant[, c('Salinity','Temperature','CHLFa')], FUN = sd, na.rm = TRUE)
+
+Temp %>% filter(Station=='DANT') %>% select(Salinity,Temperature,CHLFa) %>% summarise_each(funs(mean(.,na.rm=TRUE),sd(.,na.rm=TRUE)))
+
+
+# Step 8
+summary(Temp[, c('Salinity','Temperature','CHLFa')])
+
+Temp %>% select(Salinity,Temperature,CHLFa) %>% summary()
+
+# Step 9
+table(Temp$Station)
+table(Temp$Year)
+table(Temp$Station, Temp$Year)
+
+
+
@@ -0,0 +1,97 @@
+# load required libraries
+library(dplyr)
+
+# loading dataset
+Forest = read.csv("E:/data analytics/datasets/forestfires.csv", header = TRUE, sep = ',')
+Forest
+
+# Task 1 
+
+# How many observations are there in the dataset?
+nrow(Forest)
+
+Forest %>% summarize(count=n())
+
+# How many observations are there with a fire (i.e. area>0)?
+nrow(Forest[Forest$area>0, ])
+
+nrow(subset(Forest, area>0))
+
+Forest %>% filter(area>0) %>% summarise(count=n())
+
+
+# How many observations are there with rain (i.e. rain>0)?
+nrow(Forest[Forest$rain>0, ])
+
+Forest %>% filter(rain>0) %>% summarise(count=n())
+
+# How many observations are there with both a fire and rain?
+nrow(Forest[Forest$area>0 & Forest$rain>0, ])
+
+Forest %>% filter(area>0 & rain>0) %>% summarise(count=n())
+
+
+# Task 2
+
+# Show the columns month, day, area of all the observations.
+Forest[, c('month', 'day', 'area')]
+
+Forest %>% select(month, day, area)
+
+# Show the columns month, day, area of the observations with a fire.
+Forest[Forest$area>0, c('month', 'day', 'area')]
+
+Forest %>% filter(area>0) %>% select(month, day, area)
+
+
+# Task 3
+
+# How large are the five largest fires (i.e. having largest area)? 
+Forest %>% arrange(desc(area)) %>% top_n(5) %>% select(area)
+
+# What are the corresponding month, temp, RH, wind, rain, area?
+Forest %>% arrange(desc(area)) %>% top_n(5) %>% select(month, temp, RH, wind, rain, area)
+
+
+# Task 4
+
+# Reorder factor levels of month to be from Jan to Dec.
+levels(Forest$month)
+Forest$month = factor(Forest$month, levels = c('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'))
+levels(Forest$month)
+
+# Add one column to the data indicating whether a fire occurred for each observation ('TRUE' for area>0 and 'FALSE' for area==0).
+Forest$fire = factor(Forest$area>0)
+
+Forest = Forest %>% mutate(fire=area>0)
+str(Forest)
+
+Forest = Forest %>% mutate(fire=area>0)
+Forest$fire = factor(Forest$fire)
+str(Forest)
+
+
+# Task 5
+
+# What is the mean area/wind/temp/RH per month?
+tapply(Forest$area, Forest$month, mean) # group by month values and apply mean on area column of each group
+tapply(Forest$wind, Forest$month, mean)
+tapply(Forest$temp, Forest$month, mean)
+tapply(Forest$RH, Forest$month, mean)
+
+Forest %>% group_by(month) %>% summarise(mean_area=mean(area), mean_wind=mean(wind), mean_temp=mean(temp), mean_rh=mean(RH))
+
+# How many observations are there in each month? 
+table(Forest$month)
+
+Forest %>% group_by(month) %>% summarise(count=n())
+
+# How many observations are there with a fire in each month?
+table(Forest[Forest$area>0, ]$month)
+
+Forest %>% filter(area>0) %>% group_by(month) %>% summarise(count=n())
+
+# What is the probability of a fire in each month? 
+table(Forest[Forest$area>0, ]$month) / table(Forest$month)
+
+Forest %>% group_by(month) %>% summarise(prob=sum(area>0)/n())