ML_project/Select_Days2.R at master · KNMI-DataLab/ML_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#############################################################################################################


# This is the script in which the GMS data
# are put into dataframes with the right input format for the ML scripts
# 3 days of data are combined into one dataset

#############################################################################################################

# Empty environment
rm(list=ls())

## Load GMS data files of the relevant days
Col_Classes_1 <- c("integer", "POSIXct", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "factor", "factor", "factor", "numeric", "logical", "logical", "character")

Data_2013_12_03 <- read.csv("/data/project/GMS/data/GMSraw/2013-12-03.csv", colClasses = Col_Classes_1)
Data_2013_12_03$TIMESTAMP <- as.POSIXct(Data_2013_12_03$TIMESTAMP, tz = "GMT")

Data_2014_12_28 <- read.csv("/data/project/GMS/data/GMSraw/2014-12-28.csv", colClasses = Col_Classes_1)
Data_2014_12_28$TIMESTAMP <- as.POSIXct(Data_2014_12_28$TIMESTAMP, tz = "GMT")

Data_2015_03_24 <- read.csv("/data/project/GMS/data/GMSraw/2015-03-24.csv", colClasses = Col_Classes_1)
Data_2015_03_24$TIMESTAMP <- as.POSIXct(Data_2015_03_24$TIMESTAMP, tz = "GMT")


# Make a data frame containing all the selected days
Three_Days <- rbind.data.frame(Data_2013_12_03, Data_2014_12_28, Data_2015_03_24)

## Drop variables that are not of interest by specifying columns you wish to keep
Keep <- c( "LOCATION", "TIMESTAMP", "TW_1","TW_2", "TW_3", "TW_4", "TW_5",
           "TW_6", "TW_7", "TW_8", "TW_9", "TW_10", "TW_11", "TW_12", "TL", "TD")

Three_Days <- Three_Days[Keep]

## Melt the dataframe
# ID vars are LOCATION and TIMESTAMP, TL & TD
library(reshape2)

Three_Days <- melt(Three_Days, id.vars = c("LOCATION", "TIMESTAMP", "TL", "TD"))
colnames(Three_Days)[5] <- "SENSOR"
colnames(Three_Days)[6] <- c("TEMP")

# Make the sensor data a character column
Three_Days$SENSOR <- as.character(Three_Days$SENSOR)

## Download days with filtered GMS data
Col_Classes_2 <- c("integer", "numeric", "character", "POSIXct", "character")


Data_filtered_14_12_28 <- read.csv("/data/project/GMS/data/GMSfiltered/2014-12-28.csv",
                                   header = FALSE, colClasses = Col_Classes_2)
Data_filtered_15_03_24 <- read.csv("/data/project/GMS/data/GMSfiltered/2015-03-24.csv",
                                   header = FALSE, colClasses = Col_Classes_2)
Data_filtered_13_12_03 <- read.csv("/data/project/GMS/data/GMSfiltered/2013-12-03.csv",
                                   header = FALSE, colClasses = Col_Classes_2)

# Merge all these filtered days into one dataframe
Data_filtered <- rbind.data.frame(Data_filtered_14_12_28,Data_filtered_15_03_24, Data_filtered_13_12_03)

colnames(Data_filtered) <- c("LOCATION", "TEMP","SENSOR", "TIMESTAMP", "QUALITY")

## Merge GMS Data & Filtered data by common columns
# Because the filtered subset contains no NA's for the sensors the merged data frame also contains no NA's
library(dplyr)
Three_Days_3 <- right_join(x = Three_Days, y = Data_filtered, by = c("LOCATION", "TIMESTAMP", "SENSOR", "TEMP"))


#Add in a Day of Year (DOY) and Hour of Day (HOD) column
library(lubridate)

# Build a DOY column
DOY_Days <-  as.numeric(strftime(Three_Days_3$TIMESTAMP, format = "%j"))

# How many DOY?
length(unique(DOY_Days))

# Build a HOD column
HOD_Days <- hour(Three_Days_3$TIMESTAMP) + minute(Three_Days_3$TIMESTAMP)/60

# Add the columns to the start of the Three_Days_3 data frame
Three_Days_3 <- cbind(DOY_Days, HOD_Days, Three_Days_3)

# Check if there are any 'bad' stations
# (either test stations from RWS or stations which have been moved) in the dataframe.

bad_stations <- c("108", "422", "818", "1015", "1501", "1502", "1503")

for(i in bad_stations){
  print(i)
  print(any(Three_Days_3$LOCATION == i))
}

# Remove the 'bad' stations
Three_Days_3 <- Three_Days_3[!(Three_Days_3$LOCATION == 108|
                             Three_Days_3$LOCATION == 422|
                             Three_Days_3$LOCATION == 818|
                             Three_Days_3$LOCATION == 1015|
                             Three_Days_3$LOCATION == 1501|
                             Three_Days_3$LOCATION == 1502|
                             Three_Days_3$LOCATION == 1503),]


# Store the GMS three days data frame as .csv
write.csv(x = Three_Days_3, file = "/usr/people/kleingel/Projects/MLProject/Three_Days.csv")

# Store the GMS three days data frame as R data
save(x = Three_Days_3, file = "/usr/people/kleingel/Projects/MLProject/Three_Days.Rda")