Import libraries and define options

library(dplyr)
library(reshape2)
library(chron)
library(ggplot2)
source("GRB001.R")
## Loading required package: outliers

Define options

Sys.setlocale("LC_TIME","C")
options(stringsAsFactors=FALSE)
options(chron.year.abb=FALSE)
theme_set(theme_bw()) # just my preference for plots

Load data

Based on past work, we can define a function that reads in the data and additionally provides several time variables.

R provides many functions for extraction of time information, but for atmospheric applications we often classify time periods according to season (which is not provided). We will define our own function to convert month to season:

Month2Season <- function(month) {
  ## month is an integer (1-12)
  ## a factor with levels {"DJF", "MAM", "JJA", "SON"} is returned
  seasons <- c("DJF", "MAM", "JJA", "SON")
  index <- findInterval(month %% 12, seq(0, 12, 3))
  factor(seasons[index], seasons)
}

Test this new function:

Month2Season(c(1, 3, 12))
## [1] DJF MAM DJF
## Levels: DJF MAM JJA SON

Next, we define the function for importing the time series:

ReadTSeries <- function(filename, timecolumn="datetime", timeformat="%d.%m.%Y %H:%M") {
  ## read the table, strip units in column names, rename time column
  ##   and change data type of time column from a string of characters to
  ##   a numeric type so that we can perform operations on it
  data <- read.table(filename, skip=5, header=TRUE, sep=";", check.names=FALSE)
  names(data) <- sub("[ ].*$","",names(data)) # strip units for simplification
  names(data) <- sub("Date/time", timecolumn, names(data), fixed=TRUE)
  data[,timecolumn] <- as.chron(data[,timecolumn], timeformat) - 1/24 # end time -> start time
  ## extract additional variables from the time column
  data[,"year"] <- years(data[,timecolumn])
  data[,"month"] <- months(data[,timecolumn])
  data[,"day"] <- days(data[,timecolumn])
  data[,"hour"] <- hours(data[,timecolumn])
  data[,"dayofwk"] <- weekdays(data[,timecolumn])
  data[,"daytype"] <- ifelse(data[,"dayofwk"] %in% c("Sat","Sun"), "Weekend", "Weekday")
  data[,"season"] <- Month2Season(unclass(data[,"month"]))
  ## return value
  data
}

Read and merge (with full_join) Lausanne (LAU) and Z├╝rich (ZUE) data:

datapath <- "data/2013"

df <- full_join(cbind(site="LAU", ReadTSeries(file.path(datapath, "LAU.csv"))),
                cbind(site="ZUE", ReadTSeries(file.path(datapath, "ZUE.csv"))))
## Warning in strptime(x, format, tz = tz): unknown timezone 'zone/tz/2018c.
## 1.0/zoneinfo/Europe/Zurich'
## Joining, by = c("site", "datetime", "O3", "NO2", "CO", "PM10", "TEMP", "PREC", "RAD", "year", "month", "day", "hour", "dayofwk", "daytype", "season")

We can see that this data frame contains data from both sites.

head(df)
##   site              datetime   O3  NO2  CO PM10 TEMP PREC  RAD year month
## 1  LAU (12/31/2012 00:00:00)  7.8 56.3 0.5 16.1  3.8    0 -2.4 2012   Dec
## 2  LAU (12/31/2012 01:00:00) 22.4 38.0 0.4 11.6  4.1    0 -2.3 2012   Dec
## 3  LAU (12/31/2012 02:00:00) 14.5 37.2 0.3 10.3  3.1    0 -2.1 2012   Dec
## 4  LAU (12/31/2012 03:00:00) 28.7 25.4 0.3 10.5  3.5    0 -2.2 2012   Dec
## 5  LAU (12/31/2012 04:00:00) 19.6 33.7 0.3  9.0  2.9    0 -2.2 2012   Dec
## 6  LAU (12/31/2012 05:00:00) 30.8 51.2 0.3  8.7  3.2    0 -2.3 2012   Dec
##   day hour dayofwk daytype season SO2 NMVOC EC
## 1  31    0     Mon Weekday    DJF  NA    NA NA
## 2  31    1     Mon Weekday    DJF  NA    NA NA
## 3  31    2     Mon Weekday    DJF  NA    NA NA
## 4  31    3     Mon Weekday    DJF  NA    NA NA
## 5  31    4     Mon Weekday    DJF  NA    NA NA
## 6  31    5     Mon Weekday    DJF  NA    NA NA
tail(df)
##       site              datetime  O3  NO2  CO PM10 TEMP PREC  RAD year
## 17563  ZUE (12/31/2013 18:00:00) 1.4 49.1 0.6 28.5  1.0    0 -2.7 2013
## 17564  ZUE (12/31/2013 19:00:00) 1.5 47.9 0.6 31.9  0.2    0 -2.4 2013
## 17565  ZUE (12/31/2013 20:00:00) 2.0 48.6 0.7 34.1 -0.1    0 -2.5 2013
## 17566  ZUE (12/31/2013 21:00:00) 2.5 48.4 0.7 40.7  0.0    0 -2.4 2013
## 17567  ZUE (12/31/2013 22:00:00) 2.2 43.0 0.6 48.5 -0.4    0 -2.5 2013
## 17568  ZUE (12/31/2013 23:00:00) 2.4 43.5 0.6 53.3 -0.7    0 -2.5 2013
##       month day hour dayofwk daytype season SO2 NMVOC  EC
## 17563   Dec  31   18     Tue Weekday    DJF 3.4   0.2 2.4
## 17564   Dec  31   19     Tue Weekday    DJF 3.4   0.2 2.4
## 17565   Dec  31   20     Tue Weekday    DJF 3.3   0.2 2.6
## 17566   Dec  31   21     Tue Weekday    DJF 4.2   0.2 2.6
## 17567   Dec  31   22     Tue Weekday    DJF 3.6   0.2 2.5
## 17568   Dec  31   23     Tue Weekday    DJF 4.4   0.2 2.5

Let us save this data frame for later.

saveRDS(df, "data/2013/lau-zue.rds")

Elongate the data frame, as before.

lf <- melt(df, id.vars=c("site", "datetime", "season", "year", "month", "day", "hour", "dayofwk", "daytype"))

View variability in pollutant concentrations

Plotting your data is very good practice. Check for general trends and extreme values.

View all the measurements:

ggp <- ggplot(lf)+                                   # `lf` is the data frame
  facet_grid(variable~site, scale="free_y")+         # panels created out of these variables
  geom_line(aes(datetime, value, color=site))+       # plot `value` vs. `time` as lines
  scale_x_chron()+                                   # format x-axis labels (time units)
  theme(axis.text.x=element_text(angle=30, hjust=1)) # rotate x-axis labels
print(ggp)                                           # view the plot