> # Load required packages
> library(dplyr)
> library(ggplot2)
> library(ggthemes)
>
> # Read data from csv file
> dat <- read.csv("data/Google_Analytics_Sessions.csv", stringsAsFactors = FALSE)
>
> # First view of the dataset
> str(dat)
'data.frame': 794 obs. of 2 variables:
$ Day : chr "01.01.17" "02.01.17" "03.01.17" "04.01.17" ...
$ Sessions: num 105 304 355 394 356 338 255 302 461 489 ...
>
> # Convert the date stored as character to a date (with base package)
> dat$Day <- as.Date(dat$Day, format = "%d.%m.%y")
>
> # Graphical representation of the time series with the built-in graphics package (Figure 5)
> plot(dat$Day, dat$Sessions, type = "l")
>
> # Data preparation with dplyr
> datPrep <- dat %>%
+   mutate(
+ # Set values below 50 to NA, because this is known to be a measurement error.
+ Sessuibs = ifelse(Sessions < 50, NA, Sessions),
+     # Determine weekday
+     wday = weekdays(Day),
+     # Determine month
+     month = months(Day)
+   )
>
> # Split into training and test data
> datPrep$isTrainData <- datPrep$Day < "2019-01-01"
>
> # Simple linear model. The character variables "wday" and "month" will be
> # automatically interpreted as categorial variables.
> # The model only uses the data up to the end of 2018, i.e. for which the variable
> # isTrainData has the value TRUE
> mod1 <- lm(data = datPrep[datPrep$isTrainData, ], formula = Meetings ~ wday + month)
>
> # Model summary
> summary(mod1)
Call:
lm(formula = Sessions ~ wday + month, data = datPrep[datPrep$isTrainData, ])
Residuals:
    Min      1Q  Median      3Q     Max
-464.80  -61.88   -6.52   62.38  479.19
Coefficients:
               Estimate Std. Error t value Pr(>|t|)
(Intercept)     517.340     18.270  28.316  < 2e-16 ***
wdayThursday    -31.046     16.119  -1.926   0.0545 .
wdayFriday     -116.981     16.122  -7.256 1.08e-12 ***
wdayWednesday    -2.817     16.111  -0.175   0.8612
wdayMonday       -7.527     16.030  -0.470   0.6388
wdaySaturday   -262.736     16.164 -16.255  < 2e-16 ***
wdaySunday     -202.964     16.077 -12.624  < 2e-16 ***
monthAugust     -16.028     20.777  -0.771   0.4407
monthDecember    -6.106     20.768  -0.294   0.7688
monthFebruary   109.349     21.304   5.133 3.72e-07 ***
monthJanuary    133.990     20.770   6.451 2.10e-10 ***
monthJuly       211.030     20.849  10.122  < 2e-16 ***
monthJune       167.145     22.755   7.345 5.85e-13 ***
monthMay         24.528     21.411   1.146   0.2524
monthMarch      -10.467     20.858  -0.502   0.6159
monthNovember    99.135     20.943   4.734 2.68e-06 ***
monthOctober    -27.591     20.770  -1.328   0.1845
monthSeptember   53.475     21.030   2.543   0.0112 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 114.2 on 685 degrees of freedom
  (27 observations deleted due to missingness)
Multiple R-squared:  0.5528,     Adjusted R-squared:  0.5417
F-statistic: 49.81 on 17 and 685 DF,  p-value: < 2.2e-16
> # Calculation of the values predicted by the model (for the whole period)
> datPrep$pred <- predict(mod1, newdata = datPrep)
>
> # Graphical representation of true and predicted values with ggplot2
> p <- ggplot(datPrep) +
+   geom_line(aes(x = Day, y = Sessions, linetype = !isTrainData)) +
+   geom_line(aes(x = Day, y = pred, colour = !isTrainData)) +
+   scale_color_manual(values = c("#2B4894", "#cd5364"), limits = c(FALSE, TRUE)) +
+   labs(colour = "Test period", linetype = "Test period")
> p # Equivalent to print(p)
>
> # Alternative model with splines
> library(mgcv)
> library(lubridate)
> datPrep$dayOfYear <- yday(datPrep$Day)
> datPrep$linTrend <- 1:nrow(datPrep)
> mod2 <- gam(data = datPrep[datPrep$isTrainData, ],
+             formula = Sessions ~
+             wday + linTrend + s(dayOfYear, bs = "cc", k = 20, by = as.factor(wday)))
> summary(mod2)
Family: gaussian
Link function: identity
Formula:
Sessions ~ wday + linTrend + s(dayOfYear, bs = "cc", k = 20,
    by = as.factor(wday))
Parametric coefficients:
                Estimate Std. Error t value Pr(>|t|)
(Intercept)     532.0423    12.3948  42.925  < 2e-16 ***
wdayThursday    -32.2007    14.1368  -2.278   0.0231 *
wdayFriday     -119.6377    14.1354  -8.464  < 2e-16 ***
wdayWednesday    -4.9203    14.1359  -0.348   0.7279
wdayMonday       -5.8354    14.0639  -0.415   0.6784
wdaySaturday   -265.6973    14.1727 -18.747  < 2e-16 ***
wdaySunday     -204.1019    14.1028 -14.472  < 2e-16 ***
linTrend          0.1254     0.0205   6.118 1.71e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(dayOfYear):as.factor(wday)Tuesday    15.32     18 6.413  < 2e-16 ***
s(dayOfYear):as.factor(wday)Thursday   12.97     18 3.902 1.89e-10 ***
s(dayOfYear):as.factor(wday)Friday     12.15     18 3.953 4.59e-11 ***
s(dayOfYear):as.factor(wday)Wednesday  13.85     18 5.314 9.31e-15 ***
s(dayOfYear):as.factor(wday)Monday     16.64     18 8.382  < 2e-16 ***
s(dayOfYear):as.factor(wday)Saturday   11.29     18 3.307 3.00e-09 ***
s(dayOfYear):as.factor(wday)Sunday     12.92     18 4.843 1.02e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) =  0.648   Deviance explained = 69.9%
GCV =  11749  Scale est. = 10025     n = 703
> datPrep$pred2 <- predict(mod2, newdata = datPrep)
> ggplot(datPrep) +
+   geom_line(aes(x = Day, y = Sessions, linetype = !isTrainData)) +
+   geom_line(aes(x = Day, y = pred2, colour = !isTrainData)) +
+   scale_color_manual(values = c("#2B4894", "#cd5364"), limits = c(FALSE, TRUE)) +
+   labs(colour = "Test period", linetype = "Test period")