> # Load required packages > library(dplyr) > library(ggplot2) > library(ggthemes) > > # Read data from csv file > dat <- read.csv("data/Google_Analytics_Sessions.csv", stringsAsFactors = FALSE) > > # First view of the dataset > str(dat) 'data.frame': 794 obs. of 2 variables: $ Day : chr "01.01.17" "02.01.17" "03.01.17" "04.01.17" ... $ Sessions: num 105 304 355 394 356 338 255 302 461 489 ... > > # Convert the date stored as character to a date (with base package) > dat$Day <- as.Date(dat$Day, format = "%d.%m.%y") > > # Graphical representation of the time series with the built-in graphics package (Figure 5) > plot(dat$Day, dat$Sessions, type = "l") > > # Data preparation with dplyr > datPrep <- dat %>% + mutate( + # Set values below 50 to NA, because this is known to be a measurement error. + Sessuibs = ifelse(Sessions < 50, NA, Sessions), + # Determine weekday + wday = weekdays(Day), + # Determine month + month = months(Day) + ) > > # Split into training and test data > datPrep$isTrainData <- datPrep$Day < "2019-01-01" > > # Simple linear model. The character variables "wday" and "month" will be > # automatically interpreted as categorial variables. > # The model only uses the data up to the end of 2018, i.e. for which the variable > # isTrainData has the value TRUE > mod1 <- lm(data = datPrep[datPrep$isTrainData, ], formula = Meetings ~ wday + month) > > # Model summary > summary(mod1) Call: lm(formula = Sessions ~ wday + month, data = datPrep[datPrep$isTrainData, ]) Residuals: Min 1Q Median 3Q Max -464.80 -61.88 -6.52 62.38 479.19 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 517.340 18.270 28.316 < 2e-16 *** wdayThursday -31.046 16.119 -1.926 0.0545 . wdayFriday -116.981 16.122 -7.256 1.08e-12 *** wdayWednesday -2.817 16.111 -0.175 0.8612 wdayMonday -7.527 16.030 -0.470 0.6388 wdaySaturday -262.736 16.164 -16.255 < 2e-16 *** wdaySunday -202.964 16.077 -12.624 < 2e-16 *** monthAugust -16.028 20.777 -0.771 0.4407 monthDecember -6.106 20.768 -0.294 0.7688 monthFebruary 109.349 21.304 5.133 3.72e-07 *** monthJanuary 133.990 20.770 6.451 2.10e-10 *** monthJuly 211.030 20.849 10.122 < 2e-16 *** monthJune 167.145 22.755 7.345 5.85e-13 *** monthMay 24.528 21.411 1.146 0.2524 monthMarch -10.467 20.858 -0.502 0.6159 monthNovember 99.135 20.943 4.734 2.68e-06 *** monthOctober -27.591 20.770 -1.328 0.1845 monthSeptember 53.475 21.030 2.543 0.0112 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 114.2 on 685 degrees of freedom (27 observations deleted due to missingness) Multiple R-squared: 0.5528, Adjusted R-squared: 0.5417 F-statistic: 49.81 on 17 and 685 DF, p-value: < 2.2e-16 > # Calculation of the values predicted by the model (for the whole period) > datPrep$pred <- predict(mod1, newdata = datPrep) > > # Graphical representation of true and predicted values with ggplot2 > p <- ggplot(datPrep) + + geom_line(aes(x = Day, y = Sessions, linetype = !isTrainData)) + + geom_line(aes(x = Day, y = pred, colour = !isTrainData)) + + scale_color_manual(values = c("#2B4894", "#cd5364"), limits = c(FALSE, TRUE)) + + labs(colour = "Test period", linetype = "Test period") > p # Equivalent to print(p) > > # Alternative model with splines > library(mgcv) > library(lubridate) > datPrep$dayOfYear <- yday(datPrep$Day) > datPrep$linTrend <- 1:nrow(datPrep) > mod2 <- gam(data = datPrep[datPrep$isTrainData, ], + formula = Sessions ~ + wday + linTrend + s(dayOfYear, bs = "cc", k = 20, by = as.factor(wday))) > summary(mod2) Family: gaussian Link function: identity Formula: Sessions ~ wday + linTrend + s(dayOfYear, bs = "cc", k = 20, by = as.factor(wday)) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 532.0423 12.3948 42.925 < 2e-16 *** wdayThursday -32.2007 14.1368 -2.278 0.0231 * wdayFriday -119.6377 14.1354 -8.464 < 2e-16 *** wdayWednesday -4.9203 14.1359 -0.348 0.7279 wdayMonday -5.8354 14.0639 -0.415 0.6784 wdaySaturday -265.6973 14.1727 -18.747 < 2e-16 *** wdaySunday -204.1019 14.1028 -14.472 < 2e-16 *** linTrend 0.1254 0.0205 6.118 1.71e-09 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df F p-value s(dayOfYear):as.factor(wday)Tuesday 15.32 18 6.413 < 2e-16 *** s(dayOfYear):as.factor(wday)Thursday 12.97 18 3.902 1.89e-10 *** s(dayOfYear):as.factor(wday)Friday 12.15 18 3.953 4.59e-11 *** s(dayOfYear):as.factor(wday)Wednesday 13.85 18 5.314 9.31e-15 *** s(dayOfYear):as.factor(wday)Monday 16.64 18 8.382 < 2e-16 *** s(dayOfYear):as.factor(wday)Saturday 11.29 18 3.307 3.00e-09 *** s(dayOfYear):as.factor(wday)Sunday 12.92 18 4.843 1.02e-13 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.648 Deviance explained = 69.9% GCV = 11749 Scale est. = 10025 n = 703 > datPrep$pred2 <- predict(mod2, newdata = datPrep) > ggplot(datPrep) + + geom_line(aes(x = Day, y = Sessions, linetype = !isTrainData)) + + geom_line(aes(x = Day, y = pred2, colour = !isTrainData)) + + scale_color_manual(values = c("#2B4894", "#cd5364"), limits = c(FALSE, TRUE)) + + labs(colour = "Test period", linetype = "Test period")