---
title: "user_sample_3"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{user_sample_3}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

-   plot of bias comparison
-   Load data for multiple stations and run counterfactual.
-   Investigate effect of 9 Euro ticket for station Köln, Bottrop, Lünen

### Prerequisites

Installation for user as described in README

### Steps

1.  adapt the data_dir (in windows e.g. //de-inf-008/<YOUR_USER>\$/Eigene Dateien/ubair-master/data/)

``` r
library(ubair)
library(ggplot2)
# set data_dir where the data is stored
data_dir <- "../../Daten/user_sample_data/"
# This might take a few seconds for large files
data <- load_uba_data_from_dir(data_dir = data_dir)
```

2.  Set sample variables.

``` r
target <- "NO2"
stations <- list(
  Lünen = "DENW006", Bottrop = "DENW021",
  Köln = "DENW212"
)
meteo_variables <- c("TMP", "RFE", "WIG", "WIR", "LDR")

# dates for 9 Euro ticket effect
application_start <- lubridate::ymd("20220301") # = start reference time
date_effect_start <- lubridate::ymd_hm("20220601 00:00")
application_end <- lubridate::ymd("20220831") # = end effect time

buffer <- 0 # number of data points to be ignored before effect

trend <- "linear"
model_type <- "lightgbm"

window_size <- 14 # days of data to calculate the mean in prediction results
```

3.  create a params.yaml. Either

-   copy the default params.yaml to your file system, adapt it and load it or
-   load the default and adapt the params programmatically

``` r
params <- load_params()
# adapt params programatically
params$target <- target
params$meteo_variables <- meteo_variables
```

4.  Prepare data of each station for training and run counterfactual. See function documentation for further details.

``` r
results_all <- data.table::data.table()
for (station_name in names(stations)) {
  env_data <- clean_data(data, station = stations[station_name])
  dt_prepared <- prepare_data_for_modelling(env_data, params)
  dt_prepared <- dt_prepared[complete.cases(dt_prepared)]
  split_data <- split_data_counterfactual(
    dt_prepared,
    application_start = application_start,
    application_end = application_end
  )
  res <- run_counterfactual(split_data,
    params,
    detrending_function = trend,
    model_type = model_type,
    alpha = 0.9,
    log_transform = TRUE
  )
  predictions <- res$prediction
  predictions[, station_name := station_name]
  predictions[, station := stations[station_name]]
  predictions[, model := model_type]
  predictions[, trend := trend]
  results_all <- rbind(results_all, predictions)
}
#> Warning in log(dt_train_new$value): NaNs wurden erzeugt
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1549
#> [LightGBM] [Info] Number of data points in the train set: 60472, number of used features: 8
#> [LightGBM] [Info] Start training from score -0.000000
#> Warning in log(dt_train_new$value): NaNs wurden erzeugt
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1550
#> [LightGBM] [Info] Number of data points in the train set: 60133, number of used features: 8
#> [LightGBM] [Info] Start training from score -0.000000
#> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000315 seconds.
#> You can set `force_row_wise=true` to remove the overhead.
#> And if memory is not enough, you can set `force_col_wise=true`.
#> [LightGBM] [Info] Total Bins 1552
#> [LightGBM] [Info] Number of data points in the train set: 60494, number of used features: 8
#> [LightGBM] [Info] Start training from score 0.000000
```

5.  Plot bias (prediction - observation) to compare multiple stations

``` r
results_all[, bias := prediction - value]
results_all[, station_label := paste0(station_name, " (", station, ")")]
results_all[, station := as.character(unlist(station))]
results_all[, smoothed_bias := data.table::frollmean(bias,
  n = 24 * window_size,
  align = "center"
),
by = station
]
bias_plot <- ggplot2::ggplot(
  results_all,
  ggplot2::aes(
    x = date,
    y = smoothed_bias,
    color = station_label,
    group = station_label
  )
) +
  geom_line() +
  labs(
    title = paste("NO2 Bias Over Time by Station (", window_size, "-day mean)"),
    x = "Date",
    y = "NO2 Bias \n(prediction - observation) [µg/m³]",
    color = "Station"
  ) +
  ggplot2::geom_vline(
    xintercept = date_effect_start, linetype = 4,
    colour = "black"
  ) +
  ggplot2::theme_bw() +
  ggplot2::scale_x_datetime(
    date_minor_breaks = "1 month",
    date_breaks = "2 month"
  ) +
  ggplot2::theme(legend.position = "bottom")
bias_plot
#> Warning: Removed 1005 rows containing missing values or values outside the scale range
#> (`geom_line()`).
```

![plot of chunk bias_plot](figure/bias_plot-1.png)

6.  Plot counterfactual for Köln.

``` r
station_name_plot <- "Köln"
counterfactual_plot <- plot_counterfactual(
  dplyr::filter(results_all, station_name == station_name_plot), params,
  window_size = window_size,
  date_effect_start,
  buffer = buffer
)
counterfactual_plot
```

![plot of chunk counterfactual_plot_3](figure/counterfactual_plot_3-1.png) \`\`\`