5 Merging the files

Objectives

In this chapter, the formatted data from the previous chapters are aggregated to produce the data table used in the local projections.

The agricultural production for the crops of interest is detrended.

The resulting dataset, named df, is exported in ../data/output/df_lp.rda.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(cli)
library(gam)

Loading required package: splines
Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loaded gam 1.22-3

5.1 Load Intermediate Files

The weather data (Chapter 1) can be loaded:

load("../data/output/weather/weather_regions_df.rda")

The agricultural data (Chapter 2):

load("../data/output/minagri/dataset_agri_2001_2015.rda")

The macroeconomic data and commodity prices (Chapter 3):

load("../data/output/macro/df_macro.rda")
load("../data/output/macro/df_int_prices.rda")

The share of natural regions and the El Niño–Southern Oscillation (Chapter 4):

load("../data/output/natural_region_dep.rda")
load("../data/output/weather/ONI_temp.rda")

5.2 Merge the Files

We add ENSO data to the weather dataset:

Weather <- weather_regions_df |> 
  # Add ENSO data
  left_join(
    ONI_temp |> 
      mutate(
        Year = as.numeric(Year),
        month = as.numeric(month)
      ) |> 
      rename(
        enso_start = date_start,
        enso_end = date_end
      ),
    by = c(
      "year" = "Year",
      "month" = "month"
    )
  ) |> 
  group_by(IDDPTO, month, State) |> 
  mutate( 
    temp_min_dev_ENSO   = temp_min - mean(temp_min),
    temp_max_dev_ENSO   = temp_max - mean(temp_max),
    temp_mean_dev_ENSO  = temp_mean - mean(temp_mean),
    precip_sum_dev_ENSO = precip_sum - mean(precip_sum),
    precip_piscop_sum_dev_ENSO = precip_piscop_sum - mean(precip_piscop_sum))|> 
  ungroup() |> 
  labelled::set_variable_labels(
    temp_min_dev_ENSO   = "Deviation of Min. Temperature from ENSO Normals",
    temp_max_dev_ENSO   = "Deviation of Max. Temperature from ENSO Normals",
    temp_mean_dev_ENSO  = "Deviation of Mean Temperature from ENSO Normals",
    precip_sum_dev_ENSO = "Deviation of Total Rainfall from ENSO Normals (Chirps)",
    precip_piscop_sum_dev_ENSO = "Deviation of Total Rainfall from ENSO Normals (Piscop)"
  )

Let us merge all these datasets in a single one:

data_total <- 
  data_total |> 
  # Add weather data and ENSO 
  left_join(
    #weather_regions_df |> 
    Weather |> 
      dplyr::select(-IDDPTO),
    by = c(
      "year" = "year",
      "month" = "month",
      "region" = "DEPARTAMEN", 
      "date" = "date"
    )
  ) |> 
  # Add macroeconomic data
  left_join(
    df_macro |> rename(gdp = y),
    by = "date"
  ) |> 
  # Add commodity prices data
  left_join(
    int_prices,
    by =  c(
      "date", "product", "product_eng")
  ) |> 
  # Add share of each type of region
  left_join(
    natural_region_dep,
    by = "region"
  )

Here are the first rows of that tibble:

data_total

# A tibble: 25,920 × 111
   region_id region   product       date       ln_prices ln_produc  year month
       <int> <chr>    <chr>         <date>         <dbl>     <dbl> <dbl> <dbl>
 1         1 AMAZONAS MAÍZ AMILÁCEO 2001-01-01     0          0     2001     1
 2         1 AMAZONAS MAÍZ AMILÁCEO 2001-02-01     0.536      6.58  2001     2
 3         1 AMAZONAS MAÍZ AMILÁCEO 2001-03-01    NA          6.26  2001     3
 4         1 AMAZONAS MAÍZ AMILÁCEO 2001-04-01     0.560      6.26  2001     4
 5         1 AMAZONAS MAÍZ AMILÁCEO 2001-05-01     0.565      6.04  2001     5
 6         1 AMAZONAS MAÍZ AMILÁCEO 2001-06-01    NA          7.00  2001     6
 7         1 AMAZONAS MAÍZ AMILÁCEO 2001-07-01     0.531      7.00  2001     7
 8         1 AMAZONAS MAÍZ AMILÁCEO 2001-08-01     0.525      8.37  2001     8
 9         1 AMAZONAS MAÍZ AMILÁCEO 2001-09-01    NA          6.53  2001     9
10         1 AMAZONAS MAÍZ AMILÁCEO 2001-10-01     0.519      6.53  2001    10
# ℹ 25,910 more rows
# ℹ 103 more variables: Value_prod <dbl>, surf_m <dbl>, Value_surfR <dbl>,
#   Value_prices <dbl>, campaign <dbl>, campaign_plain <chr>,
#   month_campaign <dbl>, surf_lag_calend <dbl>, product_eng <chr>,
#   perc_product <dbl>, perc_product_mean <dbl>, diff_plant_harv <dbl>,
#   exposition <dbl>, exposition_trend <dbl>, exposition_detrended <dbl>,
#   exposition_norm <dbl>, temp_min <dbl>, temp_max <dbl>, temp_mean <dbl>, …

Some descriptive statistics are shown in Chapter 6.

Lastly, the dataset can be saved for later use.

save(data_total, file = "../data/output/dataset_2001_2015.rda")
write_csv(data_total, file = "../data/output/dataset_2001_2015.csv")

5.3 Dataset for the Local Projections

Now, let us create the dataset specifically used to estimate the models.

Let us make sure that the region data are encoded as a factor.

data_total <- 
  data_total |> 
  mutate(region_id = factor(region_id))

The crops we focus on:

crops <- c("Rice", "Dent corn", "Potato", "Cassava")

The number of observation in each region, for each crop:

data_total |> 
  filter(product_eng %in% crops) |> 
  group_by(product_eng, region_id) |> 
  summarise(n = sum(Value_prod <= 0)) |> 
  arrange(desc(n))

`summarise()` has grouped output by 'product_eng'. You can override using the
`.groups` argument.

# A tibble: 96 × 3
# Groups:   product_eng [4]
   product_eng region_id     n
   <chr>       <fct>     <int>
 1 Potato      15          180
 2 Potato      21          180
 3 Potato      23          180
 4 Potato      24          180
 5 Rice        3           180
 6 Rice        8           180
 7 Rice        10          180
 8 Rice        17          180
 9 Rice        22          180
10 Rice        14          171
# ℹ 86 more rows

5.3.1 Definition of the Variable of Interest

Warning

We compute percentage deviation of production from monthly regional average, but we will actually not use those values in the subsequent estimations. In the first version of the analysis we used to do so, but this implied estimating a monthly regional trend. As kindly pointed out by a reviewer, the estimation of the trend should not be performed independently of the estimation. In the current version of this work, we use demeaned values of production and estimate the trend in the regressions.

This section outlines a two-step procedure for expressing agricultural production data at the monthly regional level for a specific crop and month as a percentage deviation from the monthly regional crop-specific average. The procedure involves handling missing values.

Step 1: Handling Missing Values

In the first step, we address missing values by linear interpolation. This approach helps us estimate the missing values by considering the neighboring data points.
- Step 1.1: Imputing missing values with linear interpolation.
  
  The missing values get replaced by linear interpolation. However, if there are more than two consecutive missing values, they are not replaced with interpolated values. Instead, the series for the specific crop in the given region is split based on the locations of the missing values. The split with the highest number of consecutive non-missing values is retained, while the other splits are discarded.
- Step 1.2: Dropping Series with Remaining Missing Values
  
  After imputing missing values using the moving median, we check if any missing values still remain in the dataset. If there are any remaining missing values for a particular series, we choose to exclude that series from further analysis. By doing so, we ensure that the subsequent detrending process is performed only on reliable and complete data.
Step 2: Normalized Agricultural Production

For each month ( m ), region ( i ), and crop ( c ), we calculate the average production over the entire period (January 2001 to December 2015): \[\overline{y}_{c,i,m} = \frac{1}{n_{T_c}} \sum_{t=1}^{T_c} y_{c,i,m,t}^{\text{raw}} \] Then, we express agricultural production relative to the average: \[y_{c,i,m,t} = \begin{cases} \frac{y_{c,i,m,t}^{\text{raw}}}{\overline{y}_{c,i,m}}, & \overline{y}_{c,i,m} > 0\\ 0, & \overline{y}_{c,i,m} = 0 \end{cases}\] Values of \(y_{c,i,m,t}>1\) means that the production for crop \(c\) in region \(i\) during month \(m\) of year \(t\) is higher than the average monthly production for that crop and region over the period 2001 to 2015. For example, a value of 1.5 means that the production is 50% higher than average.
Step 2 (alternative version): Deviation from regional monthly average, in percent (this step is useless in the new version of the analysis: it lead to discard too many observations)

Once we have addressed the missing values, we proceed to the second step, which consists in computing the deviation of production from the monthly regional average. First, we compute the average production of each crop \(c\) in each region \(i\) for calendar month \(m\): \[\overline{y}_{c,i,m} = \frac{1}{n_{T_c}} \sum_{t=1}^{T_c} y_{c,i,m,t}^{raw}\] Then, we compute the percentage deviation from this average at each date \(t\): \[y_{c,i,m,t} = \frac{y_{c,i,m,t}^{raw} - \overline{y}_{c,i,m}}{\overline{y}_{c,i,m}}\]

Let us implement this process in R. First, we need to define two functions to handle the missing values:

The get_index_longest_non_na() function retrieves the indices of the longest consecutive sequence without missing values from a given input vector. It helps us identify the positions of elements in that sequence.
The keep_values_longest_non_na() function uses the obtained indices to create a logical vector. Each element of this vector indicates whether the corresponding element in the input vector belongs to the longest consecutive sequence of non-missing values. This allows us to filter the data and retain only the values from the longest consecutive sequence without missing values.

These two functions combined help us handle missing data in the weather series and ensure that we work with the most complete sequences for each region and crop.

The first function:

#' Returns the index of the longest sequence of non NA values in a vector
#'
#' @param y vector of numerical values
#' @export
get_index_longest_non_na <- function(y) {
  split_indices <- which(is.na(y))
  nb_obs <- length(y)

  if (length(split_indices) == 0) {
    res <- seq_len(nb_obs)
  } else {
    idx_beg <- c(1, split_indices)
    if (idx_beg[length(idx_beg)] != nb_obs) {
      idx_beg <- c(idx_beg, nb_obs)
    }
    lengths <- diff(idx_beg)
    ind_max <- which.max(lengths)
    index_beginning <- idx_beg[ind_max]
    if(!index_beginning == 1 | is.na(y[index_beginning])) {
      index_beginning <- index_beginning + 1
    }
    index_end <- idx_beg[ind_max] + lengths[ind_max]
    if(is.na(y[index_end])) {
      index_end <- index_end - 1
    }
    res <- seq(index_beginning, index_end)
  }
  res
}

The second one:

#' Returns a logical vector that identifies the longest sequence of non NA
#' values within the input vector
#' 
#' @param y numeric vector
keep_values_longest_non_na <- function(y) {
  ids_to_keep <- get_index_longest_non_na(y)
  ids <- seq(1, length(y))
  ids %in% ids_to_keep
}

Note

Those two functions are defined in weatherperu/R/utils.R.

We define another function, pct_prod_production(), that takes the data frame of observations as input, as well as a crop name and a region ID. It returns a tibble with the following variables:

product_eng: the English name of the crop
region_id: the ID of the region
month: month number
date: date
y_new_normalized (our variable of interest in Chapter 7): the production demeaned by the month-specific average for the crop of interest in the region of interest
y_new: the production (in tons) where missing values were imputed, if possible
y_dev_pct: the production expressed as the percentage deviation from the monthly-specific average (for the crop of interest, in the region of interest)
y: same as y_dev_pct but without an estimated month-specific quadratic trend estimated by OLS
t: month-specific trend.

#' Computes the percentage deviation of production from monthly regional average
#'
#' @param df data
#' @param crop_name name of the crop
#' @param region_id id of the region
#'
#' @returns tibble with the product, the region id, the date, the production
#'  where missing values were imputed (`y_new`), the percentage deviation of
#'  production from its monthly regional average (`y_dev_pct`), the percentage
#'  deviation of production from its monthly regional average where a quadratic
#'  trend has been removed (`y`), the demeaned production (`y_new_normalized`),
#'  a month-specific trend (`t`)
#' @export
#' @importFrom dplyr filter arrange mutate select row_number group_by
#' @importFrom tidyr nest unnest
#' @importFrom purrr map
#' @importFrom imputeTS na_interpolation
#' @importFrom stats lm predict residuals
pct_prod_production <- function(df,
                                crop_name,
                                region_id) {
  # The current data
  df_current <-
    df |>
    filter(
      product_eng == !!crop_name,
      region_id == !!region_id
    ) |>
    arrange(date)

  ## Dealing with missing values ----
  # Look for negative production values
  df_current <-
    df_current |>
    mutate(
      y_new = ifelse(Value_prod < 0, NA, Value_prod)
    )

  if (any(is.na(df_current$y_new))) {

    # Replacing NAs by interpolation
    # If there are more than two contiguous NAs, they are not replaced
    df_current <-
      df_current |>
      mutate(
        y_new = imputeTS::na_interpolation(y_new, maxgap = 3)
      )

    # Removing obs at the beginning/end if they are still missing
    df_current <-
      df_current |>
      mutate(
        row_to_keep = !(is.na(y_new) & row_number() %in% c(1:2, (n()-1):(n())))
      ) |>
      filter(row_to_keep) |>
      select(-row_to_keep)

    # Keeping the longest series of continuous non-NA values
    df_current <-
      df_current |>
      mutate(
        row_to_keep = keep_values_longest_non_na(y_new)
      ) |>
      filter(row_to_keep) |>
      select(-row_to_keep)
  }


  rle_y_new <- rle(df_current$y_new)
  check_contiguous_zeros <- rle_y_new$lengths[rle_y_new$values==0]

  # If there are more than 8 contiguous 0, the series is discarded
  if (any(check_contiguous_zeros > 8)) {
    resul <- NULL
  } else {
    ## Percent deviation from monthly regional average
    resul <-
      df_current |>
      group_by(month) |>
      arrange(date) |>
      mutate(
        y_new_normalized = case_when(
          mean(y_new) == 0~ 0,
          TRUE ~ y_new / mean(y_new)
        ),
        y_dev_pct = case_when(
          mean(y_new) == 0 ~ 0,
          TRUE ~ (y_new - mean(y_new)) / mean(y_new)
        )
      ) |>
      ungroup() |>
      mutate(t = row_number()) |>
      ungroup() |>
      nest(.by = c(product_eng, region_id, month)) |>
      # distinct OLS per month
      mutate(
        ols_fit   = map(data, ~ lm(y_new_normalized ~ -1 + t + I(t^2), data = .x)),
        resid     = map(ols_fit, residuals),
        fitted    = map(ols_fit, predict)
      ) |>
      unnest(cols = c(data, resid, fitted)) |>
      group_by(month) |>
      mutate(
        y = resid
      ) |>
      select(product_eng, region_id, month, date, y_new, y_dev_pct, y_new_normalized, y, t) |>
      ungroup() |>
      arrange(date)
  }
  resul
}

For example, for potatoes in region with id 1:

pct_prod_production(
  df = data_total, 
    crop_name = "Potato", 
    region_id = 1
)

# A tibble: 180 × 9
   product_eng region_id month date       y_new y_dev_pct y_new_normalized     y
   <chr>       <fct>     <dbl> <date>     <dbl>     <dbl>            <dbl> <dbl>
 1 Potato      1             1 2001-01-01 2360    -0.301             0.699 0.671
 2 Potato      1             2 2001-02-01 2841    -0.379             0.621 0.565
 3 Potato      1             3 2001-03-01 4585    -0.151             0.849 0.774
 4 Potato      1             4 2001-04-01 4585    -0.107             0.893 0.786
 5 Potato      1             5 2001-05-01 6065     0.162             1.16  1.05 
 6 Potato      1             6 2001-06-01 3998.   -0.307             0.693 0.581
 7 Potato      1             7 2001-07-01 3998.   -0.351             0.649 0.520
 8 Potato      1             8 2001-08-01 5921    -0.0959            0.904 0.749
 9 Potato      1             9 2001-09-01 4751    -0.250             0.750 0.567
10 Potato      1            10 2001-10-01 4751    -0.151             0.849 0.649
# ℹ 170 more rows
# ℹ 1 more variable: t <int>

We can apply this function to all crops of interest, in each region. Let us define a table that contains all the possible values for the combination of crops and regions:

product_and_regions <- 
  data_total |> 
  filter(product_eng %in% crops) |> 
  select(product_eng, region_id) |> 
  unique()

Then we apply the pct_prod_production() function to all these different cases, and store the results in a list named df_pct_pred_production:

df_pct_pred_production <- vector(mode = "list", length = nrow(product_and_regions))
cli_progress_bar(total = nrow(product_and_regions))
for(i in 1:nrow(product_and_regions)){
  df_pct_pred_production[[i]] <- pct_prod_production(
    df = data_total, 
    crop_name = product_and_regions$product_eng[i], 
    region_id = product_and_regions$region_id[i]
  )
  cli_progress_update(set = i)
}

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo

■■■■■■■■■■                        29% | ETA:  2s

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■    96% | ETA:  0s

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■  100% | ETA:  0s

The elements of the list are all tibbles, with the same column names. We can merge them in a single tibble.

df_pct_pred_production <- bind_rows(df_pct_pred_production)

We can have a look at the number of months with 0 values for the agricultural production.

df_pct_pred_production |> 
  group_by(product_eng, region_id) |> 
  summarise(nb_0 = sum(y_new == 0)) |> 
  arrange(desc(nb_0))

`summarise()` has grouped output by 'product_eng'. You can override using the
`.groups` argument.

# A tibble: 78 × 3
# Groups:   product_eng [4]
   product_eng region_id  nb_0
   <chr>       <fct>     <int>
 1 Rice        2           108
 2 Rice        4           100
 3 Potato      20           71
 4 Rice        7            68
 5 Dent corn   3            67
 6 Dent corn   8            66
 7 Rice        16           61
 8 Dent corn   22           59
 9 Cassava     10           54
10 Rice        23           52
# ℹ 68 more rows

Now, let us add the other columns to the tibble that contains the percentage deviation production data:

df <- df_pct_pred_production |> 
  left_join(
    data_total,
    join_by(product_eng, region_id, month, date)
  )

Let us also impute missing values for the weather variables.

weather_variables <- 
  weather_regions_df |> 
  select(where(is.numeric)) |> 
  select(-year, -month) |> 
  colnames()

The current number of missing values:

df |> 
  summarise(
    across(
      .cols = !!weather_variables,
      .fns = ~ sum(is.na(.x)),
      .names = "{.col}_nb_na"
    )
  ) |> 
  unlist()

                temp_min_nb_na                 temp_max_nb_na 
                             0                              0 
               temp_mean_nb_na               precip_sum_nb_na 
                             0                              0 
       precip_piscop_sum_nb_na        perc_gamma_precip_nb_na 
                             0                              0 
perc_gamma_precip_piscop_nb_na                 gdd_rice_nb_na 
                             0                              0 
               gdd_maize_nb_na               gdd_potato_nb_na 
                             0                              0 
             gdd_cassava_nb_na                 hdd_rice_nb_na 
                             0                              0 
               hdd_maize_nb_na               hdd_potato_nb_na 
                             0                              0 
             hdd_cassava_nb_na             temp_min_dev_nb_na 
                             0                              0 
            temp_max_dev_nb_na            temp_mean_dev_nb_na 
                             0                              0 
          precip_sum_dev_nb_na    precip_piscop_sum_dev_nb_na 
                             0                              0 
            gdd_rice_dev_nb_na            gdd_maize_dev_nb_na 
                             0                              0 
          gdd_potato_dev_nb_na          gdd_cassava_dev_nb_na 
                             0                              0 
            hdd_rice_dev_nb_na            hdd_maize_dev_nb_na 
                             0                              0 
          hdd_potato_dev_nb_na          hdd_cassava_dev_nb_na 
                             0                              0 
     cold_surprise_maize_nb_na       cold_surprise_rice_nb_na 
                             0                              0 
    cold_surprise_potato_nb_na    cold_surprise_cassava_nb_na 
                             0                              0 
      hot_surprise_maize_nb_na        hot_surprise_rice_nb_na 
                             0                              0 
     hot_surprise_potato_nb_na     hot_surprise_cassava_nb_na 
                             0                              0 
      dry_surprise_maize_nb_na        dry_surprise_rice_nb_na 
                             0                              0 
     dry_surprise_potato_nb_na     dry_surprise_cassava_nb_na 
                             0                              0 
      wet_surprise_maize_nb_na        wet_surprise_rice_nb_na 
                             0                              0 
     wet_surprise_potato_nb_na     wet_surprise_cassava_nb_na 
                             0                              0 
                   spi_1_nb_na                    spi_3_nb_na 
                             0                              0 
                   spi_6_nb_na                   spi_12_nb_na 
                             0                              0 
                  spei_1_nb_na                   spei_3_nb_na 
                             0                              0 
                  spei_6_nb_na                  spei_12_nb_na 
                             0                              0 
            spi_piscop_1_nb_na             spi_piscop_3_nb_na 
                             0                              0 
            spi_piscop_6_nb_na            spi_piscop_12_nb_na 
                             0                              0 
           spei_piscop_1_nb_na            spei_piscop_3_nb_na 
                             0                              0 
           spei_piscop_6_nb_na           spei_piscop_12_nb_na 
                             0                              0

In case of missing values, we use linear interpolation to replace them:

df <- 
  df |> 
  mutate(
    across(
      .cols = !!weather_variables,
      .fns = ~ imputeTS::na_interpolation(.x, maxgap = 3)
    )
  )

The number of remaining missing values:

df |> 
  summarise(
    across(
      .cols = !!weather_variables,
      .fns = ~ sum(is.na(.x)),
      .names = "{.col}_nb_na"
    )
  ) |> 
  unlist()

                temp_min_nb_na                 temp_max_nb_na 
                             0                              0 
               temp_mean_nb_na               precip_sum_nb_na 
                             0                              0 
       precip_piscop_sum_nb_na        perc_gamma_precip_nb_na 
                             0                              0 
perc_gamma_precip_piscop_nb_na                 gdd_rice_nb_na 
                             0                              0 
               gdd_maize_nb_na               gdd_potato_nb_na 
                             0                              0 
             gdd_cassava_nb_na                 hdd_rice_nb_na 
                             0                              0 
               hdd_maize_nb_na               hdd_potato_nb_na 
                             0                              0 
             hdd_cassava_nb_na             temp_min_dev_nb_na 
                             0                              0 
            temp_max_dev_nb_na            temp_mean_dev_nb_na 
                             0                              0 
          precip_sum_dev_nb_na    precip_piscop_sum_dev_nb_na 
                             0                              0 
            gdd_rice_dev_nb_na            gdd_maize_dev_nb_na 
                             0                              0 
          gdd_potato_dev_nb_na          gdd_cassava_dev_nb_na 
                             0                              0 
            hdd_rice_dev_nb_na            hdd_maize_dev_nb_na 
                             0                              0 
          hdd_potato_dev_nb_na          hdd_cassava_dev_nb_na 
                             0                              0 
     cold_surprise_maize_nb_na       cold_surprise_rice_nb_na 
                             0                              0 
    cold_surprise_potato_nb_na    cold_surprise_cassava_nb_na 
                             0                              0 
      hot_surprise_maize_nb_na        hot_surprise_rice_nb_na 
                             0                              0 
     hot_surprise_potato_nb_na     hot_surprise_cassava_nb_na 
                             0                              0 
      dry_surprise_maize_nb_na        dry_surprise_rice_nb_na 
                             0                              0 
     dry_surprise_potato_nb_na     dry_surprise_cassava_nb_na 
                             0                              0 
      wet_surprise_maize_nb_na        wet_surprise_rice_nb_na 
                             0                              0 
     wet_surprise_potato_nb_na     wet_surprise_cassava_nb_na 
                             0                              0 
                   spi_1_nb_na                    spi_3_nb_na 
                             0                              0 
                   spi_6_nb_na                   spi_12_nb_na 
                             0                              0 
                  spei_1_nb_na                   spei_3_nb_na 
                             0                              0 
                  spei_6_nb_na                  spei_12_nb_na 
                             0                              0 
            spi_piscop_1_nb_na             spi_piscop_3_nb_na 
                             0                              0 
            spi_piscop_6_nb_na            spi_piscop_12_nb_na 
                             0                              0 
           spei_piscop_1_nb_na            spei_piscop_3_nb_na 
                             0                              0 
           spei_piscop_6_nb_na           spei_piscop_12_nb_na 
                             0                              0

5.3.2 Saving the file

The dataset that can be used to estimate the impact of weather shocks on agricultural production can be saved in the data output folder:

# Add labels to the new columns
df <- 
  df |> 
  labelled::set_variable_labels(
    y_new = "Monthly Agricultural Production (tons)",
    y_dev_pct = "Agricultural Production (percent deviation from monthly regional values)",
  )

save(df, file = "../data/output/df_lp.rda")