2  Data Cleaning

Objectives

This chapter presents the codes used to clean data. It especially explains the different steps we took to deal with missing values.

Let us load the data obtained in Chapter 1.

Let us first load {tidyverse}:

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Then the data:

load("../data/df_merged.rda")
dim(df)
[1] 18561   841

Let us keep track on the number of observation each time we filter out observations.

nb_obs_df <- tibble(
    step = c("raw", "age_15"), 
    n = c(19940, nrow(df))
  )
nb_obs_df
# A tibble: 2 × 2
  step       n
  <chr>  <dbl>
1 raw    19940
2 age_15 18561

We can create a table with the labels associated with the name of each variable.

variable_names <- tribble(
  ~variable, ~label, ~type,
  "PERSONNE_pb_depress", "Depression", "qualitative", 
  "PERSONNE_pb_asthm", "Asthma", "qualitative",
  "PERSONNE_pb_bronchit", "Bronchitis", "qualitative",
  "PERSONNE_pb_infarctus", "Heart Attack", "qualitative",
  "PERSONNE_pb_coronair", "Artery Disease", "qualitative",
  "PERSONNE_pb_hypertens", "Hypertension", "qualitative",
  "PERSONNE_pb_avc", "Stroke", "qualitative",
  "PERSONNE_pb_arthros", "Osteoarthritis", "qualitative",
  "PERSONNE_pb_lombalgi", "Low Back Pain", "qualitative",
  "PERSONNE_pb_cervical", "Neck Pain", "qualitative",
  "PERSONNE_pb_diabet", "Diabetes", "qualitative",
  "PERSONNE_pb_allergi", "Allergy", "qualitative",
  "PERSONNE_pb_cirrhos", "Cirrhosis", "qualitative",
  "PERSONNE_pb_urinair", "Urinary Incontinence", "qualitative",
  # "PERSONNE_pb_non", "No Illness", "qualitative",
  "PERSONNE_score_t_corrected", "MHI-5 Score", "numerical",
  "PERSONNE_etat_sante", "Self-Assessed Health Condition", "qualitative",
  "PERSONNE_age", "Age", "numerical",
  "PERSONNE_sexe", "Gender", "qualitative", 
  "PERSONNE_couple", "Couple", "qualitative", 
  "PERSONNE_etatleg", "Marital Status", "qualitative", 
  "PERSONNE_statut", "Professional Status", "qualitative", 
  "PERSONNE_ss", "Social Security", "qualitative", 
  "PERSONNE_regime", "Social Security System", "qualitative", 
  "PERSONNE_rap_pcs8", "Occupation", "qualitative", 
  "PERSONNE_ald", "Long-term condition (Self-declared)", "qualitative",
  "SOINS_ald_am", "Long-term condition (SNIIRAM)", "qualitative",
  #
  #
  # Household
  #
  #
  "MENAGE_rap_zau", "Zoning in Urban Areas", "ordinal",
  "MENAGE_region", "Region", "qualitative", 
  "MENAGE_revdetail", "Income", "numerical",
  "MENAGE_revucinsee", "Net Income per Cons. Unit", "numerical",
  "MENAGE_tu","Size Urban Area", "ordinal", 
  "MENAGE_nbpers", "Household size", "numerical",
  "ensol_2011", "Sunlight", "numerical",
  # "MENAGE_revenu", "Income (class)", "ordinal", 
  # "MENAGE_typmen", "Household Type", "qualitative", 
  # "CODE_REG", "Region Code", "qualitative",
  #
  #
  # Mutual Insurance
  #
  #
  "MUTUELLE_assu", "Insurance", "qualitative", 
  "MUTUELLE_typcc", "Mutual Coverage Type", "qualitative",
  #
  # Expenses
  # Outpatient ~= Sum of the others
  #
  "SOINS_depamb", "Exp. Outpatient", "numerical",
  "SOINS_depomn", "Exp. General Practitioner", "numerical",
  "SOINS_depspe", "Exp. Specialist", "numerical",
  "SOINS_deppha", "Exp. Pharmacy", "numerical",
  "SOINS_depkin", "Exp. Physiotherapist", "numerical",
  "SOINS_depinf", "Exp. Nurse", "numerical",
  "SOINS_depden", "Exp. Dentist", "numerical",
  "SOINS_depmat", "Exp. Equipment", "numerical",
  "SOINS_deptra", "Exp. Transport", "numerical",
  "SOINS_depopt", "Exp. Optical", "numerical",
  "SOINS_deppro", "Exp. Prostheses", "numerical",
  "SOINS_depurg", "Exp. Emergency w/o hospitalization", "numerical",
  #
  #
  # Reimbursement
  #
  "SOINS_remamb", "Reimbursement Outpatient", "numerical",
  "SOINS_remomn", "Reimbursement General Practitioner", "numerical",
  "SOINS_remspe", "Reimbursement Specialist", "numerical",
  "SOINS_rempha", "Reimbursement Pharmacy", "numerical",
  "SOINS_remkin", "Reimbursement Physiotherapist", "numerical",
  "SOINS_reminf", "Reimbursement Nurse", "numerical",
  "SOINS_remden", "Reimbursement Dentist", "numerical",
  "SOINS_remmat", "Reimbursement Equipment", "numerical",
  "SOINS_remtra", "Reimbursement Transport", "numerical",
  "SOINS_remopt", "Reimbursement Optical", "numerical",
  "SOINS_rempro", "Reimbursement Prostheses", "numerical",
  "SOINS_remurg", "Reimbursement Emergency w/o hospitalization", "numerical",
  #
  #
  # Co-payment (Ticket moderateur)
  #
  "SOINS_tmamb", "Co-payment Outpatient", "numerical",
  "SOINS_tmomn", "Co-payment General Practitioner", "numerical",
  "SOINS_tmspe", "Co-payment Specialist", "numerical",
  "SOINS_tmpha", "Co-payment Pharmacy", "numerical",
  "SOINS_tmkin", "Co-payment Physiotherapist", "numerical",
  "SOINS_tminf", "Co-payment Nurse", "numerical",
  "SOINS_tmden", "Co-payment Dentist", "numerical",
  "SOINS_tmmat", "Co-payment Equipment", "numerical",
  "SOINS_tmtra", "Co-payment Transport", "numerical",
  "SOINS_tmopt", "Co-payment Optical", "numerical",
  "SOINS_tmpro", "Co-payment Prostheses", "numerical",
  "SOINS_tmurg", "Co-payment Emergency w/o hospitalization", "numerical",
  #
  # Extra-fees
  #
  "SOINS_dpaamb", "Extra-fees Outpatient", "numerical",
  "SOINS_dpaomn", "Extra-fees General Practitioner", "numerical",
  "SOINS_dpaspe", "Extra-fees Specialist", "numerical",
  "SOINS_dpapha", "Extra-fees Pharmacy", "numerical",
  "SOINS_dpakin", "Extra-fees Physiotherapist", "numerical",
  "SOINS_dpainf", "Extra-fees Nurse", "numerical",
  "SOINS_dpaden", "Extra-fees Dentist", "numerical",
  "SOINS_dpamat", "Extra-fees Equipment", "numerical",
  "SOINS_dpatra", "Extra-fees Transport", "numerical",
  "SOINS_dpaopt", "Extra-fees Optical", "numerical",
  "SOINS_dpapro", "Extra-fees Prostheses", "numerical",
  "SOINS_dpaurg", "Extra-fees Emergency w/o hospitalization", "numerical",
  #
  #
  # Deductible (Franchise)
  #
  "SOINS_pf_framb", "Deduct. Outpatient", "numerical",
  "SOINS_pf_fromn", "Deduct. General Practitioner", "numerical",
  "SOINS_pf_frspe", "Deduct. Specialist", "numerical",
  "SOINS_pf_frpha", "Deduct. Pharmacy", "numerical",
  "SOINS_pf_frkin", "Deduct. Physiotherapist", "numerical",
  "SOINS_pf_frinf", "Deduct. Nurse", "numerical",
  "SOINS_pf_frden", "Deduct. Dentist", "numerical",
  # "SOINS_pf_frmat", "Deduct. Equipment", "numerical", # does not exist in the data
  "SOINS_pf_frtra", "Deduct. Transport", "numerical",
  # "SOINS_pf_fropt", "Deduct. Optical", "numerical",
  # "SOINS_pf_frpro", "Deduct. Prostheses", "numerical",
  "SOINS_pf_frurg", "Deduct. Emergency w/o hospitalization", "numerical",
  #
  "SOINS_seac_omn", "No. Medical Sessions General Pract.", "numerical",
  "SOINS_seac_spe",  "No. Medical Sessions Specialist", "numerical",
  #
  #
  "OPINION1_renonc_cons", "Waiver General Practitioner", "qualitative",
  "OPINION1_renonc_dent", "Waiver Dental Care", "qualitative",
  "OPINION1_renonc_fin", "Waiver Other Health Care", "qualitative",
  "OPINION1_renonc_loin", "Waiver Health Care Too Far", "qualitative",
  "OPINION1_renonc_long", "Waiver Appointment Delay Too Long", "qualitative",
  #
  # Working conditions
  #
  "QST_ct_depech", "Have to Hurry to Do Job", "qualitative", 
  "QST_ct_liberte", "Very Little Freedom to Do Job", "qualitative", 
  "QST_ct_apprend", "Job Allows to Learn New Things", "qualitative", 
  "QST_ct_aidecol", "Colleagues Help Carry out Tasks", "qualitative", 
  "QST_ct_travnuit", "Job Requires not to Sleep Betw. Midnight and 5 a.m.", "qualitative", 
  "QST_ct_repet", "Repetitive Work under Time Constraints / Line Job", "qualitative", 
  "QST_ct_lourd", "Exposed to Carrying Heavy Loads", "qualitative", 
  "QST_ct_posture", "Exposed to Painful Postures", "qualitative", 
  "QST_ct_produit", "Exposed to Harmful/Toxic Products/Substances", "qualitative",
  #
  "QES_association", "Participation in Group Activities", "qualitative",
  "QES_tpsami", "Frequency Meeting with Friends/Neighbors", "qualitative",
  "QES_tpsasso", "Frequency Meeting with People in Organizations", "qualitative",
  "QES_tpscolleg", "Frequency Meeting with Colleagues Outside Work", "qualitative",
  "QES_tpsfamil", "Frequency Meeting with Family Living Outside Household", "qualitative",
  "QES_mere_etude", "Mother's Level of Education", "qualitative",
  "QES_pere_etude", "Father's Level of Education", "qualitative"
)

save(variable_names, file = "../data/out/variable_names.rda")

We only keep the variables in the table variable_names, as well as those created at the end of Chapter 1.

df_tmp <- 
  df |> 
  select(!!variable_names$variable, inf_q1_mhi_5, inf_q1_mhi_3, id) |> 
  select(
    -PERSONNE_etatleg, -MENAGE_revdetail, -MENAGE_rap_zau,
    -SOINS_remamb, -SOINS_tmamb,
    -SOINS_dpaamb, -SOINS_pf_framb
  ) |> 
  mutate(PERSONNE_pb_depress = as.character(PERSONNE_pb_depress))

Note: the following variables were excluded for the following reasons:

Some additional information should be provided regarding healthcare variables. These variables are composed of :

The following relationships exist between these variables:

To avoid collinearity problems, we retain only the following variables in the final model: tm, dpa, rem, pf_fr, pa.

Then the variables that begin with SOINS_dep need to be removed:

retirer <- 
  variable_names$variable[str_which(variable_names$variable, "^SOINS_dep")]
retirer
 [1] "SOINS_depamb" "SOINS_depomn" "SOINS_depspe" "SOINS_deppha" "SOINS_depkin"
 [6] "SOINS_depinf" "SOINS_depden" "SOINS_depmat" "SOINS_deptra" "SOINS_depopt"
[11] "SOINS_deppro" "SOINS_depurg"
df_tmp <-
  df_tmp |> 
  select(-!!retirer)

Let us remove people for which the MHI-5 score was not computed:

sum(is.na(df_tmp$inf_q1_mhi_5))
[1] 6188
df_tmp <- 
  df_tmp |> 
  filter(!is.na(inf_q1_mhi_5))

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "missing_mhi5_score", n = nrow(df_tmp))
  )
nb_obs_df
# A tibble: 3 × 2
  step                   n
  <chr>              <dbl>
1 raw                19940
2 age_15             18561
3 missing_mhi5_score 12373

Let us remove the individuals who did not report their status with regard to depression:

table(df_tmp$PERSONNE_pb_depress)

   Depression No depression  Not reported 
          673         11199           501 

Distribution of MHI-5 score among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_pb_depress == "Not reported") |> 
  group_by(inf_q1_mhi_5) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))
# A tibble: 2 × 3
  inf_q1_mhi_5     n  prop
  <fct>        <int> <dbl>
1 <=Q1           147  29.3
2 >Q1            354  70.7
df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_pb_depress != "Not reported")

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "pb_depress_not_reported", n = nrow(df_tmp))
  )
nb_obs_df
# A tibble: 4 × 2
  step                        n
  <chr>                   <dbl>
1 raw                     19940
2 age_15                  18561
3 missing_mhi5_score      12373
4 pb_depress_not_reported 11872

Let us focus on the MHI-5 score here and discard the MHI-3. Let us create a variable (status) that classifies individuals depending on their MHI-5 score and their answer regarding depression:

df_tmp <- 
  df_tmp |> 
  select(-inf_q1_mhi_3) |>
  mutate(
    status = case_when(
      PERSONNE_pb_depress == "Depression" & inf_q1_mhi_5 == "<=Q1" ~ "D_and_inf_Q1",
      PERSONNE_pb_depress == "Depression" & inf_q1_mhi_5 == ">Q1" ~ "D_and_sup_Q1",
      PERSONNE_pb_depress == "No depression" & inf_q1_mhi_5 == "<=Q1" ~ "Not_D_and_inf_Q1",
      PERSONNE_pb_depress == "No depression" & inf_q1_mhi_5 == ">Q1" ~ "Not_D_and_sup_Q1",
      PERSONNE_pb_depress == "Not reported" & inf_q1_mhi_5 == "<=Q1" ~ "Not_R_and_inf_Q1",
      PERSONNE_pb_depress == "Not reported" & inf_q1_mhi_5 == ">Q1" ~ "Not_R_and_sup_Q1",
      TRUE~"problem"
    )
  )

We can look at the distribution of this newly created variable:

table(df_tmp$status)

    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
             572              101             3193             8006 
round(100*prop.table(table(df_tmp$status)), 2)

    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
            4.82             0.85            26.90            67.44 

No values coded as problem:

df_tmp |> 
  filter(status == "problem") |> 
  select(status, PERSONNE_pb_depress, inf_q1_mhi_5) |> 
  filter(!is.na(inf_q1_mhi_5))
# A tibble: 0 × 3
# ℹ 3 variables: status <chr>, PERSONNE_pb_depress <chr>, inf_q1_mhi_5 <fct>

We are interested in people who self report having not experienced depression during the past year. Other people will be discarded. The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(! status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))
# A tibble: 2 × 4
  inf_q1_mhi_5 status           n  prop
  <fct>        <chr>        <int> <dbl>
1 <=Q1         D_and_inf_Q1   572  85.0
2 >Q1          D_and_sup_Q1   101  15.0

Let us discard other people. We define the status variable as a binary factor:

df_tmp <- 
  df_tmp |> 
  filter(status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  mutate(status = factor(
    status, level = c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1"))
  )

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "not_D", n = nrow(df_tmp))
  )
nb_obs_df
# A tibble: 5 × 2
  step                        n
  <chr>                   <dbl>
1 raw                     19940
2 age_15                  18561
3 missing_mhi5_score      12373
4 pb_depress_not_reported 11872
5 not_D                   11199

2.1 Looking at missing data

We need to check where are the missing data. Let us have a look at the “Working conditions” questions (columns which begin with QST_c), and count the number and corresponding proportion of missing values:

df_tmp |> 
  select(matches("^QST_c")) |> 
  map_df(
    function(x){
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed"),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Table 2.1: Missing values in the Working Conditions questions
variable no_obs nb_not_surveyed prop
QST_ct_depech 11199 5714 51.02
QST_ct_liberte 11199 5714 51.02
QST_ct_apprend 11199 5714 51.02
QST_ct_aidecol 11199 5714 51.02
QST_ct_travnuit 11199 5714 51.02
QST_ct_repet 11199 5714 51.02
QST_ct_lourd 11199 5714 51.02
QST_ct_posture 11199 5714 51.02
QST_ct_produit 11199 5714 51.02

As we can see from the previous result, around 50% of respondents did not give an answer to the questions related to work. It does not only concern unemployed people (1,289 individuals are classified as “Inactive having never worked”):

table(df_tmp$PERSONNE_rap_pcs8) |> sort()

                               Farmer                     Craftsman, trader 
                                  383                                   567 
                     Unskilled worker                   Commercial employee 
                                  892                                  1315 
Executive and intellectual profession               Administrative employee 
                                 1443                                  1498 
         Inactive having never worked                        Skilled worker 
                                 1595                                  1621 
              Intermediate occupation 
                                 1875 

Let us have a look at the distribution of PERSONNE_rap_pcs8 for individuals for which QST_ct_depech is missing:

df_tmp |> 
  filter(`QST_ct_depech` == "Not surveyed") |> 
  magrittr::extract2("PERSONNE_rap_pcs8") |> 
  table()

                               Farmer                     Craftsman, trader 
                                  252                                   263 
Executive and intellectual profession               Intermediate occupation 
                                  513                                   668 
              Administrative employee                   Commercial employee 
                                  617                                   598 
                       Skilled worker                      Unskilled worker 
                                  702                                   498 
         Inactive having never worked 
                                 1595 
df_tmp |> 
  filter(QST_ct_depech == "Not surveyed") |> 
  select_at(vars(matches("^QST_c")))
# A tibble: 5,714 × 9
   QST_ct_depech QST_ct_liberte QST_ct_apprend QST_ct_aidecol QST_ct_travnuit
   <fct>         <fct>          <fct>          <fct>          <fct>          
 1 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 2 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 3 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 4 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 5 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 6 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 7 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 8 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 9 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
10 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
# ℹ 5,704 more rows
# ℹ 4 more variables: QST_ct_repet <fct>, QST_ct_lourd <fct>,
#   QST_ct_posture <fct>, QST_ct_produit <fct>

We do the same for variables which give information relative to social interactions and which give information on the parents’ background (i.e., variables that begin with QES). First, let us have a look at the proportion of NA in those:

df_tmp |> 
  select(matches("^QES")) |> 
  map_df(
    function(x) {
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed", na.rm=T),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Table 2.2: Missing values in QES variables
variable no_obs nb_not_surveyed prop
QES_association 11199 44 0.39
QES_tpsami 11199 44 0.39
QES_tpsasso 11199 44 0.39
QES_tpscolleg 11199 44 0.39
QES_tpsfamil 11199 44 0.39
QES_mere_etude 11199 44 0.39
QES_pere_etude 11199 44 0.39

Let us have a look at the proportion of missing values in the current dataset:

pct_missing <- 
  sapply(df_tmp, function(x) round(100 * sum(is.na(x)) / length(x), 1)) |> 
  sort(decreasing = TRUE)
pct_missing
              SOINS_remomn               SOINS_remspe 
                      45.4                       45.4 
              SOINS_rempha               SOINS_remkin 
                      45.4                       45.4 
              SOINS_reminf               SOINS_remden 
                      45.4                       45.4 
              SOINS_remmat               SOINS_remtra 
                      45.4                       45.4 
              SOINS_remopt               SOINS_rempro 
                      45.4                       45.4 
              SOINS_remurg                SOINS_tmomn 
                      45.4                       45.4 
               SOINS_tmspe                SOINS_tmpha 
                      45.4                       45.4 
               SOINS_tmkin                SOINS_tminf 
                      45.4                       45.4 
               SOINS_tmden                SOINS_tmmat 
                      45.4                       45.4 
               SOINS_tmtra                SOINS_tmopt 
                      45.4                       45.4 
               SOINS_tmpro                SOINS_tmurg 
                      45.4                       45.4 
              SOINS_dpaomn               SOINS_dpaspe 
                      45.4                       45.4 
              SOINS_dpapha               SOINS_dpakin 
                      45.4                       45.4 
              SOINS_dpainf               SOINS_dpaden 
                      45.4                       45.4 
              SOINS_dpamat               SOINS_dpatra 
                      45.4                       45.4 
              SOINS_dpaopt               SOINS_dpapro 
                      45.4                       45.4 
              SOINS_dpaurg             SOINS_pf_fromn 
                      45.4                       45.4 
            SOINS_pf_frspe             SOINS_pf_frpha 
                      45.4                       45.4 
            SOINS_pf_frkin             SOINS_pf_frinf 
                      45.4                       45.4 
            SOINS_pf_frden             SOINS_pf_frtra 
                      45.4                       45.4 
            SOINS_pf_frurg             SOINS_seac_omn 
                      45.4                       45.4 
            SOINS_seac_spe          MENAGE_revucinsee 
                      45.4                       14.0 
               PERSONNE_ss            PERSONNE_regime 
                       0.4                        0.2 
         PERSONNE_rap_pcs8        PERSONNE_pb_depress 
                       0.1                        0.0 
         PERSONNE_pb_asthm       PERSONNE_pb_bronchit 
                       0.0                        0.0 
     PERSONNE_pb_infarctus       PERSONNE_pb_coronair 
                       0.0                        0.0 
     PERSONNE_pb_hypertens            PERSONNE_pb_avc 
                       0.0                        0.0 
       PERSONNE_pb_arthros       PERSONNE_pb_lombalgi 
                       0.0                        0.0 
      PERSONNE_pb_cervical         PERSONNE_pb_diabet 
                       0.0                        0.0 
       PERSONNE_pb_allergi        PERSONNE_pb_cirrhos 
                       0.0                        0.0 
       PERSONNE_pb_urinair PERSONNE_score_t_corrected 
                       0.0                        0.0 
       PERSONNE_etat_sante               PERSONNE_age 
                       0.0                        0.0 
             PERSONNE_sexe            PERSONNE_couple 
                       0.0                        0.0 
           PERSONNE_statut               PERSONNE_ald 
                       0.0                        0.0 
              SOINS_ald_am              MENAGE_region 
                       0.0                        0.0 
                 MENAGE_tu              MENAGE_nbpers 
                       0.0                        0.0 
                ensol_2011              MUTUELLE_assu 
                       0.0                        0.0 
            MUTUELLE_typcc       OPINION1_renonc_cons 
                       0.0                        0.0 
      OPINION1_renonc_dent        OPINION1_renonc_fin 
                       0.0                        0.0 
      OPINION1_renonc_loin       OPINION1_renonc_long 
                       0.0                        0.0 
             QST_ct_depech             QST_ct_liberte 
                       0.0                        0.0 
            QST_ct_apprend             QST_ct_aidecol 
                       0.0                        0.0 
           QST_ct_travnuit               QST_ct_repet 
                       0.0                        0.0 
              QST_ct_lourd             QST_ct_posture 
                       0.0                        0.0 
            QST_ct_produit            QES_association 
                       0.0                        0.0 
                QES_tpsami                QES_tpsasso 
                       0.0                        0.0 
             QES_tpscolleg               QES_tpsfamil 
                       0.0                        0.0 
            QES_mere_etude             QES_pere_etude 
                       0.0                        0.0 
              inf_q1_mhi_5                         id 
                       0.0                        0.0 
                    status 
                       0.0 

Ordinal variables need to be altered prior to the estimation (because of SHAP). We change them to numerical variables. To keep a track of the corresponding numerical values associated with each label, we create a table.

Then, we create the table correspondance_ordinal that contains the numerical values associated with the different categories.

correspondance_ordinal <- 
  tibble(
    variable = "MENAGE_tu",
    old_value = df_tmp$MENAGE_tu |> unique(),
    new_value = as.numeric(df_tmp$MENAGE_tu) |> unique()
  )
correspondance_ordinal
# A tibble: 5 × 3
  variable  old_value               new_value
  <chr>     <ord>                       <dbl>
1 MENAGE_tu Small Municipality              1
2 MENAGE_tu Medium Municipality             2
3 MENAGE_tu Not surveyed                    6
4 MENAGE_tu Paris metropolitan area         4
5 MENAGE_tu Large Municipality              3
save(correspondance_ordinal, file = "../data/out/correspondance_ordinal.rda")

Now that we have stored the changes we plan on applying, we can proceed to these changes:

df_tmp <- 
  df_tmp |>
  mutate_if(is.character, as.factor) |> 
  mutate(MENAGE_tu = as.numeric(df_tmp$MENAGE_tu))

We need to remove the variables that wee used to construct the target variable:

df_tmp <- 
  df_tmp |> 
  select(
    -PERSONNE_pb_depress,
    -inf_q1_mhi_5,
    -PERSONNE_score_t_corrected,
    -PERSONNE_etat_sante
  )

Then, we remove observations that contain missing values. Let us keep a track on the number of observation removed at each time.

nb_obs <- nrow(df_tmp)

eye_on_proportions <- vector(mode="list", length = ncol(df_tmp))

for(p in 1:ncol(df_tmp)){
  variable_to_check <- colnames(df_tmp)[p]
  
  eye_on_proportions[[p]] <- 
    df_tmp |> 
    filter(is.na(!!sym(variable_to_check))) |> 
    group_by(status) |> 
    count() |> ungroup() |> 
    mutate(prop = round(100 * n / sum(n), digits = 2))
  
  # Filtering out the observations with missing values
  df_tmp <- 
    df_tmp |> 
    filter(!is.na(!!sym(variable_to_check)))
  
  if(nrow(df_tmp) < nb_obs){
    nb_obs_df_tmp <- 
      tibble(step = variable_to_check, 
             n = nrow(df_tmp))
    nb_obs_df <- 
      nb_obs_df |> 
      bind_rows(nb_obs_df_tmp)
    nb_obs <- nrow(df_tmp)
  }
  
}
names(eye_on_proportions) <- colnames(df_tmp)

Let us have a look at the nb_obs_df table. The first 4 rows give:

  1. the number of observations with the raw sample
  2. how many observations were left after removing people younger than 15 years old
  3. homw many observations were left when removing individuals who did not report their status with regard to depression
  4. how many individuals reported feeling not depressed over the last 12 months.

The remaining rows indicate how many individuals (column n) are left in the sample once individuals with missing data for the variable reported in column step are removed from the sample. The number given in column n_drop indicates the loss due to lack of data for the variable of interest.

nb_obs_df |> 
  mutate(n_drop = n - dplyr::lag(n)) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Table 2.3: Number of remaining data and loss of individuals due to missing data
step n n_drop
raw 19940 NA
age_15 18561 -1379
missing_mhi5_score 12373 -6188
pb_depress_not_reported 11872 -501
not_D 11199 -673
PERSONNE_ss 11151 -48
PERSONNE_regime 11147 -4
PERSONNE_rap_pcs8 11137 -10
MENAGE_revucinsee 9576 -1561
SOINS_remomn 5306 -4270
QES_tpsasso 5305 -1

At the end, we are left with a dataset with no more missing data:

any(is.na(df_tmp))
[1] FALSE

Lastly, we remove the region variable and the self-declared health condition:

var_to_temove <- c("MENAGE_region", "code_insee", "MENAGE_region_nom")
var_to_keep <- colnames(df_tmp)[!colnames(df_tmp) %in% var_to_temove]
df_tmp <- 
  df_tmp |>
  select(!!var_to_keep)

The dimension of the final dataset:

dim(df_tmp)
[1] 5305   94

Recall we coded missing values for categorical variables as Not reported and missing values because the individuals were not included in the sub-survey as Not surveyed. Let us look at how many observation fall into each of those categories:

df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "Not reported"),
            nb_not_surveyed = sum(. == "Not surveyed")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")
variable nb_NA nb_not_surveyed
PERSONNE_pb_asthm 0 0
PERSONNE_pb_bronchit 0 0
PERSONNE_pb_infarctus 0 0
PERSONNE_pb_coronair 0 0
PERSONNE_pb_hypertens 0 0
PERSONNE_pb_avc 0 0
PERSONNE_pb_arthros 0 0
PERSONNE_pb_lombalgi 0 0
PERSONNE_pb_cervical 0 0
PERSONNE_pb_diabet 0 0
PERSONNE_pb_allergi 0 0
PERSONNE_pb_cirrhos 0 0
PERSONNE_pb_urinair 0 0
PERSONNE_sexe 0 0
PERSONNE_couple 12 0
PERSONNE_statut 559 0
PERSONNE_ss 0 0
PERSONNE_regime 0 0
PERSONNE_rap_pcs8 0 0
PERSONNE_ald 0 0
SOINS_ald_am 0 0
MUTUELLE_assu 16 680
MUTUELLE_typcc 0 680
OPINION1_renonc_cons 0 1092
OPINION1_renonc_dent 0 1092
OPINION1_renonc_fin 0 1092
OPINION1_renonc_loin 0 1092
OPINION1_renonc_long 0 1092
QST_ct_depech 29 2725
QST_ct_liberte 41 2725
QST_ct_apprend 27 2725
QST_ct_aidecol 36 2725
QST_ct_travnuit 36 2725
QST_ct_repet 38 2725
QST_ct_lourd 37 2725
QST_ct_posture 35 2725
QST_ct_produit 35 2725
QES_association 93 15
QES_tpsami 119 15
QES_tpsasso 228 15
QES_tpscolleg 484 15
QES_tpsfamil 139 15
QES_mere_etude 84 15
QES_pere_etude 83 15
status 0 0

We decide here to merge these two different values as No answer:

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      .cols = is.factor|is.character,
      .fns = function(x) {
        fct_recode(
          x,
          "No answer" = "Not reported",
          "No answer" = "Not surveyed"
        )
      }
    )
  )
df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "No answer")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")
variable nb_NA
PERSONNE_pb_asthm 0
PERSONNE_pb_bronchit 0
PERSONNE_pb_infarctus 0
PERSONNE_pb_coronair 0
PERSONNE_pb_hypertens 0
PERSONNE_pb_avc 0
PERSONNE_pb_arthros 0
PERSONNE_pb_lombalgi 0
PERSONNE_pb_cervical 0
PERSONNE_pb_diabet 0
PERSONNE_pb_allergi 0
PERSONNE_pb_cirrhos 0
PERSONNE_pb_urinair 0
PERSONNE_sexe 0
PERSONNE_couple 12
PERSONNE_statut 559
PERSONNE_ss 0
PERSONNE_regime 0
PERSONNE_rap_pcs8 0
PERSONNE_ald 0
SOINS_ald_am 0
MUTUELLE_assu 696
MUTUELLE_typcc 680
OPINION1_renonc_cons 1092
OPINION1_renonc_dent 1092
OPINION1_renonc_fin 1092
OPINION1_renonc_loin 1092
OPINION1_renonc_long 1092
QST_ct_depech 2754
QST_ct_liberte 2766
QST_ct_apprend 2752
QST_ct_aidecol 2761
QST_ct_travnuit 2761
QST_ct_repet 2763
QST_ct_lourd 2762
QST_ct_posture 2760
QST_ct_produit 2760
QES_association 108
QES_tpsami 134
QES_tpsasso 243
QES_tpscolleg 499
QES_tpsfamil 154
QES_mere_etude 99
QES_pere_etude 98
status 0

Let us remove the 12 observations where the couple status is not reported (too few observations for this category). The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_couple == "No answer") |> 
  left_join(df |> select(id, inf_q1_mhi_5), by = "id") |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))
# A tibble: 2 × 4
  inf_q1_mhi_5 status               n  prop
  <fct>        <fct>            <int> <dbl>
1 <=Q1         Not_D_and_inf_Q1     1  8.33
2 >Q1          Not_D_and_sup_Q1    11 91.7 
df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_couple != "No answer")

For factor variables related to health conditions, we drop the unused levels.

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      c(
        PERSONNE_pb_asthm,
        PERSONNE_pb_bronchit,
        PERSONNE_pb_infarctus,
        PERSONNE_pb_coronair,
        PERSONNE_pb_hypertens,
        PERSONNE_pb_avc,
        PERSONNE_pb_arthros,
        PERSONNE_pb_lombalgi,
        PERSONNE_pb_cervical,
        PERSONNE_pb_diabet,
        PERSONNE_pb_allergi,
        PERSONNE_pb_cirrhos,
        PERSONNE_pb_urinair
      ),
      ~fct_drop(.x)
    )
  )

And we save the results:

df_clean <- df_tmp
save(df_clean, file = "../data/df_clean.rda")