2 Data Cleaning

Objectives

This chapter presents the codes used to clean data. It especially explains the different steps we took to deal with missing values.

Let us load the data obtained in Chapter 1.

Let us first load {tidyverse}:

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Then the data:

load("../data/df_merged.rda")
dim(df)

[1] 18561   841

Let us keep track on the number of observation each time we filter out observations.

nb_obs_df <- tibble(
    step = c("raw", "age_15"), 
    n = c(19940, nrow(df))
  )
nb_obs_df

# A tibble: 2 × 2
  step       n
  <chr>  <dbl>
1 raw    19940
2 age_15 18561

We can create a table with the labels associated with the name of each variable.

variable_names <- tribble(
  ~variable, ~label, ~type,
  "PERSONNE_pb_depress", "Depression", "qualitative", 
  "PERSONNE_pb_asthm", "Asthma", "qualitative",
  "PERSONNE_pb_bronchit", "Bronchitis", "qualitative",
  "PERSONNE_pb_infarctus", "Heart Attack", "qualitative",
  "PERSONNE_pb_coronair", "Artery Disease", "qualitative",
  "PERSONNE_pb_hypertens", "Hypertension", "qualitative",
  "PERSONNE_pb_avc", "Stroke", "qualitative",
  "PERSONNE_pb_arthros", "Osteoarthritis", "qualitative",
  "PERSONNE_pb_lombalgi", "Low Back Pain", "qualitative",
  "PERSONNE_pb_cervical", "Neck Pain", "qualitative",
  "PERSONNE_pb_diabet", "Diabetes", "qualitative",
  "PERSONNE_pb_allergi", "Allergy", "qualitative",
  "PERSONNE_pb_cirrhos", "Cirrhosis", "qualitative",
  "PERSONNE_pb_urinair", "Urinary Incontinence", "qualitative",
  # "PERSONNE_pb_non", "No Illness", "qualitative",
  "PERSONNE_score_t_corrected", "MHI-5 Score", "numerical",
  "PERSONNE_etat_sante", "Self-Assessed Health Condition", "qualitative",
  "PERSONNE_age", "Age", "numerical",
  "PERSONNE_sexe", "Gender", "qualitative", 
  "PERSONNE_couple", "Couple", "qualitative", 
  "PERSONNE_etatleg", "Marital Status", "qualitative", 
  "PERSONNE_statut", "Professional Status", "qualitative", 
  "PERSONNE_ss", "Social Security", "qualitative", 
  "PERSONNE_regime", "Social Security System", "qualitative", 
  "PERSONNE_rap_pcs8", "Occupation", "qualitative", 
  "PERSONNE_ald", "Long-term condition (Self-declared)", "qualitative",
  "SOINS_ald_am", "Long-term condition (SNIIRAM)", "qualitative",
  #
  #
  # Household
  #
  #
  "MENAGE_rap_zau", "Zoning in Urban Areas", "ordinal",
  "MENAGE_region", "Region", "qualitative", 
  "MENAGE_revdetail", "Income", "numerical",
  "MENAGE_revucinsee", "Net Income per Cons. Unit", "numerical",
  "MENAGE_tu","Size Urban Area", "ordinal", 
  "MENAGE_nbpers", "Household size", "numerical",
  "ensol_2011", "Sunlight", "numerical",
  # "MENAGE_revenu", "Income (class)", "ordinal", 
  # "MENAGE_typmen", "Household Type", "qualitative", 
  # "CODE_REG", "Region Code", "qualitative",
  #
  #
  # Mutual Insurance
  #
  #
  "MUTUELLE_assu", "Insurance", "qualitative", 
  "MUTUELLE_typcc", "Mutual Coverage Type", "qualitative",
  #
  # Expenses
  # Outpatient ~= Sum of the others
  #
  "SOINS_depamb", "Exp. Outpatient", "numerical",
  "SOINS_depomn", "Exp. General Practitioner", "numerical",
  "SOINS_depspe", "Exp. Specialist", "numerical",
  "SOINS_deppha", "Exp. Pharmacy", "numerical",
  "SOINS_depkin", "Exp. Physiotherapist", "numerical",
  "SOINS_depinf", "Exp. Nurse", "numerical",
  "SOINS_depden", "Exp. Dentist", "numerical",
  "SOINS_depmat", "Exp. Equipment", "numerical",
  "SOINS_deptra", "Exp. Transport", "numerical",
  "SOINS_depopt", "Exp. Optical", "numerical",
  "SOINS_deppro", "Exp. Prostheses", "numerical",
  "SOINS_depurg", "Exp. Emergency w/o hospitalization", "numerical",
  #
  #
  # Reimbursement
  #
  "SOINS_remamb", "Reimbursement Outpatient", "numerical",
  "SOINS_remomn", "Reimbursement General Practitioner", "numerical",
  "SOINS_remspe", "Reimbursement Specialist", "numerical",
  "SOINS_rempha", "Reimbursement Pharmacy", "numerical",
  "SOINS_remkin", "Reimbursement Physiotherapist", "numerical",
  "SOINS_reminf", "Reimbursement Nurse", "numerical",
  "SOINS_remden", "Reimbursement Dentist", "numerical",
  "SOINS_remmat", "Reimbursement Equipment", "numerical",
  "SOINS_remtra", "Reimbursement Transport", "numerical",
  "SOINS_remopt", "Reimbursement Optical", "numerical",
  "SOINS_rempro", "Reimbursement Prostheses", "numerical",
  "SOINS_remurg", "Reimbursement Emergency w/o hospitalization", "numerical",
  #
  #
  # Co-payment (Ticket moderateur)
  #
  "SOINS_tmamb", "Co-payment Outpatient", "numerical",
  "SOINS_tmomn", "Co-payment General Practitioner", "numerical",
  "SOINS_tmspe", "Co-payment Specialist", "numerical",
  "SOINS_tmpha", "Co-payment Pharmacy", "numerical",
  "SOINS_tmkin", "Co-payment Physiotherapist", "numerical",
  "SOINS_tminf", "Co-payment Nurse", "numerical",
  "SOINS_tmden", "Co-payment Dentist", "numerical",
  "SOINS_tmmat", "Co-payment Equipment", "numerical",
  "SOINS_tmtra", "Co-payment Transport", "numerical",
  "SOINS_tmopt", "Co-payment Optical", "numerical",
  "SOINS_tmpro", "Co-payment Prostheses", "numerical",
  "SOINS_tmurg", "Co-payment Emergency w/o hospitalization", "numerical",
  #
  # Extra-fees
  #
  "SOINS_dpaamb", "Extra-fees Outpatient", "numerical",
  "SOINS_dpaomn", "Extra-fees General Practitioner", "numerical",
  "SOINS_dpaspe", "Extra-fees Specialist", "numerical",
  "SOINS_dpapha", "Extra-fees Pharmacy", "numerical",
  "SOINS_dpakin", "Extra-fees Physiotherapist", "numerical",
  "SOINS_dpainf", "Extra-fees Nurse", "numerical",
  "SOINS_dpaden", "Extra-fees Dentist", "numerical",
  "SOINS_dpamat", "Extra-fees Equipment", "numerical",
  "SOINS_dpatra", "Extra-fees Transport", "numerical",
  "SOINS_dpaopt", "Extra-fees Optical", "numerical",
  "SOINS_dpapro", "Extra-fees Prostheses", "numerical",
  "SOINS_dpaurg", "Extra-fees Emergency w/o hospitalization", "numerical",
  #
  #
  # Deductible (Franchise)
  #
  "SOINS_pf_framb", "Deduct. Outpatient", "numerical",
  "SOINS_pf_fromn", "Deduct. General Practitioner", "numerical",
  "SOINS_pf_frspe", "Deduct. Specialist", "numerical",
  "SOINS_pf_frpha", "Deduct. Pharmacy", "numerical",
  "SOINS_pf_frkin", "Deduct. Physiotherapist", "numerical",
  "SOINS_pf_frinf", "Deduct. Nurse", "numerical",
  "SOINS_pf_frden", "Deduct. Dentist", "numerical",
  # "SOINS_pf_frmat", "Deduct. Equipment", "numerical", # does not exist in the data
  "SOINS_pf_frtra", "Deduct. Transport", "numerical",
  # "SOINS_pf_fropt", "Deduct. Optical", "numerical",
  # "SOINS_pf_frpro", "Deduct. Prostheses", "numerical",
  "SOINS_pf_frurg", "Deduct. Emergency w/o hospitalization", "numerical",
  #
  "SOINS_seac_omn", "No. Medical Sessions General Pract.", "numerical",
  "SOINS_seac_spe",  "No. Medical Sessions Specialist", "numerical",
  #
  #
  "OPINION1_renonc_cons", "Waiver General Practitioner", "qualitative",
  "OPINION1_renonc_dent", "Waiver Dental Care", "qualitative",
  "OPINION1_renonc_fin", "Waiver Other Health Care", "qualitative",
  "OPINION1_renonc_loin", "Waiver Health Care Too Far", "qualitative",
  "OPINION1_renonc_long", "Waiver Appointment Delay Too Long", "qualitative",
  #
  # Working conditions
  #
  "QST_ct_depech", "Have to Hurry to Do Job", "qualitative", 
  "QST_ct_liberte", "Very Little Freedom to Do Job", "qualitative", 
  "QST_ct_apprend", "Job Allows to Learn New Things", "qualitative", 
  "QST_ct_aidecol", "Colleagues Help Carry out Tasks", "qualitative", 
  "QST_ct_travnuit", "Job Requires not to Sleep Betw. Midnight and 5 a.m.", "qualitative", 
  "QST_ct_repet", "Repetitive Work under Time Constraints / Line Job", "qualitative", 
  "QST_ct_lourd", "Exposed to Carrying Heavy Loads", "qualitative", 
  "QST_ct_posture", "Exposed to Painful Postures", "qualitative", 
  "QST_ct_produit", "Exposed to Harmful/Toxic Products/Substances", "qualitative",
  #
  "QES_association", "Participation in Group Activities", "qualitative",
  "QES_tpsami", "Frequency Meeting with Friends/Neighbors", "qualitative",
  "QES_tpsasso", "Frequency Meeting with People in Organizations", "qualitative",
  "QES_tpscolleg", "Frequency Meeting with Colleagues Outside Work", "qualitative",
  "QES_tpsfamil", "Frequency Meeting with Family Living Outside Household", "qualitative",
  "QES_mere_etude", "Mother's Level of Education", "qualitative",
  "QES_pere_etude", "Father's Level of Education", "qualitative"
)

save(variable_names, file = "../data/out/variable_names.rda")

We only keep the variables in the table variable_names, as well as those created at the end of Chapter 1.

df_tmp <- 
  df |> 
  select(!!variable_names$variable, inf_q1_mhi_5, inf_q1_mhi_3, id) |> 
  select(
    -PERSONNE_etatleg, -MENAGE_revdetail, -MENAGE_rap_zau,
    -SOINS_remamb, -SOINS_tmamb,
    -SOINS_dpaamb, -SOINS_pf_framb
  ) |> 
  mutate(PERSONNE_pb_depress = as.character(PERSONNE_pb_depress))

Note: the following variables were excluded for the following reasons:

PERSONNE_etatleg gives the marital status; we keep PERSONNE_couple which states whether the person lives as a couple
MENAGE_revdetail is the income in numerical values; we prefer keeping MENAGE_revucinsee (Net Income per Cons. Unit)
MENAGE_rap_zau: the variable MENAGE_tu already provides a very similar information
SOINS_remamb, SOINS_tmamb, SOINS_dpaamb, SOINS_pf_frambSOINS_pf_framb : these are the sum of other reimbursements; they would therefore cause colinearity issues.

Some additional information should be provided regarding healthcare variables. These variables are composed of :

expenditures (dep, dépenses)
health insurance reimbursements (remb, remboursements par l’assurance maladie)
co-payment (tm, ticket modérateur)
fee overruns (dpa, dépassement d’honoraires, non remboursés)
amounts to be charged (cha, montants à charge)
deductible (pf_fr, Participation forfaitaire/Franchise)
guaranteed participation (pa, Participation assurée).

The following relationships exist between these variables:

dep = cha + rem + pf_fr + pa
cha = tm + dpa
dep = tm + dpa + rem + pf_fr + pa.

To avoid collinearity problems, we retain only the following variables in the final model: tm, dpa, rem, pf_fr, pa.

Then the variables that begin with SOINS_dep need to be removed:

retirer <- 
  variable_names$variable[str_which(variable_names$variable, "^SOINS_dep")]
retirer

 [1] "SOINS_depamb" "SOINS_depomn" "SOINS_depspe" "SOINS_deppha" "SOINS_depkin"
 [6] "SOINS_depinf" "SOINS_depden" "SOINS_depmat" "SOINS_deptra" "SOINS_depopt"
[11] "SOINS_deppro" "SOINS_depurg"

df_tmp <-
  df_tmp |> 
  select(-!!retirer)

Let us remove people for which the MHI-5 score was not computed:

sum(is.na(df_tmp$inf_q1_mhi_5))

[1] 6188

df_tmp <- 
  df_tmp |> 
  filter(!is.na(inf_q1_mhi_5))

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "missing_mhi5_score", n = nrow(df_tmp))
  )
nb_obs_df

# A tibble: 3 × 2
  step                   n
  <chr>              <dbl>
1 raw                19940
2 age_15             18561
3 missing_mhi5_score 12373

Let us remove the individuals who did not report their status with regard to depression:

table(df_tmp$PERSONNE_pb_depress)


   Depression No depression  Not reported 
          673         11199           501

Distribution of MHI-5 score among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_pb_depress == "Not reported") |> 
  group_by(inf_q1_mhi_5) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))

# A tibble: 2 × 3
  inf_q1_mhi_5     n  prop
  <fct>        <int> <dbl>
1 <=Q1           147  29.3
2 >Q1            354  70.7

df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_pb_depress != "Not reported")

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "pb_depress_not_reported", n = nrow(df_tmp))
  )
nb_obs_df

# A tibble: 4 × 2
  step                        n
  <chr>                   <dbl>
1 raw                     19940
2 age_15                  18561
3 missing_mhi5_score      12373
4 pb_depress_not_reported 11872

Let us focus on the MHI-5 score here and discard the MHI-3. Let us create a variable (status) that classifies individuals depending on their MHI-5 score and their answer regarding depression:

df_tmp <- 
  df_tmp |> 
  select(-inf_q1_mhi_3) |>
  mutate(
    status = case_when(
      PERSONNE_pb_depress == "Depression" & inf_q1_mhi_5 == "<=Q1" ~ "D_and_inf_Q1",
      PERSONNE_pb_depress == "Depression" & inf_q1_mhi_5 == ">Q1" ~ "D_and_sup_Q1",
      PERSONNE_pb_depress == "No depression" & inf_q1_mhi_5 == "<=Q1" ~ "Not_D_and_inf_Q1",
      PERSONNE_pb_depress == "No depression" & inf_q1_mhi_5 == ">Q1" ~ "Not_D_and_sup_Q1",
      PERSONNE_pb_depress == "Not reported" & inf_q1_mhi_5 == "<=Q1" ~ "Not_R_and_inf_Q1",
      PERSONNE_pb_depress == "Not reported" & inf_q1_mhi_5 == ">Q1" ~ "Not_R_and_sup_Q1",
      TRUE~"problem"
    )
  )

We can look at the distribution of this newly created variable:

table(df_tmp$status)


    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
             572              101             3193             8006

round(100*prop.table(table(df_tmp$status)), 2)


    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
            4.82             0.85            26.90            67.44

No values coded as problem:

df_tmp |> 
  filter(status == "problem") |> 
  select(status, PERSONNE_pb_depress, inf_q1_mhi_5) |> 
  filter(!is.na(inf_q1_mhi_5))

# A tibble: 0 × 3
# ℹ 3 variables: status <chr>, PERSONNE_pb_depress <chr>, inf_q1_mhi_5 <fct>

We are interested in people who self report having not experienced depression during the past year. Other people will be discarded. The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(! status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))

# A tibble: 2 × 4
  inf_q1_mhi_5 status           n  prop
  <fct>        <chr>        <int> <dbl>
1 <=Q1         D_and_inf_Q1   572  85.0
2 >Q1          D_and_sup_Q1   101  15.0

Let us discard other people. We define the status variable as a binary factor:

df_tmp <- 
  df_tmp |> 
  filter(status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  mutate(status = factor(
    status, level = c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1"))
  )

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "not_D", n = nrow(df_tmp))
  )
nb_obs_df

# A tibble: 5 × 2
  step                        n
  <chr>                   <dbl>
1 raw                     19940
2 age_15                  18561
3 missing_mhi5_score      12373
4 pb_depress_not_reported 11872
5 not_D                   11199

2.1 Looking at missing data

We need to check where are the missing data. Let us have a look at the “Working conditions” questions (columns which begin with QST_c), and count the number and corresponding proportion of missing values:

df_tmp |> 
  select(matches("^QST_c")) |> 
  map_df(
    function(x){
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed"),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Table 2.1: Missing values in the Working Conditions questions

variable	no_obs	nb_not_surveyed	prop
QST_ct_depech	11199	5714	51.02
QST_ct_liberte	11199	5714	51.02
QST_ct_apprend	11199	5714	51.02
QST_ct_aidecol	11199	5714	51.02
QST_ct_travnuit	11199	5714	51.02
QST_ct_repet	11199	5714	51.02
QST_ct_lourd	11199	5714	51.02
QST_ct_posture	11199	5714	51.02
QST_ct_produit	11199	5714	51.02

As we can see from the previous result, around 50% of respondents did not give an answer to the questions related to work. It does not only concern unemployed people (1,289 individuals are classified as “Inactive having never worked”):

table(df_tmp$PERSONNE_rap_pcs8) |> sort()


                               Farmer                     Craftsman, trader 
                                  383                                   567 
                     Unskilled worker                   Commercial employee 
                                  892                                  1315 
Executive and intellectual profession               Administrative employee 
                                 1443                                  1498 
         Inactive having never worked                        Skilled worker 
                                 1595                                  1621 
              Intermediate occupation 
                                 1875

Let us have a look at the distribution of PERSONNE_rap_pcs8 for individuals for which QST_ct_depech is missing:

df_tmp |> 
  filter(`QST_ct_depech` == "Not surveyed") |> 
  magrittr::extract2("PERSONNE_rap_pcs8") |> 
  table()


                               Farmer                     Craftsman, trader 
                                  252                                   263 
Executive and intellectual profession               Intermediate occupation 
                                  513                                   668 
              Administrative employee                   Commercial employee 
                                  617                                   598 
                       Skilled worker                      Unskilled worker 
                                  702                                   498 
         Inactive having never worked 
                                 1595

df_tmp |> 
  filter(QST_ct_depech == "Not surveyed") |> 
  select_at(vars(matches("^QST_c")))

# A tibble: 5,714 × 9
   QST_ct_depech QST_ct_liberte QST_ct_apprend QST_ct_aidecol QST_ct_travnuit
   <fct>         <fct>          <fct>          <fct>          <fct>          
 1 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 2 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 3 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 4 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 5 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 6 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 7 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 8 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 9 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
10 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
# ℹ 5,704 more rows
# ℹ 4 more variables: QST_ct_repet <fct>, QST_ct_lourd <fct>,
#   QST_ct_posture <fct>, QST_ct_produit <fct>

We do the same for variables which give information relative to social interactions and which give information on the parents’ background (i.e., variables that begin with QES). First, let us have a look at the proportion of NA in those:

df_tmp |> 
  select(matches("^QES")) |> 
  map_df(
    function(x) {
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed", na.rm=T),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Table 2.2: Missing values in QES variables

variable	no_obs	nb_not_surveyed	prop
QES_association	11199	44	0.39
QES_tpsami	11199	44	0.39
QES_tpsasso	11199	44	0.39
QES_tpscolleg	11199	44	0.39
QES_tpsfamil	11199	44	0.39
QES_mere_etude	11199	44	0.39
QES_pere_etude	11199	44	0.39

Let us have a look at the proportion of missing values in the current dataset:

pct_missing <- 
  sapply(df_tmp, function(x) round(100 * sum(is.na(x)) / length(x), 1)) |> 
  sort(decreasing = TRUE)
pct_missing

              SOINS_remomn               SOINS_remspe 
                      45.4                       45.4 
              SOINS_rempha               SOINS_remkin 
                      45.4                       45.4 
              SOINS_reminf               SOINS_remden 
                      45.4                       45.4 
              SOINS_remmat               SOINS_remtra 
                      45.4                       45.4 
              SOINS_remopt               SOINS_rempro 
                      45.4                       45.4 
              SOINS_remurg                SOINS_tmomn 
                      45.4                       45.4 
               SOINS_tmspe                SOINS_tmpha 
                      45.4                       45.4 
               SOINS_tmkin                SOINS_tminf 
                      45.4                       45.4 
               SOINS_tmden                SOINS_tmmat 
                      45.4                       45.4 
               SOINS_tmtra                SOINS_tmopt 
                      45.4                       45.4 
               SOINS_tmpro                SOINS_tmurg 
                      45.4                       45.4 
              SOINS_dpaomn               SOINS_dpaspe 
                      45.4                       45.4 
              SOINS_dpapha               SOINS_dpakin 
                      45.4                       45.4 
              SOINS_dpainf               SOINS_dpaden 
                      45.4                       45.4 
              SOINS_dpamat               SOINS_dpatra 
                      45.4                       45.4 
              SOINS_dpaopt               SOINS_dpapro 
                      45.4                       45.4 
              SOINS_dpaurg             SOINS_pf_fromn 
                      45.4                       45.4 
            SOINS_pf_frspe             SOINS_pf_frpha 
                      45.4                       45.4 
            SOINS_pf_frkin             SOINS_pf_frinf 
                      45.4                       45.4 
            SOINS_pf_frden             SOINS_pf_frtra 
                      45.4                       45.4 
            SOINS_pf_frurg             SOINS_seac_omn 
                      45.4                       45.4 
            SOINS_seac_spe          MENAGE_revucinsee 
                      45.4                       14.0 
               PERSONNE_ss            PERSONNE_regime 
                       0.4                        0.2 
         PERSONNE_rap_pcs8        PERSONNE_pb_depress 
                       0.1                        0.0 
         PERSONNE_pb_asthm       PERSONNE_pb_bronchit 
                       0.0                        0.0 
     PERSONNE_pb_infarctus       PERSONNE_pb_coronair 
                       0.0                        0.0 
     PERSONNE_pb_hypertens            PERSONNE_pb_avc 
                       0.0                        0.0 
       PERSONNE_pb_arthros       PERSONNE_pb_lombalgi 
                       0.0                        0.0 
      PERSONNE_pb_cervical         PERSONNE_pb_diabet 
                       0.0                        0.0 
       PERSONNE_pb_allergi        PERSONNE_pb_cirrhos 
                       0.0                        0.0 
       PERSONNE_pb_urinair PERSONNE_score_t_corrected 
                       0.0                        0.0 
       PERSONNE_etat_sante               PERSONNE_age 
                       0.0                        0.0 
             PERSONNE_sexe            PERSONNE_couple 
                       0.0                        0.0 
           PERSONNE_statut               PERSONNE_ald 
                       0.0                        0.0 
              SOINS_ald_am              MENAGE_region 
                       0.0                        0.0 
                 MENAGE_tu              MENAGE_nbpers 
                       0.0                        0.0 
                ensol_2011              MUTUELLE_assu 
                       0.0                        0.0 
            MUTUELLE_typcc       OPINION1_renonc_cons 
                       0.0                        0.0 
      OPINION1_renonc_dent        OPINION1_renonc_fin 
                       0.0                        0.0 
      OPINION1_renonc_loin       OPINION1_renonc_long 
                       0.0                        0.0 
             QST_ct_depech             QST_ct_liberte 
                       0.0                        0.0 
            QST_ct_apprend             QST_ct_aidecol 
                       0.0                        0.0 
           QST_ct_travnuit               QST_ct_repet 
                       0.0                        0.0 
              QST_ct_lourd             QST_ct_posture 
                       0.0                        0.0 
            QST_ct_produit            QES_association 
                       0.0                        0.0 
                QES_tpsami                QES_tpsasso 
                       0.0                        0.0 
             QES_tpscolleg               QES_tpsfamil 
                       0.0                        0.0 
            QES_mere_etude             QES_pere_etude 
                       0.0                        0.0 
              inf_q1_mhi_5                         id 
                       0.0                        0.0 
                    status 
                       0.0

Ordinal variables need to be altered prior to the estimation (because of SHAP). We change them to numerical variables. To keep a track of the corresponding numerical values associated with each label, we create a table.

Then, we create the table correspondance_ordinal that contains the numerical values associated with the different categories.

correspondance_ordinal <- 
  tibble(
    variable = "MENAGE_tu",
    old_value = df_tmp$MENAGE_tu |> unique(),
    new_value = as.numeric(df_tmp$MENAGE_tu) |> unique()
  )
correspondance_ordinal

# A tibble: 5 × 3
  variable  old_value               new_value
  <chr>     <ord>                       <dbl>
1 MENAGE_tu Small Municipality              1
2 MENAGE_tu Medium Municipality             2
3 MENAGE_tu Not surveyed                    6
4 MENAGE_tu Paris metropolitan area         4
5 MENAGE_tu Large Municipality              3

save(correspondance_ordinal, file = "../data/out/correspondance_ordinal.rda")

Now that we have stored the changes we plan on applying, we can proceed to these changes:

df_tmp <- 
  df_tmp |>
  mutate_if(is.character, as.factor) |> 
  mutate(MENAGE_tu = as.numeric(df_tmp$MENAGE_tu))

We need to remove the variables that wee used to construct the target variable:

df_tmp <- 
  df_tmp |> 
  select(
    -PERSONNE_pb_depress,
    -inf_q1_mhi_5,
    -PERSONNE_score_t_corrected,
    -PERSONNE_etat_sante
  )

Then, we remove observations that contain missing values. Let us keep a track on the number of observation removed at each time.

nb_obs <- nrow(df_tmp)

eye_on_proportions <- vector(mode="list", length = ncol(df_tmp))

for(p in 1:ncol(df_tmp)){
  variable_to_check <- colnames(df_tmp)[p]
  
  eye_on_proportions[[p]] <- 
    df_tmp |> 
    filter(is.na(!!sym(variable_to_check))) |> 
    group_by(status) |> 
    count() |> ungroup() |> 
    mutate(prop = round(100 * n / sum(n), digits = 2))
  
  # Filtering out the observations with missing values
  df_tmp <- 
    df_tmp |> 
    filter(!is.na(!!sym(variable_to_check)))
  
  if(nrow(df_tmp) < nb_obs){
    nb_obs_df_tmp <- 
      tibble(step = variable_to_check, 
             n = nrow(df_tmp))
    nb_obs_df <- 
      nb_obs_df |> 
      bind_rows(nb_obs_df_tmp)
    nb_obs <- nrow(df_tmp)
  }
  
}
names(eye_on_proportions) <- colnames(df_tmp)

Let us have a look at the nb_obs_df table. The first 4 rows give:

the number of observations with the raw sample
how many observations were left after removing people younger than 15 years old
homw many observations were left when removing individuals who did not report their status with regard to depression
how many individuals reported feeling not depressed over the last 12 months.

The remaining rows indicate how many individuals (column n) are left in the sample once individuals with missing data for the variable reported in column step are removed from the sample. The number given in column n_drop indicates the loss due to lack of data for the variable of interest.

nb_obs_df |> 
  mutate(n_drop = n - dplyr::lag(n)) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Table 2.3: Number of remaining data and loss of individuals due to missing data

step	n	n_drop
raw	19940	NA
age_15	18561	-1379
missing_mhi5_score	12373	-6188
pb_depress_not_reported	11872	-501
not_D	11199	-673
PERSONNE_ss	11151	-48
PERSONNE_regime	11147	-4
PERSONNE_rap_pcs8	11137	-10
MENAGE_revucinsee	9576	-1561
SOINS_remomn	5306	-4270
QES_tpsasso	5305	-1

At the end, we are left with a dataset with no more missing data:

any(is.na(df_tmp))

[1] FALSE

Lastly, we remove the region variable and the self-declared health condition:

var_to_temove <- c("MENAGE_region", "code_insee", "MENAGE_region_nom")
var_to_keep <- colnames(df_tmp)[!colnames(df_tmp) %in% var_to_temove]
df_tmp <- 
  df_tmp |>
  select(!!var_to_keep)

The dimension of the final dataset:

dim(df_tmp)

[1] 5305   94

Recall we coded missing values for categorical variables as Not reported and missing values because the individuals were not included in the sub-survey as Not surveyed. Let us look at how many observation fall into each of those categories:

df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "Not reported"),
            nb_not_surveyed = sum(. == "Not surveyed")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")

variable	nb_NA	nb_not_surveyed
PERSONNE_pb_asthm	0	0
PERSONNE_pb_bronchit	0	0
PERSONNE_pb_infarctus	0	0
PERSONNE_pb_coronair	0	0
PERSONNE_pb_hypertens	0	0
PERSONNE_pb_avc	0	0
PERSONNE_pb_arthros	0	0
PERSONNE_pb_lombalgi	0	0
PERSONNE_pb_cervical	0	0
PERSONNE_pb_diabet	0	0
PERSONNE_pb_allergi	0	0
PERSONNE_pb_cirrhos	0	0
PERSONNE_pb_urinair	0	0
PERSONNE_sexe	0	0
PERSONNE_couple	12	0
PERSONNE_statut	559	0
PERSONNE_ss	0	0
PERSONNE_regime	0	0
PERSONNE_rap_pcs8	0	0
PERSONNE_ald	0	0
SOINS_ald_am	0	0
MUTUELLE_assu	16	680
MUTUELLE_typcc	0	680
OPINION1_renonc_cons	0	1092
OPINION1_renonc_dent	0	1092
OPINION1_renonc_fin	0	1092
OPINION1_renonc_loin	0	1092
OPINION1_renonc_long	0	1092
QST_ct_depech	29	2725
QST_ct_liberte	41	2725
QST_ct_apprend	27	2725
QST_ct_aidecol	36	2725
QST_ct_travnuit	36	2725
QST_ct_repet	38	2725
QST_ct_lourd	37	2725
QST_ct_posture	35	2725
QST_ct_produit	35	2725
QES_association	93	15
QES_tpsami	119	15
QES_tpsasso	228	15
QES_tpscolleg	484	15
QES_tpsfamil	139	15
QES_mere_etude	84	15
QES_pere_etude	83	15
status	0	0

We decide here to merge these two different values as No answer:

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      .cols = is.factor|is.character,
      .fns = function(x) {
        fct_recode(
          x,
          "No answer" = "Not reported",
          "No answer" = "Not surveyed"
        )
      }
    )
  )

df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "No answer")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")

variable	nb_NA
PERSONNE_pb_asthm	0
PERSONNE_pb_bronchit	0
PERSONNE_pb_infarctus	0
PERSONNE_pb_coronair	0
PERSONNE_pb_hypertens	0
PERSONNE_pb_avc	0
PERSONNE_pb_arthros	0
PERSONNE_pb_lombalgi	0
PERSONNE_pb_cervical	0
PERSONNE_pb_diabet	0
PERSONNE_pb_allergi	0
PERSONNE_pb_cirrhos	0
PERSONNE_pb_urinair	0
PERSONNE_sexe	0
PERSONNE_couple	12
PERSONNE_statut	559
PERSONNE_ss	0
PERSONNE_regime	0
PERSONNE_rap_pcs8	0
PERSONNE_ald	0
SOINS_ald_am	0
MUTUELLE_assu	696
MUTUELLE_typcc	680
OPINION1_renonc_cons	1092
OPINION1_renonc_dent	1092
OPINION1_renonc_fin	1092
OPINION1_renonc_loin	1092
OPINION1_renonc_long	1092
QST_ct_depech	2754
QST_ct_liberte	2766
QST_ct_apprend	2752
QST_ct_aidecol	2761
QST_ct_travnuit	2761
QST_ct_repet	2763
QST_ct_lourd	2762
QST_ct_posture	2760
QST_ct_produit	2760
QES_association	108
QES_tpsami	134
QES_tpsasso	243
QES_tpscolleg	499
QES_tpsfamil	154
QES_mere_etude	99
QES_pere_etude	98
status	0

Let us remove the 12 observations where the couple status is not reported (too few observations for this category). The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_couple == "No answer") |> 
  left_join(df |> select(id, inf_q1_mhi_5), by = "id") |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))

# A tibble: 2 × 4
  inf_q1_mhi_5 status               n  prop
  <fct>        <fct>            <int> <dbl>
1 <=Q1         Not_D_and_inf_Q1     1  8.33
2 >Q1          Not_D_and_sup_Q1    11 91.7

df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_couple != "No answer")

For factor variables related to health conditions, we drop the unused levels.

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      c(
        PERSONNE_pb_asthm,
        PERSONNE_pb_bronchit,
        PERSONNE_pb_infarctus,
        PERSONNE_pb_coronair,
        PERSONNE_pb_hypertens,
        PERSONNE_pb_avc,
        PERSONNE_pb_arthros,
        PERSONNE_pb_lombalgi,
        PERSONNE_pb_cervical,
        PERSONNE_pb_diabet,
        PERSONNE_pb_allergi,
        PERSONNE_pb_cirrhos,
        PERSONNE_pb_urinair
      ),
      ~fct_drop(.x)
    )
  )

And we save the results:

df_clean <- df_tmp
save(df_clean, file = "../data/df_clean.rda")