15  Data Cleaning

Objectives

This chapter presents the codes used to clean data. It especially explains the different steps we took to deal with missing values. We use the self-assessed health status instead of the self-declared status with respect to depression here to define the target variable.

Let us load the data obtained in Chapter 1.

Let us first load {tidyverse}:

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Then the data:

load("../data/df_merged.rda")
dim(df)
[1] 18561   841

Let us keep track on the number of observation each time we filter out observations.

nb_obs_df <- tibble(
    step = c("raw", "age_15"), 
    n = c(19940, nrow(df))
  )
nb_obs_df
# A tibble: 2 × 2
  step       n
  <chr>  <dbl>
1 raw    19940
2 age_15 18561

We can create a table with the labels associated with the name of each variable.

variable_names <- tribble(
  ~variable, ~label, ~type,
  "PERSONNE_pb_depress", "Depression", "qualitative", 
  "PERSONNE_pb_asthm", "Asthma", "qualitative",
  "PERSONNE_pb_bronchit", "Bronchitis", "qualitative",
  "PERSONNE_pb_infarctus", "Heart Attack", "qualitative",
  "PERSONNE_pb_coronair", "Artery Disease", "qualitative",
  "PERSONNE_pb_hypertens", "Hypertension", "qualitative",
  "PERSONNE_pb_avc", "Stroke", "qualitative",
  "PERSONNE_pb_arthros", "Osteoarthritis", "qualitative",
  "PERSONNE_pb_lombalgi", "Low Back Pain", "qualitative",
  "PERSONNE_pb_cervical", "Neck Pain", "qualitative",
  "PERSONNE_pb_diabet", "Diabetes", "qualitative",
  "PERSONNE_pb_allergi", "Allergy", "qualitative",
  "PERSONNE_pb_cirrhos", "Cirrhosis", "qualitative",
  "PERSONNE_pb_urinair", "Urinary Incontinence", "qualitative",
  # "PERSONNE_pb_non", "No Illness", "qualitative",
  "PERSONNE_score_t_corrected", "MHI-5 Score", "numerical",
  "PERSONNE_etat_sante", "Self-Assessed Health Condition", "qualitative",
  "PERSONNE_age", "Age", "numerical",
  "PERSONNE_sexe", "Gender", "qualitative", 
  "PERSONNE_couple", "Couple", "qualitative", 
  "PERSONNE_etatleg", "Marital Status", "qualitative", 
  "PERSONNE_statut", "Professional Status", "qualitative", 
  "PERSONNE_ss", "Social Security", "qualitative", 
  "PERSONNE_regime", "Social Security System", "qualitative", 
  "PERSONNE_rap_pcs8", "Occupation", "qualitative", 
  "PERSONNE_ald", "Long-term condition (Self-declared)", "qualitative",
  "SOINS_ald_am", "Long-term condition (SNIIRAM)", "qualitative",
  #
  #
  # Household
  #
  #
  "MENAGE_rap_zau", "Zoning in Urban Areas", "ordinal",
  "MENAGE_region", "Region", "qualitative", 
  "MENAGE_revdetail", "Income", "numerical",
  "MENAGE_revucinsee", "Net Income per Cons. Unit", "numerical",
  "MENAGE_tu","Size Urban Area", "ordinal", 
  "MENAGE_nbpers", "Household size", "numerical",
  "ensol_2011", "Sunlight", "numerical",
  # "MENAGE_revenu", "Income (class)", "ordinal", 
  # "MENAGE_typmen", "Household Type", "qualitative", 
  # "CODE_REG", "Region Code", "qualitative",
  #
  #
  # Mutual Insurance
  #
  #
  "MUTUELLE_assu", "Insurance", "qualitative", 
  "MUTUELLE_typcc", "Mutual Coverage Type", "qualitative",
  #
  # Expenses
  # Outpatient ~= Sum of the others
  #
  "SOINS_depamb", "Exp. Outpatient", "numerical",
  "SOINS_depomn", "Exp. General Practitioner", "numerical",
  "SOINS_depspe", "Exp. Specialist", "numerical",
  "SOINS_deppha", "Exp. Pharmacy", "numerical",
  "SOINS_depkin", "Exp. Physiotherapist", "numerical",
  "SOINS_depinf", "Exp. Nurse", "numerical",
  "SOINS_depden", "Exp. Dentist", "numerical",
  "SOINS_depmat", "Exp. Equipment", "numerical",
  "SOINS_deptra", "Exp. Transport", "numerical",
  "SOINS_depopt", "Exp. Optical", "numerical",
  "SOINS_deppro", "Exp. Prostheses", "numerical",
  "SOINS_depurg", "Exp. Emergency w/o hospitalization", "numerical",
  #
  #
  # Reimbursement
  #
  "SOINS_remamb", "Reimbursement Outpatient", "numerical",
  "SOINS_remomn", "Reimbursement General Practitioner", "numerical",
  "SOINS_remspe", "Reimbursement Specialist", "numerical",
  "SOINS_rempha", "Reimbursement Pharmacy", "numerical",
  "SOINS_remkin", "Reimbursement Physiotherapist", "numerical",
  "SOINS_reminf", "Reimbursement Nurse", "numerical",
  "SOINS_remden", "Reimbursement Dentist", "numerical",
  "SOINS_remmat", "Reimbursement Equipment", "numerical",
  "SOINS_remtra", "Reimbursement Transport", "numerical",
  "SOINS_remopt", "Reimbursement Optical", "numerical",
  "SOINS_rempro", "Reimbursement Prostheses", "numerical",
  "SOINS_remurg", "Reimbursement Emergency w/o hospitalization", "numerical",
  #
  #
  # Co-payment (Ticket moderateur)
  #
  "SOINS_tmamb", "Co-payment Outpatient", "numerical",
  "SOINS_tmomn", "Co-payment General Practitioner", "numerical",
  "SOINS_tmspe", "Co-payment Specialist", "numerical",
  "SOINS_tmpha", "Co-payment Pharmacy", "numerical",
  "SOINS_tmkin", "Co-payment Physiotherapist", "numerical",
  "SOINS_tminf", "Co-payment Nurse", "numerical",
  "SOINS_tmden", "Co-payment Dentist", "numerical",
  "SOINS_tmmat", "Co-payment Equipment", "numerical",
  "SOINS_tmtra", "Co-payment Transport", "numerical",
  "SOINS_tmopt", "Co-payment Optical", "numerical",
  "SOINS_tmpro", "Co-payment Prostheses", "numerical",
  "SOINS_tmurg", "Co-payment Emergency w/o hospitalization", "numerical",
  #
  # Extra-fees
  #
  "SOINS_dpaamb", "Extra-fees Outpatient", "numerical",
  "SOINS_dpaomn", "Extra-fees General Practitioner", "numerical",
  "SOINS_dpaspe", "Extra-fees Specialist", "numerical",
  "SOINS_dpapha", "Extra-fees Pharmacy", "numerical",
  "SOINS_dpakin", "Extra-fees Physiotherapist", "numerical",
  "SOINS_dpainf", "Extra-fees Nurse", "numerical",
  "SOINS_dpaden", "Extra-fees Dentist", "numerical",
  "SOINS_dpamat", "Extra-fees Equipment", "numerical",
  "SOINS_dpatra", "Extra-fees Transport", "numerical",
  "SOINS_dpaopt", "Extra-fees Optical", "numerical",
  "SOINS_dpapro", "Extra-fees Prostheses", "numerical",
  "SOINS_dpaurg", "Extra-fees Emergency w/o hospitalization", "numerical",
  #
  #
  # Deductible (Franchise)
  #
  "SOINS_pf_framb", "Deduct. Outpatient", "numerical",
  "SOINS_pf_fromn", "Deduct. General Practitioner", "numerical",
  "SOINS_pf_frspe", "Deduct. Specialist", "numerical",
  "SOINS_pf_frpha", "Deduct. Pharmacy", "numerical",
  "SOINS_pf_frkin", "Deduct. Physiotherapist", "numerical",
  "SOINS_pf_frinf", "Deduct. Nurse", "numerical",
  "SOINS_pf_frden", "Deduct. Dentist", "numerical",
  # "SOINS_pf_frmat", "Deduct. Equipment", "numerical", # does not exist in the data
  "SOINS_pf_frtra", "Deduct. Transport", "numerical",
  # "SOINS_pf_fropt", "Deduct. Optical", "numerical",
  # "SOINS_pf_frpro", "Deduct. Prostheses", "numerical",
  "SOINS_pf_frurg", "Deduct. Emergency w/o hospitalization", "numerical",
  #
  "SOINS_seac_omn", "No. Medical Sessions General Pract.", "numerical",
  "SOINS_seac_spe",  "No. Medical Sessions Specialist", "numerical",
  #
  #
  "OPINION1_renonc_cons", "Waiver General Practitioner", "qualitative",
  "OPINION1_renonc_dent", "Waiver Dental Care", "qualitative",
  "OPINION1_renonc_fin", "Waiver Other Health Care", "qualitative",
  "OPINION1_renonc_loin", "Waiver Health Care Too Far", "qualitative",
  "OPINION1_renonc_long", "Waiver Appointment Delay Too Long", "qualitative",
  #
  # Working conditions
  #
  "QST_ct_depech", "Have to Hurry to Do Job", "qualitative", 
  "QST_ct_liberte", "Very Little Freedom to Do Job", "qualitative", 
  "QST_ct_apprend", "Job Allows to Learn New Things", "qualitative", 
  "QST_ct_aidecol", "Colleagues Help Carry out Tasks", "qualitative", 
  "QST_ct_travnuit", "Job Requires not to Sleep Betw. Midnight and 5 a.m.", "qualitative", 
  "QST_ct_repet", "Repetitive Work under Time Constraints / Line Job", "qualitative", 
  "QST_ct_lourd", "Exposed to Carrying Heavy Loads", "qualitative", 
  "QST_ct_posture", "Exposed to Painful Postures", "qualitative", 
  "QST_ct_produit", "Exposed to Harmful/Toxic Products/Substances", "qualitative",
  #
  "QES_association", "Participation in Group Activities", "qualitative",
  "QES_tpsami", "Frequency Meeting with Friends/Neighbors", "qualitative",
  "QES_tpsasso", "Frequency Meeting with People in Organizations", "qualitative",
  "QES_tpscolleg", "Frequency Meeting with Colleagues Outside Work", "qualitative",
  "QES_tpsfamil", "Frequency Meeting with Family Living Outside Household", "qualitative",
  "QES_mere_etude", "Mother's Level of Education", "qualitative",
  "QES_pere_etude", "Father's Level of Education", "qualitative"
)

save(variable_names, file = "../data/out/variable_names.rda")

We only keep the variables in the table variable_names, as well as those created at the end of CHapter 1.

df_tmp <- 
  df |> 
  select(!!variable_names$variable, inf_q1_mhi_5, inf_q1_mhi_3, id) |> 
  select(
    -PERSONNE_etatleg, -MENAGE_revdetail, -MENAGE_rap_zau,
    -SOINS_remamb, -SOINS_tmamb,
    -SOINS_dpaamb, -SOINS_pf_framb
  ) |> 
  mutate(PERSONNE_pb_depress = as.character(PERSONNE_pb_depress))

Note: the following variables were excluded for the following reasons:

Some additional information should be provided regarding healthcare variables. These variables are composed of :

The following relationships exist between these variables:

To avoid collinearity problems, we retain only the following variables in the final model: tm, dpa, rem, pf_fr, pa.

Then the variables that begin with SOINS_dep need to be removed:

retirer <- 
  variable_names$variable[str_which(variable_names$variable, "^SOINS_dep")]
retirer
 [1] "SOINS_depamb" "SOINS_depomn" "SOINS_depspe" "SOINS_deppha" "SOINS_depkin"
 [6] "SOINS_depinf" "SOINS_depden" "SOINS_depmat" "SOINS_deptra" "SOINS_depopt"
[11] "SOINS_deppro" "SOINS_depurg"
df_tmp <-
  df_tmp |> 
  select(-!!retirer)

Let us remove people for which the MHI-5 score was not computed:

sum(is.na(df_tmp$inf_q1_mhi_5))
[1] 6188
df_tmp <- 
  df_tmp |> 
  filter(!is.na(inf_q1_mhi_5))

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "missing_mhi5_score", n = nrow(df_tmp))
  )
nb_obs_df
# A tibble: 3 × 2
  step                   n
  <chr>              <dbl>
1 raw                19940
2 age_15             18561
3 missing_mhi5_score 12373

Let us remove the individuals who did not report their health status:

table(df_tmp$PERSONNE_etat_sante)

                  Not reported                Very Bad or Bad 
                            69                            845 
Very Good, Good or Fairly Good 
                         11459 

Distribution of MHI-5 score among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_etat_sante == "Not reported") |> 
  group_by(inf_q1_mhi_5) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))
# A tibble: 2 × 3
  inf_q1_mhi_5     n  prop
  <fct>        <int> <dbl>
1 <=Q1            33  47.8
2 >Q1             36  52.2
df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_etat_sante != "Not reported")

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "health_status_not_reported", n = nrow(df_tmp))
  )
nb_obs_df
# A tibble: 4 × 2
  step                           n
  <chr>                      <dbl>
1 raw                        19940
2 age_15                     18561
3 missing_mhi5_score         12373
4 health_status_not_reported 12304

Let us focus on the MHI-5 score here and discard the MHI-3. Let us create a variable (status) that classifies individuals depending on their MHI-5 score and their answer regarding depression:

df_tmp <- 
  df_tmp |> 
  select(-inf_q1_mhi_3) |>
  mutate(
    status = case_when(
      PERSONNE_etat_sante == "Very Bad or Bad" & inf_q1_mhi_5 == "<=Q1" ~ "D_and_inf_Q1",
      PERSONNE_etat_sante == "Very Bad or Bad" & inf_q1_mhi_5 == ">Q1" ~ "D_and_sup_Q1",
      PERSONNE_etat_sante == "Very Good, Good or Fairly Good" & inf_q1_mhi_5 == "<=Q1" ~ "Not_D_and_inf_Q1",
      PERSONNE_etat_sante == "Very Good, Good or Fairly Good" & inf_q1_mhi_5 == ">Q1" ~ "Not_D_and_sup_Q1",
      TRUE~"problem"
    )
  )

We can look at the distribution of this newly created variable:

table(df_tmp$status)

    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
             633              212             3246             8213 
round(100*prop.table(table(df_tmp$status)), 2)

    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
            5.14             1.72            26.38            66.75 

No values coded as problem:

df_tmp |> 
  filter(status == "problem") |> 
  select(status, PERSONNE_etat_sante, inf_q1_mhi_5) |> 
  filter(!is.na(inf_q1_mhi_5))
# A tibble: 0 × 3
# ℹ 3 variables: status <chr>, PERSONNE_etat_sante <chr>, inf_q1_mhi_5 <fct>

We are interested in people who self report having a good SAH status. Other people will be discarded. The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(! status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))
# A tibble: 2 × 4
  inf_q1_mhi_5 status           n  prop
  <fct>        <chr>        <int> <dbl>
1 <=Q1         D_and_inf_Q1   633  74.9
2 >Q1          D_and_sup_Q1   212  25.1

Let us discard other people. We define the status variable as a binary factor:

df_tmp <- 
  df_tmp |> 
  filter(status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  mutate(status = factor(
    status, level = c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1"))
  )

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "not_D", n = nrow(df_tmp))
  )
nb_obs_df
# A tibble: 5 × 2
  step                           n
  <chr>                      <dbl>
1 raw                        19940
2 age_15                     18561
3 missing_mhi5_score         12373
4 health_status_not_reported 12304
5 not_D                      11459

15.1 Looking at missing data

We need to check where are the missing data. Let us have a look at the “Working conditions” questions (columns which begin with QST_c), and count the number and corresponding proportion of missing values:

df_tmp |> 
  select(matches("^QST_c")) |> 
  map_df(
    function(x){
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed"),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Table 15.1: Missing values in the Working Conditions questions
variable no_obs nb_not_surveyed prop
QST_ct_depech 11459 5638 49.2
QST_ct_liberte 11459 5638 49.2
QST_ct_apprend 11459 5638 49.2
QST_ct_aidecol 11459 5638 49.2
QST_ct_travnuit 11459 5638 49.2
QST_ct_repet 11459 5638 49.2
QST_ct_lourd 11459 5638 49.2
QST_ct_posture 11459 5638 49.2
QST_ct_produit 11459 5638 49.2

As we can see from the previous result, around 50% of respondents did not give an answer to the questions related to work. It does not only concern unemployed people:

table(df_tmp$PERSONNE_rap_pcs8) |> sort()

                               Farmer                     Craftsman, trader 
                                  376                                   558 
                     Unskilled worker                   Commercial employee 
                                  872                                  1360 
Executive and intellectual profession               Administrative employee 
                                 1464                                  1549 
                       Skilled worker          Inactive having never worked 
                                 1633                                  1658 
              Intermediate occupation 
                                 1979 

Let us have a look at the distribution of PERSONNE_rap_pcs8 for individuals for which QST_ct_depech is missing:

df_tmp |> 
  filter(`QST_ct_depech` == "Not surveyed") |> 
  magrittr::extract2("PERSONNE_rap_pcs8") |> 
  table()

                               Farmer                     Craftsman, trader 
                                  238                                   244 
Executive and intellectual profession               Intermediate occupation 
                                  496                                   687 
              Administrative employee                   Commercial employee 
                                  605                                   593 
                       Skilled worker                      Unskilled worker 
                                  652                                   457 
         Inactive having never worked 
                                 1658 
df_tmp |> 
  filter(QST_ct_depech == "Not surveyed") |> 
  select_at(vars(matches("^QST_c")))
# A tibble: 5,638 × 9
   QST_ct_depech QST_ct_liberte QST_ct_apprend QST_ct_aidecol QST_ct_travnuit
   <fct>         <fct>          <fct>          <fct>          <fct>          
 1 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 2 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 3 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 4 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 5 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 6 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 7 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 8 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 9 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
10 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
# ℹ 5,628 more rows
# ℹ 4 more variables: QST_ct_repet <fct>, QST_ct_lourd <fct>,
#   QST_ct_posture <fct>, QST_ct_produit <fct>

We do the same for variables which give information relative to social interactions and which give information on the parents’ background (i.e., variables that begin with QES). First, let us have a look at the proportion of NA in those:

df_tmp |> 
  select(matches("^QES")) |> 
  map_df(
    function(x) {
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed", na.rm=T),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Table 15.2: Missing values in QES variables
variable no_obs nb_not_surveyed prop
QES_association 11459 43 0.38
QES_tpsami 11459 43 0.38
QES_tpsasso 11459 43 0.38
QES_tpscolleg 11459 43 0.38
QES_tpsfamil 11459 43 0.38
QES_mere_etude 11459 43 0.38
QES_pere_etude 11459 43 0.38

Let us have a look at the proportion of missing values in the current dataset:

pct_missing <- 
  sapply(df_tmp, function(x) round(100 * sum(is.na(x)) / length(x), 1)) |> 
  sort(decreasing = TRUE)
pct_missing
              SOINS_remomn               SOINS_remspe 
                      45.8                       45.8 
              SOINS_rempha               SOINS_remkin 
                      45.8                       45.8 
              SOINS_reminf               SOINS_remden 
                      45.8                       45.8 
              SOINS_remmat               SOINS_remtra 
                      45.8                       45.8 
              SOINS_remopt               SOINS_rempro 
                      45.8                       45.8 
              SOINS_remurg                SOINS_tmomn 
                      45.8                       45.8 
               SOINS_tmspe                SOINS_tmpha 
                      45.8                       45.8 
               SOINS_tmkin                SOINS_tminf 
                      45.8                       45.8 
               SOINS_tmden                SOINS_tmmat 
                      45.8                       45.8 
               SOINS_tmtra                SOINS_tmopt 
                      45.8                       45.8 
               SOINS_tmpro                SOINS_tmurg 
                      45.8                       45.8 
              SOINS_dpaomn               SOINS_dpaspe 
                      45.8                       45.8 
              SOINS_dpapha               SOINS_dpakin 
                      45.8                       45.8 
              SOINS_dpainf               SOINS_dpaden 
                      45.8                       45.8 
              SOINS_dpamat               SOINS_dpatra 
                      45.8                       45.8 
              SOINS_dpaopt               SOINS_dpapro 
                      45.8                       45.8 
              SOINS_dpaurg             SOINS_pf_fromn 
                      45.8                       45.8 
            SOINS_pf_frspe             SOINS_pf_frpha 
                      45.8                       45.8 
            SOINS_pf_frkin             SOINS_pf_frinf 
                      45.8                       45.8 
            SOINS_pf_frden             SOINS_pf_frtra 
                      45.8                       45.8 
            SOINS_pf_frurg             SOINS_seac_omn 
                      45.8                       45.8 
            SOINS_seac_spe          MENAGE_revucinsee 
                      45.8                       14.1 
               PERSONNE_ss            PERSONNE_regime 
                       0.5                        0.2 
         PERSONNE_rap_pcs8        PERSONNE_pb_depress 
                       0.1                        0.0 
         PERSONNE_pb_asthm       PERSONNE_pb_bronchit 
                       0.0                        0.0 
     PERSONNE_pb_infarctus       PERSONNE_pb_coronair 
                       0.0                        0.0 
     PERSONNE_pb_hypertens            PERSONNE_pb_avc 
                       0.0                        0.0 
       PERSONNE_pb_arthros       PERSONNE_pb_lombalgi 
                       0.0                        0.0 
      PERSONNE_pb_cervical         PERSONNE_pb_diabet 
                       0.0                        0.0 
       PERSONNE_pb_allergi        PERSONNE_pb_cirrhos 
                       0.0                        0.0 
       PERSONNE_pb_urinair PERSONNE_score_t_corrected 
                       0.0                        0.0 
       PERSONNE_etat_sante               PERSONNE_age 
                       0.0                        0.0 
             PERSONNE_sexe            PERSONNE_couple 
                       0.0                        0.0 
           PERSONNE_statut               PERSONNE_ald 
                       0.0                        0.0 
              SOINS_ald_am              MENAGE_region 
                       0.0                        0.0 
                 MENAGE_tu              MENAGE_nbpers 
                       0.0                        0.0 
                ensol_2011              MUTUELLE_assu 
                       0.0                        0.0 
            MUTUELLE_typcc       OPINION1_renonc_cons 
                       0.0                        0.0 
      OPINION1_renonc_dent        OPINION1_renonc_fin 
                       0.0                        0.0 
      OPINION1_renonc_loin       OPINION1_renonc_long 
                       0.0                        0.0 
             QST_ct_depech             QST_ct_liberte 
                       0.0                        0.0 
            QST_ct_apprend             QST_ct_aidecol 
                       0.0                        0.0 
           QST_ct_travnuit               QST_ct_repet 
                       0.0                        0.0 
              QST_ct_lourd             QST_ct_posture 
                       0.0                        0.0 
            QST_ct_produit            QES_association 
                       0.0                        0.0 
                QES_tpsami                QES_tpsasso 
                       0.0                        0.0 
             QES_tpscolleg               QES_tpsfamil 
                       0.0                        0.0 
            QES_mere_etude             QES_pere_etude 
                       0.0                        0.0 
              inf_q1_mhi_5                         id 
                       0.0                        0.0 
                    status 
                       0.0 

Ordinal variables need to be altered prior to the estimation (because of SHAP). We change them to numerical variables. To keep a track of the corresponding numerical values associated with each label, we create a table.

Then, we create the table correspondance_ordinal that contains the numerical values associated with the different categories.

correspondance_ordinal <- 
  tibble(
    variable = "MENAGE_tu",
    old_value = df_tmp$MENAGE_tu |> unique(),
    new_value = as.numeric(df_tmp$MENAGE_tu) |> unique()
  )
correspondance_ordinal
# A tibble: 5 × 3
  variable  old_value               new_value
  <chr>     <ord>                       <dbl>
1 MENAGE_tu Small Municipality              1
2 MENAGE_tu Medium Municipality             2
3 MENAGE_tu Not surveyed                    6
4 MENAGE_tu Paris metropolitan area         4
5 MENAGE_tu Large Municipality              3
save(correspondance_ordinal, file = "../data/out/correspondance_ordinal_sah.rda")

Now that we have stored the changes we plan on applying, we can proceed to these changes:

df_tmp <- 
  df_tmp |>
  mutate_if(is.character, as.factor) |> 
  mutate(MENAGE_tu = as.numeric(df_tmp$MENAGE_tu))

We need to remove the variables that wee used to construct the target variable:

df_tmp <- 
  df_tmp |> 
  select(
    -PERSONNE_pb_depress,
    -inf_q1_mhi_5,
    -PERSONNE_score_t_corrected,
    -PERSONNE_etat_sante
  )

Then, we remove observations that contain missing values. Let us keep a track on the number of observation removed at each time.

nb_obs <- nrow(df_tmp)

eye_on_proportions <- vector(mode="list", length = ncol(df_tmp))

for(p in 1:ncol(df_tmp)){
  variable_to_check <- colnames(df_tmp)[p]
  
  eye_on_proportions[[p]] <- 
    df_tmp |> 
    filter(is.na(!!sym(variable_to_check))) |> 
    group_by(status) |> 
    count() |> ungroup() |> 
    mutate(prop = round(100 * n / sum(n), digits = 2))
  
  # Filtering out the observations with missing values
  df_tmp <- 
    df_tmp |> 
    filter(!is.na(!!sym(variable_to_check)))
  
  if(nrow(df_tmp) < nb_obs){
    nb_obs_df_tmp <- 
      tibble(step = variable_to_check, 
             n = nrow(df_tmp))
    nb_obs_df <- 
      nb_obs_df |> 
      bind_rows(nb_obs_df_tmp)
    nb_obs <- nrow(df_tmp)
  }
  
}
names(eye_on_proportions) <- colnames(df_tmp)

Let us have a look at the nb_obs_df table. The first 4 rows give:

  1. the number of observations with the raw sample
  2. how many observations were left after removing people younger than 15 years old
  3. how many observations were left when removing individuals who did not report their health status
  4. how many individuals reported feeling good (SAH).

The remaining rows indicate how many individuals (column n) are left in the sample once individuals with missing data for the variable reported in column step are removed from the sample. The number given in column n_drop indicates the loss due to lack of data for the variable of interest.

nb_obs_df |> 
  mutate(n_drop = n - dplyr::lag(n)) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
Table 15.3: Number of remaining data and loss of individuals due to missing data
step n n_drop
raw 19940 NA
age_15 18561 -1379
missing_mhi5_score 12373 -6188
health_status_not_reported 12304 -69
not_D 11459 -845
PERSONNE_ss 11407 -52
PERSONNE_regime 11403 -4
PERSONNE_rap_pcs8 11393 -10
MENAGE_revucinsee 9794 -1599
SOINS_remomn 5393 -4401
QES_tpsasso 5392 -1

At the end, we are left with a dataset with no more missing data:

any(is.na(df_tmp))
[1] FALSE

Lastly, we remove the region variable and the self-declared health condition:

var_to_temove <- c("MENAGE_region", "code_insee", "MENAGE_region_nom")
var_to_keep <- colnames(df_tmp)[!colnames(df_tmp) %in% var_to_temove]
df_tmp <- 
  df_tmp |>
  select(!!var_to_keep)

The dimension of the final dataset:

dim(df_tmp)
[1] 5392   94

Recall we coded missing values for categorical variables as Not reported and missing values because the individuals were not included in the sub-survey as Not surveyed. Let us look at how many observation fall into each of those categories:

df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "Not reported"),
            nb_not_surveyed = sum(. == "Not surveyed")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")
variable nb_NA nb_not_surveyed
PERSONNE_pb_asthm 205 0
PERSONNE_pb_bronchit 205 0
PERSONNE_pb_infarctus 205 0
PERSONNE_pb_coronair 205 0
PERSONNE_pb_hypertens 205 0
PERSONNE_pb_avc 205 0
PERSONNE_pb_arthros 205 0
PERSONNE_pb_lombalgi 205 0
PERSONNE_pb_cervical 205 0
PERSONNE_pb_diabet 205 0
PERSONNE_pb_allergi 205 0
PERSONNE_pb_cirrhos 205 0
PERSONNE_pb_urinair 205 0
PERSONNE_sexe 0 0
PERSONNE_couple 12 0
PERSONNE_statut 587 0
PERSONNE_ss 0 0
PERSONNE_regime 0 0
PERSONNE_rap_pcs8 0 0
PERSONNE_ald 0 0
SOINS_ald_am 0 0
MUTUELLE_assu 18 671
MUTUELLE_typcc 0 671
OPINION1_renonc_cons 0 1118
OPINION1_renonc_dent 0 1118
OPINION1_renonc_fin 0 1118
OPINION1_renonc_loin 0 1118
OPINION1_renonc_long 0 1118
QST_ct_depech 30 2655
QST_ct_liberte 44 2655
QST_ct_apprend 29 2655
QST_ct_aidecol 40 2655
QST_ct_travnuit 40 2655
QST_ct_repet 43 2655
QST_ct_lourd 40 2655
QST_ct_posture 40 2655
QST_ct_produit 37 2655
QES_association 94 17
QES_tpsami 127 17
QES_tpsasso 239 17
QES_tpscolleg 496 17
QES_tpsfamil 143 17
QES_mere_etude 93 17
QES_pere_etude 86 17
status 0 0

We decide here to merge these two different values as No answer:

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      .cols = is.factor|is.character,
      .fns = function(x) {
        fct_recode(
          x,
          "No answer" = "Not reported",
          "No answer" = "Not surveyed"
        )
      }
    )
  )
df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "No answer")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")
variable nb_NA
PERSONNE_pb_asthm 205
PERSONNE_pb_bronchit 205
PERSONNE_pb_infarctus 205
PERSONNE_pb_coronair 205
PERSONNE_pb_hypertens 205
PERSONNE_pb_avc 205
PERSONNE_pb_arthros 205
PERSONNE_pb_lombalgi 205
PERSONNE_pb_cervical 205
PERSONNE_pb_diabet 205
PERSONNE_pb_allergi 205
PERSONNE_pb_cirrhos 205
PERSONNE_pb_urinair 205
PERSONNE_sexe 0
PERSONNE_couple 12
PERSONNE_statut 587
PERSONNE_ss 0
PERSONNE_regime 0
PERSONNE_rap_pcs8 0
PERSONNE_ald 0
SOINS_ald_am 0
MUTUELLE_assu 689
MUTUELLE_typcc 671
OPINION1_renonc_cons 1118
OPINION1_renonc_dent 1118
OPINION1_renonc_fin 1118
OPINION1_renonc_loin 1118
OPINION1_renonc_long 1118
QST_ct_depech 2685
QST_ct_liberte 2699
QST_ct_apprend 2684
QST_ct_aidecol 2695
QST_ct_travnuit 2695
QST_ct_repet 2698
QST_ct_lourd 2695
QST_ct_posture 2695
QST_ct_produit 2692
QES_association 111
QES_tpsami 144
QES_tpsasso 256
QES_tpscolleg 513
QES_tpsfamil 160
QES_mere_etude 110
QES_pere_etude 103
status 0

Let us remove the 12 observations where the couple status is not reported (too few observations for this category). The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_couple == "No answer") |> 
  left_join(df |> select(id, inf_q1_mhi_5), by = "id") |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))
# A tibble: 2 × 4
  inf_q1_mhi_5 status               n  prop
  <fct>        <fct>            <int> <dbl>
1 <=Q1         Not_D_and_inf_Q1     1  8.33
2 >Q1          Not_D_and_sup_Q1    11 91.7 
df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_couple != "No answer")

For factor variables related to health conditions, we drop the unused levels.

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      c(
        PERSONNE_pb_asthm,
        PERSONNE_pb_bronchit,
        PERSONNE_pb_infarctus,
        PERSONNE_pb_coronair,
        PERSONNE_pb_hypertens,
        PERSONNE_pb_avc,
        PERSONNE_pb_arthros,
        PERSONNE_pb_lombalgi,
        PERSONNE_pb_cervical,
        PERSONNE_pb_diabet,
        PERSONNE_pb_allergi,
        PERSONNE_pb_cirrhos,
        PERSONNE_pb_urinair
      ),
      ~fct_drop(.x)
    )
  )

And we save the results:

df_clean <- df_tmp
save(df_clean, file = "../data/df_clean_sah.rda")