15 Data Cleaning

Objectives

This chapter presents the codes used to clean data. It especially explains the different steps we took to deal with missing values. We use the self-assessed health status instead of the self-declared status with respect to depression here to define the target variable.

Let us load the data obtained in Chapter 1.

Let us first load {tidyverse}:

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Then the data:

load("../data/df_merged.rda")
dim(df)

[1] 18561   841

Let us keep track on the number of observation each time we filter out observations.

nb_obs_df <- tibble(
    step = c("raw", "age_15"), 
    n = c(19940, nrow(df))
  )
nb_obs_df

# A tibble: 2 × 2
  step       n
  <chr>  <dbl>
1 raw    19940
2 age_15 18561

We can create a table with the labels associated with the name of each variable.

variable_names <- tribble(
  ~variable, ~label, ~type,
  "PERSONNE_pb_depress", "Depression", "qualitative", 
  "PERSONNE_pb_asthm", "Asthma", "qualitative",
  "PERSONNE_pb_bronchit", "Bronchitis", "qualitative",
  "PERSONNE_pb_infarctus", "Heart Attack", "qualitative",
  "PERSONNE_pb_coronair", "Artery Disease", "qualitative",
  "PERSONNE_pb_hypertens", "Hypertension", "qualitative",
  "PERSONNE_pb_avc", "Stroke", "qualitative",
  "PERSONNE_pb_arthros", "Osteoarthritis", "qualitative",
  "PERSONNE_pb_lombalgi", "Low Back Pain", "qualitative",
  "PERSONNE_pb_cervical", "Neck Pain", "qualitative",
  "PERSONNE_pb_diabet", "Diabetes", "qualitative",
  "PERSONNE_pb_allergi", "Allergy", "qualitative",
  "PERSONNE_pb_cirrhos", "Cirrhosis", "qualitative",
  "PERSONNE_pb_urinair", "Urinary Incontinence", "qualitative",
  # "PERSONNE_pb_non", "No Illness", "qualitative",
  "PERSONNE_score_t_corrected", "MHI-5 Score", "numerical",
  "PERSONNE_etat_sante", "Self-Assessed Health Condition", "qualitative",
  "PERSONNE_age", "Age", "numerical",
  "PERSONNE_sexe", "Gender", "qualitative", 
  "PERSONNE_couple", "Couple", "qualitative", 
  "PERSONNE_etatleg", "Marital Status", "qualitative", 
  "PERSONNE_statut", "Professional Status", "qualitative", 
  "PERSONNE_ss", "Social Security", "qualitative", 
  "PERSONNE_regime", "Social Security System", "qualitative", 
  "PERSONNE_rap_pcs8", "Occupation", "qualitative", 
  "PERSONNE_ald", "Long-term condition (Self-declared)", "qualitative",
  "SOINS_ald_am", "Long-term condition (SNIIRAM)", "qualitative",
  #
  #
  # Household
  #
  #
  "MENAGE_rap_zau", "Zoning in Urban Areas", "ordinal",
  "MENAGE_region", "Region", "qualitative", 
  "MENAGE_revdetail", "Income", "numerical",
  "MENAGE_revucinsee", "Net Income per Cons. Unit", "numerical",
  "MENAGE_tu","Size Urban Area", "ordinal", 
  "MENAGE_nbpers", "Household size", "numerical",
  "ensol_2011", "Sunlight", "numerical",
  # "MENAGE_revenu", "Income (class)", "ordinal", 
  # "MENAGE_typmen", "Household Type", "qualitative", 
  # "CODE_REG", "Region Code", "qualitative",
  #
  #
  # Mutual Insurance
  #
  #
  "MUTUELLE_assu", "Insurance", "qualitative", 
  "MUTUELLE_typcc", "Mutual Coverage Type", "qualitative",
  #
  # Expenses
  # Outpatient ~= Sum of the others
  #
  "SOINS_depamb", "Exp. Outpatient", "numerical",
  "SOINS_depomn", "Exp. General Practitioner", "numerical",
  "SOINS_depspe", "Exp. Specialist", "numerical",
  "SOINS_deppha", "Exp. Pharmacy", "numerical",
  "SOINS_depkin", "Exp. Physiotherapist", "numerical",
  "SOINS_depinf", "Exp. Nurse", "numerical",
  "SOINS_depden", "Exp. Dentist", "numerical",
  "SOINS_depmat", "Exp. Equipment", "numerical",
  "SOINS_deptra", "Exp. Transport", "numerical",
  "SOINS_depopt", "Exp. Optical", "numerical",
  "SOINS_deppro", "Exp. Prostheses", "numerical",
  "SOINS_depurg", "Exp. Emergency w/o hospitalization", "numerical",
  #
  #
  # Reimbursement
  #
  "SOINS_remamb", "Reimbursement Outpatient", "numerical",
  "SOINS_remomn", "Reimbursement General Practitioner", "numerical",
  "SOINS_remspe", "Reimbursement Specialist", "numerical",
  "SOINS_rempha", "Reimbursement Pharmacy", "numerical",
  "SOINS_remkin", "Reimbursement Physiotherapist", "numerical",
  "SOINS_reminf", "Reimbursement Nurse", "numerical",
  "SOINS_remden", "Reimbursement Dentist", "numerical",
  "SOINS_remmat", "Reimbursement Equipment", "numerical",
  "SOINS_remtra", "Reimbursement Transport", "numerical",
  "SOINS_remopt", "Reimbursement Optical", "numerical",
  "SOINS_rempro", "Reimbursement Prostheses", "numerical",
  "SOINS_remurg", "Reimbursement Emergency w/o hospitalization", "numerical",
  #
  #
  # Co-payment (Ticket moderateur)
  #
  "SOINS_tmamb", "Co-payment Outpatient", "numerical",
  "SOINS_tmomn", "Co-payment General Practitioner", "numerical",
  "SOINS_tmspe", "Co-payment Specialist", "numerical",
  "SOINS_tmpha", "Co-payment Pharmacy", "numerical",
  "SOINS_tmkin", "Co-payment Physiotherapist", "numerical",
  "SOINS_tminf", "Co-payment Nurse", "numerical",
  "SOINS_tmden", "Co-payment Dentist", "numerical",
  "SOINS_tmmat", "Co-payment Equipment", "numerical",
  "SOINS_tmtra", "Co-payment Transport", "numerical",
  "SOINS_tmopt", "Co-payment Optical", "numerical",
  "SOINS_tmpro", "Co-payment Prostheses", "numerical",
  "SOINS_tmurg", "Co-payment Emergency w/o hospitalization", "numerical",
  #
  # Extra-fees
  #
  "SOINS_dpaamb", "Extra-fees Outpatient", "numerical",
  "SOINS_dpaomn", "Extra-fees General Practitioner", "numerical",
  "SOINS_dpaspe", "Extra-fees Specialist", "numerical",
  "SOINS_dpapha", "Extra-fees Pharmacy", "numerical",
  "SOINS_dpakin", "Extra-fees Physiotherapist", "numerical",
  "SOINS_dpainf", "Extra-fees Nurse", "numerical",
  "SOINS_dpaden", "Extra-fees Dentist", "numerical",
  "SOINS_dpamat", "Extra-fees Equipment", "numerical",
  "SOINS_dpatra", "Extra-fees Transport", "numerical",
  "SOINS_dpaopt", "Extra-fees Optical", "numerical",
  "SOINS_dpapro", "Extra-fees Prostheses", "numerical",
  "SOINS_dpaurg", "Extra-fees Emergency w/o hospitalization", "numerical",
  #
  #
  # Deductible (Franchise)
  #
  "SOINS_pf_framb", "Deduct. Outpatient", "numerical",
  "SOINS_pf_fromn", "Deduct. General Practitioner", "numerical",
  "SOINS_pf_frspe", "Deduct. Specialist", "numerical",
  "SOINS_pf_frpha", "Deduct. Pharmacy", "numerical",
  "SOINS_pf_frkin", "Deduct. Physiotherapist", "numerical",
  "SOINS_pf_frinf", "Deduct. Nurse", "numerical",
  "SOINS_pf_frden", "Deduct. Dentist", "numerical",
  # "SOINS_pf_frmat", "Deduct. Equipment", "numerical", # does not exist in the data
  "SOINS_pf_frtra", "Deduct. Transport", "numerical",
  # "SOINS_pf_fropt", "Deduct. Optical", "numerical",
  # "SOINS_pf_frpro", "Deduct. Prostheses", "numerical",
  "SOINS_pf_frurg", "Deduct. Emergency w/o hospitalization", "numerical",
  #
  "SOINS_seac_omn", "No. Medical Sessions General Pract.", "numerical",
  "SOINS_seac_spe",  "No. Medical Sessions Specialist", "numerical",
  #
  #
  "OPINION1_renonc_cons", "Waiver General Practitioner", "qualitative",
  "OPINION1_renonc_dent", "Waiver Dental Care", "qualitative",
  "OPINION1_renonc_fin", "Waiver Other Health Care", "qualitative",
  "OPINION1_renonc_loin", "Waiver Health Care Too Far", "qualitative",
  "OPINION1_renonc_long", "Waiver Appointment Delay Too Long", "qualitative",
  #
  # Working conditions
  #
  "QST_ct_depech", "Have to Hurry to Do Job", "qualitative", 
  "QST_ct_liberte", "Very Little Freedom to Do Job", "qualitative", 
  "QST_ct_apprend", "Job Allows to Learn New Things", "qualitative", 
  "QST_ct_aidecol", "Colleagues Help Carry out Tasks", "qualitative", 
  "QST_ct_travnuit", "Job Requires not to Sleep Betw. Midnight and 5 a.m.", "qualitative", 
  "QST_ct_repet", "Repetitive Work under Time Constraints / Line Job", "qualitative", 
  "QST_ct_lourd", "Exposed to Carrying Heavy Loads", "qualitative", 
  "QST_ct_posture", "Exposed to Painful Postures", "qualitative", 
  "QST_ct_produit", "Exposed to Harmful/Toxic Products/Substances", "qualitative",
  #
  "QES_association", "Participation in Group Activities", "qualitative",
  "QES_tpsami", "Frequency Meeting with Friends/Neighbors", "qualitative",
  "QES_tpsasso", "Frequency Meeting with People in Organizations", "qualitative",
  "QES_tpscolleg", "Frequency Meeting with Colleagues Outside Work", "qualitative",
  "QES_tpsfamil", "Frequency Meeting with Family Living Outside Household", "qualitative",
  "QES_mere_etude", "Mother's Level of Education", "qualitative",
  "QES_pere_etude", "Father's Level of Education", "qualitative"
)

save(variable_names, file = "../data/out/variable_names.rda")

We only keep the variables in the table variable_names, as well as those created at the end of CHapter 1.

df_tmp <- 
  df |> 
  select(!!variable_names$variable, inf_q1_mhi_5, inf_q1_mhi_3, id) |> 
  select(
    -PERSONNE_etatleg, -MENAGE_revdetail, -MENAGE_rap_zau,
    -SOINS_remamb, -SOINS_tmamb,
    -SOINS_dpaamb, -SOINS_pf_framb
  ) |> 
  mutate(PERSONNE_pb_depress = as.character(PERSONNE_pb_depress))

Note: the following variables were excluded for the following reasons:

PERSONNE_etatleg gives the marital status; we keep PERSONNE_couple which states whether the person lives as a couple
MENAGE_revdetail is the income in numerical values; we prefer keeping MENAGE_revucinsee (Net Income per Cons. Unit)
MENAGE_rap_zau: the variable MENAGE_tu already provides a very similar information
SOINS_remamb, SOINS_tmamb, SOINS_dpaamb, SOINS_pf_frambSOINS_pf_framb : these are the sum of other reimbursements; they would therefore cause colinearity issues.

Some additional information should be provided regarding healthcare variables. These variables are composed of :

expenditures (dep, dépenses)
health insurance reimbursements (remb, remboursements par l’assurance maladie)
co-payment (tm, ticket modérateur)
fee overruns (dpa, dépassement d’honoraires, non remboursés)
amounts to be charged (cha, montants à charge)
deductible (pf_fr, Participation forfaitaire/Franchise)
guaranteed participation (pa, Participation assurée).

The following relationships exist between these variables:

dep = cha + rem + pf_fr + pa
cha = tm + dpa
dep = tm + dpa + rem + pf_fr + pa.

To avoid collinearity problems, we retain only the following variables in the final model: tm, dpa, rem, pf_fr, pa.

Then the variables that begin with SOINS_dep need to be removed:

retirer <- 
  variable_names$variable[str_which(variable_names$variable, "^SOINS_dep")]
retirer

 [1] "SOINS_depamb" "SOINS_depomn" "SOINS_depspe" "SOINS_deppha" "SOINS_depkin"
 [6] "SOINS_depinf" "SOINS_depden" "SOINS_depmat" "SOINS_deptra" "SOINS_depopt"
[11] "SOINS_deppro" "SOINS_depurg"

df_tmp <-
  df_tmp |> 
  select(-!!retirer)

Let us remove people for which the MHI-5 score was not computed:

sum(is.na(df_tmp$inf_q1_mhi_5))

[1] 6188

df_tmp <- 
  df_tmp |> 
  filter(!is.na(inf_q1_mhi_5))

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "missing_mhi5_score", n = nrow(df_tmp))
  )
nb_obs_df

# A tibble: 3 × 2
  step                   n
  <chr>              <dbl>
1 raw                19940
2 age_15             18561
3 missing_mhi5_score 12373

Let us remove the individuals who did not report their health status:

table(df_tmp$PERSONNE_etat_sante)


                  Not reported                Very Bad or Bad 
                            69                            845 
Very Good, Good or Fairly Good 
                         11459

Distribution of MHI-5 score among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_etat_sante == "Not reported") |> 
  group_by(inf_q1_mhi_5) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))

# A tibble: 2 × 3
  inf_q1_mhi_5     n  prop
  <fct>        <int> <dbl>
1 <=Q1            33  47.8
2 >Q1             36  52.2

df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_etat_sante != "Not reported")

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "health_status_not_reported", n = nrow(df_tmp))
  )
nb_obs_df

# A tibble: 4 × 2
  step                           n
  <chr>                      <dbl>
1 raw                        19940
2 age_15                     18561
3 missing_mhi5_score         12373
4 health_status_not_reported 12304

Let us focus on the MHI-5 score here and discard the MHI-3. Let us create a variable (status) that classifies individuals depending on their MHI-5 score and their answer regarding depression:

df_tmp <- 
  df_tmp |> 
  select(-inf_q1_mhi_3) |>
  mutate(
    status = case_when(
      PERSONNE_etat_sante == "Very Bad or Bad" & inf_q1_mhi_5 == "<=Q1" ~ "D_and_inf_Q1",
      PERSONNE_etat_sante == "Very Bad or Bad" & inf_q1_mhi_5 == ">Q1" ~ "D_and_sup_Q1",
      PERSONNE_etat_sante == "Very Good, Good or Fairly Good" & inf_q1_mhi_5 == "<=Q1" ~ "Not_D_and_inf_Q1",
      PERSONNE_etat_sante == "Very Good, Good or Fairly Good" & inf_q1_mhi_5 == ">Q1" ~ "Not_D_and_sup_Q1",
      TRUE~"problem"
    )
  )

We can look at the distribution of this newly created variable:

table(df_tmp$status)


    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
             633              212             3246             8213

round(100*prop.table(table(df_tmp$status)), 2)


    D_and_inf_Q1     D_and_sup_Q1 Not_D_and_inf_Q1 Not_D_and_sup_Q1 
            5.14             1.72            26.38            66.75

No values coded as problem:

df_tmp |> 
  filter(status == "problem") |> 
  select(status, PERSONNE_etat_sante, inf_q1_mhi_5) |> 
  filter(!is.na(inf_q1_mhi_5))

# A tibble: 0 × 3
# ℹ 3 variables: status <chr>, PERSONNE_etat_sante <chr>, inf_q1_mhi_5 <fct>

We are interested in people who self report having a good SAH status. Other people will be discarded. The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(! status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))

# A tibble: 2 × 4
  inf_q1_mhi_5 status           n  prop
  <fct>        <chr>        <int> <dbl>
1 <=Q1         D_and_inf_Q1   633  74.9
2 >Q1          D_and_sup_Q1   212  25.1

Let us discard other people. We define the status variable as a binary factor:

df_tmp <- 
  df_tmp |> 
  filter(status %in% c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1")) |> 
  mutate(status = factor(
    status, level = c("Not_D_and_inf_Q1", "Not_D_and_sup_Q1"))
  )

Let us keep track on the number of individuals that were removed:

nb_obs_df <- 
  nb_obs_df |> 
  bind_rows(
    tibble(step = "not_D", n = nrow(df_tmp))
  )
nb_obs_df

# A tibble: 5 × 2
  step                           n
  <chr>                      <dbl>
1 raw                        19940
2 age_15                     18561
3 missing_mhi5_score         12373
4 health_status_not_reported 12304
5 not_D                      11459

15.1 Looking at missing data

We need to check where are the missing data. Let us have a look at the “Working conditions” questions (columns which begin with QST_c), and count the number and corresponding proportion of missing values:

df_tmp |> 
  select(matches("^QST_c")) |> 
  map_df(
    function(x){
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed"),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Table 15.1: Missing values in the Working Conditions questions

variable	no_obs	nb_not_surveyed	prop
QST_ct_depech	11459	5638	49.2
QST_ct_liberte	11459	5638	49.2
QST_ct_apprend	11459	5638	49.2
QST_ct_aidecol	11459	5638	49.2
QST_ct_travnuit	11459	5638	49.2
QST_ct_repet	11459	5638	49.2
QST_ct_lourd	11459	5638	49.2
QST_ct_posture	11459	5638	49.2
QST_ct_produit	11459	5638	49.2

As we can see from the previous result, around 50% of respondents did not give an answer to the questions related to work. It does not only concern unemployed people:

table(df_tmp$PERSONNE_rap_pcs8) |> sort()


                               Farmer                     Craftsman, trader 
                                  376                                   558 
                     Unskilled worker                   Commercial employee 
                                  872                                  1360 
Executive and intellectual profession               Administrative employee 
                                 1464                                  1549 
                       Skilled worker          Inactive having never worked 
                                 1633                                  1658 
              Intermediate occupation 
                                 1979

Let us have a look at the distribution of PERSONNE_rap_pcs8 for individuals for which QST_ct_depech is missing:

df_tmp |> 
  filter(`QST_ct_depech` == "Not surveyed") |> 
  magrittr::extract2("PERSONNE_rap_pcs8") |> 
  table()


                               Farmer                     Craftsman, trader 
                                  238                                   244 
Executive and intellectual profession               Intermediate occupation 
                                  496                                   687 
              Administrative employee                   Commercial employee 
                                  605                                   593 
                       Skilled worker                      Unskilled worker 
                                  652                                   457 
         Inactive having never worked 
                                 1658

df_tmp |> 
  filter(QST_ct_depech == "Not surveyed") |> 
  select_at(vars(matches("^QST_c")))

# A tibble: 5,638 × 9
   QST_ct_depech QST_ct_liberte QST_ct_apprend QST_ct_aidecol QST_ct_travnuit
   <fct>         <fct>          <fct>          <fct>          <fct>          
 1 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 2 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 3 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 4 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 5 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 6 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 7 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 8 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
 9 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
10 Not surveyed  Not surveyed   Not surveyed   Not surveyed   Not surveyed   
# ℹ 5,628 more rows
# ℹ 4 more variables: QST_ct_repet <fct>, QST_ct_lourd <fct>,
#   QST_ct_posture <fct>, QST_ct_produit <fct>

We do the same for variables which give information relative to social interactions and which give information on the parents’ background (i.e., variables that begin with QES). First, let us have a look at the proportion of NA in those:

df_tmp |> 
  select(matches("^QES")) |> 
  map_df(
    function(x) {
      tibble(
        no_obs = length(x),
        nb_not_surveyed = sum(x == "Not surveyed", na.rm=T),
        prop = round(100 * nb_not_surveyed / no_obs, 2))
    },
    .id = "variable"
  ) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Table 15.2: Missing values in QES variables

variable	no_obs	nb_not_surveyed	prop
QES_association	11459	43	0.38
QES_tpsami	11459	43	0.38
QES_tpsasso	11459	43	0.38
QES_tpscolleg	11459	43	0.38
QES_tpsfamil	11459	43	0.38
QES_mere_etude	11459	43	0.38
QES_pere_etude	11459	43	0.38

Let us have a look at the proportion of missing values in the current dataset:

pct_missing <- 
  sapply(df_tmp, function(x) round(100 * sum(is.na(x)) / length(x), 1)) |> 
  sort(decreasing = TRUE)
pct_missing

              SOINS_remomn               SOINS_remspe 
                      45.8                       45.8 
              SOINS_rempha               SOINS_remkin 
                      45.8                       45.8 
              SOINS_reminf               SOINS_remden 
                      45.8                       45.8 
              SOINS_remmat               SOINS_remtra 
                      45.8                       45.8 
              SOINS_remopt               SOINS_rempro 
                      45.8                       45.8 
              SOINS_remurg                SOINS_tmomn 
                      45.8                       45.8 
               SOINS_tmspe                SOINS_tmpha 
                      45.8                       45.8 
               SOINS_tmkin                SOINS_tminf 
                      45.8                       45.8 
               SOINS_tmden                SOINS_tmmat 
                      45.8                       45.8 
               SOINS_tmtra                SOINS_tmopt 
                      45.8                       45.8 
               SOINS_tmpro                SOINS_tmurg 
                      45.8                       45.8 
              SOINS_dpaomn               SOINS_dpaspe 
                      45.8                       45.8 
              SOINS_dpapha               SOINS_dpakin 
                      45.8                       45.8 
              SOINS_dpainf               SOINS_dpaden 
                      45.8                       45.8 
              SOINS_dpamat               SOINS_dpatra 
                      45.8                       45.8 
              SOINS_dpaopt               SOINS_dpapro 
                      45.8                       45.8 
              SOINS_dpaurg             SOINS_pf_fromn 
                      45.8                       45.8 
            SOINS_pf_frspe             SOINS_pf_frpha 
                      45.8                       45.8 
            SOINS_pf_frkin             SOINS_pf_frinf 
                      45.8                       45.8 
            SOINS_pf_frden             SOINS_pf_frtra 
                      45.8                       45.8 
            SOINS_pf_frurg             SOINS_seac_omn 
                      45.8                       45.8 
            SOINS_seac_spe          MENAGE_revucinsee 
                      45.8                       14.1 
               PERSONNE_ss            PERSONNE_regime 
                       0.5                        0.2 
         PERSONNE_rap_pcs8        PERSONNE_pb_depress 
                       0.1                        0.0 
         PERSONNE_pb_asthm       PERSONNE_pb_bronchit 
                       0.0                        0.0 
     PERSONNE_pb_infarctus       PERSONNE_pb_coronair 
                       0.0                        0.0 
     PERSONNE_pb_hypertens            PERSONNE_pb_avc 
                       0.0                        0.0 
       PERSONNE_pb_arthros       PERSONNE_pb_lombalgi 
                       0.0                        0.0 
      PERSONNE_pb_cervical         PERSONNE_pb_diabet 
                       0.0                        0.0 
       PERSONNE_pb_allergi        PERSONNE_pb_cirrhos 
                       0.0                        0.0 
       PERSONNE_pb_urinair PERSONNE_score_t_corrected 
                       0.0                        0.0 
       PERSONNE_etat_sante               PERSONNE_age 
                       0.0                        0.0 
             PERSONNE_sexe            PERSONNE_couple 
                       0.0                        0.0 
           PERSONNE_statut               PERSONNE_ald 
                       0.0                        0.0 
              SOINS_ald_am              MENAGE_region 
                       0.0                        0.0 
                 MENAGE_tu              MENAGE_nbpers 
                       0.0                        0.0 
                ensol_2011              MUTUELLE_assu 
                       0.0                        0.0 
            MUTUELLE_typcc       OPINION1_renonc_cons 
                       0.0                        0.0 
      OPINION1_renonc_dent        OPINION1_renonc_fin 
                       0.0                        0.0 
      OPINION1_renonc_loin       OPINION1_renonc_long 
                       0.0                        0.0 
             QST_ct_depech             QST_ct_liberte 
                       0.0                        0.0 
            QST_ct_apprend             QST_ct_aidecol 
                       0.0                        0.0 
           QST_ct_travnuit               QST_ct_repet 
                       0.0                        0.0 
              QST_ct_lourd             QST_ct_posture 
                       0.0                        0.0 
            QST_ct_produit            QES_association 
                       0.0                        0.0 
                QES_tpsami                QES_tpsasso 
                       0.0                        0.0 
             QES_tpscolleg               QES_tpsfamil 
                       0.0                        0.0 
            QES_mere_etude             QES_pere_etude 
                       0.0                        0.0 
              inf_q1_mhi_5                         id 
                       0.0                        0.0 
                    status 
                       0.0

Ordinal variables need to be altered prior to the estimation (because of SHAP). We change them to numerical variables. To keep a track of the corresponding numerical values associated with each label, we create a table.

Then, we create the table correspondance_ordinal that contains the numerical values associated with the different categories.

correspondance_ordinal <- 
  tibble(
    variable = "MENAGE_tu",
    old_value = df_tmp$MENAGE_tu |> unique(),
    new_value = as.numeric(df_tmp$MENAGE_tu) |> unique()
  )
correspondance_ordinal

# A tibble: 5 × 3
  variable  old_value               new_value
  <chr>     <ord>                       <dbl>
1 MENAGE_tu Small Municipality              1
2 MENAGE_tu Medium Municipality             2
3 MENAGE_tu Not surveyed                    6
4 MENAGE_tu Paris metropolitan area         4
5 MENAGE_tu Large Municipality              3

save(correspondance_ordinal, file = "../data/out/correspondance_ordinal_sah.rda")

Now that we have stored the changes we plan on applying, we can proceed to these changes:

df_tmp <- 
  df_tmp |>
  mutate_if(is.character, as.factor) |> 
  mutate(MENAGE_tu = as.numeric(df_tmp$MENAGE_tu))

We need to remove the variables that wee used to construct the target variable:

df_tmp <- 
  df_tmp |> 
  select(
    -PERSONNE_pb_depress,
    -inf_q1_mhi_5,
    -PERSONNE_score_t_corrected,
    -PERSONNE_etat_sante
  )

Then, we remove observations that contain missing values. Let us keep a track on the number of observation removed at each time.

nb_obs <- nrow(df_tmp)

eye_on_proportions <- vector(mode="list", length = ncol(df_tmp))

for(p in 1:ncol(df_tmp)){
  variable_to_check <- colnames(df_tmp)[p]
  
  eye_on_proportions[[p]] <- 
    df_tmp |> 
    filter(is.na(!!sym(variable_to_check))) |> 
    group_by(status) |> 
    count() |> ungroup() |> 
    mutate(prop = round(100 * n / sum(n), digits = 2))
  
  # Filtering out the observations with missing values
  df_tmp <- 
    df_tmp |> 
    filter(!is.na(!!sym(variable_to_check)))
  
  if(nrow(df_tmp) < nb_obs){
    nb_obs_df_tmp <- 
      tibble(step = variable_to_check, 
             n = nrow(df_tmp))
    nb_obs_df <- 
      nb_obs_df |> 
      bind_rows(nb_obs_df_tmp)
    nb_obs <- nrow(df_tmp)
  }
  
}
names(eye_on_proportions) <- colnames(df_tmp)

Let us have a look at the nb_obs_df table. The first 4 rows give:

the number of observations with the raw sample
how many observations were left after removing people younger than 15 years old
how many observations were left when removing individuals who did not report their health status
how many individuals reported feeling good (SAH).

The remaining rows indicate how many individuals (column n) are left in the sample once individuals with missing data for the variable reported in column step are removed from the sample. The number given in column n_drop indicates the loss due to lack of data for the variable of interest.

nb_obs_df |> 
  mutate(n_drop = n - dplyr::lag(n)) |> 
  knitr::kable() |> 
  kableExtra::kable_classic(full_width = F, html_font = "Cambria")

Table 15.3: Number of remaining data and loss of individuals due to missing data

step	n	n_drop
raw	19940	NA
age_15	18561	-1379
missing_mhi5_score	12373	-6188
health_status_not_reported	12304	-69
not_D	11459	-845
PERSONNE_ss	11407	-52
PERSONNE_regime	11403	-4
PERSONNE_rap_pcs8	11393	-10
MENAGE_revucinsee	9794	-1599
SOINS_remomn	5393	-4401
QES_tpsasso	5392	-1

At the end, we are left with a dataset with no more missing data:

any(is.na(df_tmp))

[1] FALSE

Lastly, we remove the region variable and the self-declared health condition:

var_to_temove <- c("MENAGE_region", "code_insee", "MENAGE_region_nom")
var_to_keep <- colnames(df_tmp)[!colnames(df_tmp) %in% var_to_temove]
df_tmp <- 
  df_tmp |>
  select(!!var_to_keep)

The dimension of the final dataset:

dim(df_tmp)

[1] 5392   94

Recall we coded missing values for categorical variables as Not reported and missing values because the individuals were not included in the sub-survey as Not surveyed. Let us look at how many observation fall into each of those categories:

df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "Not reported"),
            nb_not_surveyed = sum(. == "Not surveyed")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")

variable	nb_NA	nb_not_surveyed
PERSONNE_pb_asthm	205	0
PERSONNE_pb_bronchit	205	0
PERSONNE_pb_infarctus	205	0
PERSONNE_pb_coronair	205	0
PERSONNE_pb_hypertens	205	0
PERSONNE_pb_avc	205	0
PERSONNE_pb_arthros	205	0
PERSONNE_pb_lombalgi	205	0
PERSONNE_pb_cervical	205	0
PERSONNE_pb_diabet	205	0
PERSONNE_pb_allergi	205	0
PERSONNE_pb_cirrhos	205	0
PERSONNE_pb_urinair	205	0
PERSONNE_sexe	0	0
PERSONNE_couple	12	0
PERSONNE_statut	587	0
PERSONNE_ss	0	0
PERSONNE_regime	0	0
PERSONNE_rap_pcs8	0	0
PERSONNE_ald	0	0
SOINS_ald_am	0	0
MUTUELLE_assu	18	671
MUTUELLE_typcc	0	671
OPINION1_renonc_cons	0	1118
OPINION1_renonc_dent	0	1118
OPINION1_renonc_fin	0	1118
OPINION1_renonc_loin	0	1118
OPINION1_renonc_long	0	1118
QST_ct_depech	30	2655
QST_ct_liberte	44	2655
QST_ct_apprend	29	2655
QST_ct_aidecol	40	2655
QST_ct_travnuit	40	2655
QST_ct_repet	43	2655
QST_ct_lourd	40	2655
QST_ct_posture	40	2655
QST_ct_produit	37	2655
QES_association	94	17
QES_tpsami	127	17
QES_tpsasso	239	17
QES_tpscolleg	496	17
QES_tpsfamil	143	17
QES_mere_etude	93	17
QES_pere_etude	86	17
status	0	0

We decide here to merge these two different values as No answer:

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      .cols = is.factor|is.character,
      .fns = function(x) {
        fct_recode(
          x,
          "No answer" = "Not reported",
          "No answer" = "Not surveyed"
        )
      }
    )
  )

df_tmp |> 
  select_if(is.factor) |> 
  map_df(
    ~tibble(nb_NA = sum(. == "No answer")),
    .id = "variable"
  ) |> 
  knitr::kable(format = "markdown")

variable	nb_NA
PERSONNE_pb_asthm	205
PERSONNE_pb_bronchit	205
PERSONNE_pb_infarctus	205
PERSONNE_pb_coronair	205
PERSONNE_pb_hypertens	205
PERSONNE_pb_avc	205
PERSONNE_pb_arthros	205
PERSONNE_pb_lombalgi	205
PERSONNE_pb_cervical	205
PERSONNE_pb_diabet	205
PERSONNE_pb_allergi	205
PERSONNE_pb_cirrhos	205
PERSONNE_pb_urinair	205
PERSONNE_sexe	0
PERSONNE_couple	12
PERSONNE_statut	587
PERSONNE_ss	0
PERSONNE_regime	0
PERSONNE_rap_pcs8	0
PERSONNE_ald	0
SOINS_ald_am	0
MUTUELLE_assu	689
MUTUELLE_typcc	671
OPINION1_renonc_cons	1118
OPINION1_renonc_dent	1118
OPINION1_renonc_fin	1118
OPINION1_renonc_loin	1118
OPINION1_renonc_long	1118
QST_ct_depech	2685
QST_ct_liberte	2699
QST_ct_apprend	2684
QST_ct_aidecol	2695
QST_ct_travnuit	2695
QST_ct_repet	2698
QST_ct_lourd	2695
QST_ct_posture	2695
QST_ct_produit	2692
QES_association	111
QES_tpsami	144
QES_tpsasso	256
QES_tpscolleg	513
QES_tpsfamil	160
QES_mere_etude	110
QES_pere_etude	103
status	0

Let us remove the 12 observations where the couple status is not reported (too few observations for this category). The proportion of imaginary healthy patients among the individuals that are about to be discarded:

df_tmp |> 
  filter(PERSONNE_couple == "No answer") |> 
  left_join(df |> select(id, inf_q1_mhi_5), by = "id") |> 
  group_by(inf_q1_mhi_5, status) |> 
  count() |> ungroup() |> 
  mutate(prop = round(100 * n / sum(n), digits = 2))

# A tibble: 2 × 4
  inf_q1_mhi_5 status               n  prop
  <fct>        <fct>            <int> <dbl>
1 <=Q1         Not_D_and_inf_Q1     1  8.33
2 >Q1          Not_D_and_sup_Q1    11 91.7

df_tmp <- 
  df_tmp |> 
  filter(PERSONNE_couple != "No answer")

For factor variables related to health conditions, we drop the unused levels.

df_tmp <- 
  df_tmp |> 
  mutate(
    across(
      c(
        PERSONNE_pb_asthm,
        PERSONNE_pb_bronchit,
        PERSONNE_pb_infarctus,
        PERSONNE_pb_coronair,
        PERSONNE_pb_hypertens,
        PERSONNE_pb_avc,
        PERSONNE_pb_arthros,
        PERSONNE_pb_lombalgi,
        PERSONNE_pb_cervical,
        PERSONNE_pb_diabet,
        PERSONNE_pb_allergi,
        PERSONNE_pb_cirrhos,
        PERSONNE_pb_urinair
      ),
      ~fct_drop(.x)
    )
  )

And we save the results:

df_clean <- df_tmp
save(df_clean, file = "../data/df_clean_sah.rda")