Arthur Charpentier and Ewen Gallic
freakonometrics.hypotheses.org
egallic.fr
Janvier 2018.


1 Methodological aspects: data preparation

Since the number of data is almost a billion observations, a conventional computer is currently a little capricious during the import into R. The strategy adopted here is to segment the observations by chunks

1.1 Cutting into chunks

1.1.1 Segmentation into smaller sections

The data we have is divided into three files. In spite of this splitting of the data, the number of lines remains too large for the computer. We decide to carry out a more important splitting.

To import a chunk, we use the fread function. We read the observations in small pieces of \(20\times10^6\) lines. First, we need to know the total number of observations on the chunk we are working on. The following instruction allows us to reveal this information:

library(dplyr)
library(stringr)
library(ggplot2)
library(readr)
library(tidyr)
library(stringi)
library(data.table)
library(pbapply)

nrow(fread(fichier, select = 1L))

Column names are easily obtained:

col_label <- fread(fichier, nrows = 0)
col_label <- colnames(col_label)

The first chunk contains \(412~406~274\) lines. We can create a variable indicating the start and end values of lines to import, so that we read only \(20\times10^6\) lines at most at each import.

seq_nbs <- seq(1, 412406274, by = 20*10^6)
seq_nbs <- c(seq_nbs, 412406274)
seq_nbs[1] <- 2 # La première ligne correspond aux noms de colonnes
a_parcourir <- data.frame(debut = seq_nbs[-length(seq_nbs)], fin = seq_nbs[-1]-1)

The import of each chunk is done using the fread function, as follows (here, for the first chunk):

numero_chunk <- 1 # Tronçon 1
val_deb <- a_parcourir$debut[numero_chunk]
val_fin <- a_parcourir$fin[numero_chunk]
val_skip <- val_deb-1
val_nrows <- val_fin-val_deb+1
chunk <- fread(fichier,
encoding = "Latin-1",
header = FALSE, sep = ";",
fill=TRUE, col.names=col_label, quote='',
stringsAsFactors = FALSE,
colClasses=c(rep("character",19)),
na.strings=c("NA","NaN"," ","","0"),
skip = val_skip,
nrows = val_nrows)

For what concerns us here, we can allow ourselves to separate the observations from each other, provided we keep those from same user trees. We group each observation according to the first alpha numeric character of the users.

# Creation de chunks en fonction de la premiere lettre du nom du prorietaire
chunk[, prem_lettre:=str_to_lower(str_sub(sourcename, 1, 1))]

# Les valeurs différentes pour le nom d'utilisateur
motifs <- unique(chunk$prem_lettre) %>% sort()

# On conserve uniquement les lettres, et on ajoute une valeur pour "non-lettre" (pour les chiffres par exemple)
motifs <- c(motifs[str_detect(motifs, "[:letter:]")], "[^[:letter:]]")

# Sauvegarde dans des tronçons en fonction de la première lettre du nom d'utilisateur
pb2 <- txtProgressBar(min = 1, max = length(motifs), style = 3, char = "@")
for(ii in 1:length(motifs)){
  prem <- motifs[ii]
  if(ii == length(motifs)) prem <- "0"
  chunk_lettre <- chunk[str_detect(prem_lettre, motifs[ii])]
  chunk_lettre[,prem_lettre:=NULL]
  save(chunk_lettre, file = str_c("../data/Geneanet/raw/chunks/chunk_", sprintf("%02s", no_chunks_geneanet), "_", sprintf("%02s", numero_chunk), "_", prem, ".rda"))
  setTxtProgressBar(pb2, ii)
}

We then just create a small function to run on the three files provided by Geneanet, and save the result in a data file.

1.1.2 Grouping small chunks by user names

Since the initial data is divided into three files, the small chunks obtained in the previous step must be grouped together (if user toto’s data is ever in at least two different files).

# Chemin vers les tronçons
path_to_files <- "../data/Geneanet/raw/chunks/"
# Liste des fichiers
fichiers <- list.files(path_to_files, pattern = "\\.rda", full.names = TRUE)

motifs <- list.files(path_to_files, pattern = "\\.rda")
motifs <- str_replace(motifs, "chunk_0(1|2|3)_[[:digit:]]{2}_", "") %>% 
  str_replace("\\.rda", "")
motifs <- unique(motifs) %>% sort()
motifs <- c(motifs[str_detect(motifs, "[:letter:]")], "[^[:letter:]]")

# Créé le dossier d'enregistrements des tronçons, si besoin
if(!dir.exists("../data/Geneanet/raw/chunks_letter/")) dir.create("../data/Geneanet/raw/chunks_letter/", recursive = TRUE)

# motif <- "z"
save_chunks_lettre <- function(motif){
  fichiers_charger <- fichiers[str_detect(fichiers, str_c(motif, ".rda"))]
  # Charger les 
  chunks <- pblapply(fichiers_charger, function(x) {load(x) ; unique(chunk_lettre)})
  chunk_lettre <- rbindlist(chunks)
  chunk_lettre <- unique(chunk_lettre)
  if(motif == "[^[:letter:]]") motif <- "0"
  save(chunk_lettre, file = str_c("../data/Geneanet/raw/chunks_letter/chunk_", motif, ".rda"))
}

# Regroupe les tronçons et sauvegarde le résultat
pblapply(motifs, save_chunks_lettre)

1.1.3 Splitting into 5 files for each user letter

In order to accelerate the following steps, the user’s chunks are divided into 5 parts containing more or less the same number of different users.

# Les tronçons
N <- list.files("../data/Geneanet/raw/chunks_letter/", pattern = "*.rda", full.names = TRUE)

# Création du dossier d'enregistrement des petits tronçons
if(!dir.exists("../data/Geneanet/raw/chunks_letter_2/")) dir.create("../data/Geneanet/raw/chunks_letter_2/", recursive = TRUE)

# Fonction pour découper des vecteurs en parties à peu près égales
chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE)) 

#' decouper_chunks
#' @i: indice de position du fichier dans N
decouper_chunks <- function(i){
  # Obtenir la lettre des noms d'utilisateurs du tronçon courant
  lettre <- str_replace(N[i], "../data/Geneanet/raw/chunks_letter//chunk_", "")
  lettre <- str_replace(lettre, "\\.rda", "")
  # Chargement du tronçon
  load(N[i], envir = globalenv())
  # Les noms d'utilisateurs
  sourcenames <- unique(chunk_lettre$sourcename)
  # Partage équitable des noms d'utilisateurs (!= des observations)
  chunk_sourcenames <- chunk2(sourcenames, 5)
  
  #' sauvegarder_chunk
  #' @j: indice de chunk_sourcenames à extraire pour traiter
  #'     les utilisateurs qu'il contient
  sauvegarder_chunk <- function(j){
    sourcenames_chunk <- chunk_sourcenames[[j]]
    chunk_lettre_tmp <- chunk_lettre[sourcename %in% sourcenames_chunk]
    save(chunk_lettre_tmp, file = str_c("../data/Geneanet/raw/chunks_letter_2//chunk_lettre_tmp_",
                                        lettre, "_", str_pad(j, width = 2, pad = "0"), ".rda"))
  }# Fin de sauvegarder_chunk()
  
  # Création de 5 petits tronçons pour le fichier courant
  pblapply(1:5, sauvegarder_chunk)
  
}# Fin de decouper_chunks()

# Découper en 5 petits tronçons les fichiers
pblapply(1:length(N), decouper_chunks)

1.2 Grouping by department and first simplification

To deploy the following data processing steps on multiple machines at the same time, we group individuals by department. In this same step, we group the information of each individual on a single line. Indeed, until now, a line corresponds to an event for an individual in a user’s tree.

To begin, we record in a table the different French regions present in the data:

# Choix d'un tronçon
load("../data/Geneanet/raw/chunks_letter_2/chunk_lettre_tmp_0_05.rda")
liste_depts <-
  chunk_lettre_tmp %>%
  rename(dept = `sous-region`) %>% 
  select(pays, region, dept) %>%
  filter(pays == "FRA") %>%
  unique() %>%
  arrange(region, dept) %>%
  filter(!is.na(dept)) %>%
  tbl_df()

save(liste_depts, file = "liste_depts.rda")

We work by chunks obtained in the previous step. It is necessary to list them.

N <- list.files("../data/Geneanet/raw/chunks_letter_2/", full.names = TRUE)

The following code allows us to process a single chunk, whose position in N is noted i_fichier. We take the first file as an example in the following code. A simple loop on the indices of the N elements makes it possible to treat each section.

i_fichier <- 1
fichier <- N[i_fichier]
# La première lettre du nom d'utilisateur
lettre <- str_replace(fichier, "../data/Geneanet/raw/chunks_letter_2//chunk_lettre_tmp_", "") %>% 
  str_replace(., "_..\\.rda", "")
# Le numéro de tronçon
num_chunk <- str_replace(fichier, "../data/Geneanet/raw/chunks_letter_2//chunk_lettre_tmp_", "") %>% 
  str_replace(., "._", "") %>% 
  str_replace(., "\\.rda", "")

1.2.1 Some useful functions

1.2.1.1 Event Locations Filed

Three types of events are present in the database: birth, marriage and death. We create a function to indicate the information related to these events: geography and date.

#' endroit_acte
#' 
#' Retourne un tableau de donnees indiquant la geographie de l'enregistrement
#' pour un type d'acte donnee
#' 
#' @x: (data.table) informations sur un individu
#' @acte : (string) type d'acte : "N" pour naissance, "M" pour mariage, ou "D" pour deces
#' x <- nai_per ; acte <- "N"
endroit_acte <- function(x, acte){
  if(nrow(x)>0){
    res <- 
      data.table(lieu = x$lieu,
                 dept = x$dept,
                 region = x$region,
                 pays = x$pays,
                 lat = x$latitude,
                 long = x$longitude,
                 stringsAsFactors = FALSE)
    
    if(acte == "N"){
      date <- x$date_naissance
    }else if(acte == "M"){
      date <- NA
    }else{
      date <- x$date_deces
    }
    
    res$date <- date
  }else{
    res <- 
      data.table(lieu = NA, dept = NA, region = NA, pays = NA, lat = NA, long = NA, date = NA, stringsAsFactors = FALSE)
  }
  
  res <- unique(res)
  names(res) <- str_c(names(res), "_", acte)
  res %>% tbl_df()
}# Fin de endroit_acte()

1.2.1.2 Simplification of a person

The simplifier_personne() function allows to create a single record per individual, for a given user tree. So, instead of having one line per event, we keep only one. This has an impact on different marriages: in fact, we can only keep one here.

#' simplifier_personne
#' 
#' Retourne les informations relatives a la naissance, le mariage et le deces
#' d'une personne
#' 
#' @un_id_personne: (string) id unique de la personne
#' @dt_source: (data.table) tableau de donnees dans lequel chercher les
#' informations pour cette personne
#' dt_source <- chunk_user ; un_id_personne <- ids_personnes[1]
simplifier_personne <- function(un_id_personne, dt_source){
  
  enregistrements_personne <- dt_source[id_personne == un_id_personne]
  
  sourcename_per <- enregistrements_personne$sourcename %>% unique()
  nom_per <- enregistrements_personne %>% 
    arrange(desc(str_length(nom))) %>% 
    slice(1) %>% 
    .$nom
  prenoms_per <- enregistrements_personne %>% 
    arrange(desc(str_length(prenoms))) %>% 
    slice(1) %>% 
    .$prenoms
  
  sexe_per <- enregistrements_personne$sexe %>% unique()
  
  id_mere_per <- enregistrements_personne$id_mere %>% unique()
  id_pere_per <- enregistrements_personne$id_pere %>% unique()
  
  # Naissance
  nai_per <- enregistrements_personne[stri_detect_regex(event_type, "N")]
  nai <- endroit_acte(nai_per, "N")
  # S'il y a plusieurs lieux de naissance, choisir celui le plus court
  if(nrow(nai)>1){
    nai <- 
      nai %>% 
      filter(!is.na(lieu_N)) %>% 
      arrange(str_length(lieu_N)) %>% 
      slice(1)
  }
  # Mariage
  mar_per <- enregistrements_personne[stri_detect_regex(event_type, "M")]
  mar <- endroit_acte(mar_per, "M")
  # S'il y a plusieurs dates de mariages : on ne retient que la plus ancienne
  if(nrow(mar)>1){
    mar <- 
      mar %>% 
      arrange(date_M) %>% 
      slice(1)
  }
  # Deces
  dec_per <- enregistrements_personne[stri_detect_regex(event_type, "D")]
  dec <- endroit_acte(dec_per, "D")
  # S'il y a plusieurs lieux de deces, choisir celui le plus court
  if(nrow(dec)>1){
    dec <- 
      dec %>% 
      filter(!is.na(lieu_D)) %>% 
      arrange(str_length(lieu_D)) %>% 
      slice(1)
  }
  
  nai$date_N <- enregistrements_personne$date_naissance %>% unique()
  dec$date_D <- enregistrements_personne$date_deces %>% unique()
  
  data.table(sourcename = sourcename_per, id_personne = un_id_personne,
             nom = nom_per, prenoms = prenoms_per, sexe = sexe_per,
             id_mere = id_mere_per, id_pere = id_pere_per, stringsAsFactors = FALSE) %>% 
    cbind(., nai) %>% 
    cbind(., mar) %>% 
    cbind(., dec) %>% 
    unique()
}# Fin de simplifier_personne()

1.2.1.3 Simplification of individuals in the same tree

The simplifier_proprietaire() function simplifies all the individuals present in a Geneanet user tree. It is based on the simplifier_personne() function previously defined.

#' simplifier_proprietaire
#' 
#' Pour un proprietaire, simplifie les informations de tous les individus
#' 
#' @id_user: (string) identifiant d'un proprietaire d'arbre
#' @chunk_lettre: (tbl.df) base de donnees contenant les informations des utilisateurs dans le chunk
#' id_user <- "61p"
simplifier_proprietaire <- function(id_user, chunk_lettre){
  # Se restreinde aux individus de l'arbre du proprietaire
  chunk_user <- chunk_lettre[sourcename == id_user]
  
  # Creation d'un identifiant unique par personne
  chunk_user[, id_personne := str_c(sourcename, ref_locale, sep = "@")]
  
  # Ajout de l'identifiant des parents
  identifiants_mere <- chunk_user[, list(ID_num, id_personne)] %>% 
    rename(ID_num_mere = ID_num, id_mere = id_personne)
  
  identifiants_pere <- chunk_user[, list(ID_num, id_personne)] %>% 
    rename(ID_num_pere = ID_num, id_pere = id_personne)
  
  chunk_user <- identifiants_mere[chunk_user, on = "ID_num_mere"]
  chunk_user <- identifiants_pere[chunk_user, on = "ID_num_pere"]
  
  # Suppression de variables qui vont perturber la simplification
  # chunk_user[, c("ref_locale", "ID_num", "ID_num_pere", "ID_num_mere", "ID_num_conjoint") := NULL]
  chunk_user[, c("ID_num", "ID_num_pere", "ID_num_mere", "ID_num_conjoint") := NULL]
  
  # Pour chaque personne, un enregistrement correspond soit :
  # a une naissance (N), un mariage (M), un deces (D),
  # ou un mixe de ces evenements (e.g., NM pour naissance et mariage)
  # Cette distinction est faite parce que les lieux de N, M ou D peuvent varier
  # On va prendre uniquement la date du premier mariage en compte ici.
  ids_personnes <- chunk_user[!is.na(id_personne)]$id_personne %>% unique()
  
  lapply(ids_personnes, simplifier_personne, dt_source = chunk_user) %>% 
    rbindlist
  
}# simplifier_proprietaire

1.2.2 Simplification for a chunk

We load a previously created chunk and work on it. Here we give the processing steps for a chunk; it is easy to create a function afterwards and apply it to each chunk.

# Chargement du chunk
fichier <- N[1]
load(fichier, envir = .GlobalEnv)
chunk_lettre <- chunk_lettre_tmp
rm(chunk_lettre_tmp)

We list the different users (or tree owners), and perform some variable naming operations, to facilitate code writing.

ids_sourcename <- chunk_lettre$sourcename %>% unique()
nbs_obs_ids_sourcename <- length(ids_sourcename)

chunk_lettre <- chunk_lettre %>% rename(dept = `sous-region`)
chunk_lettre <- chunk_lettre %>% rename(ID_num = increment,
                                        ID_num_mere = numero_mere,
                                        ID_num_pere = numero_pere, 
                                        ID_num_conjoint = numero_conjoint,
                                        prenoms = prenom)

We have decided to follow the descendants of individuals born between 1800 and 1804 in France. The database is filtered accordingly.

gen_0 <- chunk_lettre
gen_0[, annee_nai := stri_sub(date_naissance, 1, 4)]
gen_0 <- gen_0[annee_nai %in% seq(1800, 1804)]
gen_0 <- gen_0[stri_detect_regex(event_type, "N")]
# Se restreindre uniquement aux abres dont un des individus est né entre 1800 et 1804
chunk_lettre <- chunk_lettre[sourcename %in% unique(gen_0$sourcename)]

1.2.2.1 For a section, focus on a department

In order to lighten the operations to be done by the computers, we work by departments. We propose here the method to manage the case of a department. Again, a function with the following code allows all departments to be processed later.

i_departement <- "F01"
departement <- liste_depts$dept[i_departement]

# Création du dossier de sauvegarde des résultats
if(!dir.exists(str_c("../data/individuals/migration/", departement, "/"))) dir.create(str_c("../data/individuals/migration/", departement, "/"), recursive = TRUE)

We filter the base to restrict ourselves to first generation individuals born in the department:

gen_0 <- gen_0[dept %in% departement]
# Noms d'utilisateurs de la sous-partie de la base de données
ids_sourcename <- gen_0$sourcename %>% unique()

It is now a question of restricting oneself to the parents and descendants of the selected individuals.

# Initialisation

# La generation courante
gen_n <- gen_0[, list(sourcename, ID_num, ID_num_mere, ID_num_pere)]
nb_obs_gen_0 <- nrow(gen_n)
conserver <- gen_n[, list(sourcename, ID_num)]

# Les generations restantes (on retire la generation courante)
chunk_lettre <- chunk_lettre[sourcename %in% unique(gen_0$sourcename)]
gen_restants <- chunk_lettre[,list(sourcename, ID_num, ID_num_mere, ID_num_pere)]
gen_restants <- fsetdiff(gen_restants, gen_n, all = FALSE)

# Les parents
parents_gen_0 <-
  gen_n %>% 
  select(sourcename, ID_num_mere) %>% 
  rename(ID_num = ID_num_mere) %>% 
  bind_rows(
    gen_n %>% 
      select(sourcename, ID_num_pere) %>% 
      rename(ID_num = ID_num_pere)
  ) %>% 
  unique()

parents_gen_0_complet <- gen_restants[parents_gen_0, on = c("sourcename", "ID_num")]

# On retire les individus trouvés
# Bémol : cette étape retire les individus nés d'une relation incestueuse
gen_restants <- fsetdiff(gen_restants, parents_gen_0_complet, all = FALSE)


# Boucle
# Parmi ces personnes, lesquelles ont pour parent quelqu'un de la generation precedente
compteur <- 0
while(nrow(gen_n) > 0){
  compteur <- compteur+1
  if(compteur>=15) stop("Trop de tours")
  
  parents_m <- data.table(gen_n %>% select(sourcename, ID_num) %>% rename(ID_num_mere = ID_num))
  parents_p <- data.table(gen_n %>% select(sourcename, ID_num) %>% rename(ID_num_pere = ID_num))
  enfants <- data.table(gen_restants %>% select(sourcename, ID_num_mere, ID_num_pere, ID_num))
  
  
  gen_n <- enfants[parents_m, on = c("sourcename", "ID_num_mere")][!is.na(ID_num)] %>% 
    rbind(
      enfants[parents_p, on = c("sourcename", "ID_num_pere")][!is.na(ID_num)]
    ) %>% 
    unique()
  
  conserver <- rbind(conserver, gen_n %>% select(sourcename, ID_num)) %>% unique()
  # Les generations restantes
  gen_restants <- gen_restants[!gen_n, on = c("sourcename", "ID_num")]
}# Fin du while

# Les individus à conserver
conserver <- rbind(parents_gen_0, conserver)
# La base concernant ces individus
chunk_partiel <- chunk_lettre[conserver, on = c("sourcename", "ID_num")]

All that remains is to simplify the lines for all these individuals. The code focuses on doing this per Geneanet user.

res <- pblapply(ids_sourcename, simplifier_proprietaire, chunk_lettre = chunk_partiel)

As this last operation takes a lot of time, it is possible to propose a parallelized version:

library(parallel)
# Nombre de clusters
ncl <- detectCores()-1
(cl <- makeCluster(ncl))

invisible(clusterEvalQ(cl, library(dplyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(tidyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringi, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(data.table, warn.conflicts=FALSE, quietly=TRUE)))

# Evnoi des fonctions/données aux clusters
clusterExport(cl, c("simplifier_personne", "endroit_acte"))

res <- pblapply(ids_sourcename, simplifier_proprietaire, cl = cl, chunk_lettre = chunk_partiel)
stopCluster(cl)

All that remains is to put each result in a single table:

res <- 
  res %>% rbindlist

And finally to save the result:

save(res, file = str_c("../data/individuals/migration/", departement,"/chunk_", lettre, "_", num_chunk, ".rda"))

1.3 Tree Gathering and Duplication Elimination

As each user of the Geneanet site can build his own tree, there are many duplicates. We adopt a multi-step strategy to try to join trees together, while avoiding duplication. Due to the large amount of missing data, this is a delicate task. It surely persists in the end duplicates, which will, for the most part, not be used in the analysis. Indeed, if they have not been identified as duplicates, it is because they carry very little information, and will therefore be discarded later.

This step in the data cleaning process not only links the user trees together, but also completes some information that may be missing in one user tree but present in another.

1.3.1 Some useful functions

Here we present some functions useful for aggregating values.

1.3.1.1 Most Probable Value

The most_probable_value() function allows to find the most probable value among the ones proposed. It takes as input the name of the variable of interest, the data table containing the values, and a variable indicating whether weights in the observations are provided. The final value will be the one with the highest frequency (weighted where applicable) among the proposals.

This function is used on data concerning individuals being identified as designating the same person.

#' most_probable_value
#' Find the most probable value for a variable
#' using the values found within all candidates
#' (excluding NAs)
#' @variable_name: (string) name of the variable
#' @df: (data.table)
#' @weights: (logic) should the simplification consider the weights? (default: FALSE)
#' variable_name <- "date_N"
most_probable_value <- function(df, variable_name, weights = FALSE){
  valeurs <- df[[variable_name]]
  if(!all(is.na(valeurs))){
    if(weights){
      poids <- df[["weight"]]
      res <- data.frame(valeurs, poids, stringsAsFactors = FALSE) %>% 
        group_by(valeurs) %>% 
        summarise(poids = sum(poids)) %>% 
        ungroup()
      
      # Si la variable est une date
      # on va privilegier les informatins relatives aux dates completes
      if(variable_name %in% c("date_N", "date_M", "date_D")){
        tmp <- res %>% 
          mutate(mois = str_sub(valeurs, 5, 6),
                 jour = str_sub(valeurs, 7, 8)) %>% 
          filter(!(mois == "00" | jour == "00"))
        # S'il reste des informations, on se base sur elles
        if(nrow(tmp)>0) res <- tmp
      }
      
      res <- 
        res %>%
        arrange(desc(poids)) %>% 
        slice(1) %>% 
        magrittr::extract2("valeurs")
    }else{# Si pas de poids
      table_freq <- sort(table(valeurs))
      res <- names(table_freq)[1]
    }
  }else{
    res <- NA
  }
  res
}# End of most_probable_value()

1.3.1.2 Simplification of an individual

The simplifier_personne() function, using individuals identified as the same person, simplifies the values of each variable to obtain only one output observation. The values of each variable are obtained using the most_probable_value() function previously defined.

# df_corresp <- corresp ; un_groupe_num <- 2 ; weights <- TRUE
# rm(df_corresp, un_groupe_num, groupe_cour, individus_cour, nouvelles_valeurs, weight)
#' @weights: (logic) should the simplification consider the weights? (default: TRUE)
simplifier_personne <- function(df_corresp, un_groupe_num, weights = TRUE){
  groupe_cour <- df_corresp[id_personne_num_groupe %in% un_groupe_num]
  
  individus_cour <- individus_simple[id_personne_num %in% groupe_cour$id_personne_num]
  
  weight <- sum(individus_cour$weight)
  
  if(nrow(individus_cour) == 1){
    nouvelles_valeurs <- individus_cour[, mget(c(variables_names))]
  }else{
    # Les valeurs probables pour les variables d'interet
    nouvelles_valeurs <- 
      variables_names %>% 
      sapply(., most_probable_value, df = individus_cour, weights = weights) %>% 
      t() %>% 
      data.table(stringsAsFactors = FALSE)
  }
  
  
  cbind(id_personne_num = un_groupe_num, nouvelles_valeurs, weight = weight)
}# End of simplifier_personne()

1.3.1.3 A minimum function

A function to handle missing values when using the min() function.

my_min <- function(x) ifelse( !all(is.na(x)), min(x, na.rm=T), NA)

1.3.2 Loading data

We will retain only certain variables for individuals. The location text variables are crowded out here.

# Les vairables qui nous interessent
variables_names <- c("nom", "prenoms",
"sexe",
"lieu_N", "dept_N", "region_N", "pays_N", "lat_N", "long_N", "date_N",
"lieu_M", "dept_M", "region_M", "pays_M", "lat_M", "long_M", "date_M",
"lieu_D", "dept_D", "region_D", "pays_D", "lat_D", "long_D", "date_D",
"prenom")

It is then a question of loading the data in memory in the session. We start again with the data by department, focusing only on trees in which individuals born between 1800 and 1804 are found. We only show data processing for one department. It is easy to embed the code in a function and then deploy it across all departments.

departement <- liste_depts$dept[i_departement]
# L'ensemble des données pour ce département
N <- list.files(str_c("../data/individuals/migration/", departement, "/"), pattern = "*.rda", full.names = TRUE)
# Chargement
individus <- 
  pblapply(N, function(x){load(x) ; res}) %>%