Arthur Charpentier and Ewen Gallic
freakonometrics.hypotheses.org
egallic.fr
Janvier 2018.
Since the number of data is almost a billion observations, a conventional computer is currently a little capricious during the import into R. The strategy adopted here is to segment the observations by chunks
The data we have is divided into three files. In spite of this splitting of the data, the number of lines remains too large for the computer. We decide to carry out a more important splitting.
To import a chunk, we use the fread
function. We read the observations in small pieces of \(20\times10^6\) lines. First, we need to know the total number of observations on the chunk we are working on. The following instruction allows us to reveal this information:
library(dplyr)
library(stringr)
library(ggplot2)
library(readr)
library(tidyr)
library(stringi)
library(data.table)
library(pbapply)
nrow(fread(fichier, select = 1L))
Column names are easily obtained:
col_label <- fread(fichier, nrows = 0)
col_label <- colnames(col_label)
The first chunk contains \(412~406~274\) lines. We can create a variable indicating the start and end values of lines to import, so that we read only \(20\times10^6\) lines at most at each import.
seq_nbs <- seq(1, 412406274, by = 20*10^6)
seq_nbs <- c(seq_nbs, 412406274)
seq_nbs[1] <- 2 # La première ligne correspond aux noms de colonnes
a_parcourir <- data.frame(debut = seq_nbs[-length(seq_nbs)], fin = seq_nbs[-1]-1)
The import of each chunk is done using the fread
function, as follows (here, for the first chunk):
numero_chunk <- 1 # Tronçon 1
val_deb <- a_parcourir$debut[numero_chunk]
val_fin <- a_parcourir$fin[numero_chunk]
val_skip <- val_deb-1
val_nrows <- val_fin-val_deb+1
chunk <- fread(fichier,
encoding = "Latin-1",
header = FALSE, sep = ";",
fill=TRUE, col.names=col_label, quote='',
stringsAsFactors = FALSE,
colClasses=c(rep("character",19)),
na.strings=c("NA","NaN"," ","","0"),
skip = val_skip,
nrows = val_nrows)
For what concerns us here, we can allow ourselves to separate the observations from each other, provided we keep those from same user trees. We group each observation according to the first alpha numeric character of the users.
# Creation de chunks en fonction de la premiere lettre du nom du prorietaire
chunk[, prem_lettre:=str_to_lower(str_sub(sourcename, 1, 1))]
# Les valeurs différentes pour le nom d'utilisateur
motifs <- unique(chunk$prem_lettre) %>% sort()
# On conserve uniquement les lettres, et on ajoute une valeur pour "non-lettre" (pour les chiffres par exemple)
motifs <- c(motifs[str_detect(motifs, "[:letter:]")], "[^[:letter:]]")
# Sauvegarde dans des tronçons en fonction de la première lettre du nom d'utilisateur
pb2 <- txtProgressBar(min = 1, max = length(motifs), style = 3, char = "@")
for(ii in 1:length(motifs)){
prem <- motifs[ii]
if(ii == length(motifs)) prem <- "0"
chunk_lettre <- chunk[str_detect(prem_lettre, motifs[ii])]
chunk_lettre[,prem_lettre:=NULL]
save(chunk_lettre, file = str_c("../data/Geneanet/raw/chunks/chunk_", sprintf("%02s", no_chunks_geneanet), "_", sprintf("%02s", numero_chunk), "_", prem, ".rda"))
setTxtProgressBar(pb2, ii)
}
We then just create a small function to run on the three files provided by Geneanet, and save the result in a data file.
Since the initial data is divided into three files, the small chunks obtained in the previous step must be grouped together (if user toto
’s data is ever in at least two different files).
# Chemin vers les tronçons
path_to_files <- "../data/Geneanet/raw/chunks/"
# Liste des fichiers
fichiers <- list.files(path_to_files, pattern = "\\.rda", full.names = TRUE)
motifs <- list.files(path_to_files, pattern = "\\.rda")
motifs <- str_replace(motifs, "chunk_0(1|2|3)_[[:digit:]]{2}_", "") %>%
str_replace("\\.rda", "")
motifs <- unique(motifs) %>% sort()
motifs <- c(motifs[str_detect(motifs, "[:letter:]")], "[^[:letter:]]")
# Créé le dossier d'enregistrements des tronçons, si besoin
if(!dir.exists("../data/Geneanet/raw/chunks_letter/")) dir.create("../data/Geneanet/raw/chunks_letter/", recursive = TRUE)
# motif <- "z"
save_chunks_lettre <- function(motif){
fichiers_charger <- fichiers[str_detect(fichiers, str_c(motif, ".rda"))]
# Charger les
chunks <- pblapply(fichiers_charger, function(x) {load(x) ; unique(chunk_lettre)})
chunk_lettre <- rbindlist(chunks)
chunk_lettre <- unique(chunk_lettre)
if(motif == "[^[:letter:]]") motif <- "0"
save(chunk_lettre, file = str_c("../data/Geneanet/raw/chunks_letter/chunk_", motif, ".rda"))
}
# Regroupe les tronçons et sauvegarde le résultat
pblapply(motifs, save_chunks_lettre)
In order to accelerate the following steps, the user’s chunks are divided into 5 parts containing more or less the same number of different users.
# Les tronçons
N <- list.files("../data/Geneanet/raw/chunks_letter/", pattern = "*.rda", full.names = TRUE)
# Création du dossier d'enregistrement des petits tronçons
if(!dir.exists("../data/Geneanet/raw/chunks_letter_2/")) dir.create("../data/Geneanet/raw/chunks_letter_2/", recursive = TRUE)
# Fonction pour découper des vecteurs en parties à peu près égales
chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
#' decouper_chunks
#' @i: indice de position du fichier dans N
decouper_chunks <- function(i){
# Obtenir la lettre des noms d'utilisateurs du tronçon courant
lettre <- str_replace(N[i], "../data/Geneanet/raw/chunks_letter//chunk_", "")
lettre <- str_replace(lettre, "\\.rda", "")
# Chargement du tronçon
load(N[i], envir = globalenv())
# Les noms d'utilisateurs
sourcenames <- unique(chunk_lettre$sourcename)
# Partage équitable des noms d'utilisateurs (!= des observations)
chunk_sourcenames <- chunk2(sourcenames, 5)
#' sauvegarder_chunk
#' @j: indice de chunk_sourcenames à extraire pour traiter
#' les utilisateurs qu'il contient
sauvegarder_chunk <- function(j){
sourcenames_chunk <- chunk_sourcenames[[j]]
chunk_lettre_tmp <- chunk_lettre[sourcename %in% sourcenames_chunk]
save(chunk_lettre_tmp, file = str_c("../data/Geneanet/raw/chunks_letter_2//chunk_lettre_tmp_",
lettre, "_", str_pad(j, width = 2, pad = "0"), ".rda"))
}# Fin de sauvegarder_chunk()
# Création de 5 petits tronçons pour le fichier courant
pblapply(1:5, sauvegarder_chunk)
}# Fin de decouper_chunks()
# Découper en 5 petits tronçons les fichiers
pblapply(1:length(N), decouper_chunks)
To deploy the following data processing steps on multiple machines at the same time, we group individuals by department. In this same step, we group the information of each individual on a single line. Indeed, until now, a line corresponds to an event for an individual in a user’s tree.
To begin, we record in a table the different French regions present in the data:
# Choix d'un tronçon
load("../data/Geneanet/raw/chunks_letter_2/chunk_lettre_tmp_0_05.rda")
liste_depts <-
chunk_lettre_tmp %>%
rename(dept = `sous-region`) %>%
select(pays, region, dept) %>%
filter(pays == "FRA") %>%
unique() %>%
arrange(region, dept) %>%
filter(!is.na(dept)) %>%
tbl_df()
save(liste_depts, file = "liste_depts.rda")
We work by chunks obtained in the previous step. It is necessary to list them.
N <- list.files("../data/Geneanet/raw/chunks_letter_2/", full.names = TRUE)
The following code allows us to process a single chunk, whose position in N
is noted i_fichier
. We take the first file as an example in the following code. A simple loop on the indices of the N
elements makes it possible to treat each section.
i_fichier <- 1
fichier <- N[i_fichier]
# La première lettre du nom d'utilisateur
lettre <- str_replace(fichier, "../data/Geneanet/raw/chunks_letter_2//chunk_lettre_tmp_", "") %>%
str_replace(., "_..\\.rda", "")
# Le numéro de tronçon
num_chunk <- str_replace(fichier, "../data/Geneanet/raw/chunks_letter_2//chunk_lettre_tmp_", "") %>%
str_replace(., "._", "") %>%
str_replace(., "\\.rda", "")
Three types of events are present in the database: birth, marriage and death. We create a function to indicate the information related to these events: geography and date.
#' endroit_acte
#'
#' Retourne un tableau de donnees indiquant la geographie de l'enregistrement
#' pour un type d'acte donnee
#'
#' @x: (data.table) informations sur un individu
#' @acte : (string) type d'acte : "N" pour naissance, "M" pour mariage, ou "D" pour deces
#' x <- nai_per ; acte <- "N"
endroit_acte <- function(x, acte){
if(nrow(x)>0){
res <-
data.table(lieu = x$lieu,
dept = x$dept,
region = x$region,
pays = x$pays,
lat = x$latitude,
long = x$longitude,
stringsAsFactors = FALSE)
if(acte == "N"){
date <- x$date_naissance
}else if(acte == "M"){
date <- NA
}else{
date <- x$date_deces
}
res$date <- date
}else{
res <-
data.table(lieu = NA, dept = NA, region = NA, pays = NA, lat = NA, long = NA, date = NA, stringsAsFactors = FALSE)
}
res <- unique(res)
names(res) <- str_c(names(res), "_", acte)
res %>% tbl_df()
}# Fin de endroit_acte()
The simplifier_personne()
function allows to create a single record per individual, for a given user tree. So, instead of having one line per event, we keep only one. This has an impact on different marriages: in fact, we can only keep one here.
#' simplifier_personne
#'
#' Retourne les informations relatives a la naissance, le mariage et le deces
#' d'une personne
#'
#' @un_id_personne: (string) id unique de la personne
#' @dt_source: (data.table) tableau de donnees dans lequel chercher les
#' informations pour cette personne
#' dt_source <- chunk_user ; un_id_personne <- ids_personnes[1]
simplifier_personne <- function(un_id_personne, dt_source){
enregistrements_personne <- dt_source[id_personne == un_id_personne]
sourcename_per <- enregistrements_personne$sourcename %>% unique()
nom_per <- enregistrements_personne %>%
arrange(desc(str_length(nom))) %>%
slice(1) %>%
.$nom
prenoms_per <- enregistrements_personne %>%
arrange(desc(str_length(prenoms))) %>%
slice(1) %>%
.$prenoms
sexe_per <- enregistrements_personne$sexe %>% unique()
id_mere_per <- enregistrements_personne$id_mere %>% unique()
id_pere_per <- enregistrements_personne$id_pere %>% unique()
# Naissance
nai_per <- enregistrements_personne[stri_detect_regex(event_type, "N")]
nai <- endroit_acte(nai_per, "N")
# S'il y a plusieurs lieux de naissance, choisir celui le plus court
if(nrow(nai)>1){
nai <-
nai %>%
filter(!is.na(lieu_N)) %>%
arrange(str_length(lieu_N)) %>%
slice(1)
}
# Mariage
mar_per <- enregistrements_personne[stri_detect_regex(event_type, "M")]
mar <- endroit_acte(mar_per, "M")
# S'il y a plusieurs dates de mariages : on ne retient que la plus ancienne
if(nrow(mar)>1){
mar <-
mar %>%
arrange(date_M) %>%
slice(1)
}
# Deces
dec_per <- enregistrements_personne[stri_detect_regex(event_type, "D")]
dec <- endroit_acte(dec_per, "D")
# S'il y a plusieurs lieux de deces, choisir celui le plus court
if(nrow(dec)>1){
dec <-
dec %>%
filter(!is.na(lieu_D)) %>%
arrange(str_length(lieu_D)) %>%
slice(1)
}
nai$date_N <- enregistrements_personne$date_naissance %>% unique()
dec$date_D <- enregistrements_personne$date_deces %>% unique()
data.table(sourcename = sourcename_per, id_personne = un_id_personne,
nom = nom_per, prenoms = prenoms_per, sexe = sexe_per,
id_mere = id_mere_per, id_pere = id_pere_per, stringsAsFactors = FALSE) %>%
cbind(., nai) %>%
cbind(., mar) %>%
cbind(., dec) %>%
unique()
}# Fin de simplifier_personne()
The simplifier_proprietaire()
function simplifies all the individuals present in a Geneanet user tree. It is based on the simplifier_personne()
function previously defined.
#' simplifier_proprietaire
#'
#' Pour un proprietaire, simplifie les informations de tous les individus
#'
#' @id_user: (string) identifiant d'un proprietaire d'arbre
#' @chunk_lettre: (tbl.df) base de donnees contenant les informations des utilisateurs dans le chunk
#' id_user <- "61p"
simplifier_proprietaire <- function(id_user, chunk_lettre){
# Se restreinde aux individus de l'arbre du proprietaire
chunk_user <- chunk_lettre[sourcename == id_user]
# Creation d'un identifiant unique par personne
chunk_user[, id_personne := str_c(sourcename, ref_locale, sep = "@")]
# Ajout de l'identifiant des parents
identifiants_mere <- chunk_user[, list(ID_num, id_personne)] %>%
rename(ID_num_mere = ID_num, id_mere = id_personne)
identifiants_pere <- chunk_user[, list(ID_num, id_personne)] %>%
rename(ID_num_pere = ID_num, id_pere = id_personne)
chunk_user <- identifiants_mere[chunk_user, on = "ID_num_mere"]
chunk_user <- identifiants_pere[chunk_user, on = "ID_num_pere"]
# Suppression de variables qui vont perturber la simplification
# chunk_user[, c("ref_locale", "ID_num", "ID_num_pere", "ID_num_mere", "ID_num_conjoint") := NULL]
chunk_user[, c("ID_num", "ID_num_pere", "ID_num_mere", "ID_num_conjoint") := NULL]
# Pour chaque personne, un enregistrement correspond soit :
# a une naissance (N), un mariage (M), un deces (D),
# ou un mixe de ces evenements (e.g., NM pour naissance et mariage)
# Cette distinction est faite parce que les lieux de N, M ou D peuvent varier
# On va prendre uniquement la date du premier mariage en compte ici.
ids_personnes <- chunk_user[!is.na(id_personne)]$id_personne %>% unique()
lapply(ids_personnes, simplifier_personne, dt_source = chunk_user) %>%
rbindlist
}# simplifier_proprietaire
We load a previously created chunk and work on it. Here we give the processing steps for a chunk; it is easy to create a function afterwards and apply it to each chunk.
# Chargement du chunk
fichier <- N[1]
load(fichier, envir = .GlobalEnv)
chunk_lettre <- chunk_lettre_tmp
rm(chunk_lettre_tmp)
We list the different users (or tree owners), and perform some variable naming operations, to facilitate code writing.
ids_sourcename <- chunk_lettre$sourcename %>% unique()
nbs_obs_ids_sourcename <- length(ids_sourcename)
chunk_lettre <- chunk_lettre %>% rename(dept = `sous-region`)
chunk_lettre <- chunk_lettre %>% rename(ID_num = increment,
ID_num_mere = numero_mere,
ID_num_pere = numero_pere,
ID_num_conjoint = numero_conjoint,
prenoms = prenom)
We have decided to follow the descendants of individuals born between 1800 and 1804 in France. The database is filtered accordingly.
gen_0 <- chunk_lettre
gen_0[, annee_nai := stri_sub(date_naissance, 1, 4)]
gen_0 <- gen_0[annee_nai %in% seq(1800, 1804)]
gen_0 <- gen_0[stri_detect_regex(event_type, "N")]
# Se restreindre uniquement aux abres dont un des individus est né entre 1800 et 1804
chunk_lettre <- chunk_lettre[sourcename %in% unique(gen_0$sourcename)]
In order to lighten the operations to be done by the computers, we work by departments. We propose here the method to manage the case of a department. Again, a function with the following code allows all departments to be processed later.
i_departement <- "F01"
departement <- liste_depts$dept[i_departement]
# Création du dossier de sauvegarde des résultats
if(!dir.exists(str_c("../data/individuals/migration/", departement, "/"))) dir.create(str_c("../data/individuals/migration/", departement, "/"), recursive = TRUE)
We filter the base to restrict ourselves to first generation individuals born in the department:
gen_0 <- gen_0[dept %in% departement]
# Noms d'utilisateurs de la sous-partie de la base de données
ids_sourcename <- gen_0$sourcename %>% unique()
It is now a question of restricting oneself to the parents and descendants of the selected individuals.
# Initialisation
# La generation courante
gen_n <- gen_0[, list(sourcename, ID_num, ID_num_mere, ID_num_pere)]
nb_obs_gen_0 <- nrow(gen_n)
conserver <- gen_n[, list(sourcename, ID_num)]
# Les generations restantes (on retire la generation courante)
chunk_lettre <- chunk_lettre[sourcename %in% unique(gen_0$sourcename)]
gen_restants <- chunk_lettre[,list(sourcename, ID_num, ID_num_mere, ID_num_pere)]
gen_restants <- fsetdiff(gen_restants, gen_n, all = FALSE)
# Les parents
parents_gen_0 <-
gen_n %>%
select(sourcename, ID_num_mere) %>%
rename(ID_num = ID_num_mere) %>%
bind_rows(
gen_n %>%
select(sourcename, ID_num_pere) %>%
rename(ID_num = ID_num_pere)
) %>%
unique()
parents_gen_0_complet <- gen_restants[parents_gen_0, on = c("sourcename", "ID_num")]
# On retire les individus trouvés
# Bémol : cette étape retire les individus nés d'une relation incestueuse
gen_restants <- fsetdiff(gen_restants, parents_gen_0_complet, all = FALSE)
# Boucle
# Parmi ces personnes, lesquelles ont pour parent quelqu'un de la generation precedente
compteur <- 0
while(nrow(gen_n) > 0){
compteur <- compteur+1
if(compteur>=15) stop("Trop de tours")
parents_m <- data.table(gen_n %>% select(sourcename, ID_num) %>% rename(ID_num_mere = ID_num))
parents_p <- data.table(gen_n %>% select(sourcename, ID_num) %>% rename(ID_num_pere = ID_num))
enfants <- data.table(gen_restants %>% select(sourcename, ID_num_mere, ID_num_pere, ID_num))
gen_n <- enfants[parents_m, on = c("sourcename", "ID_num_mere")][!is.na(ID_num)] %>%
rbind(
enfants[parents_p, on = c("sourcename", "ID_num_pere")][!is.na(ID_num)]
) %>%
unique()
conserver <- rbind(conserver, gen_n %>% select(sourcename, ID_num)) %>% unique()
# Les generations restantes
gen_restants <- gen_restants[!gen_n, on = c("sourcename", "ID_num")]
}# Fin du while
# Les individus à conserver
conserver <- rbind(parents_gen_0, conserver)
# La base concernant ces individus
chunk_partiel <- chunk_lettre[conserver, on = c("sourcename", "ID_num")]
All that remains is to simplify the lines for all these individuals. The code focuses on doing this per Geneanet user.
res <- pblapply(ids_sourcename, simplifier_proprietaire, chunk_lettre = chunk_partiel)
As this last operation takes a lot of time, it is possible to propose a parallelized version:
library(parallel)
# Nombre de clusters
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
invisible(clusterEvalQ(cl, library(dplyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(tidyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringi, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(data.table, warn.conflicts=FALSE, quietly=TRUE)))
# Evnoi des fonctions/données aux clusters
clusterExport(cl, c("simplifier_personne", "endroit_acte"))
res <- pblapply(ids_sourcename, simplifier_proprietaire, cl = cl, chunk_lettre = chunk_partiel)
stopCluster(cl)
All that remains is to put each result in a single table:
res <-
res %>% rbindlist
And finally to save the result:
save(res, file = str_c("../data/individuals/migration/", departement,"/chunk_", lettre, "_", num_chunk, ".rda"))
As each user of the Geneanet site can build his own tree, there are many duplicates. We adopt a multi-step strategy to try to join trees together, while avoiding duplication. Due to the large amount of missing data, this is a delicate task. It surely persists in the end duplicates, which will, for the most part, not be used in the analysis. Indeed, if they have not been identified as duplicates, it is because they carry very little information, and will therefore be discarded later.
This step in the data cleaning process not only links the user trees together, but also completes some information that may be missing in one user tree but present in another.
Here we present some functions useful for aggregating values.
The most_probable_value()
function allows to find the most probable value among the ones proposed. It takes as input the name of the variable of interest, the data table containing the values, and a variable indicating whether weights in the observations are provided. The final value will be the one with the highest frequency (weighted where applicable) among the proposals.
This function is used on data concerning individuals being identified as designating the same person.
#' most_probable_value
#' Find the most probable value for a variable
#' using the values found within all candidates
#' (excluding NAs)
#' @variable_name: (string) name of the variable
#' @df: (data.table)
#' @weights: (logic) should the simplification consider the weights? (default: FALSE)
#' variable_name <- "date_N"
most_probable_value <- function(df, variable_name, weights = FALSE){
valeurs <- df[[variable_name]]
if(!all(is.na(valeurs))){
if(weights){
poids <- df[["weight"]]
res <- data.frame(valeurs, poids, stringsAsFactors = FALSE) %>%
group_by(valeurs) %>%
summarise(poids = sum(poids)) %>%
ungroup()
# Si la variable est une date
# on va privilegier les informatins relatives aux dates completes
if(variable_name %in% c("date_N", "date_M", "date_D")){
tmp <- res %>%
mutate(mois = str_sub(valeurs, 5, 6),
jour = str_sub(valeurs, 7, 8)) %>%
filter(!(mois == "00" | jour == "00"))
# S'il reste des informations, on se base sur elles
if(nrow(tmp)>0) res <- tmp
}
res <-
res %>%
arrange(desc(poids)) %>%
slice(1) %>%
magrittr::extract2("valeurs")
}else{# Si pas de poids
table_freq <- sort(table(valeurs))
res <- names(table_freq)[1]
}
}else{
res <- NA
}
res
}# End of most_probable_value()
The simplifier_personne()
function, using individuals identified as the same person, simplifies the values of each variable to obtain only one output observation. The values of each variable are obtained using the most_probable_value()
function previously defined.
# df_corresp <- corresp ; un_groupe_num <- 2 ; weights <- TRUE
# rm(df_corresp, un_groupe_num, groupe_cour, individus_cour, nouvelles_valeurs, weight)
#' @weights: (logic) should the simplification consider the weights? (default: TRUE)
simplifier_personne <- function(df_corresp, un_groupe_num, weights = TRUE){
groupe_cour <- df_corresp[id_personne_num_groupe %in% un_groupe_num]
individus_cour <- individus_simple[id_personne_num %in% groupe_cour$id_personne_num]
weight <- sum(individus_cour$weight)
if(nrow(individus_cour) == 1){
nouvelles_valeurs <- individus_cour[, mget(c(variables_names))]
}else{
# Les valeurs probables pour les variables d'interet
nouvelles_valeurs <-
variables_names %>%
sapply(., most_probable_value, df = individus_cour, weights = weights) %>%
t() %>%
data.table(stringsAsFactors = FALSE)
}
cbind(id_personne_num = un_groupe_num, nouvelles_valeurs, weight = weight)
}# End of simplifier_personne()
A function to handle missing values when using the min()
function.
my_min <- function(x) ifelse( !all(is.na(x)), min(x, na.rm=T), NA)
We will retain only certain variables for individuals. The location text variables are crowded out here.
# Les vairables qui nous interessent
variables_names <- c("nom", "prenoms",
"sexe",
"lieu_N", "dept_N", "region_N", "pays_N", "lat_N", "long_N", "date_N",
"lieu_M", "dept_M", "region_M", "pays_M", "lat_M", "long_M", "date_M",
"lieu_D", "dept_D", "region_D", "pays_D", "lat_D", "long_D", "date_D",
"prenom")
It is then a question of loading the data in memory in the session. We start again with the data by department, focusing only on trees in which individuals born between 1800 and 1804 are found. We only show data processing for one department. It is easy to embed the code in a function and then deploy it across all departments.
departement <- liste_depts$dept[i_departement]
# L'ensemble des données pour ce département
N <- list.files(str_c("../data/individuals/migration/", departement, "/"), pattern = "*.rda", full.names = TRUE)
# Chargement
individus <-
pblapply(N, function(x){load(x) ; res}) %>%
rbindlist() %>%
unique()
Observations for which the location coordinates are 0.00
are missing, therefore to be interpreted as not available in R.
individus <-
individus %>%
mutate(long_N = ifelse(long_N == "0.00000", yes = NA, no = long_N),
long_M = ifelse(long_M == "0.00000", yes = NA, no = long_M),
long_D = ifelse(long_D == "0.00000", yes = NA, no = long_D)) %>%
mutate(lat_N = ifelse(lat_N == "0.00000", yes = NA, no = lat_N),
lat_M = ifelse(lat_M == "0.00000", yes = NA, no = lat_M),
lat_D = ifelse(lat_D == "0.00000", yes = NA, no = lat_D))
We will rely on digital identifiers for individuals. We create an database (individus
) listing all individuals and assigning them an identifier.
# Les individus sont identifies de maniere unique par la colonne
# id_personne. Il s'agit d'une chaine de caracteres qui peut etre assez longue
# On va utiliser un identifiant numerique, ce qui sera plus pratique.
individus <-
individus %>%
tbl_df() %>%
mutate(id_personne_num = row_number()) %>%
data.table()
We will create a variable containing the first names in a simplified way, the same for the names. We remove spaces and special characters.
individus <-
individus %>%
tbl_df() %>%
mutate(nom = stri_replace_all_regex(nom, "\\((.*?)\\)", "") %>%
stri_trans_general(., "Latin-ASCII") %>%
stri_replace_all_regex(., "'|\\(|\\)|[[:blank:]]|\\?|_|\\*", "") %>%
stri_trim(.) %>%
str_to_lower()) %>%
mutate(prenom =
str_replace_all(prenoms, "\\((.*?)\\)", "") %>%
stri_replace_all_regex(., "'|\\(|\\)|\\?|_|\\*|>|\\^|\\+|[[:digit:]]|\\.|<|>", "") %>%
str_trim() %>%
gsub(" .*$", "", .) %>%
stri_trans_general(., "Latin-ASCII") %>%
stri_replace_all_regex(., "-", "") %>%
stri_trim(.) %>%
str_to_lower()) %>%
data.table()
# Les individus uniquement
individus_simple <- copy(individus)
Next, we add information about each individual’s parents, so that we can identify individuals based on their parents. If Jeanne Michu in a user’s tree has Jean Michu and Irène Dupond as parents, and in another tree, we find a person born at the same time with parents with the same names, this allows us to bring not only Jeanne, but also Jean and Irène in the two trees to merge them.
Tables are actually prepared for the parents’ information.
# On va creer une base large avec pour chaque ligne,
# les informations sur les parents egalement.
# De fait, chaque ligne representera un triplet (enfant ; mere ; pere)
ind_meres <- match(individus$id_mere, individus$id_personne)
ind_peres <- match(individus$id_pere, individus$id_personne)
# Les meres
individus_meres <-
individus[ind_meres,] %>%
magrittr::set_colnames(str_c(colnames(individus), "_mere")) %>%
select(-sourcename_mere, -id_mere_mere, -id_pere_mere, -id_personne_mere)
# Les peres
individus_peres <-
individus[ind_peres,] %>%
magrittr::set_colnames(str_c(colnames(individus), "_pere")) %>%
select(-sourcename_pere, -id_mere_pere, -id_pere_pere, -id_personne_pere)
Then we add this information to the lines of the individuals.
# On ajoute les informations relatives aux parents
individus <-
individus %>%
cbind(individus_meres) %>%
cbind(individus_peres) %>%
select(-id_mere, -id_pere) %>%
tbl_df()
rm(individus_meres, individus_peres)
We delete observations for which there is an anomaly concerning the date of birth regarding them or their parents. If one of the parents has a child before the age of 13, we consider it an anomaly.
age_min_nenf <- 13
individus <-
individus %>%
mutate(date_N_annee = str_sub(date_N, 1, 4) %>% as.numeric(),
date_N_annee_mere = str_sub(date_N_mere, 1, 4) %>% as.numeric(),
date_N_annee_pere = str_sub(date_N_pere, 1, 4) %>% as.numeric()) %>%
mutate(anomalie_mere = date_N_annee <= (date_N_annee_mere+age_min_nenf),
anomalie_mere = ifelse(is.na(anomalie_mere), yes = FALSE, no = anomalie_mere)) %>%
mutate(anomalie_pere = date_N_annee <= (date_N_annee_pere+age_min_nenf),
anomalie_pere = ifelse(is.na(anomalie_pere), yes = FALSE, no = anomalie_pere)) %>%
mutate(anomalie = anomalie_mere + anomalie_pere) %>%
filter(anomalie == 0) %>%
select(-anomalie_mere, -anomalie_pere, -anomalie) %>%
data.table()
# On repercute sur la table des individus
individus_simple <- individus_simple[id_personne_num %in% individus$id_personne_num]
individus_simple <- individus_simple[, mget(c("id_personne_num", variables_names))]
We add a column indicating the weight of each observation, in anticipation of future simplifications.
individus$weight <- 1
individus_simple$weight <- 1
We create a register showing the relationships between individuals. This register indicates for each person identified by his numerical identifier, the respective numerical identifiers of his parents. It should be noted that this register will be updated later, and will prove useful in order to carry out the mergers between individuals.
registre <-
individus %>%
select(id_personne_num, id_personne_num_mere, id_personne_num_pere) %>%
tbl_df()
To begin, we will proceed to a very coarse skimming of the base, by removing the obvious duplicates, i.e. having :
Before merging duplicate individuals into single entities, we must identify them. The register becomes useful here. As soon as two individuals are identified as designating a single person, the smaller of the two numerical identifiers becomes the new identifier for both. The same applies for parents.
corresp_meres <-
individus[, list(id_personne,id_personne_num, nom, prenom, sexe, date_N, nom_mere, prenom_mere, sexe_mere, date_N_mere)] %>%
group_by(nom, prenom, sexe, date_N, nom_mere, prenom_mere, sexe_mere, date_N_mere) %>%
mutate(id_personne_num_groupe_mere = min(id_personne_num)) %>%
ungroup() %>%
select(id_personne, id_personne_num, id_personne_num_groupe_mere) %>%
data.table()
corresp_peres <-
individus[, list(id_personne, id_personne_num, nom, prenom, sexe, date_N, nom_pere, prenom_pere, sexe_pere, date_N_pere)] %>%
group_by(nom, prenom, sexe, date_N, nom_pere, prenom_pere, sexe_pere, date_N_pere) %>%
mutate(id_personne_num_groupe_pere = min(id_personne_num)) %>%
ungroup() %>%
select(id_personne, id_personne_num, id_personne_num_groupe_pere) %>%
data.table()
corresp <-
corresp_meres %>%
left_join(corresp_peres, by = c("id_personne_num", "id_personne")) %>%
data.table()
corresp <-
corresp %>%
# rowwise() %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe_mere, id_personne_num_groupe_pere, na.rm=T)) %>%
select(-id_personne_num_groupe_mere, -id_personne_num_groupe_pere) %>%
data.table()
corresp <- unique(corresp)
This first simplification may (and does) allow more individuals to be grouped together, depending on the registry. We then put an iterative procedure in place to identify possible mergers. As soon as there are no more possible groupings, the loop ends.
continuer <- TRUE
nb_iterations <- 1
while(continuer){
registre_new <-
registre %>%
tbl_df() %>%
left_join(corresp %>% select(-id_personne) %>% rename(id_personne_num_new = id_personne_num_groupe), by = "id_personne_num") %>%
left_join(
corresp %>% select(-id_personne) %>% rename(id_personne_num_mere = id_personne_num,
id_personne_num_mere_new = id_personne_num_groupe),
by = "id_personne_num_mere"
) %>%
left_join(
corresp %>% select(-id_personne) %>% rename(id_personne_num_pere = id_personne_num,
id_personne_num_pere_new = id_personne_num_groupe),
by = "id_personne_num_pere"
)
registre_new <-
registre_new %>%
mutate(same_id = id_personne_num == id_personne_num_new,
same_id_mother = id_personne_num_mere == id_personne_num_mere_new,
same_id_father = id_personne_num_pere == id_personne_num_pere_new) %>%
mutate(same_id = ifelse(is.na(same_id), yes = TRUE, no = same_id),
same_id_mother = ifelse(is.na(same_id_mother), yes = TRUE, no = same_id_mother),
same_id_father = ifelse(is.na(same_id_father), yes = TRUE, no = same_id_father))
continuer <- any(!registre_new$same_id | !registre_new$same_id_mother | !registre_new$same_id_father)
if(!continuer) next
# On a pu retrouver d'autres associations !
# Celles des parents d'un enfant.
# Pour les mères
corresp_meres <-
registre_new %>%
select(id_personne_num_new, id_personne_num_mere) %>%
unique() %>%
arrange(id_personne_num_new, id_personne_num_mere) %>%
filter(!is.na(id_personne_num_mere))
corresp_meres <- data.table(corresp_meres)
corresp_meres[, id_personne_num_mere_2 := my_min(id_personne_num_mere), by = id_personne_num_new]
corresp_meres <-
corresp_meres %>% select(-id_personne_num_new) %>%
rename(id_personne_num = id_personne_num_mere, id_personne_num_groupe = id_personne_num_mere_2) %>%
unique()
corresp_meres[, id_personne_num_groupe := my_min(id_personne_num_groupe), by = id_personne_num]
# Idem pour les pères
corresp_peres <-
registre_new %>%
select(id_personne_num_new, id_personne_num_pere) %>%
unique() %>%
arrange(id_personne_num_new, id_personne_num_pere) %>%
filter(!is.na(id_personne_num_pere))
corresp_peres <- data.table(corresp_peres)
corresp_peres[, id_personne_num_pere_2 := my_min(id_personne_num_pere), by = id_personne_num_new]
corresp_peres <-
corresp_peres %>% select(-id_personne_num_new) %>%
rename(id_personne_num = id_personne_num_pere, id_personne_num_groupe = id_personne_num_pere_2) %>%
unique()
corresp_peres[, id_personne_num_groupe := my_min(id_personne_num_groupe), by = id_personne_num]
corresp <-
corresp %>%
tbl_df() %>%
left_join(
corresp_meres %>%
rename(id_personne_num_groupe_new = id_personne_num_groupe),
by = "id_personne_num"
) %>%
tbl_df() %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)) %>%
select(-id_personne_num_groupe_new) %>%
left_join(
corresp_peres %>%
rename(id_personne_num_groupe_new = id_personne_num_groupe),
by = "id_personne_num"
) %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)) %>%
select(-id_personne_num_groupe_new) %>%
unique()
registre_new <-
registre_new %>%
mutate(id_personne_num = id_personne_num_new,
id_personne_num_mere = id_personne_num_mere_new,
id_personne_num_pere = id_personne_num_pere_new) %>%
select(-id_personne_num_new, -id_personne_num_mere_new, -id_personne_num_pere_new,
-same_id, -same_id_mother, -same_id_father)
registre_new <- data.table(registre_new)
registre_new <- registre_new[, list(
id_personne_num_mere = min(id_personne_num_mere, na.rm=T),
id_personne_num_pere = min(id_personne_num_pere, na.rm=T)), by = id_personne_num] %>%
unique()
registre_new$id_personne_num_mere[which(registre_new$id_personne_num_mere == Inf)] <- NA
registre_new$id_personne_num_pere[which(registre_new$id_personne_num_pere == Inf)] <- NA
# toto <- data.frame(id = c(1,1,2,3,3,4,3), val = c(1,0,2,NA, NA, 2,NA)) %>% data.table()
# toto[, list(test = pmin(val)), by = id]
#
#
# registre_new <-
# registre_new %>%
# tbl_df() %>%
# group_by(id_personne_num) %>%
# summarise(id_personne_num_mere = my_min(id_personne_num_mere),
# id_personne_num_pere = my_min(id_personne_num_pere)) %>%
# ungroup()
# registre_new <- unique(registre_new)
registre <- copy(registre_new)
nb_iterations <- nb_iterations+1
}# Fin de la boucle while
corresp <- data.table(corresp)
registre <- data.table(registre)
As the duplicate tracking is done, it remains to merge the duplicate information. We use the simplifier_personne()
function presented earlier to do this. If three Mrs Michu are spotted as the same person, and two of them have the following date of birth 1800-01-01
while the third has a different day, then, due to a higher frequency for the value 1800-01-01
, we will retain the latter as Mrs Michu’s date of birth.
We simplify according to the identical groups of people we have identified.
corresp_nb <-
corresp %>%
tbl_df() %>%
group_by(id_personne_num_groupe) %>%
tally()
It is not necessary to focus on singletons, which do not require any processing and would slow down code execution. We put them aside; they will be reinstated once the simplification is complete.
groupes <- corresp_nb %>% filter(n > 1) %>% magrittr::extract2("id_personne_num_groupe")
It remains to go through each of these groups in order to make the simplification. As there are many of them, we parallel the execution of the code.
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
invisible(clusterEvalQ(cl, library(dplyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(tidyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringi, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(data.table, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringdist, warn.conflicts=FALSE, quietly=TRUE)))
# Envoi des fonctions/tableaux
clusterExport(cl, c("most_probable_value", "variables_names", "individus_simple"), envir=environment())
individus_simple_enfants <- pblapply(groupes, simplifier_personne, df_corresp = corresp, cl = cl)
stopCluster(cl)
individus_simple_enfants <- rbindlist(individus_simple_enfants)
We add the singletons left out.
groupes_un <- corresp_nb %>% filter(n == 1) %>% magrittr::extract2("id_personne_num_groupe")
individus_simple_un <- individus_simple[id_personne_num %in% groupes_un]
individus_simple <- rbind(individus_simple_un, individus_simple_enfants)
rm(corresp_meres, corresp_peres, individus_simple_enfants, registre_new, groupes, groupes_un, individus_simple_un)
So we have three tables:
Some less obvious duplicates remain in the database. We proceed in the same way as for the first simplification, adding information about the parent and creating a register, then simplifying duplicates. In this second simplification, we take into account spelling errors in first and last names.
# On va creer une base large avec pour chaque ligne,
# les informations sur les parents egalement.
# De fait, chaque ligne representera un triplet (enfant ; mere ; pere)
individus_2 <-
copy(individus_simple)
# On rajoute les identifiants des parents
individus_2$id_personne_num_mere <- registre$id_personne_num_mere[match(individus_simple$id_personne_num, registre$id_personne_num)]
individus_2$id_personne_num_pere <- registre$id_personne_num_pere[match(individus_simple$id_personne_num, registre$id_personne_num)]
ind_meres <- match(individus_2$id_personne_num_mere, individus_2$id_personne_num)
ind_peres <- match(individus_2$id_personne_num_pere, individus_2$id_personne_num)
# Les meres
individus_meres <-
individus_2[ind_meres,] %>%
magrittr::set_colnames(str_c(colnames(individus_2), "_mere")) %>%
select(-id_personne_num_mere, -id_personne_num_mere_mere, -id_personne_num_pere_mere)
# Les peres
individus_peres <-
individus_2[ind_peres,] %>%
magrittr::set_colnames(str_c(colnames(individus_2), "_pere")) %>%
select(-id_personne_num_pere, -id_personne_num_mere_pere, -id_personne_num_pere_pere)
# On ajoute les informations relatives aux parents
individus_2 <-
individus_2 %>%
cbind(individus_meres) %>%
cbind(individus_peres) %>%
# select(-id_mere, -id_pere) %>%
tbl_df()
individus_2 <- data.table(individus_2)
individus_2$date_N_annee <- individus_2$date_N %>% str_sub(1,4) %>% as.numeric()
We identify duplicates by trying to take into account spelling errors in first and last names. To do this, we rely on the stringdist()
function of the package of the same name.
library(stringdisy)
#' calculer_distance
#' help function for ecremer()
# variable <- "nom"
calculer_distance <- function(personne_cour, candidats, variable, threshold){
stringdist(personne_cour[,get(variable)], candidats[,get(variable)], method = "cosine") < threshold
}# End of calculer_distance()
We use the ecremer()
function defined below to identify potential duplicates. We now consider that two observations with about the same first and last name, about the same mother’s or father’s name are potentially mergeable.
#' ecremer
#' Retourne un tableau de candidats possibles
#' @candidats: (data.frame) table dans laquelle chercher les candidats
ecremer <- function(candidats, threshold = 0.1, personne_cour, prenoms = FALSE){
# Ecremer par le nom
ind_nom <- which(calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "nom", threshold = threshold))
candidats <- candidats[ind_nom,]
if(nrow(candidats)>0){
# Par le prenom
if(prenoms == TRUE){
ind_prenom <- which(calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "prenoms", threshold = threshold))
}else{
ind_prenom <- which(calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "prenom", threshold = threshold))
}
candidats <- candidats[ind_prenom,]
if(nrow(candidats)>0 & !is.na(personne_cour$nom_mere)){
ind_mere_nom <- calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "nom_mere", threshold = threshold)
ind_mere_nom <- c(which(is.na(ind_mere_nom)), which(ind_mere_nom))
candidats <- candidats[ind_mere_nom,]
}# Fin if mere
if(nrow(candidats)>0 & !is.na(personne_cour$nom_pere)){
ind_pere_nom <- calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "nom_pere", threshold = threshold)
ind_pere_nom <- c(which(is.na(ind_pere_nom)), which(ind_pere_nom)) %>% sort()
candidats <- candidats[ind_pere_nom,]
}# Fin if pere
}# Fin du if
candidats
}# End of ecremer()
Finally, to be able to perform the tracking, we use the function trouver_corresp_indice_naissance()
which relies on the results obtained by the function ecremer()
to define if a merger can be performed between several individuals. However, in addition to having a similar name, people must be born in the same department in the same year.
#' trouver_corresp_indice_naissance
#' Pour un numero de ligne du tableau individus
#' retourne sous forme de table les correspondances entre individus.
#' Chaque ligne du resultat correspond a l'identifiant d'une personne
#' et au plus petit identifiant d'une correspondance trouvee dans la base.
trouver_corresp_indice_naissance <- function(i){
personne_cour <- individus_3[i, ]
resul <- data.table(id_personne_num = personne_cour$id_personne_num,
id_personne_num_groupe = personne_cour$id_personne_num)
# Les identifiants des parents
personne_cour_id_num_mere <- personne_cour$id_personne_num_mere
personne_cour_id_num_pere <- personne_cour$id_personne_num_pere
# Si les identifiants des deux parents sont manquants, on ne va pas plus loin
if(!(is.na(personne_cour_id_num_mere) & is.na(personne_cour_id_num_pere))){
# On a un identifiant pour: mere U pere
id_personne_num <- personne_cour$id_personne_num
# Il faut que les candidats soient de la même année
# Et du meme sexe
# Il faut que les candidats soient nes la meme annee et soient du meme sexe
ind <- which(individus_3$date_N_annee %in% personne_cour$date_N_annee &
individus_3$sexe %in% personne_cour$sexe)
candidats <- individus_3[ind]
# individus %>% filter(id_personne == "yvono@le garrec|jean|1") %>% View()
candidats <- candidats[!id_personne_num %in% personne_cour$id_personne_num]
candidats <- ecremer(candidats, personne_cour = personne_cour)
if(nrow(candidats)>0){
id_retenir <- c(id_personne_num, candidats$id_personne_num)
resul <- data.table(id_personne_num = id_retenir, id_personne_num_groupe = my_min(id_retenir))
}# Fin du if()
}# Fin du if sur les parents
# }# Fin du if sur la date de naissance
resul
}# End of trouver_corresp_indice_naissance()
We can use the functions previously introduced to identify individuals to merge. In order not to waste time in code execution for individuals being quickly identified as not being mergeable, we a priori identify these individuals and set them aside.
ind_ne_pas_faire <- which(is.na(individus_2$date_N_annee) |
is.na(individus_2$sexe) |
(is.na(individus_2$id_personne_num_mere) & is.na(individus_2$id_personne_num_pere)))
corresp_naissance_ne_pas_faire <-
individus_2[ind_ne_pas_faire,] %>%
select(id_personne_num) %>%
mutate(id_personne_num_groupe = id_personne_num) %>%
data.table()
It remains to execute the function trouver_corresp_indice_naissance()
for each remaining individual. Again, this execution is parallelized.
ind_faire <- seq(1, nrow(individus_2))
ind_faire <- ind_faire[-ind_ne_pas_faire]
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
invisible(clusterEvalQ(cl, library(dplyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(tidyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringi, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(data.table, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringdist, warn.conflicts=FALSE, quietly=TRUE)))
individus_3 <- individus_2[ind_faire]
clusterExport(cl, c("individus_3"), envir=environment())
clusterExport(cl, c("calculer_distance", "ecremer", "my_min"), envir=environment())
corresp_naissance <- pblapply(1:nrow(individus_3), trouver_corresp_indice_naissance, cl = cl)
stopCluster(cl)
corresp_naissance <-
corresp_naissance %>%
rbindlist()
corresp_naissance <- unique(corresp_naissance)
corresp_naissance <-
corresp_naissance %>%
bind_rows(corresp_naissance_ne_pas_faire) %>%
unique()
rm(ind_faire, individus_3)
We then update the correspondence table.
corresp_naissance <- corresp_naissance[, list(id_personne_num_groupe = min(id_personne_num_groupe, na.rm=T)), by = id_personne_num] %>% unique()
corresp <-
corresp %>%
left_join(corresp_naissance %>% rename(id_personne_num_groupe_new = id_personne_num_groupe)) %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)) %>%
select(-id_personne_num_groupe_new) %>%
data.table()
Then, we look in this table, iteratively as during the first simplification operation, if it is not possible to carry out other reconciliations according to the related links.
continuer <- TRUE
nb_iterations <- 1
while(continuer){
registre_new <-
registre %>%
tbl_df() %>%
left_join(corresp %>% select(-id_personne) %>% rename(id_personne_num_new = id_personne_num_groupe), by = "id_personne_num") %>%
left_join(
corresp %>% select(-id_personne) %>% rename(id_personne_num_mere = id_personne_num,
id_personne_num_mere_new = id_personne_num_groupe),
by = "id_personne_num_mere"
) %>%
left_join(
corresp %>% select(-id_personne) %>% rename(id_personne_num_pere = id_personne_num,
id_personne_num_pere_new = id_personne_num_groupe),
by = "id_personne_num_pere"
)
registre_new <-
registre_new %>%
mutate(same_id = id_personne_num == id_personne_num_new,
same_id_mother = id_personne_num_mere == id_personne_num_mere_new,
same_id_father = id_personne_num_pere == id_personne_num_pere_new) %>%
mutate(same_id = ifelse(is.na(same_id), yes = TRUE, no = same_id),
same_id_mother = ifelse(is.na(same_id_mother), yes = TRUE, no = same_id_mother),
same_id_father = ifelse(is.na(same_id_father), yes = TRUE, no = same_id_father))
continuer <- any(!registre_new$same_id | !registre_new$same_id_mother | !registre_new$same_id_father)
if(!continuer) next
corresp_meres <-
registre_new %>%
select(id_personne_num_new, id_personne_num_mere) %>%
unique() %>%
arrange(id_personne_num_new, id_personne_num_mere) %>%
filter(!is.na(id_personne_num_mere))
corresp_meres <- data.table(corresp_meres)
corresp_meres[, id_personne_num_mere_2 := my_min(id_personne_num_mere), by = id_personne_num_new]
corresp_meres <-
corresp_meres %>% select(-id_personne_num_new) %>%
rename(id_personne_num = id_personne_num_mere, id_personne_num_groupe = id_personne_num_mere_2) %>%
unique()
corresp_meres[, id_personne_num_groupe := my_min(id_personne_num_groupe), by = id_personne_num]
corresp_peres <-
registre_new %>%
select(id_personne_num_new, id_personne_num_pere) %>%
unique() %>%
arrange(id_personne_num_new, id_personne_num_pere) %>%
filter(!is.na(id_personne_num_pere))
corresp_peres <- data.table(corresp_peres)
corresp_peres[, id_personne_num_pere_2 := my_min(id_personne_num_pere), by = id_personne_num_new]
corresp_peres <-
corresp_peres %>% select(-id_personne_num_new) %>%
rename(id_personne_num = id_personne_num_pere, id_personne_num_groupe = id_personne_num_pere_2) %>%
unique()
corresp_peres[, id_personne_num_groupe := my_min(id_personne_num_groupe), by = id_personne_num]
corresp <-
corresp %>%
tbl_df() %>%
left_join(
corresp_meres %>%
rename(id_personne_num_groupe_new = id_personne_num_groupe),
by = "id_personne_num"
) %>%
tbl_df() %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)) %>%
select(-id_personne_num_groupe_new) %>%
left_join(
corresp_peres %>%
rename(id_personne_num_groupe_new = id_personne_num_groupe),
by = "id_personne_num"
) %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)) %>%
select(-id_personne_num_groupe_new) %>%
unique()
registre_new <-
registre_new %>%
mutate(id_personne_num = id_personne_num_new,
id_personne_num_mere = id_personne_num_mere_new,
id_personne_num_pere = id_personne_num_pere_new) %>%
select(-id_personne_num_new, -id_personne_num_mere_new, -id_personne_num_pere_new,
-same_id, -same_id_mother, -same_id_father)
registre_new <- data.table(registre_new)
registre_new <- registre_new[, list(
id_personne_num_mere = min(id_personne_num_mere, na.rm=T),
id_personne_num_pere = min(id_personne_num_pere, na.rm=T)), by = id_personne_num] %>%
unique()
registre_new$id_personne_num_mere[which(registre_new$id_personne_num_mere == Inf)] <- NA
registre_new$id_personne_num_pere[which(registre_new$id_personne_num_pere == Inf)] <- NA
registre <- copy(registre_new)
nb_iterations <- nb_iterations+1
}# Fin de la boucle while
corresp <- data.table(corresp)
registre <- data.table(registre)
The identification of the new duplicates is done, it remains then to merge the duplicates, as at the time of the first fusion.
individus_simple <- individus_2[, mget(colnames(individus_simple))]
corresp_naissance_nb <-
corresp %>%
tbl_df() %>%
group_by(id_personne_num_groupe) %>%
tally()
groupes_naissance <- corresp_naissance_nb %>% filter(n > 1) %>% magrittr::extract2("id_personne_num_groupe")
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
invisible(clusterEvalQ(cl, library(dplyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(tidyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringi, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(data.table, warn.conflicts=FALSE, quietly=TRUE)))
clusterExport(cl, c("most_probable_value", "variables_names", "individus_simple"), envir=environment())
individus_simple_naissance <- pblapply(groupes_naissance, simplifier_personne, df_corresp = corresp, cl = cl, weights=TRUE)
individus_simple_naissance <- rbindlist(individus_simple_naissance)
stopCluster(cl)
We then add the individuals set aside previously.
groupes_un <- corresp_naissance_nb %>% filter(n == 1) %>% magrittr::extract2("id_personne_num_groupe")
individus_simple_naissance_un <- individus_simple[id_personne_num %in% groupes_un]
individus_simple_naissance <- rbind(individus_simple_naissance_un, individus_simple_naissance)
individus_simple <- individus_simple_naissance
The third simplification is similar in every way to the second, with the exception of the tracking step, where different conditions are used to indicate whether two individuals are designating the same person. We therefore present only the two functions that change at this stage, in order to facilitate reading.
This third simplification aims to correct information errors in the sex of individuals.
For two individuals to be designated identical, they must have at least one parent in common, be born, married or deceased in the same department. Again, their names must be about the same (we use the ecremer()
function again).
#' trouver_corresp_indice_sexe
#' Pour un numero de ligne du tableau individus
#' retourne sous forme de table les correspondances entre individus.
#' Chaque ligne du resultat correspond a l'identifiant d'une personne
#' et au plus petit identifiant d'une correspondance trouvee dans la base.
# i <- 199 ; i <- 32
trouver_corresp_indice_sexe <- function(i){
personne_cour <- individus_3[i, ]
resul <- data.table(id_personne_num = personne_cour$id_personne_num,
id_personne_num_groupe = personne_cour$id_personne_num)
# Les identifiants des parents
personne_cour_id_num_mere <- personne_cour$id_personne_num_mere
personne_cour_id_num_pere <- personne_cour$id_personne_num_pere
# Si les identifiants des deux parents sont manquants, on ne va pas plus loin
if(!(is.na(personne_cour_id_num_mere) & is.na(personne_cour_id_num_pere))){
# On a un identifiant pour: mere U pere
id_personne_num <- personne_cour$id_personne_num
# Il faut que les candidats soient du meme departement
if(!is.na(personne_cour$date_N)){
ind_N <- which(individus_3$date_N %in% personne_cour$date_N)
}else{
ind_N <- NULL
}
if(!is.na(personne_cour$date_M)){
ind_M <- which(individus_3$date_M %in% personne_cour$date_M)
}else{
ind_M <- NULL
}
if(!is.na(personne_cour$date_D)){
ind_D <- which(individus_3$date_D %in% personne_cour$date_D)
}else{
ind_D <- NULL
}
ind <- c(ind_N, ind_M, ind_D) %>% unique()
candidats <- individus_3[ind]
if(!is.na(personne_cour$dept_N)){
candidats <-
candidats[dept_N %in% personne_cour$dept_N]
}
candidats <- candidats[!id_personne_num %in% personne_cour$id_personne_num]
candidats <- ecremer(candidats, personne_cour = personne_cour, threshold = 0.02)
if(nrow(candidats)>0){
id_retenir <- c(id_personne_num, candidats$id_personne_num)
resul <- data.table(id_personne_num = id_retenir, id_personne_num_groupe = my_min(id_retenir))
}# Fin du if()
}# Fin du if sur les parents
# }# Fin du if sur la date de naissance
resul
}# End of trouver_corresp_indice_sexe()
The rest of the code is similar to the second simplification.
The fourth simplification aims to correct errors concerning dates. Some dates indicate a year, a month and a day. Others are not month or day specific, and instead of the number corresponding to the month or day, the value 00
is indicated. The present simplification attempts to take this aspect into account.
Again, the duplicate tracking function is the changing element here. We add as a constraint for the matching, the fact that the individuals must be born relatively close to each other (<5km) while remaining in the same department (if this last is indicated).
#' trouver_corresp_indice_00
#' Pour un numero de ligne du tableau individus
#' retourne sous forme de table les correspondances entre individus.
#' Chaque ligne du resultat correspond a l'identifiant d'une personne
#' et au plus petit identifiant d'une correspondance trouvee dans la base.
trouver_corresp_indice_00 <- function(i){
personne_cour <- individus_3[i, ]
resul <- data.table(id_personne_num = personne_cour$id_personne_num,
id_personne_num_groupe = personne_cour$id_personne_num)
# Si la date de naissance est entiere, on ne va pas chercher plus loin
if(personne_cour$date_N_mois == 0 | personne_cour$date_N_jour == 0){
# Les identifiants des parents
personne_cour_id_num_mere <- personne_cour$id_personne_num_mere
personne_cour_id_num_pere <- personne_cour$id_personne_num_pere
# Si les identifiants des deux parents sont manquants, on ne va pas plus loin
if(!(is.na(personne_cour_id_num_mere) & is.na(personne_cour_id_num_pere))){
# On a un identifiant pour: mere U pere
id_personne_num <- personne_cour$id_personne_num
# Il faut que les candidats soient de la même année et si possible du même département
candidats <- individus_3[date_N_annee %in% personne_cour$date_N_annee]
if(!is.na(personne_cour$dept_N)){
candidats <-
candidats[dept_N %in% personne_cour$dept_N]
}
candidats <- candidats[!id_personne_num %in% personne_cour$id_personne_num]
if(nrow(candidats)>0){
les_distances <-
geosphere::distGeo(c(as.numeric(personne_cour$long_N), as.numeric(personne_cour$lat_N)),
cbind(as.numeric(candidats$long_N), as.numeric(candidats$lat_N)))
candidats <- candidats[which(les_distances < 5000),]
candidats <- ecremer(candidats, personne_cour = personne_cour, threshold = 0.01, prenoms = TRUE)
}
if(nrow(candidats)>0){
id_retenir <- c(id_personne_num, candidats$id_personne_num)
resul <- data.table(id_personne_num = id_retenir, id_personne_num_groupe = my_min(id_retenir))
}# Fin du if()
}# Fin du if sur les parents
}
# }
resul
}# End of trouver_corresp_indice_00()
In this fifth simplification, we perform the merger by first names, using only the first name, and no longer the last name. The ecremer()
function is replaced by the following :
#' ecremer
#' Retourne un tableau de candidats possibles
#' @candidats: (data.frame) table dans laquelle chercher les candidats
ecremer_2 <- function(candidats, threshold = 0.1, personne_cour){
# Ecremer par le nom
ind_nom <- which(calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "nom", threshold = threshold))
candidats <- candidats[ind_nom,]
if(nrow(candidats)>0){
# Par le prenom
pren_cour <- personne_cour$prenom
ind_prenom <- which(str_detect(string = candidats$prenoms,
pattern = fixed(pren_cour, ignore_case = TRUE)))
candidats <- candidats[ind_prenom,]
if(nrow(candidats)>0 & !is.na(personne_cour$nom_mere)){
ind_mere_nom <- calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "nom_mere", threshold = threshold)
ind_mere_nom <- c(which(is.na(ind_mere_nom)), which(ind_mere_nom))
candidats <- candidats[ind_mere_nom,]
}# Fin if mere
if(nrow(candidats)>0 & !is.na(personne_cour$nom_pere)){
ind_pere_nom <- calculer_distance(personne_cour = personne_cour, candidats = candidats, variable = "nom_pere", threshold = threshold)
ind_pere_nom <- c(which(is.na(ind_pere_nom)), which(ind_pere_nom)) %>% sort()
candidats <- candidats[ind_pere_nom,]
}# Fin if pere
}# Fin du if
candidats
}# End of ecremer_2()
The tracking function is modified as follows. People must be born on the same day, not too far from each other (<5km), their parents must have close names.
#' trouver_corresp_indice_fifth
#' Pour un numero de ligne du tableau individus
#' retourne sous forme de table les correspondances entre individus.
#' Chaque ligne du resultat correspond a l'identifiant d'une personne
#' et au plus petit identifiant d'une correspondance trouvee dans la base.
trouver_corresp_indice_fifth <- function(i){
personne_cour <- individus_3[i, ]
resul <- data.table(id_personne_num = personne_cour$id_personne_num,
id_personne_num_groupe = personne_cour$id_personne_num)
id_personne_num <- personne_cour$id_personne_num
# Il faut que les candidats soient nes le meme jour
ind <- which(individus_3$date_N %in% personne_cour$date_N)
candidats <- individus_3[ind]
if(!is.na(personne_cour$dept_N)){
candidats <-
candidats[dept_N %in% personne_cour$dept_N]
}
candidats <- candidats[!id_personne_num %in% personne_cour$id_personne_num]
if(nrow(candidats)>0){
les_distances <-
geosphere::distGeo(c(as.numeric(personne_cour$long_N), as.numeric(personne_cour$lat_N)),
cbind(as.numeric(candidats$long_N), as.numeric(candidats$lat_N)))
candidats <- candidats[which(les_distances < 5000),]
candidats <- ecremer_2(candidats, personne_cour = personne_cour, threshold = 0.01)
}
if(nrow(candidats)>0){
id_retenir <- c(id_personne_num, candidats$id_personne_num)
resul <- data.table(id_personne_num = id_retenir, id_personne_num_groupe = my_min(id_retenir))
}# Fin du if()
# }
resul
}# End of trouver_corresp_indice_fifth()
The last simplification is based on brothers and sisters.
First we add information about the parents of individuals.
individus_2 <- individus_simple
individus_2$id_personne_num_mere <- registre$id_personne_num_mere[match(individus_simple$id_personne_num, registre$id_personne_num)]
individus_2$id_personne_num_pere <- registre$id_personne_num_pere[match(individus_simple$id_personne_num, registre$id_personne_num)]
We define the function trouver_enfants()
which, as its name indicates (French for find_children
), allows to find children identifiers for an individual located at the i
position of the individual_2
array.
trouver_enfants <- function(i){
id_cour <- individus_2$id_personne_num[i]
ind_enfants <- c(which(individus_2$id_personne_num_mere == id_cour),
which(individus_2$id_personne_num_pere == id_cour)) %>%
unique()
# ids_enfants <- individus_2$id_personne_num[ind_enfants] %>% str_c(., collapse = "|") %>%
# str_c("|", ., "|")
ids_enfants <- individus_2$id_personne_num[ind_enfants]
ids_enfants
}# Fin de trouver_enfants()
We apply this function to each observation of the database, by parallelizing the execution of the code.
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
invisible(clusterEvalQ(cl, library(dplyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(tidyr, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringi, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(data.table, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(stringdist, warn.conflicts=FALSE, quietly=TRUE)))
clusterExport(cl, c("individus_2"), envir=environment())
ids_enfants <- pbsapply(seq(1, nrow(individus_2)), trouver_enfants, cl = cl)
# stopCluster(cl)
Note that we do not close the connection to the clusters, we will reuse them a little further.
We add to each individual of the database the identifiers found for their children. Then, we recreate a variable for the first name of individuals, keeping only the first name. “Marie Chantal Jacqueline Michu” will have the short name “Marie”.
individus_2$ids_enfants <- ids_enfants
individus_2 <-
individus_2 %>%
mutate(date_N_annee = str_sub(date_N, 1, 4) %>% as.numeric(),
date_D_annee = str_sub(date_D, 1, 4) %>% as.numeric(),
prenom_court = str_replace_all(prenoms, "\\((.*?)\\)", ""),
prenom_court = stri_replace_all_regex(prenom_court, "'|\\(|\\)|\\?|_|\\*|>|\\^|\\+|[[:digit:]]|\\.|<|>", "") %>%
str_trim(),
prenom_court = gsub(" .*$", "", prenom_court),
prenom_court = stri_trans_general(prenom_court, "Latin-ASCII"),
prenom_court = str_extract(prenom_court, '\\w*') %>%
stri_trim() %>%
str_to_lower()) %>%
data.table()
We then create the same_siblings()
function which allows to find duplicate people, according to their brothers and sisters.
#' same_siblings
#' Retourne les freres/soeurs en doublons pour un individu de la base
#' i <- ind_faire[55]
#' i <- 2407
#' rm(individu,ids_parents, parents,ids_siblings,res,siblings,prenom_individu,prenom_individu,candidats, i)
same_siblings <- function(i){
individu <- individus_2[i, ]
# Rechercher si cette personne a des freres et soeurs
ids_parents <- c(individu$id_personne_num_mere, individu$id_personne_num_pere)
ids_parents <- ids_parents[!is.na(ids_parents)]
parents <- individus_2[id_personne_num %in% ids_parents]
ids_siblings <- unlist(parents$ids_enfants) %>% unique()
ids_siblings <- ids_siblings[-which(ids_siblings == individu$id_personne_num)]
res <- NULL
# Si la personne a des freres et soeurs
if(length(ids_siblings)>0){
siblings <- individus_2[id_personne_num %in% ids_siblings]
nom_individu <- individu$nom
prenom_court_individu <- individu$prenom_court
# Reperer parmi eux ceux qui ont le meme prenom
candidats <- siblings[prenom_court == prenom_court_individu & nom == nom_individu]
# S'il y a une dates de deces indiquee, elle doit correspondre (meme annee)
# Si la date est manquante, on emet l'hypothese qu'il peut toujours s'agir de la meme personne
if(nrow(candidats)>0){
annee_D_individu <- individu$date_D_annee
if(is.na(annee_D_individu)){
annee_D_potentielle <- NA
}else{
annee_D_potentielle <- c(NA, annee_D_individu)
}
candidats <- candidats[date_D_annee %in% annee_D_potentielle]
if(nrow(candidats)>0){
annee_N_individu <- individu$date_N_annee
if(is.na(annee_N_individu)){
annee_N_potentielle <- NA
}else{
annee_N_potentielle <- c(NA,annee_N_individu)
}
candidats <- candidats[date_N_annee %in% annee_N_potentielle]
# A present, il reste des freres ou soeurs avec le meme prenom et la meme annee de naissance et de deces
if(nrow(candidats)>0){
res <- c(individu$id_personne_num, candidats$id_personne_num)
}
}# Fin du if pour la date de naissance
}# Fin du if pour la date de deces
}# Fin de s'il y a des siblings
res
}# Fin de same_siblings()
All that remains is to apply this function, using clusters again, which we have not revoked.
ind_ne_pas_faire <- c(which(is.na(individus_2$id_personne_num_mere) & is.na(individus_2$id_personne_num_pere)))
ind_faire <- seq(1, nrow(individus_2))
ind_faire <- ind_faire[-ind_ne_pas_faire]
# Calcul en parallele
clusterExport(cl, c("individus_2"), envir=environment())
fusion_siblings <- pblapply(ind_faire, same_siblings, cl = cl)
stopCluster(cl)
# Il faut fusionner ces personnes
fusion_siblings <- fusion_siblings[which(!sapply(fusion_siblings, is.null))]
corresp_siblings <- lapply(fusion_siblings, function(x) data.table(id_personne_num = x, id_personne_num_groupe = min(x))) %>%
rbindlist() %>%
unique()
corresp_siblings <- corresp_siblings[, list(id_personne_num_groupe = min(id_personne_num_groupe, na.rm=T)), by = id_personne_num] %>% unique()
# On repercute ces nouvelles informations sur corresp
corresp <-
corresp %>%
left_join(corresp_siblings %>% rename(id_personne_num_groupe_new = id_personne_num_groupe)) %>%
mutate(id_personne_num_groupe = pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)) %>%
select(-id_personne_num_groupe_new) %>%
data.table()
Then it remains to loop on the register and apply the simplification function, as in the previous steps.
In the previous section, we grouped the trees by department. This new section is about pooling the departmental bases in order to create a French base.
Let’s load the list of departments we had created before and add a column with the department numbers.
load("liste_depts.rda")
liste_depts <-
liste_depts %>%
mutate(dep_num = str_sub(dept, 2))
We can add the name of the region, based on data contained in the package raster
.
map_fr_sp <- raster::getData(name = "GADM", country = "FRA", level = 2)
liste_depts <-
liste_depts %>%
left_join(
data.frame(dep_num = map_fr_sp$CCA_2, dep_name = map_fr_sp$NAME_2)
) %>%
tbl_df()
We then load the departmental genealogy data obtained in the previous step.
#' charger_corresp
#' Charger la table de correspondance, pour un département donné
#' @i_departement: (int) ligne de liste_depts à traiter
charger_corresp <- function(i_departement){
departement <- liste_depts$dept[i_departement]
load(str_c("../data/dept/migration/", departement, "/resul.rda"))
resul$corresp
}# Fin de charger_corresp()
# Charger les données
corresp <- pblapply(seq(1, nrow(liste_depts)), charger_corresp)
corresp <- bind_rows(corresp)
Individuals are identified using a numeric value, specific to each department, as well as a string composed of the amateur genealogist’s name, the individual’s surname and given names. While a person will be uniquely identified by his textual identifier, his numerical identifier may be different depending on the department. We indeed worked by department, by defining at this time an identifier for each individual present in the extract of the base. Now that we are bringing the departments together, we need to create a new numeric identifier. Take an example: the individual Pierre Dupond
present in the user tree batman36
has for mother a woman born in 1801 in Finistère and for father a man born in 1800 in Morbihan. Also, this individual received a numerical identifier when we processed Finistère, and another numerical identifier when we processed Morbihan. Its textual identifier is on the other hand identical in both subbases. We will thus assign a new numerical identifier to this Pierre Dupond, by relying on his textual identifier. We are doing this because we will again try to identify associations that could have escaped us on the subbases, and try to complete some information.
corresp_id <- data.frame(id_personne = unique(corresp$id_personne)) %>% tbl_df()
corresp_id <- corresp_id %>% mutate(id_personne_num_new = row_number())
Now that we have new identifiers for individuals, we can load departmental databases of individuals, update their numerical identifier, and join the databases. We do this by using a function that we apply to each department.
#' charger_individus_simple
#' Retourne une liste pour un département contenant :
#' - registre : le registre mis à jour
#' - individus_simple : les informations sur les individus
#' - corresp : le tableau de correspondances mis à jour (à chaque personne présente initialement dans la base, l'identifiant numérique correspondant)
#' - departement : le numéro de département
#' @i_departement: (int) ligne de liste_depts à consulter pour récupérer le numéro de département
charger_individus_simple <- function(i_departement){
departement <- liste_depts$dept[i_departement]
load(str_c("../data/dept/migration/", departement, "/resul.rda"))
# TABLE DE CORRESPONDANCE #
corresp <- resul$corresp %>% tbl_df()
# Rajout des identifiants sous forme de texte dans la table de correspondance
corresp_2 <-
corresp %>%
left_join(
corresp %>%
select(id_personne, id_personne_num) %>%
rename(id_personne_groupe = id_personne, id_personne_num_groupe = id_personne_num) %>%
unique(),
by = "id_personne_num_groupe"
)
corresp_2 <- corresp_2 %>% filter(!is.na(id_personne_groupe))
# Rajout des nouveaux identifiants numeriques
corresp_2 <-
corresp_2 %>% left_join(corresp_id %>% mutate(id_personne = as.character(id_personne)), by = "id_personne")
corresp_2 <-
corresp_2 %>%
left_join(
corresp_id %>% rename(id_personne_groupe = id_personne, id_personne_num_groupe_new = id_personne_num_new) %>%
mutate(id_personne_groupe = as.character(id_personne_groupe)),
by = "id_personne_groupe"
)
# INDIVIDUS #
individus_simple <- resul$individus_simple
individus_simple <- individus_simple[weight != 0,]
individus_simple <-
individus_simple %>% tbl_df() %>%
left_join(
corresp_2 %>%
select(id_personne_num, id_personne_num_new),
by = "id_personne_num"
) %>%
tbl_df()
# REGISTRE DES LIENS DE PARENTE #
registre <- resul$registre %>% data.table()
registre <- registre[id_personne_num %in% individus_simple$id_personne_num] %>%
tbl_df()
registre <-
registre %>%
left_join(corresp_2 %>% select(id_personne_num, id_personne_num_new), by = "id_personne_num") %>%
# Ajout mere
left_join(corresp_2 %>% select(id_personne_num, id_personne_num_new) %>%
rename(id_personne_num_mere = id_personne_num, id_personne_num_mere_new = id_personne_num_new),
by = "id_personne_num_mere") %>%
# Ajout pere
left_join(corresp_2 %>% select(id_personne_num, id_personne_num_new) %>%
rename(id_personne_num_pere = id_personne_num, id_personne_num_pere_new = id_personne_num_new),
by = "id_personne_num_pere")
registre <-
registre %>%
select(id_personne_num_new, id_personne_num_mere_new, id_personne_num_pere_new)
corresp_2 <- corresp_2 %>%
select(id_personne, id_personne_num_new, id_personne_num_groupe_new) %>%
filter(!is.na(id_personne_num_new))
registre <-
registre %>%
rename(id_personne_num = id_personne_num_new,
id_personne_num_mere = id_personne_num_mere_new,
id_personne_num_pere = id_personne_num_pere_new)
individus_simple <-
individus_simple %>%
select(-id_personne_num) %>%
rename(id_personne_num = id_personne_num_new)
corresp_2 <-
corresp_2 %>%
rename(id_personne_num = id_personne_num_new,
id_personne_num_groupe = id_personne_num_groupe_new)
list(registre = registre, individus_simple = individus_simple, corresp = corresp_2, departement = departement)
}# Fin de charger_individus_simple()
res <- pblapply(seq(1, nrow(liste_depts)), charger_individus_simple)
We can extract and attach tables of correspondence between them.
# Table des correspondances
corresp <- pblapply(res, function(x) x$corresp)
corresp <- corresp %>% rbindlist() %>% unique()
corresp <- corresp[, list(id_personne_num_groupe = min(id_personne_num_groupe, na.rm=T)), by = id_personne_num] %>% unique()
corresp_id <-
corresp_id %>%
rename(id_personne_num = id_personne_num_new) %>%
data.table()
corresp <- corresp_id[corresp, on = "id_personne_num"]
The same applies to the tables containing the information relating to each individual.
individus_simple <- pblapply(res, function(x) x$individus_simple)
individus_simple <- individus_simple %>% rbindlist() %>% unique()
individus_simple <- individus_simple[!is.na(id_personne_num)]
We are looking for obvious associations that can be made with these new data. To do this, we look at the textual identifier of individuals and see if we can count duplicate values.
corresp_2 <- corresp[, list(id_personne_num, id_personne_num_groupe)] %>%
rename(id_personne_num_groupe_new = id_personne_num_groupe,
id_personne_num_groupe = id_personne_num)
setkey(corresp, id_personne_num_groupe)
setkey(corresp_2, id_personne_num_groupe)
corresp <- corresp_2[corresp]
corresp[, id_personne_num_groupe := pmin(id_personne_num_groupe, id_personne_num_groupe_new, na.rm=T)]
corresp$id_personne_num_groupe_new <- NULL
rm(corresp_2)
Again we use a function to simplify the values, based on the observed frequency of the values.
#' most_probable_value
#' Find the most probable value for a variable
#' using the values found within all candidates
#' (excluding NAs)
#' @variable_name: (string) name of the variable
#' @df: (data.table)
#' @weights: (logic) should the simplification consider the weights? (default: FALSE)
#' variable_name <- "date_N"
most_probable_value <- function(df, variable_name, weights = FALSE){
valeurs <- df[[variable_name]]
# valeurs <- individu_cour_a_fusionner %>% magrittr::extract2(variable_name)
if(!all(is.na(valeurs))){
if(weights){
poids <- df[["weight"]]
res <- data.frame(valeurs, poids, stringsAsFactors = FALSE) %>%
group_by(valeurs) %>%
summarise(poids = sum(poids)) %>%
ungroup()
# Si la variable est une date
# on va privilegier les informatins relatives aux dates completes
if(variable_name %in% c("date_N", "date_M", "date_D")){
tmp <- res %>%
mutate(mois = str_sub(valeurs, 5, 6),
jour = str_sub(valeurs, 7, 8)) %>%
filter(!(mois == "00" | jour == "00"))
# S'il reste des informations, on se base sur elles
if(nrow(tmp)>0) res <- tmp
}
res <-
res %>%
arrange(desc(poids)) %>%
slice(1) %>%
magrittr::extract2("valeurs")
}else{
table_freq <- sort(table(valeurs))
res <- names(table_freq)[1]
}
}else{
res <- NA
}
res
}# End of most_probable_value()
#' simplifier_personne
#' À partir d'un identifiant numérique, retourne un tableau à une seule ligne contenant les valeurs de chaque variable relative à cette personne.
#' @weights: (logic) should the simplification consider the weights? (default: FALSE)
simplifier_personne <- function(un_groupe_num, weights = TRUE){
groupe_cour <- df_corresp[id_personne_num_groupe %in% un_groupe_num]
individus_cour <- df_individus_simple[id_personne_num %in% groupe_cour$id_personne_num]
weight <- sum(individus_cour$weight)
if(nrow(individus_cour) == 1){
nouvelles_valeurs <- individus_cour[, mget(c(variables_names))]
}else{
# Les valeurs probables pour les variables d'interet
nouvelles_valeurs <-
variables_names %>%
sapply(., most_probable_value, df = individus_cour, weights = weights) %>%
t() %>%
data.table(stringsAsFactors = FALSE)
}
cbind(id_personne_num = un_groupe_num, nouvelles_valeurs, weight = weight)
}# End of simplifier_personne()
Again we use the my_min()
function, which returns the value NA
rather than Inf
when all compared values are missing.
my_min <- function(x) ifelse( !all(is.na(x)), min(x, na.rm=T), NA)
Les variables que nous souhaitons extraire sont les suivantes :
# Les variables qui nous interessent
variables_names <- c("nom", "prenoms",
"sexe",
"lieu_N", "dept_N", "region_N", "pays_N", "lat_N", "long_N", "date_N",
"lieu_M", "dept_M", "region_M", "pays_M", "lat_M", "long_M", "date_M",
"lieu_D", "dept_D", "region_D", "pays_D", "lat_D", "long_D", "date_D",
"prenom")
Then, we simplify the individuals, in the same way as we did in the previous section.
All that remains is to save the result:
res <- list(nb_obs = nb_obs,
corresp = corresp,
registre = registre,
individus_simple = individus_simple)
save(res, file = "migration_1800_04_dep.rda")
The previous steps made it possible to obtain a genealogy database containing information on individuals born in France between 1800 and 1804, their parents and their descendants. We can use this data to create a new basis for studying generations. This section gives the steps used for this purpose and presents the codes used to generate the graphs.
We start by loading some necessary packages.
library(tidyverse)
library(lubridate)
library(stringr)
library(stringi)
library(data.table)
library(pbapply)
library(dplyrExtras)
library(stringdist)
library(sp)
We also load the list of departments and add their names.
load("liste_depts.rda")
liste_depts <-
liste_depts %>%
mutate(dep_num = str_sub(dept, 2))
map_fr_sp <- raster::getData(name = "GADM", country = "FRA", level = 2)
liste_depts <-
liste_depts %>%
left_join(
data.frame(dep_num = map_fr_sp$CCA_2, dep_name = map_fr_sp$NAME_2)
) %>%
tbl_df()
We recover the genealogy data obtained at the end of the previous step.
load("migration_1800_04_dep.rda")
registre <- res$registre
individus_simple <- res$individus_simple
individus_2 <- individus_simple
From the identifiers of the parents present in each row of the individual data table, we can add to each individual the information relating to their parents, if they exist.
# On rajoute les identifiants des parents
individus_2$id_personne_num_mere <- registre$id_personne_num_mere[match(individus_simple$id_personne_num, registre$id_personne_num)]
individus_2$id_personne_num_pere <- registre$id_personne_num_pere[match(individus_simple$id_personne_num, registre$id_personne_num)]
ind_meres <- match(individus_2$id_personne_num_mere, individus_2$id_personne_num)
ind_peres <- match(individus_2$id_personne_num_pere, individus_2$id_personne_num)
# Les meres
individus_meres <-
individus_2[ind_meres,] %>%
magrittr::set_colnames(str_c(colnames(individus_2), "_mere")) %>%
select(-id_personne_num_mere, -id_personne_num_mere_mere, -id_personne_num_pere_mere)
# Les peres
individus_peres <-
individus_2[ind_peres,] %>%
magrittr::set_colnames(str_c(colnames(individus_2), "_pere")) %>%
select(-id_personne_num_pere, -id_personne_num_mere_pere, -id_personne_num_pere_pere)
# On ajoute les informations relatives aux parents
individus_2 <-
individus_2 %>%
cbind(individus_meres) %>%
cbind(individus_peres) %>%
# select(-id_mere, -id_pere) %>%
data.table()
Let us focus on individuals born between 1680 and 2017.
# On retire les individus nes apres 2017 et avant 1680
individus_2 <- individus_2[date_N_annee > 1680 & date_N_annee <= 2017]
There are observations with outliers. We’ll identify them and remove them.
One source of abnormality is the age at which a person can have a child. If we found that a child was born before one of his parents reached the age of 13, we consider it to be an input error (or a merger error, which is more problematic). We consider that an individual is able to reproduce between the ages of 13 and 70.
# On retire les anomalies concernant les dates de naissances
age_min_nenf <- 13
age_max_enf <- 70
individus_2 <-
individus_2 %>%
mutate(date_N_annee = str_sub(date_N, 1, 4) %>% as.numeric(),
date_D_annee = str_sub(date_D, 1, 4) %>% as.numeric(),
date_N_annee_mere = str_sub(date_N_mere, 1, 4) %>% as.numeric(),
date_N_annee_pere = str_sub(date_N_pere, 1, 4) %>% as.numeric()) %>%
# Anomalie si la mere a eu son enfant a moins de 13 ans ou a plus de 70 ans
mutate(anomalie_mere = (date_N_annee <= (date_N_annee_mere+age_min_nenf)) | (date_N_annee >= (date_N_annee_mere + age_max_enf)),
anomalie_mere = ifelse(is.na(anomalie_mere), yes = FALSE, no = anomalie_mere)) %>%
# Idem pour le pere
mutate(anomalie_pere = (date_N_annee <= (date_N_annee_pere+age_min_nenf)) | (date_N_annee >= (date_N_annee_pere + age_max_enf)),
anomalie_pere = ifelse(is.na(anomalie_pere), yes = FALSE, no = anomalie_pere)) %>%
mutate(anomalie = anomalie_mere + anomalie_pere) %>%
filter(anomalie == 0) %>%
select(-anomalie_mere, -anomalie_pere, -anomalie) %>%
data.table()
We remove observations for which negative ages or older than 122 years are reported.
individus_2 <-
individus_2 %>%
# mutate(age = interval(ymd(date_N), ymd(date_D)) / years(1)) %>%
mutate(age = date_D_annee - date_N_annee) %>%
mutate(keep = age >= 0 & age <= 122,
keep = ifelse(is.na(keep), yes = TRUE, no = keep)) %>%
filter(keep == TRUE) %>%
select(-keep)
We remove observations for which the year of death occurs before this year of birth.
individus_2 <-
individus_2 %>%
mutate(keep = (str_sub(date_D, 1, 4) %>% as.numeric) >= (str_sub(date_N, 1, 4) %>% as.numeric),
keep = ifelse(is.na(keep), yes = TRUE, no = keep)) %>%
filter(keep == TRUE) %>%
select(-keep)
Let us filter the individuals in the database individus
by retaining only those that were obtained in individus_2
.
individus_simple <-
individus_simple[id_personne_num %in% individus_2$id_personne_num]
We can now construct the picture of generations, starting with the individuals at the origin of the study, i.e. those born between 1800 and 1804, in France.
gen_0 <- individus_2[date_N_annee %in% seq(1800, 1804)]
gen_0 <- registre[id_personne_num %in% gen_0$id_personne_num]
We will build this table generation by generation. For this purpose, we define a table containing the current generation, which will be modified when we look at children, grandchildren, etc. We have to initialize it.
gen_n <- copy(gen_0)
gen_n$generation <- 0
We remove individuals of the current generation from our table of individuals.
gen_restants <- registre[id_personne_num %in% individus_2$id_personne_num]
gen_restants <- gen_restants[!id_personne_num %in% gen_n$id_personne_num]
We initialize our generation table, composed for the moment of individuals of the current generation, i.e., generation 0.
generations <-
data.frame(id_personne_num = gen_0$id_personne_num) %>%
rename(id_aieul = id_personne_num) %>%
data.table()
Let us now identify the children of the current generation.
peres_potentiels <-
gen_restants %>%
select(id_personne_num, id_personne_num_pere) %>%
rename(id_aieul = id_personne_num_pere, id_enfant = id_personne_num) %>%
filter(!is.na(id_aieul))
meres_potentiels <-
gen_restants %>%
select(id_personne_num, id_personne_num_mere) %>%
rename(id_aieul = id_personne_num_mere, id_enfant = id_personne_num) %>%
filter(!is.na(id_aieul))
parents_potentiels <-
peres_potentiels %>%
rbind(meres_potentiels)
setkey(parents_potentiels, id_aieul)
setkey(generations, id_aieul)
generations <- parents_potentiels[generations]
Then the grandchildren.
peres_potentiels <-
gen_restants %>%
select(id_personne_num, id_personne_num_pere) %>%
rename(id_enfant = id_personne_num_pere, id_p_enfant = id_personne_num) %>%
filter(!is.na(id_enfant))
meres_potentiels <-
gen_restants %>%
select(id_personne_num, id_personne_num_mere) %>%
rename(id_enfant = id_personne_num_mere, id_p_enfant = id_personne_num) %>%
filter(!is.na(id_enfant))
parents_potentiels <-
peres_potentiels %>%
rbind(meres_potentiels)
setkey(parents_potentiels, id_enfant)
setkey(generations, id_enfant)
generations <- parents_potentiels[generations]
And finally, the great-grandchildren.
peres_potentiels <-
gen_restants %>%
select(id_personne_num, id_personne_num_pere) %>%
rename(id_p_enfant = id_personne_num_pere, id_ap_enfant = id_personne_num) %>%
filter(!is.na(id_p_enfant))
meres_potentiels <-
gen_restants %>%
select(id_personne_num, id_personne_num_mere) %>%
rename(id_p_enfant = id_personne_num_mere, id_ap_enfant = id_personne_num) %>%
filter(!is.na(id_p_enfant))
parents_potentiels <-
peres_potentiels %>%
rbind(meres_potentiels)
setkey(parents_potentiels, id_p_enfant)
setkey(generations, id_p_enfant)
generations <- parents_potentiels[generations]
We need to identify in which department some points identified by coordinates fall (longitude, latitude). For this, a simple way is to use the inside.gpc.poly()
function of the package surveillance
. However, it is necessary to provide a polygon of the class gpc.poly()
. For the time being, we have data to plot the departments in the form of data tables. We actually convert these arrays to gpc.poly()
objects.
library(rgeos)
#' group_to_gpc_poly
#' Returns the polygon from a group within the data frame containing
#' the coordinates of regions
#' @df: data frame with the coordinates of the polygons
#' @group: (string) group ID
group_to_gpc_poly <- function(df, group){
group_coords <- df[df$group == group, c("long", "lat")]
as(group_coords, "gpc.poly")
}
#' departement_polygone
#' Returns a French region as a gpc.poly object
#' un_dep: (string) code of the region
departement_polygone <- function(un_dep){
dep_tmp <-
france_1800 %>%
filter(departement == un_dep)
df_tmp_polys <- lapply(unique(dep_tmp$group), group_to_gpc_poly, df = dep_tmp)
df_tmp_polys_tot <- df_tmp_polys[[1]]
if(length(df_tmp_polys)>1){
for(i in 2:length(df_tmp_polys)){
df_tmp_polys_tot <- gpclib::union(df_tmp_polys_tot, df_tmp_polys[[i]])
}
}
df_tmp_polys_tot
}# End of departement_polygone()
france_1800_gpc <- lapply(unique(france_1800$departement), departement_polygone)
names(france_1800_gpc) <- unique(france_1800$departement)
save(france_1800_gpc, file = "france_1800_gpc.rda")
Now that we have the departments as gpc.poly
, we can create a function to indicate if a point (lon,lat) is within a given department.
est_dans_poly_naissance <- function(longitude, latitude, departement){
surveillance::inside.gpc.poly(x = longitude,
y = latitude,
departement,
mode.checked = FALSE)
}
To locate the birth department of individuals, we convert the coordinates into numerical values. They are currently in strings.
individus_simple_reste <- individus_simple
individus_simple_reste <- individus_simple_reste[!is.na(lat_N)]
individus_simple_reste <-
individus_simple_reste %>%
mutate(long_N = as.numeric(long_N),
lat_N = as.numeric(lat_N))
individus_simple_reste <- data.table(individus_simple_reste)
We will loop the departments to determine, at each iteration, which individuals were born in that department. Since we have a large number of individuals and the French territory is relatively large at the departmental level, we filter our data roughly every round. For a given department, we get the coordinates of a rectangle containing it, using the get.bbox()
function. We then filter the individuals according to this rectangle, to exclude those whose coordinates are outside. Then the function est_dans_poly_naissance()
has to be applied to the filtered points, which are much less numerous than originally.
dept_n_indiv <- rep(list(NA), length(france_1800_gpc))
pb <- txtProgressBar(min = 1, max = length(france_1800_gpc), style = 3)
for(i in 1:length(france_1800_gpc)){
poly <- france_1800_gpc[[i]]
nom_dep <- names(france_1800_gpc)[i]
individus_simple_reste_2 <-
individus_simple_reste %>%
select(long_N, lat_N, id_personne_num) %>%
filter(long_N >= get.bbox(poly)$x[1],
long_N <= get.bbox(poly)$x[2],
lat_N >= get.bbox(poly)$y[1],
lat_N <= get.bbox(poly)$y[2]) %>%
rowwise() %>%
mutate(in_poly = est_dans_poly_naissance(long_N, lat_N, poly)) %>%
select(id_personne_num, in_poly) %>%
mutate(dept_N = nom_dep) %>%
filter(in_poly == TRUE) %>%
select(-in_poly)
individus_simple_reste <-
individus_simple_reste[! id_personne_num %in% individus_simple_reste_2$id_personne_num]
dept_n_indiv[[i]] <- individus_simple_reste_2
setTxtProgressBar(pb, i)
rm(individus_simple_reste_2)
# On appelle le garbage collector toutes les 10 itérations
if(i %% 10 == 0) gc()
}
dept_n_indiv <-
dept_n_indiv %>%
rbindlist()
It then remains to update individuals_simple
to reflect the information obtained on the birth department of individuals.
dept_n_indiv <- dept_n_indiv %>% rename(dept_N_2 = dept_N)
setkey(dept_n_indiv, id_personne_num)
setkey(individus_simple, id_personne_num)
individus_simple <- dept_n_indiv[individus_simple]
head(individus_simple)
individus_simple <-
individus_simple %>%
mutate(dept_N = ifelse(!is.na(dept_N_2), yes = dept_N_2, no = dept_N))
individus_simple <- data.table(individus_simple)
We can recalculate the age of individuals.
individus_simple[, date_N_annee := as.numeric(str_sub(date_N, 1, 4))]
individus_simple[, date_D_annee := as.numeric(str_sub(date_D, 1, 4))]
individus_simple[, age := as.numeric(date_D_annee) - as.numeric(date_N_annee)]
For our generation table, each line must allow to follow a descendant. Each line must contain the information of a person and his ancestors up to that born between 1800 and 1804.
var_recup <- c("id_personne_num", "long_N", "lat_N", "sexe", "date_N", "date_D", "date_M", "dept_N", "date_N_annee", "date_D_annee", "age")
infos_aieul <-
individus_simple %>%
s_select(var_recup) %>%
magrittr::set_colnames(str_c(var_recup, "_aieul")) %>%
rename(id_aieul = id_personne_num_aieul)
infos_enfants <-
individus_simple %>%
s_select(var_recup) %>%
magrittr::set_colnames(str_c(var_recup, "_enfants")) %>%
rename(id_enfant = id_personne_num_enfants)
infos_p_enfant <-
individus_simple %>%
s_select(var_recup) %>%
magrittr::set_colnames(str_c(var_recup, "_p_enfants")) %>%
rename(id_p_enfant = id_personne_num_p_enfants)
infos_ap_enfant <-
individus_simple %>%
s_select(var_recup) %>%
magrittr::set_colnames(str_c(var_recup, "_ap_enfants")) %>%
rename(id_ap_enfant = id_personne_num_ap_enfants)
setkey(infos_aieul, id_aieul)
setkey(infos_enfants, id_enfant)
setkey(infos_p_enfant, id_p_enfant)
setkey(infos_ap_enfant, id_ap_enfant)
setkey(generations, id_aieul)
generations <- infos_aieul[generations]
setkey(generations, id_enfant)
generations <- infos_enfants[generations]
setkey(generations, id_p_enfant)
generations <- infos_p_enfant[generations]
setkey(generations, id_ap_enfant)
generations <- infos_ap_enfant[generations]
rm(infos_aieul, infos_enfants, infos_p_enfant, infos_ap_enfant)
generations
Based on the GPS coordinates provided, we are able to calculate the distance between a child and his or her birthplace. First, let’s pass all the coordinate variables in numerical values.
library(geosphere)
generations <-
generations %>%
mutate(long_N_aieul = as.numeric(long_N_aieul),
lat_N_aieul = as.numeric(lat_N_aieul),
long_N_enfants = as.numeric(long_N_enfants),
lat_N_enfants = as.numeric(lat_N_enfants),
long_N_p_enfants = as.numeric(long_N_p_enfants),
lat_N_p_enfants = as.numeric(lat_N_p_enfants),
long_N_ap_enfants = as.numeric(long_N_ap_enfants),
lat_N_ap_enfants = as.numeric(lat_N_ap_enfants))
generations <- data.table(generations)
We will use the distm()
function of the package geosphere
to calculate the distances (in meters) between two points.
calcul_distance <- function(long_1, lat_1, long_2, lat_2){
distm(c(long_1, lat_1), c(long_2, lat_2), fun = distHaversine) %>% "["(1)
}
For each row of the generation table, we calculate the following three distances :
dist_enfants
: distance between a child and an ancestor of 1800 ;dist_p_enfants
: distance between a grandchild and the grandfather of 1800 ;dist_ap_enfants
: distance between a great-grandchild and the grandfather of 1800.distances_l <-
pblapply(1:nrow(generations), function(y){
x <- generations[y,]
dist_enfants = calcul_distance(x$long_N_aieul, x$lat_N_aieul, x$long_N_enfants, x$lat_N_enfants)
dist_p_enfants = calcul_distance(x$long_N_aieul, x$lat_N_aieul, x$long_N_p_enfants, x$lat_N_p_enfants)
dist_ap_enfants = calcul_distance(x$long_N_aieul, x$lat_N_aieul, x$long_N_ap_enfants, x$lat_N_ap_enfants)
data.frame(dist_enfants = dist_enfants,
dist_p_enfants = dist_p_enfants,
dist_ap_enfants = dist_ap_enfants)
}) %>%
rbindlist() %>%
data.table()
It remains to add this information to the generation
table.
generations <-
generations %>%
cbind(distances_l)
We can convert these distances into kilometres.
generations <-
generations %>%
mutate(dist_enfants_km = dist_enfants / 1000,
dist_p_enfants_km = dist_p_enfants / 1000,
dist_ap_enfants_km = dist_ap_enfants / 1000)
Let us group some departments together, to reflect the approximate state of the departments in the 19th century.
liste_depts <-
liste_depts %>%
mutate(dept2 = dept,
dept2 = ifelse(dept2 %in% c("F78", "F91", "F92", "F95"), yes = "F78", no = dept2),
dept2 = ifelse(dept2 %in% c("F93", "F94", "F77"), yes = "F77", no = dept2)
)
Let us do the same on a generational basis.
generations <-
generations %>%
mutate(dept_N_aieul = ifelse(dept_N_aieul %in% c("F78", "F91", "F92", "F95"), yes = "F78", no = dept_N_aieul),
dept_N_enfants = ifelse(dept_N_enfants %in% c("F78", "F91", "F92", "F95"), yes = "F78", no = dept_N_enfants),
dept_N_p_enfants = ifelse(dept_N_p_enfants %in% c("F78", "F91", "F92", "F95"), yes = "F78", no = dept_N_p_enfants),
dept_N_ap_enfants = ifelse(dept_N_ap_enfants %in% c("F78", "F91", "F92", "F95"), yes = "F78", no = dept_N_ap_enfants),
#
dept_N_aieul = ifelse(dept_N_aieul %in% c("F93", "F94", "F77"), yes = "F77", no = dept_N_aieul),
dept_N_enfants = ifelse(dept_N_enfants %in% c("F93", "F94", "F77"), yes = "F77", no = dept_N_enfants),
dept_N_p_enfants = ifelse(dept_N_p_enfants %in% c("F93", "F94", "F77"), yes = "F77", no = dept_N_p_enfants),
dept_N_ap_enfants = ifelse(dept_N_ap_enfants %in% c("F93", "F94", "F77"), yes = "F77", no = dept_N_ap_enfants)
)
generations <- data.table(generations)
We can extract information related to adelphias (siblings and sororities).
The number of children per generation 1 woman (children).
nb_enf_aieux <-
generations %>%
filter(sexe_aieul == 2) %>%
mutate(keep = TRUE) %>%
mutate(keep = ifelse(age_aieul < 13, yes = FALSE, no = keep)) %>%
mutate(keep = ifelse(is.na(dept_M_aieul), yes = FALSE, no = keep)) %>%
filter(keep) %>%
select(id_aieul, id_enfant, dept_N_aieul) %>%
unique() %>%
mutate(nb_enf = ifelse(is.na(id_enfant), yes = 0, no = 1)) %>%
group_by(id_aieul, dept_N_aieul) %>%
summarise(nb_enf = sum(nb_enf))
summary(nb_enf_aieux$nb_enf)
The number of children per generation 2 woman (grandchildren).
nb_enf_enfants <-
generations %>%
filter(sexe_enfants == 2) %>%
mutate(keep = TRUE) %>%
mutate(keep = ifelse(age_enfants < 13, yes = FALSE, no = keep)) %>%
mutate(keep = ifelse(is.na(dept_M_enfant), yes = FALSE, no = keep)) %>%
filter(keep) %>%
select(id_enfant, id_p_enfant, dept_N_p_enfants) %>%
unique() %>%
mutate(nb_enf = ifelse(is.na(id_p_enfant), yes = 0, no = 1)) %>%
group_by(id_enfant, dept_N_p_enfants) %>%
summarise(nb_enf = sum(nb_enf))
summary(nb_enf_enfants$nb_enf)
The number of children per generation 3 woman (rear grandchildren).
nb_enf_p_enfants <-
generations %>%
filter(sexe_p_enfants == 2) %>%
mutate(keep = TRUE) %>%
mutate(keep = ifelse(age_p_enfants < 13, yes = FALSE, no = keep)) %>%
mutate(keep = ifelse(is.na(dept_M_p_enfant), yes = FALSE, no = keep)) %>%
filter(keep) %>%
select(id_p_enfant, id_ap_enfant, dept_N_ap_enfants) %>%
unique() %>%
mutate(nb_enf = ifelse(is.na(id_ap_enfant), yes = 0, no = 1)) %>%
group_by(id_p_enfant, dept_N_ap_enfants) %>%
summarise(nb_enf = sum(nb_enf))
summary(nb_enf_p_enfants$nb_enf)
We are interested in the study of the mobility of individuals between generations. In fact, we must focus on individuals who are able to move around. We place an arbitrary lower age limit on our individuals, 16 years. We hypothesize that individuals will not change villages before the age of 16.
generations_mobilite <-
generations %>%
filter(age_aieul >= 16,
age_enfants >= 16,
age_p_enfants >= 16,
age_ap_enfants >= 16) %>%
data.table()
Finally, we save the two databases generations
and generations_mobilite
.
save(generations, file = "generations_migration.rda")
save(generations_mobilite, file = "generations_mobilite.rda")
Now, we have a cleaned base, although it surely still contains duplicates that the executions of our algorithms could not detect. Then we can explore it. In this section we are interested in some comparisons of descriptive statistics of genealogy data with some results from the literature.
A first element to characterize our sample is its size. We look at the number of births per year by differentiating generations. Then we compare the number of births in the initial sample with official data to get an idea of the representativeness of the generation behind our study.
We load some packages, as well as genealogy data.
library(tidyverse)
library(stringr)
library(data.table)
library(pbapply)
library(dplyrExtras)
library(sp)
load("generations_migration.rda")
For each generation, we calculate the number of births per year.
nb_nais_aieux <-
generations %>%
tbl_df() %>%
select(id_aieul, date_N_aieul) %>%
unique() %>%
mutate(date_N_aieul_annee = str_sub(date_N_aieul, 1, 4) %>% as.numeric()) %>%
group_by(date_N_aieul_annee) %>%
tally() %>%
arrange(date_N_aieul_annee)
nb_nais_enfants <-
generations %>%
tbl_df() %>%
select(id_enfant, date_N_enfants) %>%
unique() %>%
mutate(date_N_enfants_annee = str_sub(date_N_enfants, 1, 4) %>% as.numeric()) %>%
group_by(date_N_enfants_annee) %>%
tally() %>%
arrange(date_N_enfants_annee)
nb_nais_p_enfants <-
generations %>%
tbl_df() %>%
select(id_p_enfant, date_N_p_enfants) %>%
unique() %>%
mutate(date_N_p_enfants_annee = str_sub(date_N_p_enfants, 1, 4) %>% as.numeric()) %>%
group_by(date_N_p_enfants_annee) %>%
tally() %>%
arrange(date_N_p_enfants_annee)
nb_nais_ap_enfants <-
generations %>%
tbl_df() %>%
select(id_ap_enfant, date_N_ap_enfants) %>%
unique() %>%
mutate(date_N_ap_enfants_annee = str_sub(date_N_ap_enfants, 1, 4) %>% as.numeric()) %>%
group_by(date_N_ap_enfants_annee) %>%
tally() %>%
arrange(date_N_ap_enfants_annee)
We pool these observations.
nb_nais <-
nb_nais_aieux %>%
rename(annee_N = date_N_aieul_annee) %>%
mutate(generation = 0) %>%
bind_rows(nb_nais_enfants %>%
rename(annee_N = date_N_enfants_annee) %>%
mutate(generation = 1)
) %>%
bind_rows(
nb_nais_p_enfants %>%
rename(annee_N = date_N_p_enfants_annee) %>%
mutate(generation = 2)
) %>%
bind_rows(
nb_nais_ap_enfants %>%
rename(annee_N = date_N_ap_enfants_annee) %>%
mutate(generation = 3)
) %>%
mutate(generation = factor(generation, levels = seq(0,3), labels = c('Aïeux', "Enfants",
"Petits-enfants", "Arrière-petits-enfants")))
Then, we can draw a histogram, with a log scale for the ordinates, the number of births of ancestors being relatively explosive compared to the number of annual births for the descendants.
p_distrib_naissances_gen <-
ggplot(data = nb_nais, aes(x = annee_N, y = log(n))) +
geom_bar(stat="identity", position = "identity", aes(fill =generation , group = generation), alpha = .5) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10), limits = c(1800, 1940)) +
xlab("Nombre d'individus") + ylab("Année de naissance") +
scale_fill_discrete("")
p_distrib_naissances_gen
To have an idea of the representativeness of the data, we look, by department, how many births are filled in our data and compare the result with the records of the [Statistique Générale de la France] (https://www.insee.fr/fr/statistiques/2591293?sommaire=2591397) (SGF) available on the INSEE website.
We load our data as well as those of the SGF We will be interested only in a few variables among the 228 that the SGF file proposes.
load("generations_migration.rda")
pop_insee <- readxl::read_excel("../data/population_insee/TERR_T86.xls", skip = 8, col_names = FALSE)
pop_insee_col <- readxl::read_excel("../data/population_insee/TERR_T86.xls", skip = 6) %>%
colnames()
colnames(pop_insee) <- pop_insee_col
# V3 : Numéro de département
# V6 : Nom de l'unité d'analyse
# V221 : Naissances totales; légitimes et naturels; garçons, 1800 à 1801
# V222 : Naissances totales; légitimes et naturels: filles 1800 à 1801
A small correction of department number, to be able to match our data to those of the SGF.
pop_insee <-
pop_insee %>%
mutate(V3 = ifelse(V3 == "99", yes = "54", no = V3))
We retain only the variables that interest us.
nb_nais_region <-
pop_insee %>%
select(V3, V221, V222) %>%
rename(num_dep = V3, nb_nais_f = V222, nb_nais_h = V221) %>%
filter(num_dep != "00")
head(nb_nais_region)
# A tibble: 6 x 3
num_dep nb_nais_h nb_nais_f
<chr> <dbl> <dbl>
1 01 4892 4811
2 02 6801 6534
3 03 4108 3883
4 04 2813 2739
5 05 2070 1945
6 07 4496 4288
We sum up births for women and men.
nb_nais_1801 <-
nb_nais_region %>%
summarise(nb_nais_h = sum(nb_nais_h, na.rm=T), nb_nais_f = sum(nb_nais_f, na.rm=T))
head(nb_nais_1801)
# A tibble: 1 x 2
nb_nais_h nb_nais_f
<dbl> <dbl>
1 464562 439126
We now calculate the number of births of girls and boys for each generation, for each year, in the genealogy data.
nb_naiss_par_annee <-
generations %>%
tbl_df() %>%
select(id_aieul, sexe_aieul, date_N_annee_aieul) %>%
unique() %>%
rename(id_personne = id_aieul, sexe = sexe_aieul, date_N_annee = date_N_annee_aieul) %>%
bind_rows(
generations %>%
tbl_df() %>%
select(id_enfant, sexe_enfants, date_N_annee_enfants) %>%
unique() %>%
rename(id_personne = id_enfant, sexe = sexe_enfants, date_N_annee = date_N_annee_enfants)
) %>%
bind_rows(
generations %>%
tbl_df() %>%
select(id_p_enfant, sexe_p_enfants, date_N_annee_p_enfants) %>%
unique() %>%
rename(id_personne = id_p_enfant, sexe = sexe_p_enfants, date_N_annee = date_N_annee_p_enfants)
) %>%
bind_rows(
generations %>%
tbl_df() %>%
select(id_ap_enfant, sexe_ap_enfants, date_N_annee_ap_enfants) %>%
unique() %>%
rename(id_personne = id_ap_enfant, sexe = sexe_ap_enfants, date_N_annee = date_N_annee_ap_enfants)
) %>%
unique() %>%
group_by(sexe, date_N_annee) %>%
tally()
The idea is then to build a table (representativity
) only for the ancestors born in 1801, to compare with the values of the SGF. The table we are representativite shows, for each department, the number of births recorded in the genealogy data, for women and for men. We then add the SGF data contained in the table nb_nais_region
. The next step is to compare the values, making a ratio of the number of births in the genealogy data to the number of births reported by the SGF.
representativite <-
generations %>%
tbl_df() %>%
filter(date_N_annee_aieul == 1801) %>%
select(id_aieul, sexe_aieul, date_N_annee_aieul, dept_N_aieul) %>%
unique() %>%
rename(dept_N = dept_N_aieul,
date_N_annee = date_N_annee_aieul,
sexe = sexe_aieul) %>%
group_by(sexe, date_N_annee, dept_N) %>%
summarise(nb_nais = n()) %>%
left_join(
nb_nais_region %>% gather(sexe, nb_nais_1801, -num_dep) %>%
mutate(sexe = ifelse(sexe == "nb_nais_h", yes = "1", no = "2")) %>%
mutate(dept_N = str_c("F", num_dep))
) %>%
filter(!is.na(sexe)) %>%
mutate(pct_nais = nb_nais / nb_nais_1801 * 100,
pct_nais = round(pct_nais, 2)) %>%
ungroup() %>%
mutate(sexe = factor(sexe, levels = c(2,1), labels = c("Femmes", "Hommes"))) %>%
arrange(date_N_annee, sexe)
head(representativite)
# A tibble: 6 x 7
sexe date_N_annee dept_N nb_nais num_dep nb_nais_1801 pct_nais
<fct> <dbl> <chr> <int> <chr> <dbl> <dbl>
1 Femmes 1801 F01 1332 01 4811 27.7
2 Femmes 1801 F02 2341 02 6534 35.8
3 Femmes 1801 F03 1659 03 3883 42.7
4 Femmes 1801 F04 668 04 2739 24.4
5 Femmes 1801 F05 842 05 1945 43.3
6 Femmes 1801 F06 779 <NA> NA NA
We remove departments for which SGF does not provide data.
representativite <-
representativite %>%
filter(!is.na(dept_N))
We extract the numbers from the department variable.
representativite <-
representativite %>%
select(dept_N, sexe, nb_nais, nb_nais_1801, pct_nais) %>%
mutate(num_dep = str_sub(dept_N, 2))
head(representativite)
# A tibble: 6 x 6
dept_N sexe nb_nais nb_nais_1801 pct_nais num_dep
<chr> <fct> <dbl> <dbl> <dbl> <chr>
1 F01 Femmes 1332 4811 27.7 01
2 F01 Hommes 1474 4892 30.1 01
3 F02 Femmes 2341 6534 35.8 02
4 F02 Hommes 2836 6801 41.7 02
5 F03 Femmes 1659 3883 42.7 03
6 F03 Hommes 1866 4108 45.4 03
We can then propose a graphical representation in the form of a choropleth map of the regional representativity of our sample of ancestors. To do this, we load the data to trace France. More details on obtaining this file are given inappendix.
load("france_1800.rda")
representativite_map <-
france_1800 %>%
left_join(
representativite %>%
filter(!is.na(sexe)) %>%
select(num_dep, sexe, pct_nais) %>%
spread(sexe, pct_nais)
)
p_repres <- ggplot() +
geom_polygon(data = representativite_map %>% gather(sexe, pct_nais, Femmes, Hommes),
aes(x = long, y = lat, group = group, fill = pct_nais), size = .4) +
coord_quickmap() +
scale_fill_gradientn("%", colours = rev(heat.colors(10)), na.value = "grey") +
facet_wrap(~sexe)
p_repres
The percentage of men can also be calculated from the genealogy data by creating a table with all the unique individuals in the sample and then summing the number of individuals by sex.
sexes <- generations %>%
select(id_aieul, sexe_aieul, dept_N_aieul) %>%
unique() %>%
rename(id_personne = id_aieul, sexe = sexe_aieul, dept_N = dept_N_aieul) %>%
mutate(sexe = as.character(sexe)) %>%
bind_rows(
generations %>%
select(id_enfant, sexe_enfants, dept_N_enfants) %>%
unique() %>%
rename(id_personne = id_enfant, sexe = sexe_enfants, dept_N = dept_N_enfants) %>%
mutate(sexe = as.character(sexe))
) %>%
bind_rows(
generations %>%
select(id_p_enfant, sexe_p_enfants, dept_N_p_enfants) %>%
unique() %>%
rename(id_personne = id_p_enfant, sexe = sexe_p_enfants, dept_N = dept_N_p_enfants) %>%
mutate(sexe = as.character(sexe))
) %>%
bind_rows(
generations %>%
select(id_ap_enfant, sexe_ap_enfants,dept_N_ap_enfants ) %>%
unique() %>%
rename(id_personne = id_ap_enfant, sexe = sexe_ap_enfants, dept_N = dept_N_ap_enfants) %>%
mutate(sexe = as.character(sexe))
) %>%
unique()
sexes %>%
tbl_df() %>%
group_by(sexe) %>%
tally() %>%
ungroup() %>%
filter(!is.na(sexe)) %>%
mutate(pct = n/sum(n)) %>%
head()
# A tibble: 2 x 3
sexe n pct
<chr> <int> <dbl>
1 1 1316749 0.538
2 2 1132795 0.462
The masculinity rate can also be calculated:
sexes %>%
tbl_df() %>%
group_by(sexe) %>%
tally() %>%
ungroup() %>%
filter(!is.na(sexe)) %>%
spread(sexe, n) %>%
mutate(sex_ratio = `1` / `2` * 100)
# A tibble: 1 x 3
`1` `2` sex_ratio
<int> <int> <dbl>
1 1316749 1132795 116
It is not complicated to calculate the percentage of men per department:
representativite_fh <-
sexes %>%
filter(str_detect(dept_N, "^F[[:digit:]]{2}")) %>%
filter(!is.na(sexe)) %>%
group_by(dept_N, sexe) %>%
summarise(n = n()) %>%
group_by(dept_N) %>%
mutate(pct = n / sum(n)) %>%
rename(departement = dept_N)
representativite_fh_sex_ratio <-
sexes %>%
filter(str_detect(dept_N, "^F[[:digit:]]{2}")) %>%
filter(!is.na(sexe)) %>%
group_by(dept_N, sexe) %>%
summarise(n = n()) %>%
spread(sexe, n) %>%
mutate(sex_ratio = `1` / `2` * 100)
A data table can then be prepared for a graphical representation:
representativite_map_fh <-
france_1800 %>%
left_join(
representativite_fh %>%
filter(sexe == 1)
)
p_repres_fh <- ggplot() +
geom_polygon(data = representativite_map_fh,
aes(x = long, y = lat, group = group, fill = pct), col = "black", size = .4) +
coord_quickmap() +
scale_fill_gradientn("%", colours = rev(heat.colors(10)), na.value = "grey")
p_repres_fh
We will look at the fertility rate. So we have to calculate the number of children per woman. We do it for each generation.
# Nombre d'enfants par femme : generation 1 (enfants)
nb_enf_aieux <-
generations %>%
tbl_df() %>%
filter(sexe_aieul == 2) %>%
mutate(keep = TRUE) %>%
mutate(keep = ifelse(age_aieul < 15, yes = FALSE, no = keep)) %>%
mutate(keep = ifelse(is.na(dept_M_aieul), yes = FALSE, no = keep)) %>%
filter(keep) %>%
select(id_aieul, id_enfant, dept_N_aieul) %>%
unique() %>%
mutate(nb_enf = ifelse(is.na(id_enfant), yes = 0, no = 1)) %>%
group_by(id_aieul, dept_N_aieul) %>%
summarise(nb_enf = sum(nb_enf))
# Nombre d'enfants par femme : generation 2 (petits enfants)
nb_enf_enfants <-
generations %>%
filter(sexe_enfants == 2) %>%
mutate(keep = TRUE) %>%
mutate(keep = ifelse(age_enfants < 15, yes = FALSE, no = keep)) %>%
mutate(keep = ifelse(is.na(dept_M_enfants), yes = FALSE, no = keep)) %>%
filter(keep) %>%
select(id_enfant, id_p_enfant, dept_N_p_enfants) %>%
unique() %>%
mutate(nb_enf = ifelse(is.na(id_p_enfant), yes = 0, no = 1)) %>%
group_by(id_enfant, dept_N_p_enfants) %>%
summarise(nb_enf = sum(nb_enf))
# Nombre d'enfants par femme : generation 3 (arrieres petits enfants)
nb_enf_p_enfants <-
generations %>%
filter(sexe_p_enfants == 2) %>%
mutate(keep = TRUE) %>%
mutate(keep = ifelse(age_p_enfants < 15, yes = FALSE, no = keep)) %>%
mutate(keep = ifelse(is.na(dept_M_p_enfants), yes = FALSE, no = keep)) %>%
filter(keep) %>%
select(id_p_enfant, id_ap_enfant, dept_N_ap_enfants) %>%
unique() %>%
mutate(nb_enf = ifelse(is.na(id_ap_enfant), yes = 0, no = 1)) %>%
group_by(id_p_enfant, dept_N_ap_enfants) %>%
summarise(nb_enf = sum(nb_enf))
summary(nb_enf_aieux$nb_enf)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 1.000 1.357 2.000 33.000
summary(nb_enf_enfants$nb_enf)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 1.000 1.605 2.000 19.000
summary(nb_enf_p_enfants$nb_enf)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 1.000 1.615 2.000 21.000
Then, we join all these values in a single data table.
meres <-
nb_enf_aieux %>%
ungroup() %>%
rename(id_personne_num = id_aieul, dept_N = dept_N_aieul) %>%
mutate(generation = 0) %>%
bind_rows(
nb_enf_enfants %>%
ungroup() %>%
rename(id_personne_num = id_enfant, dept_N = dept_N_p_enfants) %>%
mutate(generation = 1)
) %>%
bind_rows(
nb_enf_p_enfants %>%
ungroup() %>%
rename(id_personne_num = id_p_enfant, dept_N = dept_N_ap_enfants) %>%
mutate(generation = 2)
) %>%
tbl_df()
The average number of children, regardless of generations:
mean(meres$nb_enf)
[1] 1.467611
The distribution of the number of children by generation (in lines) is given in the table below. The columns indicate the number of children. The column headed 11
refers to the percentage of women in the sample who gave birth to 11 or more children.
meres %>%
mutate(nb_enf = ifelse(nb_enf > 10, yes = 11, no = nb_enf)) %>%
select(generation, nb_enf) %>%
group_by(generation, nb_enf) %>%
summarise(nn = n()) %>%
ungroup () %>%
group_by(generation) %>%
mutate(pct_nn = nn / sum(nn)*100) %>%
select(generation, nb_enf, pct_nn) %>%
spread(nb_enf, pct_nn) %>%
knitr::kable(digits=2)
generation | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 46.98 | 24.54 | 10.26 | 6.14 | 4.02 | 2.62 | 1.89 | 1.20 | 0.85 | 0.53 | 0.36 | 0.63 |
1 | 27.36 | 39.18 | 13.18 | 7.48 | 4.80 | 2.86 | 1.87 | 1.22 | 0.84 | 0.51 | 0.27 | 0.43 |
2 | 25.81 | 40.33 | 13.72 | 7.88 | 4.50 | 2.68 | 1.89 | 1.17 | 0.78 | 0.47 | 0.30 | 0.46 |
In this section, we present the codes developed to study the temporal aspects of the data. We focus mainly on the ancestors of the base, namely individuals born between 1800 and 1804.
We can explore mortality issues through birth and death dates of individuals. We will focus mainly on the ancestors, but the methods can be applied to the whole base.
aieux <-
generations %>%
select(id_aieul, date_N_annee_aieul, date_D_annee_aieul, age_aieul, sexe_aieul, dept_N_aieul) %>%
unique() %>%
filter(!is.na(date_N_annee_aieul), !is.na(date_D_annee_aieul)) %>%
mutate(end_age = date_D_annee_aieul - date_N_annee_aieul) %>%
tbl_df()
We need to load some packages.
library(tidyverse)
library(demography)
library(survival)
library(data.table)
We also need to recover mortality data. We download the death tables from Vallin et al. (2001).
lifetable_fr <- read_csv("lifetable_FRA.csv")
We define a function to extract, for a given cohort, the probability of instantaneous death \(\mu_x\) (which is equivalent to \(q_x\), i.e., the probability that an individual aged \(x\) does not survive in \(x+1\)).
#' qx_annee_age
#' Obtenir la proba qu'un individu âgé de @annee - @annee_naiss_depart
#' ne survive pas à l'âge de x
#' @annee (int) : Année pour laquelle extraire la table de mortalité
#' @annee_naiss_depart (int) : Annee de naissance des individus que l'on suit
# annee = 1806 ; annee_naiss_depart <- 1804
qx_annee_age <- function(annee, annee_naiss_depart, sexe){
# Âge des individus que l'on suit
age_courant <- annee - annee_naiss_depart
# La table de mortalite en France pour l'année annee et le sexe sexe
table_mortalite_annee <- lifetable_fr %>% filter(Year1 == annee, Sex == sexe, TypeLT == 1)
# Extraire la ligne de cette table pour les individus que l'on suit
res <-
table_mortalite_annee %>%
filter(Age == age_courant) %>%
slice(1) %>%
select(`q(x)`, `m(x)`) %>%
rename(qx = `q(x)`, mx = `m(x)`)
# magrittr::extract2("q(x)")
if(nrow(res) == 0) res <- data.frame(qx = NA, mx = NA)
res
}# Fin de qx_annee_age()
We define another function to calculate the probability that an individual aged \(x\) will reach $x+$1, or \(p_x\), and the probability of conditional survival at age \(S_x(t)\). These values are estimated by cohort.
#' survie_diag
#' On parcourt chaque année jusqu'à annee_naiss_depart+105
#' Mais on commence à 1806 (pas de données avant)
#' annee_naiss_depart <- 1804 ; sexe <- 1 ; age_min = 6
survie_diag <- function(annee_naiss_depart, sexe = c(1,2), age_min = 6){
nqx <- lapply(seq(1806, annee_naiss_depart+105), qx_annee_age, annee_naiss_depart = annee_naiss_depart, sexe = sexe) %>%
bind_rows()
ages <- seq(1806, annee_naiss_depart+105) - annee_naiss_depart
res <- nqx
res$age <- ages
res$sexe <- as.character(sexe)
res <- res %>% filter(age >= age_min)
# Calcul de p_x (proba que (x) atteigne x+1)
res$px = 1-res$qx
# Rajout de la survie
res$sx <- cumprod(res$px)
res$cohorte <- annee_naiss_depart
res
}# Fin de survie_diag
The survie_diag()
function is applied to the birth years of individuals in the 1800 to 1804 cohorts, for women, then for men.
survies_diagonale_1800_1804_f <-
pblapply(seq(1800, 1804), survie_diag, sexe = "2") %>%
bind_rows()
survies_diagonale_1800_1804_h <-
pblapply(seq(1800, 1804), survie_diag, sexe = "1") %>%
bind_rows()
survies_diagonale_1800_1804 <-
survies_diagonale_1800_1804_f %>%
bind_rows(survies_diagonale_1800_1804_h)
head(survies_diagonale_1800_1804)
# A tibble: 6 x 7
qx mx age sexe px sx cohorte
<dbl> <dbl> <int> <chr> <dbl> <dbl> <int>
1 0.0138 0.0139 6 2 0.986 0.986 1800
2 0.0118 0.0119 7 2 0.988 0.975 1800
3 0.0101 0.0102 8 2 0.990 0.965 1800
4 0.00808 0.00811 9 2 0.992 0.957 1800
5 0.00666 0.00668 10 2 0.993 0.950 1800
6 0.00591 0.00593 11 2 0.994 0.945 1800
survies_diagonale_1800_1804 %>%
mutate(sexe = factor(sexe, levels = c(2,1), labels = c("Femmes", "Hommes"))) %>%
ggplot(data = .,
aes(x = age, y = sx, colour = factor(cohorte), linetype = sexe)) +
geom_line() +
ggtitle("Fonctions de survie des femmes et des hommes par cohortes")
Estimates are given by cohort, we aggregate them for the 1800-1804 group, taking the average of the probabilities of survival and instantaneous death, by age and sex.
surv_hist <-
survies_diagonale_1800_1804 %>%
group_by(age, sexe) %>%
summarise(sx = mean(sx, na.rm=T),
mx = mean(mx))
head(surv_hist)
# A tibble: 6 x 4
# Groups: age [3]
age sexe sx mx
<int> <chr> <dbl> <dbl>
1 6 1 0.986 0.0137
2 6 2 0.987 0.0135
3 7 1 0.975 0.0112
4 7 2 0.976 0.0112
5 8 1 0.966 0.0102
6 8 2 0.966 0.00983
Now that we have estimates of the probability of survival at each age from mortality table data, we present our estimates from Geneanet genealogy data. We define a function, calcul_survie_geneanet()
, which from a year of birth, estimates the probabilities of survival, instantaneous death for both men and women, for a cohort of individuals born that year.
#' calcul_survie_geneanet
#' Estime les probabilités de survie, de décès instantanné (en log) pour les hommes et les femmes
#' pour des individus d'une cohorte donnée
#' @une_annee (int) : année de naissance des individus à suivre
calcul_survie_geneanet <- function(une_annee){
aieux_annee <- aieux %>%
tbl_df() %>%
filter(date_N_annee_aieul == une_annee) %>%
filter(age_aieul >= 6)
# On compte le nombre de personne encore en vie à chaque année
calcul_exposition <- function(une_annee, sexe){
aieux_annee %>% filter(date_D_annee_aieul >= une_annee, sexe_aieul == sexe) %>% nrow()
}
annees <- seq(une_annee+6, une_annee+110)
exposition_annee_h <- data.frame(annee = annees, exposition = sapply(annees, calcul_exposition, sexe = "1")) %>% tbl_df()
exposition_annee_f <- data.frame(annee = annees, exposition = sapply(annees, calcul_exposition, sexe = "2")) %>% tbl_df()
exposition_annee <-
exposition_annee_h %>% rename(exposition_h = exposition) %>%
left_join(exposition_annee_f %>% rename(exposition_f = exposition), by = "annee") %>%
mutate(age = annees-une_annee)
exposition_annee <-
exposition_annee %>%
mutate(exposition_h = ifelse(exposition_h <= 0, yes = 1, no = exposition_h),
exposition_f = ifelse(exposition_h <= 0, yes = 1, no = exposition_f)) %>%
# Calcul du nombre de décès
# mutate(expo_moins_1_h = lag(exposition_h),
# expo_moins_1_f = lag(exposition_f)) %>%
mutate(expo_plus_1_h = lead(exposition_h),
expo_plus_1_f = lead(exposition_f)) %>%
mutate(deces_h = exposition_h - expo_plus_1_h,
deces_f = exposition_f - expo_plus_1_f) %>%
mutate(sx_h = (exposition_h - deces_h) / exposition_h,
sx_f = (exposition_f - deces_f) / exposition_f) %>%
select(-expo_plus_1_h, -expo_plus_1_f) %>%
# Proba de déceder dans l'année
mutate(qxt_h = deces_h / exposition_h,
qxt_f = deces_f / exposition_f) %>%
mutate(qxt_h = ifelse(qxt_h <= 0, yes = NA, no = qxt_h),
qxt_f = ifelse(qxt_f <= 0, yes = NA, no = qxt_f)) %>%
# Calcul de la force de mortalité
mutate(fm_h = log(qxt_h),
fm_f = log(qxt_f)) %>%
mutate(annee_naissance = une_annee)
# Survie
# exposition_annee$lx_f[1] <- exposition_annee$lx_h[1] <- 1
exposition_annee$sx_h <- cumprod(exposition_annee$sx_h)
exposition_annee$sx_f <- cumprod(exposition_annee$sx_f)
exposition_annee
}# Fin de calcul_survie_geneanet()
The probability of instantaneous survival and death for individuals in cohorts 1800 to 1804 is calculated from genealogy data:
library(pbapply)
surv_geneanet_tot <-
pblapply(1800:1804, calcul_survie_geneanet) %>%
bind_rows()
head(surv_geneanet_tot)
# A tibble: 6 x 13
annee exposition_h exposition_f age deces_h deces_f sx_h sx_f
<int> <int> <int> <int> <int> <int> <dbl> <dbl>
1 1806 72587 61392 6 550 570 0.992 0.991
2 1807 72037 60822 7 492 427 0.986 0.984
3 1808 71545 60395 8 376 397 0.980 0.977
4 1809 71169 59998 9 307 277 0.976 0.973
5 1810 70862 59721 10 245 233 0.973 0.969
6 1811 70617 59488 11 272 263 0.969 0.965
# ... with 5 more variables: qxt_h <dbl>, qxt_f <dbl>, fm_h <dbl>, fm_f
# <dbl>, annee_naissance <int>
As with the data from the mortality tables, we calculate the average by age and sex:
surv_geneanet <-
surv_geneanet_tot %>%
group_by(age) %>%
summarise(fm_h = mean(fm_h, na.rm=T), fm_f = mean(fm_f, na.rm=T),
sx_h = mean(sx_h, na.rm=T), sx_f = mean(sx_f, na.rm=T))
head(surv_geneanet)
# A tibble: 6 x 5
age fm_h fm_f sx_h sx_f
<int> <dbl> <dbl> <dbl> <dbl>
1 6 -4.77 -4.63 0.992 0.990
2 7 -4.96 -4.82 0.985 0.982
3 8 -5.18 -5.07 0.979 0.976
4 9 -5.32 -5.22 0.974 0.971
5 10 -5.44 -5.32 0.970 0.966
6 11 -5.51 -5.46 0.966 0.962
We modify the shape of the last table to get a layout suitable for use by the ggplot2
graphics functions.
surv_geneanet_sx <-
surv_geneanet %>%
select(age, sx_h, sx_f) %>%
gather(sexe, sx, sx_h, sx_f) %>%
mutate(sexe = factor(sexe, levels = c("sx_h", "sx_f"), labels = c(1,2))) %>%
mutate(sexe = as.character(sexe))
We share historical data from the tables with our sample genealogical data.
df_plot_survie <-
surv_hist %>% mutate(type = "Historique") %>%
bind_rows(
surv_geneanet_sx %>%
mutate(type = "Geneanet")
) %>%
mutate(sexe = factor(sexe, levels = c("2","1"), labels = c("Femmes", "Hommes")))
It is then possible to represent the survival functions by sex for each of the two bases.
p <-
ggplot(data = df_plot_survie,
aes(x = age, y = sx, colour = sexe, linetype = type)) +
geom_line() +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
scale_linetype_manual("", values = c("Historique" = "dashed", "Geneanet" = "solid")) +
xlab("âge") + ylab("Probabilité") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
p
This time, calculations can be done for all individuals, not just those at least 6 years old.
#' calcul_survie_geneanet_tot
#' Estime les probabilités de survie, de décès instantanné (en log) pour les hommes et les femmes
#' pour des individus d'une cohorte donnée, dans un département donné
#' @une_annee (int) : année de naissance des individus à suivre
# une_annee <- 1800
calcul_survie_geneanet_tot <- function(une_annee){
aieux_annee <-
aieux %>%
tbl_df() %>%
filter(date_N_annee_aieul == une_annee)
# On compte le nombre de personne encore en vie à chaque année
calcul_exposition <- function(une_annee, sexe){
aieux_annee %>% filter(date_D_annee_aieul >= une_annee, sexe_aieul == sexe) %>% nrow()
}
annees <- seq(une_annee, une_annee+110)
exposition_annee_h <- data.frame(annee = annees, exposition = sapply(annees, calcul_exposition, sexe = "1")) %>% tbl_df()
exposition_annee_f <- data.frame(annee = annees, exposition = sapply(annees, calcul_exposition, sexe = "2")) %>% tbl_df()
exposition_annee <-
exposition_annee_h %>% rename(exposition_h = exposition) %>%
left_join(exposition_annee_f %>% rename(exposition_f = exposition), by = "annee") %>%
mutate(age = annees-une_annee)
exposition_annee <-
exposition_annee %>%
mutate(exposition_h = ifelse(exposition_h <= 0, yes = 1, no = exposition_h),
exposition_f = ifelse(exposition_h <= 0, yes = 1, no = exposition_f)) %>%
# Calcul du nombre de décès
# mutate(expo_moins_1_h = lag(exposition_h),
# expo_moins_1_f = lag(exposition_f)) %>%
mutate(expo_plus_1_h = lead(exposition_h),
expo_plus_1_f = lead(exposition_f)) %>%
mutate(deces_h = exposition_h - expo_plus_1_h,
deces_f = exposition_f - expo_plus_1_f) %>%
mutate(sx_h = (exposition_h - deces_h) / exposition_h,
sx_f = (exposition_f - deces_f) / exposition_f) %>%
select(-expo_plus_1_h, -expo_plus_1_f) %>%
# Proba de déceder dans l'année
mutate(qxt_h = deces_h / exposition_h,
qxt_f = deces_f / exposition_f) %>%
mutate(qxt_h = ifelse(qxt_h <= 0, yes = NA, no = qxt_h),
qxt_f = ifelse(qxt_f <= 0, yes = NA, no = qxt_f)) %>%
# Calcul de la force de mortalité
mutate(fm_h = log(qxt_h),
fm_f = log(qxt_f)) %>%
mutate(annee_naissance = une_annee)
# Survie
# exposition_annee$lx_f[1] <- exposition_annee$lx_h[1] <- 1
exposition_annee$sx_h <- cumprod(exposition_annee$sx_h)
exposition_annee$sx_f <- cumprod(exposition_annee$sx_f)
exposition_annee
}# Fin de calcul_survie_geneanet_tot()
surv_geneanet_2_tot_c <- pblapply(1800:1804, calcul_survie_geneanet_tot)
surv_geneanet_2_tot_c <- bind_rows(surv_geneanet_2_tot_c)
surv_geneanet_2_tot <-
surv_geneanet_2_tot_c %>%
group_by(age) %>%
summarise(fm_h = mean(fm_h, na.rm=T), fm_f = mean(fm_f, na.rm=T),
sx_h = mean(sx_h, na.rm=T), sx_f = mean(sx_f, na.rm=T))
We also want to explore mortality through its regional differences. We adapt the previous code so that estimates are made at the regional level.
#' calcul_survie_geneanet_2
#' Estime les probabilités de survie, de décès instantanné (en log) pour les hommes et les femmes
#' pour des individus d'une cohorte donnée, dans un département donné
#' @une_annee (int) : année de naissance des individus à suivre
#' @un_dep (string) : identifiant du département (lu dans le tableau `aieux`)
calcul_survie_geneanet_2 <- function(une_annee, un_dep){
aieux_annee <-
aieux %>%
tbl_df() %>%
filter(date_N_annee_aieul == une_annee) %>%
filter(dept_N_aieul %in% un_dep)
# On compte le nombre de personne encore en vie à chaque année
calcul_exposition <- function(une_annee, sexe){
aieux_annee %>% filter(date_D_annee_aieul >= une_annee, sexe_aieul == sexe) %>% nrow()
}
annees <- seq(une_annee, une_annee+110)
exposition_annee_h <- data.frame(annee = annees, exposition = sapply(annees, calcul_exposition, sexe = "1")) %>% tbl_df()
exposition_annee_f <- data.frame(annee = annees, exposition = sapply(annees, calcul_exposition, sexe = "2")) %>% tbl_df()
exposition_annee <-
exposition_annee_h %>% rename(exposition_h = exposition) %>%
left_join(exposition_annee_f %>% rename(exposition_f = exposition), by = "annee") %>%
mutate(age = annees-une_annee)
exposition_annee <-
exposition_annee %>%
mutate(exposition_h = ifelse(exposition_h <= 0, yes = 1, no = exposition_h),
exposition_f = ifelse(exposition_h <= 0, yes = 1, no = exposition_f)) %>%
# Calcul du nombre de décès
mutate(expo_plus_1_h = lead(exposition_h),
expo_plus_1_f = lead(exposition_f)) %>%
mutate(deces_h = exposition_h - expo_plus_1_h,
deces_f = exposition_f - expo_plus_1_f) %>%
mutate(sx_h = (exposition_h - deces_h) / exposition_h,
sx_f = (exposition_f - deces_f) / exposition_f) %>%
select(-expo_plus_1_h, -expo_plus_1_f) %>%
# Proba de déceder dans l'année
mutate(qxt_h = deces_h / exposition_h,
qxt_f = deces_f / exposition_f) %>%
mutate(qxt_h = ifelse(qxt_h <= 0, yes = NA, no = qxt_h),
qxt_f = ifelse(qxt_f <= 0, yes = NA, no = qxt_f)) %>%
# Calcul de la force de mortalité
mutate(fm_h = log(qxt_h),
fm_f = log(qxt_f)) %>%
mutate(annee_naissance = une_annee)
# Survie
exposition_annee$sx_h <- cumprod(exposition_annee$sx_h)
exposition_annee$sx_f <- cumprod(exposition_annee$sx_f)
exposition_annee <-
exposition_annee %>%
mutate(dept = un_dep)
exposition_annee
}# Fin de calcul_survie_geneanet_2()
We will go through each of the available departments. We list them.
les_depts <- unique(aieux$dept_N_aieul)
les_depts <- les_depts[!is.na(les_depts)]
We go through each of these departments, and for each department, we go through each cohort to apply the function calcul_survie_geneanet_2()
.
surv_geneanet_2_depts_c <-
pblapply(les_depts, function(un_dep){
lapply(1800:1804, calcul_survie_geneanet_2, un_dep = un_dep) %>%
bind_rows()
})
surv_geneanet_2_depts_c <- surv_geneanet_2_depts_c %>% bind_rows()
head(surv_geneanet_2_depts_c)
# A tibble: 6 x 14
annee exposition_h exposition_f age deces_h deces_f sx_h sx_f
<int> <dbl> <int> <int> <dbl> <int> <dbl> <dbl>
1 1800 2231 2268 0 216 161 0.903 0.929
2 1801 2015 2107 1 67.0 90 0.873 0.889
3 1802 1948 2017 2 31.0 34 0.859 0.874
4 1803 1917 1983 3 29.0 42 0.846 0.856
5 1804 1888 1941 4 22.0 29 0.836 0.843
6 1805 1866 1912 5 13.0 15 0.831 0.836
# ... with 6 more variables: qxt_h <dbl>, qxt_f <dbl>, fm_h <dbl>, fm_f
# <dbl>, annee_naissance <int>, dept <chr>
Then, we aggregate by age and department, calculating the means for mortality strength and survival function.
surv_geneanet_2_depts <-
surv_geneanet_2_depts_c %>%
group_by(age, dept) %>%
summarise(fm_h = mean(fm_h, na.rm=T), fm_f = mean(fm_f, na.rm=T),
sx_h = mean(sx_h, na.rm=T), sx_f = mean(sx_f, na.rm=T))
We prepare the data so that the format corresponds to what is expected by the graphics functions of the package ggplot2
.
surv_geneanet_2_dept_sx <-
surv_geneanet_2_depts %>%
select(dept, age, sx_h, sx_f) %>%
gather(sexe, sx, sx_h, sx_f) %>%
mutate(sexe = factor(sexe, levels = c("sx_h", "sx_f"), labels = c(1,2))) %>%
mutate(sexe = as.character(sexe))
df_plot_survie_2 <-
surv_geneanet_2_dept_sx %>%
mutate(sexe = factor(sexe, levels = c("2","1"), labels = c("Femmes", "Hommes")))
p <-
ggplot(data = df_plot_survie_2,
aes(x = age, y = sx, colour = sexe, group = interaction(dept, sexe))) +
geom_line(alpha = .2) +
xlab("âge") + ylab("Probabilité") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
p
From the estimates made in the previous section, we can look at the force of mortality, that is, the probability, for an individual having reached a given age, to die during the course of the year.
df_plot_mx <-
surv_hist %>%
select(age, sexe, mx) %>%
mutate(mx = log(mx)) %>%
mutate(type = "Historique") %>%
bind_rows(
surv_geneanet %>%
select(age, fm_h, fm_f) %>%
gather(sexe, mx, fm_h, fm_f) %>%
mutate(type = "Geneanet") %>%
mutate(sexe = ifelse(sexe == "fm_h", yes = "1", no = sexe),
sexe = ifelse(sexe == "fm_f", yes = "2", no = sexe))
) %>%
mutate(sexe = factor(sexe, levels = c("2","1"), labels = c("Femmes", "Hommes")))
head(df_plot_mx)
# A tibble: 6 x 4
# Groups: age [3]
age sexe mx type
<int> <fct> <dbl> <chr>
1 6 Hommes -4.29 Historique
2 6 Femmes -4.31 Historique
3 7 Hommes -4.49 Historique
4 7 Femmes -4.49 Historique
5 8 Hommes -4.58 Historique
6 8 Femmes -4.62 Historique
p_mx <-
ggplot(data = df_plot_mx, aes(x = age, y = mx, colour = sexe, linetype = type)) +
geom_line() +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
scale_linetype_manual("", values = c("Historique" = "dashed", "Geneanet" = "solid")) +
xlab("âge") + ylab("log(mu_x)") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
p_mx
And finally, we can draw these two graphs on a single figure:
df_plot <-
df_plot_survie %>% select(-mx) %>% rename(value = sx) %>% mutate(variable = "sx") %>%
bind_rows(df_plot_mx %>% rename(value = mx) %>% mutate(variable = "mx"))
p_survie <-
ggplot(data = df_plot %>% mutate(variable = factor(variable, levels = c("sx", "mx"),
labels = c("Probabilité de survie", "Force de mortalité (en log)"))),
aes(x = age, y = value, colour = sexe, linetype = type)) +
geom_line() +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
scale_linetype_manual("", values = c("Historique" = "dashed", "Geneanet" = "solid")) +
xlab("age") + ylab(NULL) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
facet_wrap(~variable, scales = "free_y", nrow = 1)
p_survie
df_plot_mx_2 <-
surv_geneanet_2_depts %>%
select(dept, age, fm_h, fm_f) %>%
gather(sexe, mx, fm_h, fm_f) %>%
mutate(sexe = ifelse(sexe == "fm_h", yes = "1", no = sexe),
sexe = ifelse(sexe == "fm_f", yes = "2", no = sexe)) %>%
mutate(sexe = factor(sexe, levels = c("2","1"), labels = c("Femmes", "Hommes")))
p_mx_2 <-
ggplot(data = df_plot_mx_2, aes(x = age, y = mx, colour = sexe, group = interaction(dept, sexe))) +
geom_line(alpha = .2) +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
xlab("\\^age") + ylab("$log(\\mu\\_x$)") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
p_mx_2+theme_paper()
# Les deux graphiques combinés
df_plot_2 <-
df_plot_survie_2 %>% rename(value = sx) %>% mutate(variable = "sx") %>%
bind_rows(df_plot_mx_2 %>% rename(value = mx) %>% mutate(variable = "mx"))
p_survie_2 <-
ggplot(data = df_plot_2 %>%
mutate(variable = factor(variable, levels = c("sx", "mx"),
labels = c("Probabilit\\'e de survie", "Force de mortalit\\'e (en log)"))),
aes(x = age, y = value, colour = sexe, group = interaction(dept, sexe))) +
geom_line(alpha=.1) +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
xlab("\\^age") + ylab(NULL) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
facet_wrap(~variable, scales = "free_y", nrow = 1)
p_survie_2+theme_paper()
df_plot_mx_2 <-
surv_geneanet_2_depts %>%
select(dept, age, fm_h, fm_f) %>%
gather(sexe, mx, fm_h, fm_f) %>%
mutate(sexe = ifelse(sexe == "fm_h", yes = "1", no = sexe),
sexe = ifelse(sexe == "fm_f", yes = "2", no = sexe)) %>%
mutate(sexe = factor(sexe, levels = c("2","1"), labels = c("Femmes", "Hommes")))
p_mx_2 <-
ggplot(data = df_plot_mx_2, aes(x = age, y = mx, colour = sexe, group = interaction(dept, sexe))) +
geom_line(alpha = .2) +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
xlab("âge") + ylab("log(mu_x)") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10))
p_mx_2
By combining the two graphs:
df_plot_2 <-
df_plot_survie_2 %>% rename(value = sx) %>% mutate(variable = "sx") %>%
bind_rows(df_plot_mx_2 %>% rename(value = mx) %>% mutate(variable = "mx"))
p_survie_2 <-
ggplot(data = df_plot_2 %>%
mutate(variable = factor(variable, levels = c("sx", "mx"),
labels = c("Probabilité de survie", "Force de mortalité (en log)"))),
aes(x = age, y = value, colour = sexe, group = interaction(dept, sexe))) +
geom_line(alpha=.1) +
scale_colour_manual("", values = c("Femmes" = "red", "Hommes" = "blue")) +
xlab("âge") + ylab(NULL) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
facet_wrap(~variable, scales = "free_y", nrow = 1)
p_survie_2
Using estimates of survival functions for women and men, it is easy to calculate individuals’ life expectancies at birth.
Let us calculate life expectancy at birth using data from Vallin and Meslé.
# Espérance de vie à la naissance pour la cohorte de 1806, données de Vallin et Meslé.
vallin_1806 <-
survie_diag(annee_naiss_depart = 1806, sexe = "1", age_min = 0) %>%
bind_rows(survie_diag(annee_naiss_depart = 1806, sexe = "2", age_min = 0))
vallin_1806 %>%
group_by(sexe) %>%
summarise(e = sum(cumprod(exp(-mx)), na.rm=T))
We build a function that performs the calculation.
calcul_esperance_vie_naissance_aieux <- function(df, summ = FALSE, age_min = 0){
res <-
df %>%
filter(age_min >= age_min) %>%
left_join(
df %>%
# group_by(annee_naissance, dept) %>%
filter(age == 0) %>%
select(annee_naissance, exposition_h, exposition_f) %>%
rename(exposition_f1 = exposition_f, exposition_h1 = exposition_h),
by = c("annee_naissance")
) %>%
mutate(e_h = exposition_h / exposition_h1,
e_f = exposition_f / exposition_f1) %>%
group_by(annee_naissance) %>%
summarise(e_h = sum(e_h)-0.5,
e_f = sum(e_f)-0.5) %>%
ungroup()
if(summ){
res <- res %>% summarise(e_h = mean(e_h), e_f = mean(e_f))
}
res
}
We apply this function to our mortality table for all ancestors.
esperance_vie_aieux <-
calcul_esperance_vie_naissance_aieux(surv_geneanet_2_tot_c)
esperance_vie_aieux %>%
knitr::kable(digit=2)
annee_naissance | e_h | e_f |
---|---|---|
1800 | 45.27 | 43.31 |
1801 | 43.66 | 41.88 |
1802 | 43.06 | 41.32 |
1803 | 42.60 | 40.93 |
1804 | 43.12 | 41.62 |
esperance_vie_aieux_mean <-
esperance_vie_aieux %>%
summarise(e_h = mean(e_h), e_f = mean(e_f))
esperance_vie_aieux_mean
# A tibble: 1 x 2
e_h e_f
<dbl> <dbl>
1 43.5 41.8
calcul_esperance_vie_naissance <- function(df, summ = FALSE, age_min = 0){
res <-
df %>%
filter(age_min >= age_min) %>%
left_join(
df %>%
# group_by(annee_naissance, dept) %>%
filter(age == 0) %>%
select(annee_naissance, dept, exposition_h, exposition_f) %>%
rename(exposition_f1 = exposition_f, exposition_h1 = exposition_h),
by = c("annee_naissance", "dept")
) %>%
mutate(e_h = exposition_h / exposition_h1,
e_f = exposition_f / exposition_f1) %>%
group_by(annee_naissance, dept) %>%
summarise(e_h = sum(e_h)-0.5,
e_f = sum(e_f)-0.5) %>%
ungroup()
if(summ){
res <- res %>% summarise(e_h = mean(e_h), e_f = mean(e_f))
}
res
}
Life expectancies at birth for each cohort are obtained as follows:
esperance_vie <-
calcul_esperance_vie_naissance(surv_geneanet_2_depts_c)
head(esperance_vie)
# A tibble: 6 x 4
annee_naissance dept e_h e_f
<int> <chr> <dbl> <dbl>
1 1800 F01 42.9 40.3
2 1800 F02 43.8 42.5
3 1800 F03 44.7 38.2
4 1800 F04 44.6 38.3
5 1800 F05 39.2 32.3
6 1800 F06 44.0 48.0
It is then enough to carry out the average by department to obtain an estimate for all the cohorts from 1800 to 1804:
esperance_vie_mean <-
esperance_vie %>%
group_by(dept) %>%
summarise(e_h = mean(e_h), e_f = mean(e_f))
head(esperance_vie_mean)
# A tibble: 6 x 3
dept e_h e_f
<chr> <dbl> <dbl>
1 F01 41.9 40.5
2 F02 41.1 39.9
3 F03 41.3 36.7
4 F04 42.7 37.3
5 F05 36.5 31.6
6 F06 42.7 42.5
A representation by means of a choropleth map is possible, using again the map of France france_1800
whose creation is detailed in appendix.
esperance_vie_map <-
france_1800 %>%
left_join(esperance_vie_mean, by = c("departement" = "dept"))
p_map <-
ggplot(esperance_vie_map %>%
gather(sexe, esperance, e_h, e_f) %>%
mutate(sexe = factor(sexe, levels = c("e_f", "e_h"), labels = c("Femmes", "Hommes"))),
aes(x = long, y = lat, group = group, fill = esperance)) +
geom_polygon() +
facet_grid(~sexe) +
scale_x_continuous(limits = c(-6, 10)) +
scale_y_continuous(limits = c(41, 51.5)) +
scale_colour_manual("Sexe", values = c("Femmes" = "blue", "Hommes" = "red"), na.value = "#377eb8") +
scale_fill_gradient(low = "yellow", high = "red") +
coord_quickmap() +
theme(legend.position = "bottom", text = element_text(size = 6),
legend.key.height = unit(1, "line"),
legend.key.width = unit(1.5, "line"))
p_map
It is interesting to compare the values with those that exist in the literature. We report the values found in Walle (1973).
esperances_vandewalle <-
c('F01', 'Ain', '32.6',
'F02', 'Aisne', '35.9',
'F03', 'Allier', '31.7',
'F04', 'Alpes-de-Haute-Provence', '34.2',
'F05', 'Hautes-Alpes', '38.3',
'F06', 'Alpes-Maritimes', NA,
'F07', 'Ardèche', '41.9',
'F08', 'Ardennes', '40.7',
'F09', 'Ariège', '45.2',
'F10', 'Aube', '34.2',
'F11', 'Aude', '39.4',
'F12', 'Aveyron', '44.6',
'F13', 'Bouches-du-Rhône', NA,
'F14', 'Calvados', '47.7',
'F15', 'Cantal', '43.0',
'F16', 'Charente', '35.8',
'F17', 'Charente-Maritime', '32.8',
'F18', 'Cher', '32.3',
'F19', 'Corrèze', '34.8',
'F20', 'Corse-du-Sud', '39',
'F21', "Côte-d'Or", '35.4',
'F22', "Côtes-d'Armor", '37.9',
'F23', 'Creuse', '35.2',
'F24', 'Dordogne', '34.3',
'F25', 'Doubs', '39.6',
'F26', 'Drôme', '37.3',
'F27', 'Eure', '40.6',
'F28', 'Eure-et-Loir', '30.1',
'F29', 'Finistère', '29.2',
'F30', 'Gard', '36.9',
'F31', 'Haute-Garonne', '38.0',
'F32', 'Gers', '40.4',
'F33', 'Gironde', '37.7',
'F34', 'Hérault', '37.8',
'F35', 'Ille-et-Vilaine', '32.4',
'F36', 'Indre', '29.1',
'F37', 'Indre-et-Loire', '35.2',
'F38', 'Isère', '37.9',
'F39', 'Jura', '37.5',
'F40', 'Landes', '30.7',
'F41', 'Loir-et-Cher', '24.9',
'F42', 'Loire', '37.0',
'F43', 'Haute-Loire', '38.8',
'F44', 'Loire-Atlantique', '39.4',
'F45', 'Loiret', '24.7',
'F46', 'Lot', '35.8',
'F47', 'Lot-et-Garonne', '41.6',
'F48', 'Lozère', '45.3',
'F49', 'Maine-et-Loire', '38.4',
'F50', 'Manche', '45.9',
'F51', 'Marne', '38.1',
'F52', 'Haute-Marne', '33.2',
'F53', 'Mayenne', '34.7',
'F54', 'Meurthe-et-Moselle', '36.0',
'F55', 'Meuse', '35.0',
'F56', 'Morbihan', '32.1',
'F57', 'Moselle', '39.2',
'F58', 'Nièvre', '26.9',
'F59', 'Nord', '36.3',
'F60', 'Oise', '35.8',
'F61', 'Orne', '39.1',
'F62', 'Pas-de-Calais', '38.0',
'F63', 'Puy-de-Dôme', '35.9',
'F64', 'Pyrénées-Atlantiques', '44.2',
'F65', 'Hautes-Pyrénées', '46.5',
'F66', 'Pyrénées-Orientales', '32.1',
'F67', 'Bas-Rhin', '37.1',
'F68', 'Haut-Rhin', '39.9',
'F69', 'Rhône', NA,
'F70', 'Haute-Saône', '34.8',
'F71', 'Saône-et-Loire', '32.8',
'F72', 'Sarthe', '32.4',
'F73', 'Savoie', NA,
'F74', 'Haute-Savoie', NA,
'F75', 'Paris', NA,
'F76', 'Seine-Maritime', '44.0',
'F77', 'Seine-et-Marne', '28.6',
'F78', 'Essonne', NA,
'F79', 'Deux-Sèvres', '41.0',
'F80', 'Somme', '41.0',
'F81', 'Tarn', '39.6',
'F82', 'Tarn-et-Garonne', '37.3',
'F83', 'Var', '33.9',
'F84', 'Vaucluse', '33.4',
'F85', 'Vendée', '36.6',
'F86', 'Vienne', '46.0',
'F87', 'Haute-Vienne', '29.5',
'F88', 'Vosges', '38.3',
'F89', 'Yonne', '29.2') %>%
matrix(ncol = 3, byrow = TRUE) %>%
data.frame(stringsAsFactors = FALSE) %>%
tbl_df()
colnames(esperances_vandewalle) <- c("dept", "nom_sp", "e_f_walle")
esperances_vandewalle <-
esperances_vandewalle %>%
mutate(e_f_walle = as.numeric(e_f_walle))
We join our regional estimates with those of Etienne van de Walle.
esperance_vie_mean_2 <-
esperance_vie_mean %>%
left_join(esperances_vandewalle)
head(esperance_vie_mean_2)
# A tibble: 6 x 5
dept e_h e_f nom_sp e_f_walle
<chr> <dbl> <dbl> <chr> <dbl>
1 F01 41.9 40.5 Ain 32.6
2 F02 41.1 39.9 Aisne 35.9
3 F03 41.3 36.7 Allier 31.7
4 F04 42.7 37.3 Alpes-de-Haute-Provence 34.2
5 F05 36.5 31.6 Hautes-Alpes 38.3
6 F06 42.7 42.5 Alpes-Maritimes NA
The average life expectancy at birth for women is the same for both our sample and van de Walle’s sample:
mean(esperance_vie_mean_2$e_f, na.rm=T)
[1] 42.20441
mean(esperance_vie_mean_2$e_f_walle, na.rm=T)
[1] 36.67805
Our estimated average value is higher. However, we would like to see if the same geographic effects appear with our data. We then calculate the deviations per department with respect to the average value of the departments, for our data, as well as for those of the literature.
esperance_vie_mean_2 <-
esperance_vie_mean_2 %>%
mutate(e_f_3 = e_f - mean(e_f, na.rm=T),
e_f_walle_3 = e_f_walle - mean(e_f_walle, na.rm=T))
head(esperance_vie_mean_2)
# A tibble: 6 x 7
dept e_h e_f nom_sp e_f_walle e_f_3 e_f_walle_3
<chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 F01 41.9 40.5 Ain 32.6 - 1.70 - 4.08
2 F02 41.1 39.9 Aisne 35.9 - 2.30 - 0.778
3 F03 41.3 36.7 Allier 31.7 - 5.49 - 4.98
4 F04 42.7 37.3 Alpes-de-Haute-Provence 34.2 - 4.92 - 2.48
5 F05 36.5 31.6 Hautes-Alpes 38.3 -10.6 1.62
6 F06 42.7 42.5 Alpes-Maritimes NA 0.325 NA
Then we represent the results on choropleth maps.
esperance_vie_map <-
france_1800 %>%
left_join(esperance_vie_mean_2 %>%
select(dept, e_f_3, e_f_walle_3), by = c("departement" = "dept"))
p_map <-
ggplot(esperance_vie_map %>%
gather(type, esperance, e_f_3, e_f_walle_3) %>%
mutate(type = factor(type, levels = c("e_f_3", "e_f_walle_3"), labels = c("Geneanet", "van de Walle"))),
aes(x = long, y = lat, group = group, fill = esperance)) +
geom_polygon(col = "grey") +
facet_wrap(~type) +
scale_x_continuous(limits = c(-6, 10)) +
scale_y_continuous(limits = c(41, 51.5)) +
scale_fill_gradient2("Écart à la moyenne départementale (années)", low = "red", mid = "white", high = "#71BC78", midpoint = 0, na.value = "grey",
guide = guide_colourbar(direction = "horizontal",
barwidth = 30,
title.position = "bottom",
title.hjust = .5)) +
coord_quickmap() +
theme(legend.position = "bottom", text = element_text(size = 6),
legend.key.height = unit(1, "line"),
legend.key.width = unit(1.5, "line")) +
theme(text = element_text(size = 16), legend.position = "bottom")
p_map
We will estimate confidence intervals for life expectancies at birth. Two methods are envisaged. A first is to use what Carlo Giovanni Camarda proposes on his website (https://sites.google.com/site/carlogiovannicamarda/r-stuff/life-expectancy-confidence-interval). A second is strongly inspired by this method, but proves simpler to implement. Both methods are based on bootstrap.
We get the lifetable.qx()
function used to construct a mortality table from the probabilities of death.
## function for constructing a lifetable starting from probabilities
lifetable.qx <- function(x, qx, sex="M", ax=NULL, last.ax=5.5){
m <- length(x)
n <- c(diff(x), NA)
qx[is.na(qx)] <- 0
if(is.null(ax)){
ax <- rep(0,m)
if(x[1]!=0 | x[2]!=1){
ax <- n/2
ax[m] <- last.ax
}else{
if(sex=="F"){
if(qx[1]>=0.1){
ax[1] <- 0.350
}else{
ax[1] <- 0.05 + 3*qx[1]
}
}
if(sex=="M"){
if(qx[1]>=0.1){
ax[1] <- 0.33
}else{
ax[1] <- 0.0425 + 2.875*qx[1]
}
}
ax[-1] <- n[-1]/2
ax[m] <- last.ax
}
}
px <- 1-qx
lx <- cumprod(c(1,px))*100000
dx <- -diff(lx)
Lx <- n*lx[-1] + ax*dx
lx <- lx[-(m+1)]
Lx[m] <- lx[m]*last.ax
Lx[is.na(Lx)] <- 0 ## in case of NA values
Lx[is.infinite(Lx)] <- 0 ## in case of Inf values
Tx <- rev(cumsum(rev(Lx)))
ex <- Tx/lx
data.frame(x, n, ax, qx, px, lx, dx, Lx, Tx, ex)
}
This function is called by the following one, which calculates confidence intervals for life expectancy at a given age. It is slightly adapted from the codes proposed by Camarda.
#' ex_ci
#' @description Compute a confidence interval for life expectancy
#' https://sites.google.com/site/carlogiovannicamarda/r-stuff/life-expectancy-confidence-interval
#' @param LT Life table
#' @param which.x Which age for the life expectancy
#' @param level level of the confidence interval
#' @param ns number of samples
#' @param sex 'F' (female) of 'M' (male)
#' @param year_birth year of birth of the cohort
#' @param dept_birth departement of birth
#' which.x <- 0; level <- 0.95; ns <- 1000; sex="M"; year_birth <- 1800 ; dept_birth <- "F29"
ex_ci <- function(LT, which.x=0, level=0.95, ns=1000, sex="M", year_birth = 1800, dept_birth = "F29"){
## ages
x <- LT$age
if(sex == "M"){
## number of deaths
Dx <- LT$deces_h
## estimated probs
qx <- LT$qxt_h
}else{
## number of deaths
Dx <- LT$deces_f
## estimated probs
qx <- LT$qxt_f
}
## number of ages
m <- nrow(LT)
## trials for binomial, rounded
Ntil <- round(Dx/qx)
## ax for last age
# last.ax <- LT$ax[m]
## simulated death counts
## from Binomial distribution
Y <- suppressWarnings(matrix(rbinom(m*ns,
Ntil,
qx),
m,ns))
## simulated probabilities
QX <- Y/Ntil
## which age?
wh <- which(x==which.x)
## for all replicates, compute life expectancy
fun.ex <- function(qx){
return(lifetable.qx(x=x, qx, sex=sex)$ex[wh])
}
exsim <- apply(QX, 2, fun.ex)
## confidence interval
CI <- quantile(exsim,
probs = c((1-level)/2,
1 - (1-level)/2))
data.frame(ex_mean = mean(exsim),
ex_inf = CI[[1]],
ex_sup = CI[[2]],
age = which.x,
sexe = sex,
annee_naissance = year_birth,
dept = dept_birth,
stringsAsFactors = FALSE)
} # Fin de ex_ci()
We also introduce a function to use the second method. This method consists of considering a population \(E_0\) born in a given year (e.g., 1800) with a probability of dying in the year (\(\widehat{q}_x\), estimated on our data for this cohort). It is an iterative process. At age \(x\), if \(E_x\) people are alive, the number of deaths \(D_x\) is drawn according to a binomial law \(\mathcal{B}(E_x,\widehat{q}_x)\). Next, we ask \(E_{x+1}=E_x - D_x\), that is, we consider the number of people alive at the next age as the number of people alive at the \(x\) age from which we subtract the number of deaths that have just been drawn. We then draw a new number of deaths for the next age, and so on. These iterative steps are independently reproduced in \(1,000\) samples. In each of these samples, we calculate life expectancy at birth based on the prints made. The empirical quantiles of \(0.025\) and \(0.975\) then become the bounds of the \(95\%\) confidence interval for the life expectancy at birth of the individuals in the cohort studied. The simu()
function below is used to make a draw. This will involve using simu()
iteratively to obtain different samples.
#' simu
#' @description Estime des IC par bootstrap pour les esperances de vie a la naissance
#' a partir d'une table de vie
#' @param i valeur seed
#' @param LT Table de mortalite
simu <- function(i, LT){
set.seed(i)
d_h=d_f=rep(0,111)
e_h=LT$exposition_h
e_f=LT$exposition_f
q_h=LT$qxt_h
q_f=LT$qxt_f
# Last not NA value :
ind_max_not_na_h <- max(which(!is.na(q_h)))
ind_na_h <- which(is.na(q_h))
q_h[ind_na_h[ind_na_h> ind_max_not_na_h]]=1
ind_max_not_na_f <- max(which(!is.na(q_f)))
ind_na_f <- which(is.na(q_f))
q_f[ind_na_f[ind_na_f> ind_max_not_na_f]]=1
if(any(is.na(q_h))){
loess_h <- loess(q_h~age, data = data.frame(q_h = q_h, age = 1:length(q_h)))
ind_missing_h <- which(is.na(q_h))
predicted_h <- predict(loess_h, newdata = data.frame(age = ind_missing_h))
predicted_h[predicted_h>1] <- 1
predicted_h[predicted_h<0] <- 0
q_h[ind_missing_h] <- predicted_h
}
if(any(is.na(q_f))){
loess_f <- loess(q_f~age, data = data.frame(q_f = q_f, age = 1:length(q_f)))
ind_missing_f <- which(is.na(q_f))
predicted_f <- predict(loess_f, newdata = data.frame(age = ind_missing_f))
predicted_f[predicted_f>1] <- 1
predicted_f[predicted_f<0] <- 0
q_f[ind_missing_f] <- predicted_f
}
for(t in 1:110){
D_h = rbinom(1,size=e_h[t],prob=q_h[t])
d_h[t] = D_h
e_h[t+1] = e_h[t]-d_h[t]
D_f = rbinom(1,size=e_f[t],prob=q_f[t])
d_f[t] = D_f
e_f[t+1] = e_f[t]-d_f[t]
}
s_h=e_h/e_h[1]
s_f=e_f/e_f[1]
list(E_h=sum(s_h)-.5,E_f=sum(s_f)-.5)
}# Fin de simu()
LT_h <- lifetable_fr %>% filter(Year1 == 1806, Sex == 1, TypeLT == 1)
LT_f <- lifetable_fr %>% filter(Year1 == 1806, Sex == 2, TypeLT == 1)
#'
#'@param LT Life table
#'@param ns number of samples to draw
#'@param which.x age for which to compute life expectancy
#'@param level level for the confidence interval
ex_ci_vallin <- function(LT, ns=1000, which.x=0, level=0.95){
## sex
sex <- LT$Sex %>% unique()
## ages
x <- LT$Age
## number of deaths
Dx <- LT$`d(x)`
## estimated probs
qx <- LT$`q(x)`
## number of ages
m <- nrow(LT)
## trials for binomial, rounded
Ntil <- round(Dx/qx)
## ax for last age
# last.ax <- LT$ax[m]
## simulated death counts
## from Binomial distribution
Y <- suppressWarnings(matrix(rbinom(m*ns,
Ntil,
qx),
m,ns))
## simulated probabilities
QX <- Y/Ntil
## which age?
wh <- which(x==which.x)
## for all replicates, compute life expectancy
fun.ex <- function(qx){
return(lifetable.qx(x=x, qx, sex=sex)$ex[wh])
}
exsim <- apply(QX, 2, fun.ex)
## confidence interval
CI <- quantile(exsim,
probs = c((1-level)/2,
1 - (1-level)/2))
data.frame(ex_mean = mean(exsim),
ex_inf = CI[[1]],
ex_sup = CI[[2]],
age = which.x,
sexe = sex,
annee_naissance = which.x,
stringsAsFactors = FALSE)
}# End of ex_ci_vallin()
ex_vallin <- ex_ci_vallin(LT = LT_h) %>%
bind_rows(
ex_ci_vallin(LT = LT_f)
)
ex_vallin
We propose a function, e_ic_cohorte()
, which from a year of birth, bootstrap estimates the confidence intervals for life expectancy at birth of the cohort of individuals born this year.
#' e_ic_cohorte
#' @param year_birth annee de naissance de la cohorte
#' @param ns nombre d'échantillons
#' #' @param level niveau de l'intervalle de confiance
e_ic_cohorte <- function(year_birth, ns=1000, level=0.95){
LT <- surv_geneanet_2_tot_c %>% filter(annee_naissance == year_birth)
E=lapply(1:ns,simu, LT=LT)
BE=matrix(unlist(E),nrow=2)
ex_mean <- apply(BE,1,mean)
ex_inf <- apply(BE,1,function(x) quantile(x,(1-level)/2))
ex_sup <- apply(BE,1,function(x) quantile(x,1 - (1-level)/2))
data.frame(ex_mean = ex_mean,
ex_inf = ex_inf,
ex_sup = ex_sup,
sexe = c("M", "F"),
annee_naissance = year_birth,
stringsAsFactors = FALSE)
}# Fin de e_ic_cohorte()
ex_aieux <- pblapply(1800:1804, e_ic_cohorte)
ex_aieux <- bind_rows(ex_aieux)
ex_aieux_avg <-
ex_aieux %>%
group_by(sexe) %>%
summarise(ex_mean = mean(ex_mean),
ex_inf = mean(ex_inf),
ex_sup = mean(ex_sup))
ex_aieux_avg
# A tibble: 2 x 4
sexe ex_mean ex_inf ex_sup
<chr> <dbl> <dbl> <dbl>
1 F 41.8 41.6 42.0
2 M 43.5 43.3 43.7
Here, we propose to use both methods for estimating confidence intervals.
We apply the method of Carlo Giovanni Camarda (https://sites.google.com/site/carlogiovannicamarda/r-stuff/life-expectancy-confidence-interval).
We loop on the years of birth of the ancestors, for each sex, and for each department.
annees <- 1800:1804
sexes <- c("F", "M")
depts <- sort(unique(surv_geneanet_2_depts_c$dept))
n_tot <- length(annees)*length(sexes)*length(depts)
pb <- txtProgressBar(min = 1, max = n_tot, style = 3)
i <- 1
ex_depts <- NULL
for(annee in annees){
for(sexe in sexes) {
for(dept in depts){
LT <- surv_geneanet_2_depts_c %>% filter(annee_naissance == annee, dept == dept)
tmp <- ex_ci(LT=LT, year_birth = annee, sex = sexe, dept_birth = dept)
ex_depts <- bind_rows(ex_depts, tmp)
rm(tmp, LT)
setTxtProgressBar(pb, i)
i <- i+1
}
}
}
head(ex_depts)
ex_mean ex_inf ex_sup age sexe annee_naissance dept
1 40.27501 38.32497 42.34023 0 F 1800 F01
2 42.42850 40.84798 43.98664 0 F 1800 F02
3 38.24195 36.38821 40.12682 0 F 1800 F03
4 38.34529 34.82395 41.78034 0 F 1800 F04
5 32.26114 29.72819 35.05179 0 F 1800 F05
6 47.84448 44.62942 51.02613 0 F 1800 F06
Using our method now.
#' ex_ci
#' @description Compute a confidence interval for life expectancy
#' From an initial population $E_0$ born in `year_birth`, et a probability of dying within the current year
#' $\widehat{q}_x$ previously estimated, proceeds as follows:
#' For age $x$, if $E_x$ people are still alive, the number of deaths $D_x$ is drawn from a Binomial $B(E_x,q_x)$ distribution
#' and we then posit $E_{x+1}=E_x-D_x$.
#' @param level level of the confidence interval
#' @param ns number of samples
#' @param year_birth year of birth of the cohort
#' @param dept_birth departement of birth
#' which.x <- 0; level <- 0.95; ns <- 1000; sex="M"; year_birth <- 1800 ; dept_birth <- "F29"
ex_ci_2 <- function(level=0.95, ns=1000, year_birth = 1800, dept_birth = "F29"){
LT <- surv_geneanet_2_depts_c %>% filter(annee_naissance == year_birth, dept == dept_birth)
E=lapply(1:ns,simu, LT=LT)
BE=matrix(unlist(E),nrow=2)
ex_mean <- apply(BE,1,mean)
ex_inf <- apply(BE,1,function(x) quantile(x,(1-level)/2))
ex_sup <- apply(BE,1,function(x) quantile(x,1 - (1-level)/2))
data.frame(ex_mean = ex_mean,
ex_inf = ex_inf,
ex_sup = ex_sup,
sexe = c("M", "F"),
annee_naissance = year_birth,
dept = dept_birth,
stringsAsFactors = FALSE)
} # Fin de ex_ci_2()
Since this method performs estimates for each sex in the simu()
function, we iterate only on birth years and departments.
n_tot <- length(annees)*length(depts)
pb <- txtProgressBar(min = 1, max = n_tot, style = 3)
i <- 1
ex_depts_2 <- NULL
for(annee in annees){
for(dept in depts){
tmp <- ex_ci_2(year_birth = annee, dept_birth = dept)
ex_depts_2 <- bind_rows(ex_depts_2, tmp)
rm(tmp)
setTxtProgressBar(pb, i)
i <- i+1
}
}
head(ex_depts_2)
ex_mean ex_inf ex_sup sexe annee_naissance dept
1 41.02150 39.07937 42.96377 M 1800 F01
2 39.66530 37.69183 41.63957 F 1800 F01
3 43.74457 42.35724 45.04229 M 1800 F02
4 42.40019 40.97947 43.99872 F 1800 F02
5 44.09319 42.46796 45.83167 M 1800 F03
6 37.91267 36.07012 39.75813 F 1800 F03
ex_depts_avg <-
ex_depts_2 %>%
group_by(dept, sexe) %>%
summarise(ex_mean = mean(ex_mean),
ex_inf = mean(ex_inf),
ex_sup = mean(ex_sup))
head(ex_depts_avg)
# A tibble: 6 x 5
# Groups: dept [3]
dept sexe ex_mean ex_inf ex_sup
<chr> <chr> <dbl> <dbl> <dbl>
1 F01 F 39.7 37.6 41.8
2 F01 M 40.9 38.9 42.9
3 F02 F 39.9 38.3 41.5
4 F02 M 41.0 39.5 42.4
5 F03 F 35.0 33.0 37.1
6 F03 M 39.4 37.4 41.4
We can then graph estimates of life expectancies at birth with confidence intervals.
esperance_vie_mean_ic <-
esperance_vie_mean_2 %>%
left_join(
ex_depts_avg %>%
filter(sexe == "F") %>%
rename(ex_mean_F = ex_mean, ex_inf_F = ex_inf, ex_sup_F = ex_sup) %>%
select(-sexe) %>%
left_join(
ex_depts_avg %>%
filter(sexe == "M") %>%
rename(ex_mean_M = ex_mean, ex_inf_M = ex_inf, ex_sup_M = ex_sup) %>%
select(-sexe)
)
)
esperance_vie_mean_ic %>%
mutate(outside = e_f_walle <= ex_inf_F,
inside = e_f_walle > ex_inf_F & e_f_walle < ex_sup_F) %>%
summarise(outside = sum(outside, na.rm=T),
inside = sum(inside, na.rm=T))
p_error_bar_e <-
esperance_vie_mean_ic %>%
# mutate(departement = str_c(nom_sp, " (", str_sub(dept, 2), ")")) %>%
mutate(departement = str_sub(dept, 2)) %>%
select(departement, e_h, e_f, ex_inf_M, ex_sup_M, ex_inf_F, ex_sup_F) %>%
unite(values_f, e_f, ex_inf_F, ex_sup_F, sep = "@") %>%
unite(values_h, e_h, ex_inf_M, ex_sup_M, sep = "@") %>%
gather(sexe, value, values_h, values_f) %>%
separate(value, c("e", "e_inf", "e_sup"), sep = "@", convert = TRUE) %>%
mutate(echantillon = "Geneanet") %>%
filter(!is.na(departement)) %>%
ggplot(data = ., aes(x = departement, y = e, colour = sexe)) +
geom_point() +
geom_errorbar(aes(ymin = e_inf, ymax = e_sup), width=1, size = .8) +
scale_colour_manual("Sexe", values = c("values_f" = "red", "values_h" = "blue"),
labels = c("values_f" = "Femmes", "values_h" = "Hommes")) +
geom_point(data =
esperance_vie_mean_ic %>%
# mutate(departement = str_c(nom_sp, " (", str_sub(dept, 2), ")")) %>%
mutate(departement = str_sub(dept, 2)) %>%
select(departement, e_f_walle) %>%
mutate(sexe = "values_f") %>%
rename(e = e_f_walle) %>%
mutate(echantillon = "van de Walle"),
aes(x = departement, y = e, shape = echantillon), colour = "black") +
scale_shape_manual("", values = c("van de Walle" = 17)) +
theme(axis.text.x = element_text(angle = 90, size = .6*20)) +
xlab("Département") + ylab("Espérance de vie à la naissance")
p_error_bar_e
We also produce a map representation.
esperance_vie_map_ic <-
france_1800 %>%
left_join(esperance_vie_mean_2 %>%
select(dept, e_f, e_h), by = c("departement" = "dept"))
To be able to add the confidence intervals relative to the estimates of each department, we get the coordinates of barycentres of each department.
# Barycentres
library(sp)
obtenir_spa_poly_sp <- function(un_dep){
dep_tmp <- france_1800 %>%
filter(departement == un_dep)
obtenir_poly_groupe <- function(un_groupe){
coords_group <- dep_tmp[which(dep_tmp$group %in% un_groupe), c("long", "lat")]
return(Polygons(list(Polygon(coords_group)), un_groupe))
}
dep_polys <- lapply(unique(dep_tmp$group), obtenir_poly_groupe)
dep_polys_sp <- SpatialPolygons(dep_polys, seq(1, length(dep_polys)))
dep_polys_sp
}
france_1800_sp <- lapply(unique(france_1800$departement), obtenir_spa_poly_sp)
names(france_1800_sp) <- unique(france_1800$departement)
france_1800_barycentres <- lapply(france_1800_sp, function(x) rgeos::gCentroid(x))
france_1800_barycentres <- do.call("rbind", lapply(france_1800_barycentres, function(x) x@coords))
france_1800_barycentres <- cbind(france_1800_barycentres, departement = unique(france_1800$departement))
france_1800_barycentres <- data.frame(france_1800_barycentres, stringsAsFactors = FALSE) %>% tbl_df()
rownames(france_1800_barycentres) <- NULL
france_1800_barycentres <-
france_1800_barycentres %>%
mutate(long = as.numeric(x), lat = as.numeric(y)) %>%
select(-x,-y)
The base map is as follows:
p_map <-
ggplot() +
geom_polygon(data = esperance_vie_map_ic %>%
gather(sexe, esperance, e_f, e_h) %>%
mutate(sexe = factor(sexe, levels = c("e_f", "e_h"), labels = c("Femmes", "Hommes"))),
aes(x = long, y = lat, group = group, fill = esperance)) +
facet_grid(~sexe) +
scale_x_continuous(limits = c(-6, 10)) +
scale_y_continuous(limits = c(41, 51.5)) +
scale_fill_gradient("Espérance de vie à la naissance (années)", low = "yellow", high = "red") +
coord_quickmap() +
theme(legend.position = "bottom", text = element_text(size = 6),
legend.key.height = unit(1, "line"),
legend.key.width = unit(1.5, "line"))
We add confidence intervals:
val_longueur <- 1
val_corresp <- .1
longueur_ic <-
ex_depts_avg %>%
left_join(france_1800_barycentres, by = c("dept" = "departement")) %>%
mutate(longueur_ic_tx = ex_sup - ex_inf) %>%
mutate(
lat_beg = lat - (val_corresp * longueur_ic_tx/val_longueur)/2,
lat_2 = lat + (val_corresp * longueur_ic_tx/val_longueur)/2) %>%
mutate(sexe = factor(sexe, levels = c("F", "M"), labels = c("Femmes", "Hommes")))
legende_x <- -4.5
legende_y <- 43
legende_min <- 5
legende_max <- 10
p_map <-
p_map +
geom_errorbar(data = longueur_ic,
aes(x = long, ymin = lat_beg, ymax = lat_2), colour = "black", size = 1.5, width=0.2) +
geom_errorbar(data = longueur_ic,
aes(x = long, ymin = lat_beg, ymax = lat_2), colour = "white", size = 1, width=0.1) +
geom_errorbar(data = data.frame(long = legende_x, lat = legende_y, longueur_ic_tx = c(legende_min, legende_max)) %>%
mutate(lat_2 = lat + val_corresp * longueur_ic_tx/val_longueur),
aes(x = long, ymin = lat, ymax = lat_2), colour = "black", size = 1.5, width=0.2) +
geom_errorbar(data = data.frame(long = legende_x, lat = legende_y, longueur_ic_tx = c(legende_min, legende_max)) %>%
mutate(lat_2 = lat + val_corresp * longueur_ic_tx/val_longueur),
aes(x = long, ymin = lat, ymax = lat_2), colour = "white", size = 1, width=0.1) +
geom_text(data = data.frame(long = legende_x + .5, lat = legende_y, longueur_ic_tx = c(legende_min, legende_max)) %>%
mutate(lat_2 = lat + val_corresp * longueur_ic_tx/val_longueur),
aes(x = long, y = lat_2, label = longueur_ic_tx))
p_map
We can explore the spatial aspects of genealogy data in more detail. We start by loading some packages, as well as maps of France and genealogy data specific to the sub-sample of individuals likely to be of migrating age.
library(tidyverse)
library(stringr)
library(data.table)
library(pbapply)
library(dplyrExtras)
library(sp)
load("liste_depts.rda")
load("france.rda")
load("france_1800.rda")
liste_depts <-
liste_depts %>%
mutate(dept2 = dept,
dept2 = ifelse(dept2 %in% c("F78", "F91", "F92", "F95"), yes = "F78", no = dept2),
dept2 = ifelse(dept2 %in% c("F93", "F94", "F77"), yes = "F77", no = dept2)
)
# Les données de généalogie
load("generations_mobilite.rda")
generations <- generations_mobilite
rm(generations_mobilite)
We will need to define in which department certain points marked by their coordinates (longitude, latitude) are located. We redefine two functions to transform data tables containing departmental contours into polygons.
library(rgeos)
#' group_to_gpc_poly
#' Returns the polygon from a group within the data frame containing
#' the coordinates of regions
#' @df: data frame with the coordinates of the polygons
#' @group: (string) group ID
group_to_gpc_poly <- function(df, group){
group_coords <- df[df$group == group, c("long", "lat")]
as(group_coords, "gpc.poly")
}
#' departement_polygone
#' Returns a French region as a gpc.poly object
#' un_dep: (string) code of the region
departement_polygone <- function(un_dep){
dep_tmp <-
france_1800 %>%
filter(departement == un_dep)
df_tmp_polys <- lapply(unique(dep_tmp$group), group_to_gpc_poly, df = dep_tmp)
df_tmp_polys_tot <- df_tmp_polys[[1]]
if(length(df_tmp_polys)>1){
for(i in 2:length(df_tmp_polys)){
df_tmp_polys_tot <- gpclib::union(df_tmp_polys_tot, df_tmp_polys[[i]])
}
}
df_tmp_polys_tot
}# End of departement_polygone()
france_1800_gpc <- lapply(unique(france_1800$departement), departement_polygone)
names(france_1800_gpc) <- unique(france_1800$departement)
save(france_1800_gpc, file = "france_1800_gpc.rda")
We create a function based on another, namely inside.gpc.poly()
of the package surveillance
, to determine, from coordinate vectors (longitude, latitude) and a department in polygon form, which are the points within the department.
est_dans_poly_naissance <- function(longitude, latitude, departement){
surveillance::inside.gpc.poly(x = longitude,
y = latitude,
departement,
mode.checked = FALSE)
}
generations %>%
filter(dist_enfants_km >0) %>%
.$dist_enfants_km %>%
summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.004 6.452 23.126 201.593 219.673 16895.501
generations %>%
filter(dist_p_enfants_km >0) %>%
.$dist_p_enfants_km %>%
summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.020 6.523 27.075 269.284 242.784 19190.069
generations %>%
filter(dist_ap_enfants_km >0) %>%
.$dist_ap_enfants_km %>%
summary()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.031 7.881 40.540 316.859 273.826 19190.069
We define two variables relating to the distance between the ancestors and their descendants: one indicating if the distance separating the places of birth between an ancestor and his descendants is less than 10 km or not, and another to know if this distance is less than 20 km or not.
generations <-
generations %>%
mutate(moins_10_enfants = dist_enfants_km <= 10,
moins_20_enfants = dist_enfants_km <= 20,
moins_10_p_enfants = dist_p_enfants_km <= 10,
moins_20_p_enfants = dist_p_enfants_km <= 20,
moins_10_ap_enfants = dist_ap_enfants_km <= 10,
moins_20_ap_enfants = dist_ap_enfants_km <= 20)
generations <- generations %>% data.table()
We look, for each generation, at the percentage of descendants born within 10 or 20km of their ancestor’s birthplace.
tableau_distances_1 <-
generations %>%
select(id_aieul, id_enfant, moins_10_enfants, moins_20_enfants, dist_enfants_km) %>%
rename(moins_10 = moins_10_enfants,
moins_20 = moins_20_enfants,
dist_km = dist_enfants_km) %>%
unique() %>%
summarise(
moins_10_not_na = sum(!is.na(moins_10)),
moins_10 = sum(moins_10, na.rm=T),
moins_20_not_na = sum(!is.na(moins_20)),
moins_20 = sum(moins_20, na.rm=T),
nas = sum(is.na(dist_km), na.rm=T),
nb = n()
) %>%
mutate(generation = 1,
moins_10_pct = (moins_10 / moins_10_not_na * 100) %>% round(2),
moins_20_pct = (moins_20 / moins_20_not_na * 100) %>% round(2))
tableau_distances_2 <-
generations %>%
select(id_aieul, id_p_enfant, moins_10_p_enfants, moins_20_p_enfants, dist_p_enfants_km) %>%
rename(moins_10 = moins_10_p_enfants,
moins_20 = moins_20_p_enfants,
dist_km = dist_p_enfants_km) %>%
unique() %>%
summarise(
moins_10_not_na = sum(!is.na(moins_10)),
moins_10 = sum(moins_10, na.rm=T),
moins_20_not_na = sum(!is.na(moins_20)),
moins_20 = sum(moins_20, na.rm=T),
nas = sum(is.na(dist_km), na.rm=T),
nb = n()
) %>%
mutate(generation = 2,
moins_10_pct = (moins_10 / moins_10_not_na * 100) %>% round(2),
moins_20_pct = (moins_20 / moins_20_not_na * 100) %>% round(2))
tableau_distances_3 <-
generations %>%
select(id_aieul, id_ap_enfant, moins_10_ap_enfants, moins_20_ap_enfants, dist_ap_enfants_km) %>%
rename(moins_10 = moins_10_ap_enfants,
moins_20 = moins_20_ap_enfants,
dist_km = dist_ap_enfants_km) %>%
unique() %>%
summarise(
moins_10_not_na = sum(!is.na(moins_10)),
moins_10 = sum(moins_10, na.rm=T),
moins_20_not_na = sum(!is.na(moins_20)),
moins_20 = sum(moins_20, na.rm=T),
nas = sum(is.na(dist_km), na.rm=T),
nb = n()
) %>%
mutate(generation = 3,
moins_10_pct = (moins_10 / moins_10_not_na * 100) %>% round(2),
moins_20_pct = (moins_20 / moins_20_not_na * 100) %>% round(2))
We aggregate these three tables into one:
tableau_distances <-
tableau_distances_1 %>%
bind_rows(tableau_distances_2) %>%
bind_rows(tableau_distances_3)
tableau_distances
moins_10_not_na moins_10 moins_20_not_na moins_20 nas nb generation
1: 26384 20596 26384 21754 1132 27516 1
2: 32793 19265 32793 21899 1705 34498 2
3: 66878 29065 66878 34990 7038 73916 3
moins_10_pct moins_20_pct
1: 78.06 82.45
2: 58.75 66.78
3: 43.46 52.32
To have an idea of the distribution of the distances separating the places of birth of the descendants from that of their ancestors, we prepare tables, for each generation, indicating for each individual the distances in kilometers, the sex, as well as the generation to which each individual belongs.
enfants <-
generations %>%
select(id_enfant, long_N_enfants, lat_N_enfants, sexe_enfants,
dept_N_enfants, moins_10_enfants, moins_20_enfants,
dist_enfants_km) %>%
mutate(generation = 1) %>%
unique()
colnames(enfants) <- str_replace(colnames(enfants), "\\_enfants?", "")
enfants <- data.table(enfants)
p_enfants <-
generations %>%
select(id_p_enfant, long_N_p_enfants, lat_N_p_enfants, sexe_p_enfants,
dept_N_p_enfants, moins_10_p_enfants, moins_20_p_enfants,
dist_p_enfants_km) %>%
mutate(generation = 2) %>%
unique()
colnames(p_enfants) <- str_replace(colnames(p_enfants), "\\_p_enfants?", "")
p_enfants <- data.table(p_enfants)
ap_enfants <-
generations %>%
select(id_ap_enfant, long_N_ap_enfants, lat_N_ap_enfants, sexe_ap_enfants,
dept_N_ap_enfants, moins_10_ap_enfants, moins_20_ap_enfants,
dist_ap_enfants_km) %>%
mutate(generation = 3) %>%
unique()
colnames(ap_enfants) <- str_replace(colnames(ap_enfants), "\\_ap_enfants?", "")
ap_enfants <- data.table(ap_enfants)
We merge these three paintings into one:
generations_2 <-
enfants %>%
bind_rows(p_enfants) %>%
bind_rows(ap_enfants)
We remove the missing values. In addition, we create a distance variable expressed in logarithms. This gives an idea of the distribution of distances for those who migrate, with the overwhelming majority of sedentary people.
generations_2 <- generations_2[!is.na(dist_km)]
generations_2 <- generations_2[, dist_km_2 := dist_km+1]
generations_2 <- generations_2[, dist_km_log := log(dist_km+1)]
We format the data table so that it can be used by the functions of the package ggplot2
.
stat_des_distances <-
generations_2 %>%
select(generation, dist_km) %>%
tbl_df() %>%
mutate(type = ifelse(dist_km == 0, yes = "Sédentaire", no = NA),
type = ifelse(dist_km<=20 & dist_km >0, yes = "Courte", no = type),
type = ifelse(dist_km>20 , yes = "Longue", no = type))
An overview of the distribution between sedentary, short-distance and long-distance migrants can be obtained using the following instructions:
stat_des_distances %>%
group_by(type) %>%
summarise(nb_obs = n()) %>%
mutate(pct = nb_obs / sum(nb_obs)*100) %>%
knitr::kable(digits=2)
type | nb_obs | pct |
---|---|---|
Courte | 30207 | 25.62 |
Longue | 45496 | 38.59 |
Sédentaire | 42203 | 35.79 |
And differentiating according to the generation (in columns):
stat_des_distances %>%
filter(!is.na(type)) %>%
group_by(generation, type) %>%
summarise(nb_obs = n()) %>%
group_by(generation) %>%
mutate(pct = nb_obs / sum(nb_obs) * 100) %>%
select(-nb_obs) %>%
spread(generation, pct) %>%
knitr::kable(digits=2)
type | 1 | 2 | 3 |
---|---|---|---|
Courte | 19.43 | 27.70 | 27.06 |
Longue | 18.40 | 34.24 | 48.77 |
Sédentaire | 62.17 | 38.06 | 24.17 |
We want to visualize the distribution of distances, by generation. First, we rely on the scaling parameters used when calling the hist()
function of the graphics
package.
hist_dist <- hist(generations_2$dist_km_log, plot=FALSE)
In a second step, we create a function returning for each bar of the histogram, the values in abscissa and ordinate, these first reflecting the distances, these seconds representing the percentage of observations for each given distance.
#' hist_gen
#' Valeurs pour l'histogramme des distances, pour une generation
#' @une_gen (int) : numéro de la génération
hist_gen <- function(une_gen){
hist_tmp <-
generations_2 %>% filter(generation == une_gen) %>%
magrittr::extract2("dist_km_log") %>%
hist(plot=F, breaks = hist_dist$breaks)
data.frame(x = hist_tmp$mids, counts = hist_tmp$counts) %>%
mutate(pct = counts / sum(counts),
generation = une_gen) %>%
tbl_df()
}# Fin de hist_gen()
We apply this function to each of the three generations at our disposal.
distances_gen_hist <-
lapply(unique(generations_2$generation), hist_gen) %>%
bind_rows()
Finally, we can create and display histograms.
p_dist_histo <-
ggplot(data = distances_gen_hist %>%
mutate(generation = factor(generation,
levels = c(0, 1, 2, 3),
labels = c('Aïeux', "Enfants", "Petits-enfants", "Arrière-petits-enfants"))),
aes(x = exp(x), y = pct)) +
geom_bar(stat = "identity") +
facet_wrap(~generation, nrow = 2, scales = "free_x") +
scale_x_log10(breaks=c(0, 1, 2, 5, 10, 25, 50, 100, 500, 2500, 10000)) +
ylab(NULL) + xlab("Distance (km)")
p_dist_histo
We can also look at intergenerational travel in France according to gender. To do this, we propose to look at this on a table indicating distances in rows, generations and sex in columns.
# Définition des classes de distances
cut_dist <- c(0:25, 50, 75, 100, 150, 200, +Inf)
cut(seq(0, 5), cut_dist, ordered_result = TRUE, include.lowest = TRUE)
[1] [0,1] [0,1] (1,2] (2,3] (3,4] (4,5]
31 Levels: [0,1] < (1,2] < (2,3] < (3,4] < (4,5] < (5,6] < ... < (200,Inf]
generations_dist_table <-
generations_2 %>%
filter(!is.na(dist_km), !is.na(sexe), !is.na(generation)) %>%
mutate(dist_km_rounded = round(dist_km)) %>%
select(generation, sexe, dist_km_rounded) %>%
mutate(dist_km_categ = cut(dist_km_rounded, cut_dist, ordered_result = TRUE, include.lowest=TRUE)) %>%
tbl_df() %>%
group_by(generation, sexe, dist_km_categ) %>%
tally() %>%
group_by(generation, sexe) %>%
mutate(pct = n/sum(n) * 100,
pct_cumul = cumsum(pct)) %>%
mutate(pct = round(pct, 2),
pct_cumul = round(pct_cumul, 2))
generations_dist_table_tex <-
generations_dist_table %>%
select(generation, sexe, dist_km_categ, pct, pct_cumul) %>%
ungroup() %>%
mutate(sexe = factor(sexe, levels = c(2,1), labels = c("Femmes", "Hommes"))) %>%
mutate(value = str_c(pct, " (", pct_cumul, ")")) %>%
select(-pct, -pct_cumul) %>%
dcast(., dist_km_categ~generation + sexe, value.var = "value")
generations_dist_table_tex %>%
knitr::kable(digits=2)
dist_km_categ | 1_Femmes | 1_Hommes | 2_Femmes | 2_Hommes | 3_Femmes | 3_Hommes |
---|---|---|---|---|---|---|
[0,1] | 63.74 (63.74) | 62.24 (62.24) | 39.07 (39.07) | 38.54 (38.54) | 25.21 (25.21) | 24.32 (24.32) |
(1,2] | 1.82 (65.55) | 1.55 (63.78) | 1.88 (40.94) | 1.88 (40.42) | 1.82 (27.03) | 1.64 (25.96) |
(2,3] | 2.15 (67.7) | 2.26 (66.04) | 2.71 (43.66) | 2.66 (43.08) | 2.26 (29.29) | 2.2 (28.16) |
(3,4] | 2.41 (70.11) | 2.15 (68.19) | 2.89 (46.55) | 3.19 (46.27) | 2.66 (31.95) | 2.72 (30.88) |
(4,5] | 2.08 (72.19) | 2.1 (70.28) | 2.67 (49.21) | 2.7 (48.97) | 2.39 (34.33) | 2.59 (33.47) |
(5,6] | 1.51 (73.7) | 1.88 (72.16) | 2.57 (51.78) | 2.39 (51.35) | 2.34 (36.67) | 2.38 (35.85) |
(6,7] | 1.37 (75.07) | 1.5 (73.66) | 1.91 (53.69) | 2.18 (53.54) | 2.05 (38.72) | 2.04 (37.89) |
(7,8] | 1.25 (76.31) | 1.24 (74.9) | 1.61 (55.29) | 1.75 (55.28) | 1.8 (40.52) | 1.62 (39.52) |
(8,9] | 0.92 (77.23) | 1.09 (75.99) | 1.46 (56.76) | 1.54 (56.82) | 1.6 (42.12) | 1.55 (41.07) |
(9,10] | 0.82 (78.05) | 1.11 (77.1) | 1.3 (58.05) | 1.53 (58.36) | 1.47 (43.59) | 1.29 (42.36) |
(10,11] | 0.64 (78.69) | 0.78 (77.88) | 1.16 (59.21) | 1.14 (59.5) | 1.29 (44.88) | 1.34 (43.7) |
(11,12] | 0.69 (79.39) | 0.49 (78.36) | 0.91 (60.12) | 0.97 (60.47) | 1.11 (45.99) | 1.01 (44.71) |
(12,13] | 0.49 (79.87) | 0.52 (78.88) | 0.98 (61.1) | 0.93 (61.4) | 1.14 (47.13) | 1.03 (45.74) |
(13,14] | 0.35 (80.22) | 0.42 (79.3) | 0.8 (61.9) | 0.95 (62.35) | 0.84 (47.97) | 0.89 (46.63) |
(14,15] | 0.27 (80.49) | 0.44 (79.74) | 0.67 (62.57) | 0.79 (63.14) | 0.85 (48.82) | 0.81 (47.44) |
(15,16] | 0.43 (80.92) | 0.39 (80.13) | 0.8 (63.37) | 0.6 (63.74) | 0.89 (49.71) | 0.78 (48.22) |
(16,17] | 0.33 (81.25) | 0.38 (80.51) | 0.77 (64.14) | 0.73 (64.47) | 0.69 (50.4) | 0.68 (48.9) |
(17,18] | 0.35 (81.6) | 0.32 (80.83) | 0.57 (64.71) | 0.63 (65.11) | 0.66 (51.07) | 0.73 (49.63) |
(18,19] | 0.26 (81.86) | 0.31 (81.14) | 0.5 (65.21) | 0.67 (65.78) | 0.65 (51.71) | 0.69 (50.31) |
(19,20] | 0.27 (82.13) | 0.32 (81.45) | 0.49 (65.69) | 0.5 (66.28) | 0.64 (52.35) | 0.57 (50.88) |
(20,21] | 0.26 (82.39) | 0.22 (81.67) | 0.45 (66.14) | 0.43 (66.71) | 0.5 (52.85) | 0.55 (51.43) |
(21,22] | 0.24 (82.64) | 0.25 (81.92) | 0.46 (66.6) | 0.35 (67.06) | 0.44 (53.29) | 0.52 (51.95) |
(22,23] | 0.24 (82.88) | 0.28 (82.19) | 0.34 (66.94) | 0.34 (67.4) | 0.41 (53.7) | 0.54 (52.49) |
(23,24] | 0.16 (83.04) | 0.29 (82.48) | 0.46 (67.4) | 0.33 (67.73) | 0.43 (54.13) | 0.4 (52.89) |
(24,25] | 0.09 (83.13) | 0.19 (82.67) | 0.28 (67.67) | 0.25 (67.98) | 0.35 (54.48) | 0.32 (53.22) |
(25,50] | 3 (86.13) | 2.51 (85.19) | 5.36 (73.03) | 5.07 (73.05) | 6.97 (61.46) | 7.07 (60.29) |
(50,75] | 1.27 (87.41) | 1.57 (86.75) | 2.7 (75.74) | 2.84 (75.89) | 3.95 (65.41) | 3.68 (63.97) |
(75,100] | 1.11 (88.51) | 1 (87.75) | 1.82 (77.56) | 1.84 (77.72) | 2.71 (68.11) | 2.63 (66.61) |
(100,150] | 1.59 (90.1) | 1.72 (89.47) | 2.78 (80.34) | 2.83 (80.55) | 4.08 (72.19) | 4.23 (70.83) |
(150,200] | 1.34 (91.44) | 1.48 (90.95) | 2.49 (82.83) | 2.82 (83.38) | 3.5 (75.7) | 3.75 (74.58) |
(200,Inf] | 8.56 (100) | 9.05 (100) | 17.17 (100) | 16.62 (100) | 24.3 (100) | 25.42 (100) |
We will take a look at migration between cities, depending on the type of city. The General Statistics of France (SGF) provides statistics for certain communes: https://www.insee.fr/fr/statistiques/2591293?sommaire=2591397. We decide to divide the communes into several categories, using the principle of Fleury & Henry (1958) and Blayo (1975), which consists in segmenting the communes according to their number of inhabitants. We use the population statistics of 1836 to define, among the communes present in the SGF data:
We consider that municipalities that are not included in the SGF data should be classified in a separate category, which we call “very small towns”.
We load data from SGF:
library(stringi)
pop_chef_lieux <- readxl::read_excel("../data/population_insee/TERR_T116.xls", skip=7)
pop_chef_lieux <-
pop_chef_lieux %>%
filter(`Code de l'unité géographique` == 3) %>%
select(`Code du département`, `Nom de l'unité d'analyse`,
`Population totale des villes chefs-lieux d'arrondissement en 1836`) %>%
rename(code_dep = `Code du département`, nom_insee = `Nom de l'unité d'analyse`,
pop_1836 = `Population totale des villes chefs-lieux d'arrondissement en 1836`) %>%
mutate(nom = stri_replace_all_regex(nom_insee, "\\((.*?)\\)", "") %>%
stri_trans_general(., "Latin-ASCII") %>%
stri_replace_all_regex(., "'|\\(|\\)|[[:blank:]]|\\?|_|\\*", "") %>%
stri_trim(.) %>%
str_to_lower()) %>%
mutate(code_dep = ifelse(code_dep %in% c("78", "91", "92", "95"), yes = "78", no = code_dep),
code_dep = ifelse(code_dep %in% c("93", "94", "77"), yes = "77", no = code_dep),
code_dep = ifelse(code_dep %in% c("2A", "2B"), yes = "20", no = code_dep))
head(pop_chef_lieux)
# A tibble: 6 x 4
code_dep nom_insee pop_1836 nom
<chr> <chr> <dbl> <chr>
1 01 BELLEY 3970 belley
2 01 BOURG 9528 bourg
3 01 NANTUA 3696 nantua
4 01 TREVOUX 2559 trevoux
5 02 CHATEAU-THIERRY 4761 chateau-thierry
6 02 LAON 8230 laon
We add Nice by hand…
pop_chef_lieux <-
pop_chef_lieux %>%
bind_rows(data.frame(code_dep = "06", nom_insee = "NICE", pop_1836 = 33811, nom = "nice", stringsAsFactors = FALSE))
We will add additional data, which could then be used, in particular the surface of the communes. These data are proposed by Open Street Map (OSM), an appendix makes it possible to understand how we obtain them.
load("../shp_OSM/communes_sans_cercle.rda")
communes_osm <- lapply(communes_sans_cercle, function(x) x@data) %>% bind_rows() %>% tbl_df()
communes_osm <-
communes_osm %>%
mutate(code_dep = str_sub(insee, 1, 2)) %>%
rename(nom_osm = nom) %>%
mutate(nom = stri_replace_all_regex(nom_osm, "\\((.*?)\\)", "") %>%
stri_trans_general(., "Latin-ASCII") %>%
stri_replace_all_regex(., "'|\\(|\\)|[[:blank:]]|\\?|_|\\*", "") %>%
stri_trim(.) %>%
str_to_lower()) %>%
mutate(code_dep = ifelse(code_dep %in% c("78", "91", "92", "95"), yes = "78", no = code_dep),
code_dep = ifelse(code_dep %in% c("93", "94", "77"), yes = "77", no = code_dep),
code_dep = ifelse(code_dep %in% c("2A", "2B"), yes = "20", no = code_dep))
head(communes_osm)
The thorny point here is to match the SGF bases with those of OSM. The simplest would be to use INSEE codes, but they are not provided by SGF. We then decide to use the department code and the name of the commune. However, the matching is not complete, some spellings differ depending on the source of the data.
pop_chef_lieux %>%
anti_join(communes_osm) %>%
head()
# A tibble: 6 x 4
code_dep nom_insee pop_1836 nom
<chr> <chr> <dbl> <chr>
1 01 BOURG 9528 bourg
2 03 MOULINS-SUR-ALLIER 15231 moulins-sur-allier
3 04 DIGNE 6365 digne
4 07 TOURNON 4174 tournon
5 08 MEZIERES 4083 mezieres
6 08 ROCROY 3682 rocroy
We then create a practical function to identify potential matches.
chercher_match_commune <- function(i){
(tmp <- pop_chef_lieux %>%
anti_join(communes_osm) %>%
slice(i)
)
cat(str_c("Je cherche pour : ", tmp$nom, "\n"))
print(tmp)
res_tmp <-
communes_osm %>%
filter(code_dep == tmp$code_dep) %>%
filter(str_sub(nom, 1, 4) == str_sub(tmp$nom, 1, 4)) %>%
arrange(nom)
if(nrow(res_tmp) == 0){
print("Pas trouvé... Je cherche plus loin\n")
res_tmp <- communes_osm %>%
filter(code_dep == tmp$code_dep) %>%
filter(str_detect(nom, tmp$nom)) %>%
arrange(nom)
}
res_tmp
}
Using this function is simple:
chercher_match_commune(1)
Je cherche pour : bourg
# A tibble: 1 x 4
code_dep nom_insee pop_1836 nom
<chr> <chr> <dbl> <chr>
1 01 BOURG 9528 bourg
# A tibble: 2 x 6
insee nom_osm wikipedia surf_ha code_dep nom
<chr> <chr> <chr> <dbl> <chr> <chr>
1 01053 Bourg-en-Bresse fr:Bourg-en-Br… 2393 01 bourg-en-…
2 01054 Bourg-Saint-Christophe fr:Bourg-Saint… 900 01 bourg-sai…
We make the following corrections, using successively the function chercher_match_commune()
:
a_remplacer_noms <-
c("bourg", "bourg-en-bresse",
"moulins-sur-allier", "moulins",
"digne", "digne-les-bains",
"tournon", "tournon-sur-rhone",
"mezieres", "charleville-mezieres",
"rocroy", "rocroi",
"milhau", "millau",
"aix", "aix-en-provence",
"arles-sur-rhone", "arles",
"vire", "virenormandie",
"barbezieux", "barbezieux-saint-hilaire",
"st-jean-dangely", "saint-jean-dangely",
"st-amand-montrond", "saint-amand-montrond",
"brive", "brive-la-gaillarde",
"semur", "semur-en-auxois",
"guincamp", "guingamp",
"sarlat", "sarlat-la-caneda",
"beaume-les-dames", "baume-les-dames",
"montelimart", "montelimar",
"alais", "ales",
"lesparre", "lesparre-medoc",
"saint-pons", "saint-pons-de-thomieres",
"montfort", "montfort-sur-meu",
"romorantin", "romorantin-lanthenay",
"lepuy", "lepuy-en-velay",
"yssengeaux", "yssingeaux",
"florac", "florac-trois-rivieres",
"bauge", "bauge-en-anjou",
"beaupreau", "beaupreau-en-mauges",
"segre", "segre-en-anjou-bleu",
"cherbourg", "cherbourg-en-cotentin",
"mortain", "mortain-bocage",
"chalons-sur-marne", "chalons-en-champagne",
"vitry", "vitry-le-francois",
"vassy", "wassy",
# "briey", "valdebriey",
"cosne", "cosne-cours-sur-loire",
"avesnes", "avesnes-sur-helpe",
"domfront", "domfront-en-poiraie",
"mortagne", "mortagne-au-perche",
"montreuil-sur-mer", "montreuil",
"saint-pol", "saint-pol-sur-ternoise",
"mauleon", "mauleon-licharre",
"oloron", "oloron-sainte-marie",
"argeles", "argeles-gazost",
"bagneres", "bagneres-de-bigorre",
"schelestadt", "selestat",
"mantes", "mantes-la-jolie",
"castel-sarrazin", "castelsarrasin",
"saint-yrieix", "saint-yrieix-la-perche",
"saint-die", "saint-die-des-vosges",
"corbeil", "corbeil-essonnes",
"briey", "valdebriey"
) %>%
matrix(byrow=TRUE, ncol=2) %>%
data.frame(stringsAsFactors = FALSE)
colnames(a_remplacer_noms) <- c("nom", "nouveau_nom")
And we reflect these modifications on pop_chef_lieux
:
pop_chef_lieux <-
pop_chef_lieux %>%
left_join(a_remplacer_noms) %>%
mutate(nom = ifelse(!is.na(nouveau_nom), yes = nouveau_nom, no = nom)) %>%
select(-nouveau_nom)
We are also changing the information for some recently created departments.
a_remplacer_noms_dep <-
c("briey", "54",
"saint-denis", "77",
"sceaux", "78",
"corbeil", "78",
"etampes", "78",
"pontoise", "78",
"grasse", "06",
"chateau-salins", "57",
"luneville", "54",
"nancy", "54",
"sarrebourg", "57",
"toul", "54") %>%
matrix(byrow=TRUE, ncol=2) %>%
data.frame(stringsAsFactors = FALSE)
colnames(a_remplacer_noms_dep) <- c("nom", "nouveau_dep")
pop_chef_lieux <-
pop_chef_lieux %>%
left_join(a_remplacer_noms_dep) %>%
mutate(code_dep = ifelse(!is.na(nouveau_dep), yes = nouveau_dep, no = code_dep)) %>%
select(-nouveau_dep)
Matching between OSM and SGF data can thus be done:
villes_francaises <-
pop_chef_lieux %>%
left_join(communes_osm) %>%
filter(!is.na(pop_1836))
villes_francaises %>% arrange(-pop_1836) %>% data.frame() %>% slice(1:12)
# A tibble: 12 x 8
code_dep nom_insee pop_1836 nom insee nom_osm wikipedia surf_ha
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 75 PARIS 909126 paris 75056 Paris fr:Paris 10537
2 69 LYON 150814 lyon 69123 Lyon fr:Lyon 4798
3 13 MARSEILLE 146239 marseille 13055 Marsei… fr:Marse… 24189
4 33 BORDEAUX 98705 bordeaux 33063 Bordea… fr:Borde… 4984
5 76 ROUEN 92083 rouen 76540 Rouen fr:Rouen 2139
6 31 TOULOUSE 77372 toulouse 31555 Toulou… fr:Toulo… 11802
7 44 NANTES 75895 nantes 44109 Nantes fr:Nantes 6579
8 59 LILLE 72005 lille 59350 Lille fr:Lille 3483
9 67 STRASBOURG 57885 strasbourg 67482 Strasb… fr:Stras… 7828
10 80 AMIENS 46129 amiens 80021 Amiens fr:Amiens 5007
11 30 NIMES 43036 nimes 30189 Nîmes fr:Nîmes 16170
12 57 METZ 42793 metz 57463 Metz fr:Metz 4176
We calculate the population density (number of inhabitants per square kilometre), and define the three types of cities among the municipalities cited by the SGF.
villes_francaises <-
villes_francaises %>%
# Densite au km2
mutate(densite_pop_1836 = pop_1836 / (surf_ha * 0.01)) %>%
mutate(type_commune = "petite",
type_commune = ifelse(pop_1836 >= 10000, yes = "moyenne", no = type_commune),
type_commune = ifelse(pop_1836 >= 50000, yes = "grande", no = type_commune)
) %>%
filter(!is.na(insee))
We use this classification to segment the communes of birth of the individuals composing our sample:
# Aïeux
types_communes_aieul <-
villes_francaises %>%
select(insee, type_commune) %>%
rename(code_insee_N_aieul = insee, type_commune_aieul = type_commune) %>%
data.table()
setkey(types_communes_aieul, code_insee_N_aieul)
setkey(generations, code_insee_N_aieul)
generations <- types_communes_aieul[generations]
# Enfants
types_communes_enfants <-
villes_francaises %>%
select(insee, type_commune) %>%
rename(code_insee_N_enfants = insee, type_commune_enfants = type_commune) %>%
data.table()
setkey(types_communes_enfants, code_insee_N_enfants)
setkey(generations, code_insee_N_enfants)
generations <- types_communes_enfants[generations]
# Petits-enfants
types_communes_p_enfants <-
villes_francaises %>%
select(insee, type_commune) %>%
rename(code_insee_N_p_enfants = insee, type_commune_p_enfants = type_commune) %>%
data.table()
setkey(types_communes_p_enfants, code_insee_N_p_enfants)
setkey(generations, code_insee_N_p_enfants)
generations <- types_communes_p_enfants[generations]
# Arrière-petits-enfants
types_communes_ap_enfants <-
villes_francaises %>%
select(insee, type_commune) %>%
rename(code_insee_N_ap_enfants = insee, type_commune_ap_enfants = type_commune)%>%
data.table()
setkey(types_communes_ap_enfants, code_insee_N_ap_enfants)
setkey(generations, code_insee_N_ap_enfants)
generations <- types_communes_ap_enfants[generations]
Then, we add the modality “very small town” to the municipalities not mentioned in the SGF data.
generations$type_commune_aieul[which(is.na(generations$type_commune_aieul))] <- "autres"
generations$type_commune_enfants[which(is.na(generations$type_commune_enfants))] <- "autres"
generations$type_commune_p_enfants[which(is.na(generations$type_commune_p_enfants))] <- "autres"
generations$type_commune_ap_enfants[which(is.na(generations$type_commune_ap_enfants))] <- "autres"
generations <-
generations %>% tbl_df() %>%
mutate(type_commune_aieul = factor(type_commune_aieul, levels = c("grande", "moyenne", "petite", "autres")),
type_commune_enfants = factor(type_commune_enfants, levels = c("grande", "moyenne", "petite", "autres")),
type_commune_p_enfants = factor(type_commune_p_enfants, levels = c("grande", "moyenne", "petite", "autres")),
type_commune_ap_enfants = factor(type_commune_ap_enfants, levels = c("grande", "moyenne", "petite", "autres"))) %>%
data.table()
So we can start to come up with some statistics. We look at the proportion of individuals in each type of city, by generation:
prop_villes_gen <-
generations %>%
tbl_df() %>%
select(id_aieul, type_commune_aieul) %>%
unique() %>%
group_by(type_commune_aieul) %>%
tally() %>%
mutate(generation = "0") %>%
mutate(pct = n / sum(n)) %>%
rename(type_commune = type_commune_aieul) %>%
bind_rows(
generations %>%
tbl_df() %>%
select(id_enfant, type_commune_enfants) %>%
unique() %>%
group_by(type_commune_enfants) %>%
tally() %>%
mutate(generation = "1") %>%
mutate(pct = n / sum(n)) %>%
rename(type_commune = type_commune_enfants)
) %>%
bind_rows(
generations %>%
tbl_df() %>%
select(id_p_enfant, type_commune_p_enfants) %>%
unique() %>%
group_by(type_commune_p_enfants) %>%
tally() %>%
mutate(generation = "2") %>%
mutate(pct = n / sum(n)) %>%
rename(type_commune = type_commune_p_enfants)
) %>%
bind_rows(
generations %>%
tbl_df() %>%
select(id_ap_enfant, type_commune_ap_enfants) %>%
unique() %>%
group_by(type_commune_ap_enfants) %>%
tally() %>%
mutate(generation = "3") %>%
mutate(pct = n / sum(n)) %>%
rename(type_commune = type_commune_ap_enfants)
)
The result is as follows:
prop_villes_gen %>% select(-n) %>%
mutate(pct = pct*100) %>% spread(generation, pct) %>%
knitr::kable(digits=2)
type_commune | 0 | 1 | 2 | 3 |
---|---|---|---|---|
grande | 3.40 | 3.92 | 6.31 | 7.63 |
moyenne | 5.18 | 5.25 | 6.55 | 7.86 |
petite | 3.43 | 3.56 | 3.73 | 3.81 |
autres | 87.99 | 87.27 | 83.41 | 80.70 |
We then look at conditional transient movements between small, medium and large cities, in percentages.
# Aieux --> enfants
villes_aieul_enfant <-
generations %>%
select(id_aieul, id_enfant, type_commune_aieul, type_commune_enfants) %>%
unique()
villes_aieul_enfant <-
xtabs(~type_commune_aieul + type_commune_enfants, villes_aieul_enfant)
# Aieux --> petits-enfants
villes_aieul_p_enfant <-
generations %>%
select(id_aieul, id_p_enfant, type_commune_aieul, type_commune_p_enfants) %>%
unique()
villes_aieul_p_enfant <-
xtabs(~type_commune_aieul + type_commune_p_enfants, villes_aieul_p_enfant)
# Aieux --> arrière-petits-enfants
villes_aieul_ap_enfant <-
generations %>%
select(id_aieul, id_ap_enfant, type_commune_aieul, type_commune_ap_enfants) %>%
unique()
villes_aieul_ap_enfant <-
xtabs(~type_commune_aieul + type_commune_ap_enfants, villes_aieul_ap_enfant)
table_villes_aieul_enfant <- ftable(villes_aieul_enfant)
table_villes_aieul_enfant <- table_villes_aieul_enfant/rowSums(table_villes_aieul_enfant) * 100
table_villes_aieul_p_enfant <- ftable(villes_aieul_p_enfant)
table_villes_aieul_p_enfant <- table_villes_aieul_p_enfant/rowSums(table_villes_aieul_p_enfant) * 100
table_villes_aieul_ap_enfant <- ftable(villes_aieul_ap_enfant)
table_villes_aieul_ap_enfant <- table_villes_aieul_ap_enfant/rowSums(table_villes_aieul_ap_enfant) * 100
Percentages are read per line.
table_villes_aieul_enfant
type_commune_enfants Paris Couronne Autre
type_commune_aieul
Paris 74.794521 3.561644 21.643836
Couronne 4.360465 79.651163 15.988372
Autre NaN NaN NaN
table_villes_aieul_p_enfant
type_commune_p_enfants Paris Couronne Autre
type_commune_aieul
Paris 55.040323 9.072581 35.887097
Couronne 6.766917 68.421053 24.812030
Autre NaN NaN NaN
table_villes_aieul_ap_enfant
type_commune_ap_enfants Paris Couronne Autre
type_commune_aieul
Paris 43.447038 8.976661 47.576302
Couronne 11.786600 58.188586 30.024814
Autre NaN NaN NaN
We can look more closely at the Parisian case. To do this, we create a base including the descendants born in the department of Paris as well as their ancestors. The ancestors were not necessarily born in the department of Paris, and this is what we will study to begin with.
enfants_75 <-
generations %>%
filter(dept_N_enfants == "F75") %>%
select(id_aieul, id_enfant, dept_N_aieul) %>%
mutate(generation = 1) %>%
unique() %>%
select(dept_N_aieul, generation)
p_enfants_75 <-
generations %>%
filter(dept_N_p_enfants == "F75") %>%
select(id_aieul, id_p_enfant, dept_N_aieul) %>%
mutate(generation = 2) %>%
unique() %>%
select(dept_N_aieul, generation)
ap_enfants_75 <-
generations %>%
filter(dept_N_ap_enfants == "F75") %>%
select(id_aieul, id_ap_enfant, dept_N_aieul) %>%
mutate(generation = 3) %>%
unique() %>%
select(dept_N_aieul, generation)
generations_paris <-
enfants_75 %>%
bind_rows(p_enfants_75) %>%
bind_rows(ap_enfants_75) %>%
tbl_df()
We calculate, for each generation of descendants (children, grandchildren and great-grandchildren), the number of ancestors born in each of the French departments. Then, we calculate the relative share of each birth department of the ancestors, excluding Paris, to see if certain regions are more represented than others in the origin of the descendants born in Paris.
nb_obs_depts <-
generations_paris %>%
filter(str_detect(dept_N_aieul, "^F[[:digit:]]{2}")) %>%
filter(dept_N_aieul != "F75") %>%
group_by(generation, dept_N_aieul) %>%
summarise(nb_obs = n()) %>%
group_by(generation) %>%
mutate(pct = nb_obs / sum(nb_obs) * 100)
nb_obs_depts %>% arrange(-pct) %>% head()
# A tibble: 6 x 4
# Groups: generation [3]
generation dept_N_aieul nb_obs pct
<dbl> <chr> <int> <dbl>
1 1.00 F78 23 8.98
2 3.00 F67 267 8.54
3 2.00 F67 82 8.34
4 1.00 F77 17 6.64
5 2.00 F57 62 6.31
6 2.00 F77 59 6.00
We want to offer a graphic representation of these values on a choropleth map.
france_paris <-
france_1800 %>%
left_join(
nb_obs_depts %>%
select(generation, pct, dept_N_aieul) %>%
spread(generation, pct) %>%
mutate(num_dep = str_sub(dept_N_aieul, 2)),
by = "num_dep")
france_paris <-
france_paris %>%
gather(generation, pct, `1`,`2`, `3`) %>%
mutate(pct = ifelse(num_dep == "75", yes = NA, no = pct)) %>%
mutate(generation = factor(generation,
levels = c(0, 1, 2, 3),
labels = c('Aïeux', "Enfants", "Petits-enfants", "Arrière-petits-enfants")))
max_pct <- france_paris$pct %>% max(na.rm=T)
max_pct <- ceiling(max_pct)
p <- ggplot() +
geom_polygon(data = france_paris, aes(x = long, y = lat, group = group, fill = pct), col = "grey", size = .4) +
geom_polygon(data = france_paris %>% filter(departement == "F75"),
aes(x = long, y = lat, group = group), fill = NA, size = .4, col = "grey50", linetype = "dashed") +
coord_quickmap() +
scale_fill_gradientn(
"Pourcentage",
colours = c("#FFFFFF", rev(heat.colors(10))), na.value = "white",
limits = c(0, max_pct),
guide = guide_colourbar(direction = "horizontal",
barwidth = 30,
title.position = "bottom",
title.hjust = .5)) +
facet_wrap(~generation) +
theme(text = element_text(size = 16), legend.position = "bottom")
p
We can also look at the percentage of descendants, in each department, to be born in Paris.
pct_naiss_enfants <-
generations %>%
select(id_aieul, id_enfant, dept_N_aieul, dept_N_enfants) %>%
filter(!is.na(dept_N_aieul), !is.na(dept_N_enfants)) %>%
unique() %>%
tbl_df() %>%
mutate(naiss_paris = dept_N_enfants == "F75") %>%
group_by(dept_N_aieul) %>%
summarise(n = n(), n_paris = sum(naiss_paris), pct_paris = n_paris / n * 100)
pct_naiss_p_enfants <-
generations %>%
select(id_aieul, id_p_enfant, dept_N_aieul, dept_N_p_enfants) %>%
filter(!is.na(dept_N_aieul), !is.na(dept_N_p_enfants)) %>%
unique() %>%
tbl_df() %>%
mutate(naiss_paris = dept_N_p_enfants == "F75") %>%
group_by(dept_N_aieul) %>%
summarise(n = n(), n_paris = sum(naiss_paris), pct_paris = n_paris / n * 100)
pct_naiss_ap_enfants <-
generations %>%
select(id_aieul, id_ap_enfant, dept_N_aieul, dept_N_ap_enfants) %>%
filter(!is.na(dept_N_aieul), !is.na(dept_N_ap_enfants)) %>%
unique() %>%
tbl_df() %>%
mutate(naiss_paris = dept_N_ap_enfants == "F75") %>%
group_by(dept_N_aieul) %>%
summarise(n = n(), n_paris = sum(naiss_paris), pct_paris = n_paris / n * 100)
pct_naiss_paris <-
pct_naiss_enfants %>% mutate(generation = 1) %>%
bind_rows(pct_naiss_p_enfants %>% mutate(generation = 2)) %>%
bind_rows(pct_naiss_ap_enfants %>% mutate(generation = 3))
stat_des_pct_naiss_paris <-
pct_naiss_paris %>%
group_by(generation) %>%
summarise(pct_paris_avg = mean(pct_paris),
pct_paris_min = min(pct_paris),
pct_paris_max = max(pct_paris)
)
stat_des_pct_naiss_paris
# A tibble: 3 x 4
generation pct_paris_avg pct_paris_min pct_paris_max
<dbl> <dbl> <dbl> <dbl>
1 1.00 1.69 0 73.3
2 2.00 3.30 0 53.6
3 3.00 5.11 0.256 42.2
The map, which shows for each department, the percentage of descendants to be born in Paris, for each department, is obtained as follows:
france_paris <-
france_1800 %>%
left_join(
pct_naiss_paris %>%
select(generation, pct_paris, dept_N_aieul) %>%
spread(generation, pct_paris) %>%
mutate(num_dep = str_sub(dept_N_aieul, 2)),
by = "num_dep")
france_paris <-
france_paris %>%
gather(generation, pct, `1`,`2`, `3`) %>%
mutate(pct = ifelse(num_dep == "75", yes = NA, no = pct)) %>%
mutate(generation = factor(generation,
levels = c(0, 1, 2, 3),
labels = c('A\\"\\\'eux', "Enfants", "Petits-enfants", "Arri\\`ere-petits-enfants")))
max_pct <- france_paris$pct %>% max(na.rm=T)
max_pct <- ceiling(max_pct)
p <- ggplot() +
geom_polygon(data = france_paris, aes(x = long, y = lat, group = group, fill = pct), col = "grey", size = .4) +
geom_polygon(data = france_paris %>% filter(departement == "F75"),
aes(x = long, y = lat, group = group), fill = NA, size = .4, col = "grey50", linetype = "dashed") +
coord_quickmap() +
scale_fill_gradientn(
"Pourcentage",
colours = c("#FFFFFF", rev(heat.colors(10))), na.value = "white",
limits = c(0, max_pct),
guide = guide_colourbar(direction = "horizontal",
barwidth = 30,
title.position = "bottom",
title.hjust = .5)) +
facet_wrap(~generation) +
theme(text = element_text(size = 16), legend.position = "bottom")
p
The map showing the proportions of unborn descendants in Paris seems to reflect a distance effect: the further away the department of origin of the ancestors is from Paris, the lower the percentage of unborn descendants seems to be. We then dig in this direction.
We need a measure of the distance between each department and Paris. We retain the one that separates the barycentres. We must de facto obtain the barycentre of each region.
library(rgeos)
obtenir_spa_poly_sp <- function(un_dep){
dep_tmp <- france_1800 %>%
filter(departement == un_dep)
obtenir_poly_groupe <- function(un_groupe){
coords_group <- dep_tmp[which(dep_tmp$group %in% un_groupe), c("long", "lat")]
return(Polygons(list(Polygon(coords_group)), un_groupe))
}
dep_polys <- lapply(unique(dep_tmp$group), obtenir_poly_groupe)
dep_polys_sp <- SpatialPolygons(dep_polys, seq(1, length(dep_polys)))
dep_polys_sp
}
france_1800_sp <- lapply(unique(france_1800$departement), obtenir_spa_poly_sp)
names(france_1800_sp) <- unique(france_1800$departement)
france_1800_barycentres <- lapply(france_1800_sp, function(x) gCentroid(x))
france_1800_barycentres <- do.call("rbind", lapply(france_1800_barycentres, function(x) x@coords))
france_1800_barycentres <- cbind(france_1800_barycentres, departement = unique(france_1800$departement))
france_1800_barycentres <- data.frame(france_1800_barycentres, stringsAsFactors = FALSE) %>% tbl_df()
rownames(france_1800_barycentres) <- NULL
france_1800_barycentres <-
france_1800_barycentres %>%
mutate(long = as.numeric(x), lat = as.numeric(y)) %>%
select(-x,-y)
In particular, the coordinates of the Paris department barycentre are as follows:
(coords_paris <- france_1800_barycentres %>% filter(departement == "F75"))
# A tibble: 1 x 3
departement long lat
<chr> <dbl> <dbl>
1 F75 2.37 48.9
We encapsulate the distm()
function of the package geosphere
in another, to be able to calculate the distance separating two points of the globe.
calcul_distance <- function(long_1, lat_1, long_2, lat_2){
geosphere::distm(c(long_1, lat_1), c(long_2, lat_2), fun = geosphere::distHaversine) %>% "["(1)
}
We apply the newly created function to obtain the distances (in km) separating each department from Paris:
distances_paris <-
france_1800_barycentres %>%
mutate(long_paris = coords_paris$long, lat_paris = coords_paris$lat) %>%
rowwise() %>%
mutate(distance_paris = calcul_distance(long, lat, long_paris, lat_paris) / 1000) %>%
ungroup() %>%
select(departement, distance_paris)
head(distances_paris)
# A tibble: 6 x 2
departement distance_paris
<chr> <dbl>
1 F01 380
2 F02 118
3 F03 282
4 F04 607
5 F06 657
6 F07 482
Then we add these distances to our data representing the percentage of descendants in each department to be born in Paris.
pct_naiss_paris <-
pct_naiss_paris %>%
left_join(distances_paris, by = c("dept_N_aieul" = "departement")) %>%
filter(dept_N_aieul != "F75") %>%
mutate(generation = factor(generation,
levels = c(0, 1, 2, 3),
labels = c('Aïeux', "Enfants", "Petits-enfants", "Arrière-petits-enfants")))
To assign colors to each generation according to those previously used, we use a function to find them (these are the colors that are automatically used by the package ggplot2
functions).
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
couleurs <- gg_color_hue(4)
p_prop_paris_distance <- ggplot(data = pct_naiss_paris,
aes(x = distance_paris, y = pct_paris,
colour = generation, group = generation
)) +
geom_point() +
geom_smooth(se = FALSE) +
xlab("Distance à Paris (km)") + ylab("Pourcentage") +
scale_colour_manual("", values = c('AÏeux' = couleurs[1], "Enfants" = couleurs[2],
"Petits-enfants" = couleurs[3], "Arrière-petits-enfants" = couleurs[4])) +
scale_linetype_manual("", values = c("Enfants" = "solid",
"Petits-enfants" = "dotted",
"Arrière-petits-enfants" = "dashed")) +
scale_shape_manual("", values = c("Enfants" = 15,
"Petits-enfants" = 16,
"Arrière-petits-enfants" = 17))
p_prop_paris_distance
We explore the movements between the commune of Paris and its crown. We recover the codes of the communes within a radius of 20km from Paris, which we consider as the crown. To do this, we reuse the object communes_proches_20km
, whose obtaining is detailed in annexe.
codes_insee_proches_20km <- communes_proches_20km[["75056"]]$limitrophes_20
codes_retenir <- c(codes_insee_proches_20km, "75056")
For each generation, we calculate the percentage of descendants of Parisian ancestors to be born in Paris, in its crown or elsewhere.
# Aieux --> enfants
villes_aieul_enfant <-
generations %>%
filter(code_insee_N_aieul %in% codes_retenir) %>%
filter(!is.na(code_insee_N_aieul), !is.na(code_insee_N_enfants)) %>%
mutate(type_commune_aieul = ifelse(code_insee_N_aieul == "75056", yes = "Paris", no = "Autre")) %>%
mutate(type_commune_aieul = ifelse(code_insee_N_aieul %in% codes_insee_proches_20km, yes = "Couronne", no = type_commune_aieul)) %>%
mutate(type_commune_enfants = ifelse(code_insee_N_enfants == "75056", yes = "Paris", no = "Autre")) %>%
mutate(type_commune_enfants = ifelse(code_insee_N_enfants %in% codes_insee_proches_20km, yes = "Couronne", no = type_commune_enfants)) %>%
select(id_aieul, id_enfant, type_commune_aieul, type_commune_enfants) %>%
unique() %>%
mutate(type_commune_aieul = factor(type_commune_aieul, levels = c("Paris", "Couronne", "Autre")),
type_commune_enfants = factor(type_commune_enfants, levels = c("Paris", "Couronne", "Autre")))
villes_aieul_enfant <-
xtabs(~type_commune_aieul + type_commune_enfants, villes_aieul_enfant)
# Aieux --> petits-enfants
villes_aieul_p_enfant <-
generations %>%
filter(code_insee_N_aieul %in% codes_retenir) %>%
mutate(type_commune_aieul = ifelse(code_insee_N_aieul == "75056", yes = "Paris", no = "Autre")) %>%
mutate(type_commune_aieul = ifelse(code_insee_N_aieul %in% codes_insee_proches_20km, yes = "Couronne", no = type_commune_aieul)) %>%
mutate(type_commune_p_enfants = ifelse(code_insee_N_p_enfants == "75056", yes = "Paris", no = "Autre")) %>%
mutate(type_commune_p_enfants = ifelse(code_insee_N_p_enfants %in% codes_insee_proches_20km, yes = "Couronne", no = type_commune_p_enfants)) %>%
select(id_aieul, id_p_enfant, type_commune_aieul, type_commune_p_enfants, code_insee_N_p_enfants) %>%
unique() %>%
mutate(type_commune_aieul = factor(type_commune_aieul, levels = c("Paris", "Couronne", "Autre")),
type_commune_p_enfants = factor(type_commune_p_enfants, levels = c("Paris", "Couronne", "Autre")))
villes_aieul_p_enfant <-
xtabs(~type_commune_aieul + type_commune_p_enfants, villes_aieul_p_enfant)
# Aieux --> arrière-petits-enfants
villes_aieul_ap_enfant <-
generations %>%
filter(code_insee_N_aieul %in% codes_retenir) %>%
mutate(type_commune_aieul = ifelse(code_insee_N_aieul == "75056", yes = "Paris", no = "Autre")) %>%
mutate(type_commune_aieul = ifelse(code_insee_N_aieul %in% codes_insee_proches_20km, yes = "Couronne", no = type_commune_aieul)) %>%
mutate(type_commune_ap_enfants = ifelse(code_insee_N_ap_enfants == "75056", yes = "Paris", no = "Autre")) %>%
mutate(type_commune_ap_enfants = ifelse(code_insee_N_ap_enfants %in% codes_insee_proches_20km, yes = "Couronne", no = type_commune_ap_enfants)) %>%
select(id_aieul, id_ap_enfant, type_commune_aieul, type_commune_ap_enfants, code_insee_N_ap_enfants) %>%
unique() %>%
mutate(type_commune_aieul = factor(type_commune_aieul, levels = c("Paris", "Couronne", "Autre")),
type_commune_ap_enfants = factor(type_commune_ap_enfants, levels = c("Paris", "Couronne", "Autre")))
villes_aieul_ap_enfant <-
xtabs(~type_commune_aieul + type_commune_ap_enfants, villes_aieul_ap_enfant)
table_villes_aieul_enfant <- ftable(villes_aieul_enfant)
table_villes_aieul_enfant <- table_villes_aieul_enfant/rowSums(table_villes_aieul_enfant) * 100
table_villes_aieul_p_enfant <- ftable(villes_aieul_p_enfant)
table_villes_aieul_p_enfant <- table_villes_aieul_p_enfant/rowSums(table_villes_aieul_p_enfant) * 100
table_villes_aieul_ap_enfant <- ftable(villes_aieul_ap_enfant)
table_villes_aieul_ap_enfant <- table_villes_aieul_ap_enfant/rowSums(table_villes_aieul_ap_enfant) * 100
To obtain a (rough) map of France in 1800, we first rely on the current routes, provided by the package maps
.
france <- map_data(map = "france") %>% rename(nom_map = region) %>% tbl_df()
head(france)
We also retrieve traces from the database GADM. The package raster
offers a function to download the data.
map_fr_sp <- raster::getData(name = "GADM", country = "FRA", level = 2)
The data in the GADM database makes it possible to obtain better spelled names than those in the package maps
. We extract them, and proceed to some manipulations in order to carry out the pairing between the two sources.
noms_departements <- map_fr_sp@data %>% select(CCA_2, NAME_2) %>% unique() %>% tbl_df() %>%
rename(num_dep = CCA_2, nom_sp = NAME_2) %>%
mutate(nom_jointure = iconv(nom_sp, to='ASCII//TRANSLIT')) %>%
mutate(nom_jointure = str_replace_all(nom_jointure, "[^[:alnum:]]", "")) %>%
mutate(nom_jointure = str_to_lower(nom_jointure)) %>%
arrange(nom_jointure)
We can therefore obtain a nice database to draw the borders of the French departments, which we store.
france <-
france %>%
mutate(nom_jointure = str_to_lower(nom_map)) %>%
mutate(nom_jointure = str_replace_all(nom_jointure, "[^[:alnum:]]", "")) %>%
tbl_df() %>%
arrange(nom_jointure) %>%
left_join(noms_departements)
save(france, file="france.rda")
All that remains is to merge a few departments to better reflect the state of France in the 19th century. We must group together a few departments, mainly those recently created, notably in the Paris basin.
We create a function allowing to transform a department represented by the coordinates of its polygon into gpc.poly
object. This step is useful to be able to use the polygon union functions later.
#' group_to_gpc_poly
#' Returns the polygon from a group within the data frame containing
#' the coordinates of regions
#' @df: data frame with the coordinates of the polygons
#' @group: (string) group ID
group_to_gpc_poly <- function(df, group){
group_coords <- df[df$group == group, c("long", "lat")]
as(group_coords, "gpc.poly")
}
We create another function to merge departments into one. This function proceeds in the following way: we put in the form of polygon the departments to merge, create the union of these polygons, then recover the coordinates of the union of the polygons.
#' fusion_depts
#' A partir d'un tableau contenant les coordonnées des frontières des départements et d'une liste d'identifiants
#' de départements, fusionne les départements cités en un seul polygone.
#' @depts_a_grouper (string) : vecteur de caractères indiquant les identifiants des départements à fusionner
#' @nouveau_nom (string) : nouveau nom du département issu de la fusion
#' @nouveau_departement (string) : nouvel identifiant du département issu de la fusion
#' @df : tableau à double entrée contenant les coordonnées des frontières des départements
fusion_depts <- function(depts_a_grouper, nouveau_nom, nouveau_departement, df){
df_tmp <- df %>% filter(departement %in% depts_a_grouper)
df_tmp_polys <- lapply(unique(df_tmp$group), group_to_gpc_poly, df = df_tmp)
df_tmp_polys_tot <- df_tmp_polys[[1]]
if(length(df_tmp_polys)>1){
for(i in 2:length(df_tmp_polys)){
df_tmp_polys_tot <- gpclib::union(df_tmp_polys_tot, df_tmp_polys[[i]])
}
}
coords <- data.frame(long = df_tmp_polys_tot@pts[[1]]$x, lat = df_tmp_polys_tot@pts[[1]]$y)
noms_col <- colnames(df_tmp)
res <-
df_tmp %>%
select(-long, -lat) %>%
slice(1)
res <-
cbind(coords, res) %>% tbl_df() %>%
s_select(noms_col) %>%
mutate(nom_map = nouveau_nom,
departement = nouveau_departement)
res
}# Fin de fusion_depts()
We carry out the following three mergers:
seine_oise <- fusion_depts(c("F78", "F91", "F92", "F95"), nouveau_nom = "Seine-et-Oise", nouveau_departement = "F78", df = france)
seine_oise$num_dep <- "78"
seine_marne <- fusion_depts(c("F93", "F94", "F77"), nouveau_nom = "Seine-et-Marne", nouveau_departement = "F77", df = france)
seine_marne$num_dep <- "77"
corse <- fusion_depts(c("F2A", "F2B"), nouveau_nom = "Corse", nouveau_departement = "F20", df = france)
corse$num_dep <- "20"
We remove these departments from the initial data table, then add the merge coordinates, and save the result.
france_1800 <-
france %>%
filter(! departement %in% c("F78", "F91", "F92", "F95", "F93", "F94", "F77", "F2A", "F2B")) %>%
bind_rows(seine_oise) %>%
bind_rows(seine_marne) %>%
bind_rows(corse) %>%
mutate(num_dep)
save(france_1800, file="france_1800.rda")
l_fusion <- c("F78", "F91", "F92", "F95", "F93", "F94", "F77", "F2A", "F2B")
france %>%
mutate(dept_a_fusionner = departement %in% l_fusion) %>%
ggplot(data = .,
aes(x=long,y=lat,group=group)) +
geom_polygon(col = "white", aes(fill = dept_a_fusionner)) +
coord_quickmap()
l_fusionnes <- c("F78", "F77", "F20")
france_1800 %>%
mutate(depts_fusionnes = departement %in% l_fusionnes) %>%
ggplot(data = ., aes(x=long,y=lat,group=group)) +
geom_polygon(col = "white", aes(fill = depts_fusionnes)) +
coord_quickmap()
This annex proposes to indicate how to recover the geographical limits of the communes from the data of Open Street Map (OSM), then to reuse them to obtain an extension of the borders of the communes in a given radius. Obtaining border extensions makes it possible, among other things, to identify which communes are located in a neighbourhood of \(x\) kilometres from another commune. By choosing a low value of \(x\)x, this makes it possible to identify the neighbouring communes.
We recover OSM data via the open platform of French public data data.gouv.fr. In particular, we download the most recent shapefile of municipalities (January 2018 at the time of download). The sequel happens with R
.
First, we load some packages.
library(tidyverse)
library(lubridate)
library(stringr)
library(stringi)
library(data.table)
library(pbapply)
library(dplyrExtras)
library(stringdist)
library(sp)
library(rgeos)
library(maptools)
library(rgdal)
We load the data:
communes <- readOGR(dsn="communes-20180101-shp/", layer="communes-20180101")
communes <- spTransform(communes, CRS( "+init=epsg:2154" ))
We will extract the information of each commune from the communes
object. In this case, it is necessary to pay attention to a technical detail that concerns the size of each object containing the information of a single commune. If we simply extract a commune as it is, in the current state, the object created will occupy a heavy place in memory, since it will contain many (very many) useless information. Indeed, the slot data
of the communes
object contains information: communes INSEE codes, names and Wikipedia links. However, this information is stored as factors: R
therefore considers each value as integers, and refers to a dictionary indicating the corresponding levels. When we extract a factor from a vector of factors in R
, we get a sub-element of this vector… and the complete dictionary! Also, when this dictionary is very large, you lose effectiveness. Here, as each line of the slot data
contains a unique INSEE code, a unique name and a unique wikipedia link, it is sub-optimal to store this information in the form of factors, simple strings of characters are enough and prove to be clearly more efficient afterwards, when extracting communes.
codes_insee <- unique(communes$insee) %>% as.character()
communes@data <-
communes@data %>%
mutate(insee = as.character(insee),
nom = as.character(nom),
wikipedia = as.character(wikipedia))
We will enlarge the polygons limiting each commune, according to a given distance. To do this, we use the gBuffer()
function of the rgeos
package. We choose to extend the borders of the municipalities by 20 km.
distance <- 20000 # en metres
We create a function which will be in charge of enlarging the borders of a commune, for a given distance. This function returns a list containing 4 objects: the coordinates of a rectangle delimiting the limits of the commune, those of a rectangle delimiting the extended limits of the commune, then the spatial objects containing the coordinates of the commune and the extended commune, respectively.
#' communes_buffer
#' Obtenir la surface etendue de la commune
#' avec une distance donnee
#' @code_insee: (string) code INSEE de la commune
#' @distance: (num) distance pour etendre
communes_buffer <- function(code_insee, distance){
tmp <- communes[communes$insee == code_insee,]
tmp_buffer <- gBuffer(tmp, width = distance, byid = TRUE)
bbox_commune <- bbox(tmp)
bbox_commune_buffer <- bbox(tmp_buffer)
tmp_buffer <- spTransform(tmp_buffer, CRS("+proj=longlat +datum=WGS84"))
tmp <- spTransform(tmp, CRS("+proj=longlat +datum=WGS84"))
list(bbox_commune = bbox_commune, bbox_commune_buffer = bbox_commune_buffer, tmp = tmp, tmp_buffer = tmp_buffer)
}# Fin de communes_buffer()
Here is an example of the result for a particular commune, Rennes, with an enlargement factor of 1km.
res_rennes <- communes_buffer(code_insee = "35238", distance = 1000)
The coordinates of the frame delimiting the commune, and those of the frame delimiting the extended commune:
res_rennes$bbox_commune
min max
x 346353.8 356295.1
y 6785457.4 6793920.0
res_rennes$bbox_commune_buffer
min max
x 345356.3 357295
y 6784457.4 6794920
The limits of the commune and the expansion:
plot(res_rennes$tmp_buffer, border = "red")
plot(res_rennes$tmp, add=TRUE)
To allow the computer to manage smaller objects, we split the 36 000 INSEE codes into 20 sections, apply the communes_buffer()
function on each INSEE code of the sections, and save the result of each section.
chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
a_parcourir <- chunk2(1:length(codes_insee), 20)
if(!(dir.exists(str_c("communes/")))) dir.create(str_c("communes/"), recursive = TRUE)
for(i in 1:length(a_parcourir)){
communes_cercles_tmp <-
pblapply(codes_insee[a_parcourir[[i]]], communes_buffer, distance = distance)
save(communes_cercles_tmp, file = str_c("communes/communes_cercles_tmp_", i, ".rda"))
rm(communes_cercles_tmp)
}
All that remains is to load the 20 intermediate results, to obtain the extended limits of each commune:
communes_cercles <-
lapply(1:length(a_parcourir), function(i){
load(str_c("communes/communes_cercles_tmp_", i, ".rda"))
lapply(communes_cercles_tmp, function(x) x$tmp_buffer)
})
communes_cercles <- unlist(communes_cercles)
names(communes_cercles) <- codes_insee
Then those of the not extended communes:
communes_sans_cercle <-
lapply(1:length(a_parcourir), function(i){
load(str_c("communes/communes_cercles_tmp_", i, ".rda"))
lapply(communes_cercles_tmp, function(x) x$tmp)
})
communes_sans_cercle <- unlist(communes_sans_cercle)
names(communes_sans_cercle) <- codes_insee
And finally the rectangles delimiting the borders of the communes and the extended communes:
communes_cercles_bbox <-
lapply(1:length(a_parcourir), function(i){
load(str_c("communes/communes_cercles_tmp_", i, ".rda"))
lapply(communes_cercles_tmp, function(x) x$bbox_commune_buffer)
})
communes_cercles_bbox <- unlist(communes_cercles_bbox, recursive=FALSE)
names(communes_cercles_bbox) <- codes_insee
communes_bbox <-
lapply(1:length(a_parcourir), function(i){
load(str_c("communes/communes_cercles_tmp_", i, ".rda"))
lapply(communes_cercles_tmp, function(x) x$bbox_commune)
})
communes_bbox <- unlist(communes_bbox, recursive=FALSE)
names(communes_bbox) <- codes_insee
We then transform the spatial objects containing the boundaries of the communes into data tables.
options(warn=-1)
communes_cercles_df <-
pblapply(communes_cercles, function(x){
suppressMessages(broom::tidy(x, region = "insee"))
}) %>%
bind_rows()
options(warn=1)
We do the same for the communes:
communes <- spTransform(communes, CRS("+proj=longlat +datum=WGS84"))
communes_df <- broom::tidy(communes, region = "insee")
communes_df <- tbl_df(communes_df)
Now, we can use the boundaries of the extended communes to identify, for each of the communes, the other close ones within a radius of 20km. We create a function that works in two stages, for a given commune. In a first step, we use the bounding box of the communes to quickly skim the communes potentially close to our reference commune. This step aims to accelerate the second step which consists in using the gIntersects()
function of the package rgeos
. This function, which is not the fastest to execute, indicates if two polygons intersect. It thus allows us to identify the communes at the intersection with the commune whose limits have been widened by 20km.
#' trouver_intersection_commune
#' Pour la commune i de communes_cercles, retourne
#' l'indice Insee de cette commune et les indices Insee des
#' communes dans un rayon de 20km de cette commune
#' @i (int) : indice de la commune
trouver_intersection_commune <- function(i){
comm_courante <- communes_cercles[[i]]
comm_restantes <- communes_sans_cercle[-i]
# On fait un premier ecremage à l'aide des box
bbox_courante <- communes_cercles_bbox[[i]]
bbox_restantes <- communes_bbox[-i]
box_se_touchent <- function(x){
# Est-ce que les box se touchent
touche <-
bbox_courante["x", "min"] <= x["x", "max"] & bbox_courante["x", "max"] >= x["x", "min"] &
bbox_courante["y", "min"] <= x["y", "max"] & bbox_courante["y", "max"] >= x["y", "min"]
touche
}# Fin de box_se_touchent()
touchent <- sapply(bbox_restantes, box_se_touchent)
inter <- sapply(comm_restantes[touchent], function(x){
gIntersects(x, comm_courante)
})
insee_intersection <- names(comm_restantes)[which(touchent)[which(inter)]]
list(insee = names(communes_cercles[i]), limitrophes_20 = insee_intersection)
}
We apply this function to all municipalities. To speed things up, we parallel execution.
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
invisible(clusterEvalQ(cl, library(tidyverse, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(geosphere, warn.conflicts=FALSE, quietly=TRUE)))
invisible(clusterEvalQ(cl, library(rgeos, warn.conflicts=FALSE, quietly=TRUE)))
clusterExport(cl, c("communes_cercles", "communes_sans_cercle"), envir=environment())
clusterExport(cl, c("communes_cercles_bbox", "communes_bbox"), envir=environment())
communes_proches_20km <- pblapply(1:length(communes_cercles), trouver_intersection_commune, cl = cl)
names(communes_proches_20km) <- names(communes_cercles)
stopCluster(cl)
Let us take the example of the commune of Rennes to observe the result.
ind_rennes <- which(names(communes_cercles) == "35238")
proche_rennes_20 <- trouver_intersection_commune(i = ind_rennes)
proche_rennes_20
$insee
[1] "35238"
$limitrophes_20
[1] "35289" "35149" "35312" "35340" "35188" "35012" "35221" "35231"
[9] "35212" "35030" "35322" "35218" "35333" "35066" "35266" "35206"
[17] "35352" "35055" "35334" "35001" "35039" "35203" "35143" "35144"
[25] "35146" "35346" "35265" "35233" "35193" "35079" "35274" "35197"
[33] "35296" "35251" "35007" "35003" "35315" "35067" "35085" "35286"
[41] "35165" "35338" "35264" "35283" "35087" "35252" "35229" "35052"
[49] "35141" "35057" "35175" "35127" "35155" "35099" "35176" "35173"
[57] "35069" "35060" "35169" "35309" "35282" "35208" "35035" "35196"
[65] "35198" "35242" "35164" "35022" "35139" "35135" "35245" "35180"
[73] "35134" "35089" "35258"
[ reached getOption("max.print") -- omitted 102 entries ]
Let us look at the result on a map:
map_rennes <-
ggplot(data = communes_df %>%
filter(id %in% unlist(proche_rennes_20)) %>%
mutate(limitrophe = ifelse(id %in% proche_rennes_20$limitrophes_20, yes = "limitrophe", no = "non"),
limitrophe = ifelse(id == proche_rennes_20$insee, yes = "focus", no = limitrophe))) +
geom_polygon(data = map_data("france"), aes(x = long, y = lat, group = group), fill = NA, col = "white") +
geom_polygon(aes(x = long, y= lat, group = group, fill = limitrophe)) +
geom_polygon(data = communes_cercles[[ind_rennes]] %>%
broom::tidy(region = "insee") %>%
tbl_df(),
aes(x = long, y=lat, group = group), fill = NA, col = "red", linetype = "dashed") +
scale_fill_manual("", values = c("limitrophe" = "dodgerblue", "non" = "white", "focus" = "red"), guide = FALSE) +
coord_quickmap(xlim = c(-5,0),
ylim = c(47.5,48.5))
map_rennes
We can save the results.
save(communes, file = "communes.rda")
save(communes_df, file = "communes_df.rda")
save(communes_cercles, file = "communes_cercles.rda")
save(communes_sans_cercle, file = "communes_sans_cercle.rda")
save(communes_cercles_bbox, file = "communes_cercles_bbox.rda")
save(communes_bbox, file = "communes_bbox.rda")
save(communes_proches_20km, file = "communes_proches_20km.rda")
Adding rivers to some maps can be helpful. This appendix proposes a method to do this, based on data from Open Street Map (OSM), which can be extracted on the open platform for French public data. Watercourse data are available at https://www.data.gouv.fr/fr/datasets/carte-des-departements-2/. As they are proposed by department, we will scrape the links to the files of each department, then download them using a function.
library(tidyverse)
library(lubridate)
library(stringr)
library(stringi)
library(data.table)
library(pbapply)
library(dplyrExtras)
library(stringdist)
library(sp)
library(rgeos)
library(maptools)
library(rgdal)
library(rvest)
if(!(dir.exists(paste0("osm_datagouv")))) dir.create(paste0("osm_datagouv"), recursive = TRUE)
page_datagrouv <- read_html("https://www.data.gouv.fr/fr/datasets/carte-des-departements-2/")
liens <-
page_datagrouv %>%
html_nodes(".resources-list") %>%
.[[1]] %>%
html_nodes("h4 a") %>%
html_attr("href")
telecharger_fichiers <- function(lien){
nom_fichier <- str_replace(string = lien, pattern = "^.*?departement-", replacement = "")
nom_dep <- str_replace(string = nom_fichier, "\\.zip", "")
download.file(lien, destfile = str_c("osm_datagouv/", nom_fichier))
unzip(str_c("osm_datagouv/", nom_fichier), exdir = str_c("osm_datagouv/", nom_dep))
file.remove(str_c("osm_datagouv/", nom_fichier))
}
We download the files with the following instruction:
pblapply(liens[1:2], telecharger_fichiers)
We will store the data, by department, in a specific folder:
if(!(dir.exists(paste0("rivieres_osm")))) dir.create(paste0("rivieres_osm"), recursive = TRUE)
We get the list of OSM files:
dossiers_departements <- list.files("osm_datagouv", full.names = TRUE)
Then we create a function that imports the recovered shapefile files, simplifies them slightly and returns the result as a data table.
riviere_rda <- function(un_dep){
num_dep <- str_replace(un_dep,"osm_datagouv/", "") %>%
str_replace("-(.*?)$", "")
chemin <- str_c(un_dep, "/departement-", num_dep)
rivieres <- readOGR(dsn=chemin, layer="waterways")
rivieres <- gSimplify(rivieres, tol = 0.0001)
rivieres@data$osm_id <- as.character(rivieres@data$osm_id)
rivieres@data$name <- as.character(rivieres@data$name)
rivieres@data$type <- as.character(rivieres@data$type)
rivieres@data$width <- as.character(rivieres@data$width)
rivieres@data$id <- rownames(rivieres@data)
rivieres@data$dep <- num_dep
rivieres_df <- broom::tidy(rivieres, region = "id")
rivieres_df <- rivieres_df %>% left_join(rivieres@data)
save(rivieres_df, file = str_c("rivieres_osm/", num_dep, ".rda"))
}
Finally, we apply the riviere_rda()
function to handle all departments.
pblapply(dossiers_departements, riviere_rda)
Blayo, Y. (1975). Mouvement naturel de la population française de 1740 a 1829. Population, 30, 15. https://doi.org/10.2307/1530644
Fleury, M., & Henry, L. (1958). Pour connaitre la population de la France depuis Louis XIV. Plan de travaux par sondage. Population, 13(4), 663. https://doi.org/10.2307/1525088
Walle, E. van de. (1973). La mortalité des départements français ruraux au XIXe siècle. Annales de démographie historique, 1973(1), 581‑589.