#' Process Generation Connections
#'
#' This function processes connections between each two generations in a pedigree simulation.
#' It marks individuals as parents, sons, or daughters based on their generational position and relationships.
#' The function also handles the assignment of couple IDs, manages single and coupled individuals,
#' and establishes parent-offspring links across generations.
#' @param df_Fam A data frame containing the simulated pedigree information up to the current generation.
#' Must include columns for family ID, individual ID, generation number, spouse ID (spID),
#' and sex. This data frame is updated in place to include flags for parental status (ifparent),
#' son status (ifson), and daughter status (ifdau), as well as couple IDs.
#' @inheritParams simulatePedigree
#' @inheritParams createGenDataFrame
#'
#' @details
#' The function iterates through each generation, starting from the second, to establish connections based on mating and parentage.
#' For the first generation, it sets the parental status directly. For subsequent generations, it calculates the number of couples,
#' the expected number of offspring, and assigns offspring to parents. It handles gender-based assignments for sons and daughters,
#' and deals with the nuances of single individuals and couple formation. The function relies on external functions `assignCoupleIds`
#' and `adjustKidsPerCouple` to handle specific tasks related to couple ID assignment and offspring number adjustments, respectively.
#'
#' @return The function updates the `df_Fam` data frame in place, adding or modifying columns related to parental and offspring status,
#' as well as assigning unique couple IDs. It does not return a value explicitly.
#'
buildBtwnGenerations <- function(df_Fam, Ngen, sizeGens, verbose = FALSE, marR, sexR, kpc,
rd_kpc, personID = "ID",
momID = "momID",
dadID = "dadID",
code_male = "M", code_female = "F", beta = FALSE) {
# Normalize string aliases to logical values for downstream functions
use_optimized <- FALSE
if (beta %in% c("index", "indexed")) {
stop("The 'index' or 'indexed' option for parameter 'beta' is not yet implemented.")
} else if (isTRUE(beta) || identical(beta, "optimized")) {
use_optimized <- TRUE
} else if (isFALSE(beta) || beta %in% c("base", "original") || is.null(beta)) {
use_optimized <- FALSE
} else {
stop("Invalid value for parameter 'beta'. Accepted values are TRUE, FALSE, 'optimized', 'base', 'original', or 'index'/'indexed'.")
}
if (use_optimized) {
df_Fam <- buildBtwnGenerations_opt(
df_Fam = df_Fam,
Ngen = Ngen,
sizeGens = sizeGens,
verbose = verbose,
marR = marR,
sexR = sexR,
kpc = kpc,
rd_kpc = rd_kpc,
personID = personID,
momID = momID,
dadID = dadID,
code_male = code_male,
code_female = code_female,
beta = TRUE
)
} else {
df_Fam <- buildBtwnGenerations_base(
df_Fam = df_Fam,
Ngen = Ngen,
sizeGens = sizeGens,
verbose = verbose,
marR = marR,
sexR = sexR,
kpc = kpc,
rd_kpc = rd_kpc,
personID = personID,
momID = momID,
dadID = dadID,
code_male = code_male,
code_female = code_female,
beta = FALSE
)
}
df_Fam
}
buildBtwnGenerations_base <- function(df_Fam,
Ngen,
sizeGens,
verbose = FALSE,
marR, sexR, kpc,
rd_kpc, personID = "ID",
momID = "momID",
dadID = "dadID",
code_male = "M",
code_female = "F",
beta = FALSE) {
# Initialize flags for the full pedigree data frame.
# These are used throughout linkage and get overwritten per-generation as needed.
df_Fam$ifparent <- FALSE
df_Fam$ifson <- FALSE
df_Fam$ifdau <- FALSE
# Precompute row indices per generation once.
# This avoids repeated df_Fam$gen == i scans inside loops.
gen_rows <- split(seq_len(nrow(df_Fam)), df_Fam$gen)
# Loop across generations 1..Ngen.
for (i in seq_len(Ngen)) {
# -------------------------------------------------------------------------
# Generation 1: base case
# Generation 1 individuals are founders and are treated as "parents" by design.
# They do not have assigned mother/father, so we just set flags and continue.
# -------------------------------------------------------------------------
if (i == 1) {
rows_i <- gen_rows[[as.character(i)]]
df_Ngen <- df_Fam[rows_i, , drop = FALSE]
# Mark everyone in generation 1 as parents (founder couple logic occurs earlier).
df_Ngen$ifparent <- TRUE
df_Ngen$ifson <- FALSE
df_Ngen$ifdau <- FALSE
df_Fam[rows_i, ] <- df_Ngen
# Write back into the main df_Fam.
} else {
# calculate the number of couples in the i-1 th generation
rows_i <- gen_rows[[as.character(i)]]
rows_prev <- gen_rows[[as.character(i - 1)]]
# -------------------------------------------------------------------------
# Step A: Determine how many couples exist in generation i-1
#
# In your representation, each coupled individual has a non-NA spID, and each couple
# appears twice (one row per spouse). Therefore:
# number_of_couples = (number_of_non_single_individuals) / 2
# where number_of_non_single_individuals = sizeGens[i-1] - count(NA spID)
# -------------------------------------------------------------------------
N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[rows_prev]))) * 0.5
# Expected number of offspring linked to those couples (before sex split).
N_LinkedMem <- N_couples * kpc
# Split linked offspring into female and male counts using sexR,
# where sexR is the proportion male, so (1 - sexR) is the proportion female.
N_LinkedFemale <- round(N_LinkedMem * (1 - sexR))
N_LinkedMale <- N_LinkedMem - N_LinkedFemale
# -------------------------------------------------------------------------
# Step B: Prepare generation i data, assign couple IDs, and mark potential children
# -------------------------------------------------------------------------
# get the df for the i the generation
df_Ngen <- df_Fam[rows_i, , drop = FALSE]
# Reset per-generation fields that will be recomputed.
df_Ngen$ifparent <- FALSE
df_Ngen$ifson <- FALSE
df_Ngen$ifdau <- FALSE
df_Ngen$coupleId <- NA_character_
# Randomly permute generation i rows so selection is not tied to row order.
df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]
# Start to connect children with mother and father
if (verbose == TRUE) {
message(
"Step 2.1: mark a group of potential sons and daughters in the i th generation"
)
}
# count the number of couples in the i th gen
if (verbose == TRUE) {
countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5
}
# Assign couple IDs within generation i.
df_Ngen <- assignCoupleIds(df_Ngen, beta = beta)
# Identify singles in generation i (no spouse).
IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)]
# Count singles by sex; these affect how many "linked" children can come from couples.
SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID))
SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID))
# Number of linked females that must come from couples after excluding single females.
# This value is passed into markPotentialChildren, which decides who becomes ifson/ifdau.
CoupleF <- N_LinkedFemale - SingleF
# Mark potential sons and daughters within generation i.
# This writes ifson/ifdau into the returned data frame
df_Fam[rows_i, ] <- markPotentialChildren(
df_Ngen = df_Ngen,
i = i,
Ngen = Ngen,
sizeGens = sizeGens,
CoupleF = CoupleF,
code_male = code_male,
code_female = code_female,
beta = beta
)
# -------------------------------------------------------------------------
# Step C: Mark a subset of generation i-1 couples as parents (ifparent)
#
# Goal: choose enough married couples (based on marR) to be parents.
# We walk through a randomized order of generation i-1, and whenever we select
# an individual who has a spouse, we mark both spouses as ifparent.
# -------------------------------------------------------------------------
if (verbose == TRUE) {
message(
"Step 2.2: mark a group of potential parents in the i-1 th generation"
)
}
df_Ngen <- df_Fam[rows_prev, , drop = FALSE]
# Reset flags within i-1 before reselecting parent couples.
df_Ngen$ifparent <- FALSE
df_Ngen$ifson <- FALSE
df_Ngen$ifdau <- FALSE
# Randomize order so parent selection is not tied to row ordering.
df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]
# Boolean vector that tracks which rows in df_prev are selected as parents.
# Start all FALSE.
isUsedParent <- df_Ngen$ifparent
# Loop over up to sizeGens[i-1] positions.
# Stop early once the parent selection proportion reaches marR.
nrow_df_Ngen <- nrow(df_Ngen)
for (k in seq_len(sizeGens[i - 1])) {
# Proportion of individuals currently marked as parents in df_prev.
# Since we always mark spouses together, this moves in steps of 2.
if (sum(isUsedParent) / nrow_df_Ngen >= marR) {
df_Ngen$ifparent <- isUsedParent
break
} else {
# Only select someone as a parent if:
# 1) they are not already used as a parent, and
# 2) they have a spouse (spID not NA), because singles cannot form a parent couple.
if (!(isUsedParent[k]) && !is.na(df_Ngen$spID[k])) { # Mark this individual as parent.
isUsedParent[k] <- TRUE
# Mark their spouse row as parent too.
# This works because spouse IDs are unique within a generation in this simulation.
isUsedParent[df_Ngen$spID == df_Ngen$id[k]] <- TRUE
} else {
next
}
}
}
df_Ngen$ifparent <- isUsedParent
# Restore original row order for df_prev before writing back into df_Fam.
df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE]
df_Fam[rows_prev, ] <- df_Ngen
if (verbose == TRUE) {
message(
"Step 2.3: connect the i and i-1 th generation"
)
}
if (i == 1) {
next
} else {
# Pull the two generations together.
df_Ngen <- df_Fam[df_Fam$gen %in% c(i, i - 1), , drop = FALSE]
sizeI <- sizeGens[i - 1]
sizeII <- sizeGens[i]
# Collect IDs of marked sons and daughters in generation i.
IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i]
IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i]
# Interleave sons and daughters to get an offspring list.
IdOfp <- evenInsert(IdSon, IdDau)
# nMates is number of parent couples selected (ifparent rows are individuals).
nMates <- sum(df_Ngen$ifparent) / 2
# If no mates or no offspring were selected for linkage, skip linkage.
if (nMates <= 0 || length(IdOfp) == 0) {
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
next
}
# generate link kids to the couples
random_numbers <- adjustKidsPerCouple(
nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc,
rd_kpc = rd_kpc,
beta = beta
)
# Guard: adjustKidsPerCouple returned nothing usable
if (length(random_numbers) == 0 || all(is.na(random_numbers))) {
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
next
}
# -------------------------------------------------------------------------
# Step E: Build parent assignment vectors IdMa and IdPa
#
# The goal is to expand couples into per-child vectors of mother IDs and father IDs,
# where each couple contributes random_numbers[couple_index] children.
#
# Important: df_Ngen contains both generations. We only want parent generation rows.
# -------------------------------------------------------------------------
# Identify rows in df_Ngen that belong to generation i-1 (parent generation).
rows_prev_in_pair <- which(df_Ngen$gen == (i - 1))
# Extract parent generation into a smaller frame to make operations faster and clearer.
prev <- df_Ngen[rows_prev_in_pair, , drop = FALSE]
# Keep only those rows that are marked ifparent and are actually paired (non-NA spID).
parent_rows <- which(prev$ifparent == TRUE & !is.na(prev$spID))
# If no usable parent couples remain, skip linkage.
if (length(parent_rows) == 0) {
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
next
}
# Create a symmetric couple key so we can keep only one row per couple.
a <- pmin(prev$id, prev$spID)
b <- pmax(prev$id, prev$spID)
couple_key <- paste(a, b, sep = "_")
# Keep only the first row for each couple among the parent rows.
parent_rows <- parent_rows[!duplicated(couple_key[parent_rows])]
# Determine whether each kept row corresponds to the female member of the couple.
# If the kept row is female: mother = id, father = spID
# If the kept row is male: father = id, mother = spID
is_female_row <- prev$sex[parent_rows] == code_female
# One mother ID per couple.
ma_ids <- ifelse(is_female_row, prev$id[parent_rows], prev$spID[parent_rows])
# One father ID per couple.
pa_ids <- ifelse(is_female_row, prev$spID[parent_rows], prev$id[parent_rows])
# Align lengths between couples and random_numbers.
# If random_numbers is longer than couples, truncate random_numbers.
nCouples <- length(parent_rows)
if (length(random_numbers) > nCouples) {
random_numbers <- random_numbers[seq_len(nCouples)]
}
# Expand from "one mother/father per couple" to "one mother/father per child".
# rep.int is used to avoid extra overhead.
IdMa <- rep.int(ma_ids, times = random_numbers)
IdPa <- rep.int(pa_ids, times = random_numbers)
# -------------------------------------------------------------------------
# Step F: Ensure IdMa/IdPa length matches the number of offspring IdOfp
#
# Two mismatch cases:
# 1) Too many parent slots relative to offspring: drop excess parent slots.
# 2) Too many offspring relative to parent slots: drop some offspring.
#
# drop singles first (IdSingle) when reducing offspring.
# -------------------------------------------------------------------------
if (length(IdPa) - length(IdOfp) > 0) {
if (verbose == TRUE) {
message("length of IdPa", length(IdPa), "\n")
}
# Excess parent slots: randomly remove that many entries from IdPa and IdMa.
excess <- length(IdPa) - length(IdOfp)
if (length(IdPa) > 0 && excess > 0) {
IdRm <- sample.int(length(IdPa), size = excess)
IdPa <- IdPa[-IdRm]
IdMa <- IdMa[-IdRm]
}
} else if (length(IdPa) - length(IdOfp) < 0) {
if (verbose == TRUE) {
message("length of IdOfp", length(IdOfp), "\n")
message("length of IdPa", length(IdPa), "\n")
message("length of IdSingle", length(IdMa), "\n")
}
# harden the resample call when IdSingle is empty:
# Need to drop some offspring because we do not have enough parent slots.
need_drop <- length(IdOfp) - length(IdPa)
if (need_drop > 0) {
if (length(IdSingle) > 0) {
# Preferentially remove offspring IDs that correspond to singles.
# resample is expected to return a vector of IDs to remove.
IdRm <- resample(IdSingle, size = need_drop)
IdOfp <- IdOfp[!(IdOfp %in% IdRm)]
} else {
# If there are no singles to target, drop arbitrary offspring indices.
drop_idx <- sample.int(length(IdOfp), size = need_drop)
IdOfp <- IdOfp[-drop_idx]
}
}
}
# -------------------------------------------------------------------------
# Step G: Assign pat/mat into df_Ngen for the selected offspring.
#
# Replaces the old loop:
# for (m in seq_along(IdOfp)) df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- ...
# Using match avoids repeated scanning over df_Ngen$id.
# -------------------------------------------------------------------------
# Find row positions in df_Ngen corresponding to offspring IDs.
child_rows <- match(IdOfp, df_Ngen$id)
# Only keep rows that matched successfully.
ok <- !is.na(child_rows)
if (any(ok)) {
# Assign father IDs and mother IDs to offspring rows.
df_Ngen$pat[child_rows[ok]] <- IdPa[ok]
df_Ngen$mat[child_rows[ok]] <- IdMa[ok]
}
# -------------------------------------------------------------------------
# Step H: Write the two generations back into df_Fam using the precomputed indices.
# -------------------------------------------------------------------------
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
}
}
}
df_Fam
}
buildBtwnGenerations_opt <- function(df_Fam,
Ngen,
sizeGens,
verbose = FALSE,
marR, sexR, kpc,
rd_kpc, personID = "ID",
momID = "momID",
dadID = "dadID",
code_male = "M",
code_female = "F",
beta = TRUE) {
# Initialize flags for the full pedigree data frame.
# These are used throughout linkage and get overwritten per-generation as needed.
df_Fam$ifparent <- FALSE
df_Fam$ifson <- FALSE
df_Fam$ifdau <- FALSE
# Precompute row indices per generation once.
# This avoids repeated df_Fam$gen == i scans inside loops.
gen_rows <- split(seq_len(nrow(df_Fam)), df_Fam$gen)
# Loop across generations 1..Ngen.
for (i in seq_len(Ngen)) {
# -------------------------------------------------------------------------
# Generation 1: base case
# Generation 1 individuals are founders and are treated as "parents" by design.
# They do not have assigned mother/father, so we just set flags and continue.
# -------------------------------------------------------------------------
if (i == 1) {
rows_i <- gen_rows[[as.character(i)]]
df_Ngen <- df_Fam[rows_i, , drop = FALSE]
# Mark everyone in generation 1 as parents (founder couple logic occurs earlier).
df_Ngen$ifparent <- TRUE
df_Ngen$ifson <- FALSE
df_Ngen$ifdau <- FALSE
df_Fam[rows_i, ] <- df_Ngen
# Write back into the main df_Fam.
} else {
# calculate the number of couples in the i-1 th generation
rows_i <- gen_rows[[as.character(i)]]
rows_prev <- gen_rows[[as.character(i - 1)]]
# -------------------------------------------------------------------------
# Step A: Determine how many couples exist in generation i-1
#
# In your representation, each coupled individual has a non-NA spID, and each couple
# appears twice (one row per spouse). Therefore:
# number_of_couples = (number_of_non_single_individuals) / 2
# where number_of_non_single_individuals = sizeGens[i-1] - count(NA spID)
# -------------------------------------------------------------------------
N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[rows_prev]))) * 0.5
# Expected number of offspring linked to those couples (before sex split).
N_LinkedMem <- N_couples * kpc
# Split linked offspring into female and male counts using sexR,
# where sexR is the proportion male, so (1 - sexR) is the proportion female.
N_LinkedFemale <- round(N_LinkedMem * (1 - sexR))
N_LinkedMale <- N_LinkedMem - N_LinkedFemale
# -------------------------------------------------------------------------
# Step B: Prepare generation i data, assign couple IDs, and mark potential children
# -------------------------------------------------------------------------
# get the df for the i the generation
df_Ngen <- df_Fam[rows_i, , drop = FALSE]
# Reset per-generation fields that will be recomputed.
df_Ngen$ifparent <- FALSE
df_Ngen$ifson <- FALSE
df_Ngen$ifdau <- FALSE
df_Ngen$coupleId <- NA_character_
# Randomly permute generation i rows so selection is not tied to row order.
df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]
# Start to connect children with mother and father
if (verbose == TRUE) {
message(
"Step 2.1: mark a group of potential sons and daughters in the i th generation"
)
}
# count the number of couples in the i th gen
countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5
# Assign couple IDs within generation i.
df_Ngen <- assignCoupleIds(df_Ngen, beta = beta)
# Identify singles in generation i (no spouse).
IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)]
# Count singles by sex; these affect how many "linked" children can come from couples.
SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID))
SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID))
# Number of linked females that must come from couples after excluding single females.
# This value is passed into markPotentialChildren, which decides who becomes ifson/ifdau.
CoupleF <- N_LinkedFemale - SingleF
# Mark potential sons and daughters within generation i.
# This writes ifson/ifdau into the returned data frame
df_Fam[rows_i, ] <- markPotentialChildren(
df_Ngen = df_Ngen,
i = i,
Ngen = Ngen,
sizeGens = sizeGens,
CoupleF = CoupleF,
code_male = code_male,
code_female = code_female,
beta = beta
)
# -------------------------------------------------------------------------
# Step C: Mark a subset of generation i-1 couples as parents (ifparent)
#
# OPTIMIZATION: Instead of looping through individuals and doing linear
# spouse lookups (O(n²)), we pre-identify all couples using vectorized
# operations, sample the needed number of couples directly, and mark both
# spouses in one vectorized operation (O(n)).
#
# Goal: choose enough married couples (based on marR) to be parents.
# -------------------------------------------------------------------------
if (verbose == TRUE) {
message(
"Step 2.2: mark a group of potential parents in the i-1 th generation"
)
}
df_Ngen <- df_Fam[rows_prev, , drop = FALSE]
# Reset flags within i-1 before reselecting parent couples.
df_Ngen$ifparent <- FALSE
df_Ngen$ifson <- FALSE
df_Ngen$ifdau <- FALSE
# Randomize order so parent selection is not tied to row ordering.
# This matches the base version and ensures similar random behavior.
df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]
# OPTIMIZED: Fully vectorized parent couple selection
# Process all couples at once instead of looping through individuals
# Identify individuals with spouses
has_spouse <- !is.na(df_Ngen$spID)
if (any(has_spouse)) {
# Create symmetric couple keys for ALL rows (NA for singles)
couple_keys_all <- ifelse(
has_spouse,
paste(
pmin(df_Ngen$id, df_Ngen$spID),
pmax(df_Ngen$id, df_Ngen$spID),
sep = "_"
),
NA_character_
)
# Find first occurrence of each couple using !duplicated()
# This gives us unique couples in the order they appear (after randomization)
first_occurrence <- !duplicated(couple_keys_all) & has_spouse
# Get the unique couple keys in order
unique_couples_ordered <- couple_keys_all[first_occurrence]
# Calculate how many couples to select
# Target: marR proportion of individuals = (marR * n) / 2 couples
n_couples_target <- ceiling(sizeGens[i - 1] * marR / 2)
n_couples_target <- min(n_couples_target, length(unique_couples_ordered))
# Select first n couples (in randomized order from the shuffling above)
selected_couples <- unique_couples_ordered[seq_len(n_couples_target)]
# Mark all individuals in selected couples as parents (vectorized)
df_Ngen$ifparent <- couple_keys_all %in% selected_couples
} else {
df_Ngen$ifparent <- FALSE
}
df_Fam[rows_prev, ] <- df_Ngen
if (verbose == TRUE) {
message(
"Step 2.3: connect the i and i-1 th generation"
)
}
if (i == 1) {
next
} else {
# Pull the two generations together.
# OPTIMIZATION: Use pre-computed row indices instead of df_Fam$gen %in% c(i, i-1)
df_Ngen <- df_Fam[c(rows_prev, rows_i), , drop = FALSE]
sizeI <- sizeGens[i - 1]
sizeII <- sizeGens[i]
# Collect IDs of marked sons and daughters in generation i.
IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i]
IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i]
# Interleave sons and daughters to get an offspring list.
IdOfp <- evenInsert(IdSon, IdDau)
# nMates is number of parent couples selected (ifparent rows are individuals).
nMates <- sum(df_Ngen$ifparent) / 2
# If no mates or no offspring were selected for linkage, skip linkage.
if (nMates <= 0 || length(IdOfp) == 0) {
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
next
}
# generate link kids to the couples
random_numbers <- adjustKidsPerCouple(
nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc,
rd_kpc = rd_kpc,
beta = beta
)
# Guard: adjustKidsPerCouple returned nothing usable
if (length(random_numbers) == 0 || all(is.na(random_numbers))) {
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
next
}
# -------------------------------------------------------------------------
# Step E: Build parent assignment vectors IdMa and IdPa
#
# The goal is to expand couples into per-child vectors of mother IDs and father IDs,
# where each couple contributes random_numbers[couple_index] children.
#
# Important: df_Ngen contains both generations. We only want parent generation rows.
# -------------------------------------------------------------------------
# Identify rows in df_Ngen that belong to generation i-1 (parent generation).
rows_prev_in_pair <- which(df_Ngen$gen == (i - 1))
# Extract parent generation into a smaller frame to make operations faster and clearer.
prev <- df_Ngen[rows_prev_in_pair, , drop = FALSE]
# Keep only those rows that are marked ifparent and are actually paired (non-NA spID).
parent_rows <- which(prev$ifparent == TRUE & !is.na(prev$spID))
# If no usable parent couples remain, skip linkage.
if (length(parent_rows) == 0) {
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
next
}
# Create a symmetric couple key so we can keep only one row per couple.
a <- pmin(prev$id, prev$spID)
b <- pmax(prev$id, prev$spID)
couple_key <- paste(a, b, sep = "_")
# Keep only the first row for each couple among the parent rows.
parent_rows <- parent_rows[!duplicated(couple_key[parent_rows])]
# Determine whether each kept row corresponds to the female member of the couple.
# If the kept row is female: mother = id, father = spID
# If the kept row is male: father = id, mother = spID
is_female_row <- prev$sex[parent_rows] == code_female
# One mother ID per couple.
ma_ids <- ifelse(is_female_row, prev$id[parent_rows], prev$spID[parent_rows])
# One father ID per couple.
pa_ids <- ifelse(is_female_row, prev$spID[parent_rows], prev$id[parent_rows])
# Align lengths between couples and random_numbers.
# If random_numbers is longer than couples, truncate random_numbers.
nCouples <- length(parent_rows)
if (length(random_numbers) > nCouples) {
random_numbers <- random_numbers[seq_len(nCouples)]
}
# Expand from "one mother/father per couple" to "one mother/father per child".
# rep.int is used to avoid extra overhead.
IdMa <- rep.int(ma_ids, times = random_numbers)
IdPa <- rep.int(pa_ids, times = random_numbers)
# -------------------------------------------------------------------------
# Step F: Ensure IdMa/IdPa length matches the number of offspring IdOfp
#
# Two mismatch cases:
# 1) Too many parent slots relative to offspring: drop excess parent slots.
# 2) Too many offspring relative to parent slots: drop some offspring.
#
# drop singles first (IdSingle) when reducing offspring.
# -------------------------------------------------------------------------
if (length(IdPa) - length(IdOfp) > 0) {
if (verbose == TRUE) {
message("length of IdPa", length(IdPa), "\n")
}
# Excess parent slots: randomly remove that many entries from IdPa and IdMa.
excess <- length(IdPa) - length(IdOfp)
if (length(IdPa) > 0 && excess > 0) {
IdRm <- sample.int(length(IdPa), size = excess)
IdPa <- IdPa[-IdRm]
IdMa <- IdMa[-IdRm]
}
} else if (length(IdPa) - length(IdOfp) < 0) {
if (verbose == TRUE) {
message("length of IdOfp", length(IdOfp), "\n")
message("length of IdPa", length(IdPa), "\n")
message("length of IdSingle", length(IdMa), "\n")
}
# harden the resample call when IdSingle is empty:
# Need to drop some offspring because we do not have enough parent slots.
need_drop <- length(IdOfp) - length(IdPa)
if (need_drop > 0) {
if (length(IdSingle) > 0) {
# Preferentially remove offspring IDs that correspond to singles.
# resample is expected to return a vector of IDs to remove.
IdRm <- resample(IdSingle, size = need_drop)
IdOfp <- IdOfp[!(IdOfp %in% IdRm)]
} else {
# If there are no singles to target, drop arbitrary offspring indices.
drop_idx <- sample.int(length(IdOfp), size = need_drop)
IdOfp <- IdOfp[-drop_idx]
}
}
}
# -------------------------------------------------------------------------
# Step G: Assign pat/mat into df_Ngen for the selected offspring.
#
# Replaces the old loop:
# for (m in seq_along(IdOfp)) df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- ...
# Using match avoids repeated scanning over df_Ngen$id.
# -------------------------------------------------------------------------
# Find row positions in df_Ngen corresponding to offspring IDs.
child_rows <- match(IdOfp, df_Ngen$id)
# Only keep rows that matched successfully.
ok <- !is.na(child_rows)
if (any(ok)) {
# Assign father IDs and mother IDs to offspring rows.
df_Ngen$pat[child_rows[ok]] <- IdPa[ok]
df_Ngen$mat[child_rows[ok]] <- IdMa[ok]
}
# -------------------------------------------------------------------------
# Step H: Write the two generations back into df_Fam using the precomputed indices.
# -------------------------------------------------------------------------
df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
}
}
}
df_Fam
}
#' Simulate Pedigrees
#' This function simulates "balanced" pedigrees based on a group of parameters:
#' 1) k - Kids per couple;
#' 2) G - Number of generations;
#' 3) p - Proportion of males in offspring;
#' 4) r - Mating rate.
#'
#' @importFrom stats runif
#' @param kpc Number of kids per couple. An integer >= 2 that determines how
#' many kids each fertilized mated couple will have in the pedigree. Default
#' value is 3. Returns an error when kpc equals 1.
#' @param Ngen Number of generations. An integer >= 2 that determines how many
#' generations the simulated pedigree will have. The first generation is always
#' a fertilized couple. The last generation has no mated individuals.
#' @param sexR Sex ratio of offspring. A numeric value ranging from 0 to 1 that
#' determines the proportion of males in all offspring in this pedigree. For
#' instance, 0.4 means 40 percent of the offspring will be male.
#' @param marR Mating rate. A numeric value ranging from 0 to 1 which determines
#' the proportion of mated (fertilized) couples in the pedigree within each
#' generation. For instance, marR = 0.5 suggests 50 percent of the offspring in
#' a specific generation will be mated and have their offspring.
#' @param rd_kpc logical. If TRUE, the number of kids per mate will be randomly
#' generated from a poisson distribution with mean kpc. If FALSE, the number of
#' kids per mate will be fixed at kpc.
#' @param balancedSex Not fully developed yet. Always \code{TRUE} in the
#' current version.
#' @param balancedMar Not fully developed yet. Always \code{TRUE} in the
#' current version.
#' @param verbose logical If TRUE, message progress through stages of algorithm
#' @param code_male The value to use for males. Default is "M"
#' @param code_female The value to use for females. Default is "F"
#' @param fam_shift An integer to shift the person ID. Default is 1L.
#' This is useful when simulating multiple pedigrees to avoid ID conflicts.
#' @param beta logical or character. Controls which algorithm version to use:
#' \itemize{
#' \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm.
#' Slower but ensures exact reproducibility with set.seed().}
#' \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup.
#' Produces statistically equivalent results but not identical to base version
#' due to different random number consumption. Recommended for large simulations
#' where speed matters more than exact reproducibility.}
#' }
#' Note: Both versions are mathematically correct and produce valid pedigrees with the
#' same statistical properties (sex ratios, mating rates, etc.). The optimized version
#' uses vectorized operations instead of loops, making it much faster for large pedigrees.
#' @param ... Additional arguments to be passed to other functions.
#' @inheritParams ped2fam
#' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID".
#' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows:
#' \itemize{
#' \item{fam: The family id of each simulated individual. It is 'fam1' in a single simulated pedigree.}
#' \item{ID: The unique personal ID of each simulated individual. The first digit is the fam id; the fourth digit is the generation the individual is in; the following digits represent the order of the individual within their pedigree. For example, 100411 suggests this individual has a family id of 1, is in the 4th generation, and is the 11th individual in the 4th generation.}
#' \item{gen: The generation the simulated individual is in.}
#' \item{dadID: Personal ID of the individual's father.}
#' \item{momID: Personal ID of the individual's mother.}
#' \item{spID: Personal ID of the individual's mate.}
#' \item{sex: Biological sex of the individual. F - female; M - male.}
#' }
#' @export
#' @examples
#' set.seed(5)
#' df_ped <- simulatePedigree(
#' kpc = 4,
#' Ngen = 4,
#' sexR = .5,
#' marR = .7
#' )
#' summary(df_ped)
simulatePedigree <- function(kpc = 3,
Ngen = 4,
sexR = .5,
marR = 2 / 3,
rd_kpc = FALSE,
balancedSex = TRUE,
balancedMar = TRUE,
verbose = FALSE,
personID = "ID",
momID = "momID",
dadID = "dadID",
spouseID = "spouseID",
code_male = "M",
code_female = "F",
fam_shift = 1L,
beta = FALSE) {
# SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations
# SexRatio <- sexR / (1 - sexR)
# Calculate the expected family size in each generations
sizeGens <- allGens(kpc = kpc, Ngen = Ngen, marR = marR)
# famSizeIndex <- 1:sum(sizeGens)
if (verbose == TRUE) {
message(
"Step 1: Let's build the connection within each generation first"
)
}
df_Fam <- buildWithinGenerations(
sizeGens = sizeGens,
Ngen = Ngen,
sexR = sexR,
marR = marR,
verbose = verbose,
personID = personID,
momID = momID,
dadID = dadID,
code_male = code_male,
code_female = code_female,
fam_shift = fam_shift,
beta = beta
)
if (verbose == TRUE) {
message(
"Step 2: Let's try to build connection between each two generations"
)
}
df_Fam <- buildBtwnGenerations(
df_Fam = df_Fam,
Ngen = Ngen,
sizeGens = sizeGens,
verbose = verbose,
marR = marR,
sexR = sexR,
kpc = kpc,
rd_kpc = rd_kpc,
personID = personID,
momID = momID,
dadID = dadID,
code_male = code_male,
code_female = code_female,
beta = beta
)
df_Fam <- df_Fam[, 1:7]
df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ]
colnames(df_Fam)[c(2, 4, 5)] <- c(personID, dadID, momID)
# connect the detached members
df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ]
df_Fam
}
#' @rdname simulatePedigree
#' @export
SimPed <- function(...) { # nolint: object_name_linter.
warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.")
simulatePedigree(...)
}