R/simulatePedigree.R at main - R-Computing-Lab/BGmisc

main
BGmisc
List view
R\simulatePedigree.R

0 Duplication
31 Churn
982 Lines
476 Lines of Code
#' Process Generation Connections
#'
#' This function processes connections between each two generations in a pedigree simulation.
#' It marks individuals as parents, sons, or daughters based on their generational position and relationships.
#' The function also handles the assignment of couple IDs, manages single and coupled individuals,
#' and establishes parent-offspring links across generations.
#' @param df_Fam A data frame containing the simulated pedigree information up to the current generation.
#'               Must include columns for family ID, individual ID, generation number, spouse ID (spID),
#'               and sex. This data frame is updated in place to include flags for parental status (ifparent),
#'               son status (ifson), and daughter status (ifdau), as well as couple IDs.
#' @inheritParams simulatePedigree
#' @inheritParams createGenDataFrame
#'
#' @details
#' The function iterates through each generation, starting from the second, to establish connections based on mating and parentage.
#' For the first generation, it sets the parental status directly. For subsequent generations, it calculates the number of couples,
#' the expected number of offspring, and assigns offspring to parents. It handles gender-based assignments for sons and daughters,
#' and deals with the nuances of single individuals and couple formation. The function relies on external functions `assignCoupleIds`
#' and `adjustKidsPerCouple` to handle specific tasks related to couple ID assignment and offspring number adjustments, respectively.
#'
#' @return The function updates the `df_Fam` data frame in place, adding or modifying columns related to parental and offspring status,
#'         as well as assigning unique couple IDs. It does not return a value explicitly.
#'

buildBtwnGenerations <- function(df_Fam, Ngen, sizeGens, verbose = FALSE, marR, sexR, kpc,
                                    rd_kpc, personID = "ID",
                                    momID = "momID",
                                    dadID = "dadID",
                                    code_male = "M", code_female = "F", beta = FALSE) {
  # Normalize string aliases to logical values for downstream functions
  use_optimized <- FALSE

  if (beta %in% c("index", "indexed")) {
    stop("The 'index' or 'indexed' option for parameter 'beta' is not yet implemented.")
  } else if (isTRUE(beta) || identical(beta, "optimized")) {
    use_optimized <- TRUE
  } else if (isFALSE(beta) || beta %in% c("base", "original") || is.null(beta)) {
    use_optimized <- FALSE
  } else {
    stop("Invalid value for parameter 'beta'. Accepted values are TRUE, FALSE, 'optimized', 'base', 'original', or 'index'/'indexed'.")
  }

  if (use_optimized) {
    df_Fam <- buildBtwnGenerations_opt(
      df_Fam = df_Fam,
      Ngen = Ngen,
      sizeGens = sizeGens,
      verbose = verbose,
      marR = marR,
      sexR = sexR,
      kpc = kpc,
      rd_kpc = rd_kpc,
      personID = personID,
      momID = momID,
      dadID = dadID,
      code_male = code_male,
      code_female = code_female,
      beta = TRUE
    )
  } else {
    df_Fam <- buildBtwnGenerations_base(
      df_Fam = df_Fam,
      Ngen = Ngen,
      sizeGens = sizeGens,
      verbose = verbose,
      marR = marR,
      sexR = sexR,
      kpc = kpc,
      rd_kpc = rd_kpc,
      personID = personID,
      momID = momID,
      dadID = dadID,
      code_male = code_male,
      code_female = code_female,
      beta = FALSE
    )
  }
  df_Fam
}


buildBtwnGenerations_base <- function(df_Fam,
                                         Ngen,
                                         sizeGens,
                                         verbose = FALSE,
                                         marR, sexR, kpc,
                                         rd_kpc, personID = "ID",
                                         momID = "momID",
                                         dadID = "dadID",
                                         code_male = "M",
                                         code_female = "F",
                                         beta = FALSE) {
  # Initialize flags for the full pedigree data frame.
  # These are used throughout linkage and get overwritten per-generation as needed.

  df_Fam$ifparent <- FALSE
  df_Fam$ifson <- FALSE
  df_Fam$ifdau <- FALSE

  # Precompute row indices per generation once.
  # This avoids repeated df_Fam$gen == i scans inside loops.
  gen_rows <- split(seq_len(nrow(df_Fam)), df_Fam$gen)

  # Loop across generations 1..Ngen.

  for (i in seq_len(Ngen)) {
    # -------------------------------------------------------------------------
    # Generation 1: base case
    # Generation 1 individuals are founders and are treated as "parents" by design.
    # They do not have assigned mother/father, so we just set flags and continue.
    # -------------------------------------------------------------------------

    if (i == 1) {
      rows_i <- gen_rows[[as.character(i)]]
      df_Ngen <- df_Fam[rows_i, , drop = FALSE]

      # Mark everyone in generation 1 as parents (founder couple logic occurs earlier).
      df_Ngen$ifparent <- TRUE
      df_Ngen$ifson <- FALSE
      df_Ngen$ifdau <- FALSE
      df_Fam[rows_i, ] <- df_Ngen
      # Write back into the main df_Fam.
    } else {
      # calculate the number of couples in the i-1 th generation
      rows_i <- gen_rows[[as.character(i)]]
      rows_prev <- gen_rows[[as.character(i - 1)]]

      # -------------------------------------------------------------------------
      # Step A: Determine how many couples exist in generation i-1
      #
      # In your representation, each coupled individual has a non-NA spID, and each couple
      # appears twice (one row per spouse). Therefore:
      #   number_of_couples = (number_of_non_single_individuals) / 2
      # where number_of_non_single_individuals = sizeGens[i-1] - count(NA spID)
      # -------------------------------------------------------------------------

      N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[rows_prev]))) * 0.5

      # Expected number of offspring linked to those couples (before sex split).

      N_LinkedMem <- N_couples * kpc
      # Split linked offspring into female and male counts using sexR,
      # where sexR is the proportion male, so (1 - sexR) is the proportion female.

      N_LinkedFemale <- round(N_LinkedMem * (1 - sexR))
      N_LinkedMale <- N_LinkedMem - N_LinkedFemale


      # -------------------------------------------------------------------------
      # Step B: Prepare generation i data, assign couple IDs, and mark potential children
      # -------------------------------------------------------------------------

      # get the df for the i the generation
      df_Ngen <- df_Fam[rows_i, , drop = FALSE]


      # Reset per-generation fields that will be recomputed.
      df_Ngen$ifparent <- FALSE
      df_Ngen$ifson <- FALSE
      df_Ngen$ifdau <- FALSE
      df_Ngen$coupleId <- NA_character_

      # Randomly permute generation i rows so selection is not tied to row order.
      df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]

      # Start to connect children with mother and father

      if (verbose == TRUE) {
        message(
          "Step 2.1: mark a group of potential sons and daughters in the i th generation"
        )
      }


      # count the number of couples in the i th gen
      if (verbose == TRUE) {
        countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5
      }
      # Assign couple IDs within generation i.
      df_Ngen <- assignCoupleIds(df_Ngen, beta = beta)

      # Identify singles in generation i (no spouse).

      IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)]

      # Count singles by sex; these affect how many "linked" children can come from couples.
      SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID))
      SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID))

      # Number of linked females that must come from couples after excluding single females.
      # This value is passed into markPotentialChildren, which decides who becomes ifson/ifdau.

      CoupleF <- N_LinkedFemale - SingleF

      # Mark potential sons and daughters within generation i.
      # This writes ifson/ifdau into the returned data frame
      df_Fam[rows_i, ] <- markPotentialChildren(
        df_Ngen = df_Ngen,
        i = i,
        Ngen = Ngen,
        sizeGens = sizeGens,
        CoupleF = CoupleF,
        code_male = code_male,
        code_female = code_female,
        beta = beta
      )

      # -------------------------------------------------------------------------
      # Step C: Mark a subset of generation i-1 couples as parents (ifparent)
      #
      # Goal: choose enough married couples (based on marR) to be parents.
      # We walk through a randomized order of generation i-1, and whenever we select
      # an individual who has a spouse, we mark both spouses as ifparent.
      # -------------------------------------------------------------------------

      if (verbose == TRUE) {
        message(
          "Step 2.2: mark a group of potential parents in the i-1 th generation"
        )
      }
      df_Ngen <- df_Fam[rows_prev, , drop = FALSE]

      # Reset flags within i-1 before reselecting parent couples.
      df_Ngen$ifparent <- FALSE
      df_Ngen$ifson <- FALSE
      df_Ngen$ifdau <- FALSE

      # Randomize order so parent selection is not tied to row ordering.
      df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]


      # Boolean vector that tracks which rows in df_prev are selected as parents.
      # Start all FALSE.
      isUsedParent <- df_Ngen$ifparent

      # Loop over up to sizeGens[i-1] positions.
      # Stop early once the parent selection proportion reaches marR.
      nrow_df_Ngen <- nrow(df_Ngen)

      for (k in seq_len(sizeGens[i - 1])) {
        # Proportion of individuals currently marked as parents in df_prev.
        # Since we always mark spouses together, this moves in steps of 2.
        if (sum(isUsedParent) / nrow_df_Ngen >= marR) {
          df_Ngen$ifparent <- isUsedParent
          break
        } else {
          # Only select someone as a parent if:
          # 1) they are not already used as a parent, and
          # 2) they have a spouse (spID not NA), because singles cannot form a parent couple.


          if (!(isUsedParent[k]) && !is.na(df_Ngen$spID[k])) { # Mark this individual as parent.

            isUsedParent[k] <- TRUE
            # Mark their spouse row as parent too.
            # This works because spouse IDs are unique within a generation in this simulation.
            isUsedParent[df_Ngen$spID == df_Ngen$id[k]] <- TRUE
          } else {
            next
          }
        }
      }

      df_Ngen$ifparent <- isUsedParent

      # Restore original row order for df_prev before writing back into df_Fam.

      df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE]

      df_Fam[rows_prev, ] <- df_Ngen

      if (verbose == TRUE) {
        message(
          "Step 2.3: connect the i and i-1 th generation"
        )
      }


      if (i == 1) {
        next
      } else {
        # Pull the two generations together.
        df_Ngen <- df_Fam[df_Fam$gen %in% c(i, i - 1), , drop = FALSE]

        sizeI <- sizeGens[i - 1]
        sizeII <- sizeGens[i]

        # Collect IDs of marked sons and daughters in generation i.
        IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i]
        IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i]

        # Interleave sons and daughters to get an offspring list.
        IdOfp <- evenInsert(IdSon, IdDau)

        # nMates is number of parent couples selected (ifparent rows are individuals).
        nMates <- sum(df_Ngen$ifparent) / 2

        # If no mates or no offspring were selected for linkage, skip linkage.
        if (nMates <= 0 || length(IdOfp) == 0) {
          df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
          df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
          next
        }

        # generate link kids to the couples
        random_numbers <- adjustKidsPerCouple(
          nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc,
          rd_kpc = rd_kpc,
          beta = beta
        )

        # Guard: adjustKidsPerCouple returned nothing usable
        if (length(random_numbers) == 0 || all(is.na(random_numbers))) {
          df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
          df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
          next
        }

        # -------------------------------------------------------------------------
        # Step E: Build parent assignment vectors IdMa and IdPa
        #
        # The goal is to expand couples into per-child vectors of mother IDs and father IDs,
        # where each couple contributes random_numbers[couple_index] children.
        #
        # Important: df_Ngen contains both generations. We only want parent generation rows.
        # -------------------------------------------------------------------------

        # Identify rows in df_Ngen that belong to generation i-1 (parent generation).
        rows_prev_in_pair <- which(df_Ngen$gen == (i - 1))

        # Extract parent generation into a smaller frame to make operations faster and clearer.
        prev <- df_Ngen[rows_prev_in_pair, , drop = FALSE]

        # Keep only those rows that are marked ifparent and are actually paired (non-NA spID).
        parent_rows <- which(prev$ifparent == TRUE & !is.na(prev$spID))

        # If no usable parent couples remain, skip linkage.
        if (length(parent_rows) == 0) {
          df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
          df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
          next
        }
        # Create a symmetric couple key so we can keep only one row per couple.
        a <- pmin(prev$id, prev$spID)
        b <- pmax(prev$id, prev$spID)
        couple_key <- paste(a, b, sep = "_")

        # Keep only the first row for each couple among the parent rows.
        parent_rows <- parent_rows[!duplicated(couple_key[parent_rows])]

        # Determine whether each kept row corresponds to the female member of the couple.
        # If the kept row is female: mother = id, father = spID
        # If the kept row is male:   father = id, mother = spID
        is_female_row <- prev$sex[parent_rows] == code_female
        # One mother ID per couple.
        ma_ids <- ifelse(is_female_row, prev$id[parent_rows], prev$spID[parent_rows])

        # One father ID per couple.
        pa_ids <- ifelse(is_female_row, prev$spID[parent_rows], prev$id[parent_rows])

        # Align lengths between couples and random_numbers.
        # If random_numbers is longer than couples, truncate random_numbers.
        nCouples <- length(parent_rows)

        if (length(random_numbers) > nCouples) {
          random_numbers <- random_numbers[seq_len(nCouples)]
        }

        # Expand from "one mother/father per couple" to "one mother/father per child".
        # rep.int is used to avoid extra overhead.
        IdMa <- rep.int(ma_ids, times = random_numbers)
        IdPa <- rep.int(pa_ids, times = random_numbers)

        # -------------------------------------------------------------------------
        # Step F: Ensure IdMa/IdPa length matches the number of offspring IdOfp
        #
        # Two mismatch cases:
        # 1) Too many parent slots relative to offspring: drop excess parent slots.
        # 2) Too many offspring relative to parent slots: drop some offspring.
        #
        # drop singles first (IdSingle) when reducing offspring.
        # -------------------------------------------------------------------------


        if (length(IdPa) - length(IdOfp) > 0) {
          if (verbose == TRUE) {
            message("length of IdPa", length(IdPa), "\n")
          }
          # Excess parent slots: randomly remove that many entries from IdPa and IdMa.

          excess <- length(IdPa) - length(IdOfp)
          if (length(IdPa) > 0 && excess > 0) {
            IdRm <- sample.int(length(IdPa), size = excess)
            IdPa <- IdPa[-IdRm]
            IdMa <- IdMa[-IdRm]
          }
        } else if (length(IdPa) - length(IdOfp) < 0) {
          if (verbose == TRUE) {
            message("length of IdOfp", length(IdOfp), "\n")
            message("length of IdPa", length(IdPa), "\n")
            message("length of IdSingle", length(IdMa), "\n")
          }


          # harden the resample call when IdSingle is empty:
          # Need to drop some offspring because we do not have enough parent slots.
          need_drop <- length(IdOfp) - length(IdPa)

          if (need_drop > 0) {
            if (length(IdSingle) > 0) {
              # Preferentially remove offspring IDs that correspond to singles.
              # resample is expected to return a vector of IDs to remove.

              IdRm <- resample(IdSingle, size = need_drop)
              IdOfp <- IdOfp[!(IdOfp %in% IdRm)]
            } else {
              # If there are no singles to target, drop arbitrary offspring indices.
              drop_idx <- sample.int(length(IdOfp), size = need_drop)
              IdOfp <- IdOfp[-drop_idx]
            }
          }
        }

        # -------------------------------------------------------------------------
        # Step G: Assign pat/mat into df_Ngen for the selected offspring.
        #
        # Replaces the old loop:
        #   for (m in seq_along(IdOfp)) df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- ...
        # Using match avoids repeated scanning over df_Ngen$id.
        # -------------------------------------------------------------------------

        # Find row positions in df_Ngen corresponding to offspring IDs.
        child_rows <- match(IdOfp, df_Ngen$id)
        # Only keep rows that matched successfully.

        ok <- !is.na(child_rows)

        if (any(ok)) {
          # Assign father IDs and mother IDs to offspring rows.

          df_Ngen$pat[child_rows[ok]] <- IdPa[ok]
          df_Ngen$mat[child_rows[ok]] <- IdMa[ok]
        }
        # -------------------------------------------------------------------------
        # Step H: Write the two generations back into df_Fam using the precomputed indices.
        # -------------------------------------------------------------------------

        df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
        df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
      }
    }
  }
  df_Fam
}

buildBtwnGenerations_opt <- function(df_Fam,
                                              Ngen,
                                              sizeGens,
                                              verbose = FALSE,
                                              marR, sexR, kpc,
                                              rd_kpc, personID = "ID",
                                              momID = "momID",
                                              dadID = "dadID",
                                              code_male = "M",
                                              code_female = "F",
                                              beta = TRUE) {
  # Initialize flags for the full pedigree data frame.
  # These are used throughout linkage and get overwritten per-generation as needed.

  df_Fam$ifparent <- FALSE
  df_Fam$ifson <- FALSE
  df_Fam$ifdau <- FALSE

  # Precompute row indices per generation once.
  # This avoids repeated df_Fam$gen == i scans inside loops.
  gen_rows <- split(seq_len(nrow(df_Fam)), df_Fam$gen)

  # Loop across generations 1..Ngen.

  for (i in seq_len(Ngen)) {
    # -------------------------------------------------------------------------
    # Generation 1: base case
    # Generation 1 individuals are founders and are treated as "parents" by design.
    # They do not have assigned mother/father, so we just set flags and continue.
    # -------------------------------------------------------------------------

    if (i == 1) {
      rows_i <- gen_rows[[as.character(i)]]
      df_Ngen <- df_Fam[rows_i, , drop = FALSE]

      # Mark everyone in generation 1 as parents (founder couple logic occurs earlier).
      df_Ngen$ifparent <- TRUE
      df_Ngen$ifson <- FALSE
      df_Ngen$ifdau <- FALSE
      df_Fam[rows_i, ] <- df_Ngen
      # Write back into the main df_Fam.
    } else {
      # calculate the number of couples in the i-1 th generation
      rows_i <- gen_rows[[as.character(i)]]
      rows_prev <- gen_rows[[as.character(i - 1)]]

      # -------------------------------------------------------------------------
      # Step A: Determine how many couples exist in generation i-1
      #
      # In your representation, each coupled individual has a non-NA spID, and each couple
      # appears twice (one row per spouse). Therefore:
      #   number_of_couples = (number_of_non_single_individuals) / 2
      # where number_of_non_single_individuals = sizeGens[i-1] - count(NA spID)
      # -------------------------------------------------------------------------

      N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[rows_prev]))) * 0.5

      # Expected number of offspring linked to those couples (before sex split).

      N_LinkedMem <- N_couples * kpc
      # Split linked offspring into female and male counts using sexR,
      # where sexR is the proportion male, so (1 - sexR) is the proportion female.

      N_LinkedFemale <- round(N_LinkedMem * (1 - sexR))
      N_LinkedMale <- N_LinkedMem - N_LinkedFemale


      # -------------------------------------------------------------------------
      # Step B: Prepare generation i data, assign couple IDs, and mark potential children
      # -------------------------------------------------------------------------

      # get the df for the i the generation
      df_Ngen <- df_Fam[rows_i, , drop = FALSE]


      # Reset per-generation fields that will be recomputed.
      df_Ngen$ifparent <- FALSE
      df_Ngen$ifson <- FALSE
      df_Ngen$ifdau <- FALSE
      df_Ngen$coupleId <- NA_character_

      # Randomly permute generation i rows so selection is not tied to row order.
      df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]

      # Start to connect children with mother and father

      if (verbose == TRUE) {
        message(
          "Step 2.1: mark a group of potential sons and daughters in the i th generation"
        )
      }


      # count the number of couples in the i th gen
      countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5

      # Assign couple IDs within generation i.
      df_Ngen <- assignCoupleIds(df_Ngen, beta = beta)

      # Identify singles in generation i (no spouse).

      IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)]

      # Count singles by sex; these affect how many "linked" children can come from couples.
      SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID))
      SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID))

      # Number of linked females that must come from couples after excluding single females.
      # This value is passed into markPotentialChildren, which decides who becomes ifson/ifdau.

      CoupleF <- N_LinkedFemale - SingleF

      # Mark potential sons and daughters within generation i.
      # This writes ifson/ifdau into the returned data frame
      df_Fam[rows_i, ] <- markPotentialChildren(
        df_Ngen = df_Ngen,
        i = i,
        Ngen = Ngen,
        sizeGens = sizeGens,
        CoupleF = CoupleF,
        code_male = code_male,
        code_female = code_female,
        beta = beta
      )

      # -------------------------------------------------------------------------
      # Step C: Mark a subset of generation i-1 couples as parents (ifparent)
      #
      # OPTIMIZATION: Instead of looping through individuals and doing linear
      # spouse lookups (O(n²)), we pre-identify all couples using vectorized
      # operations, sample the needed number of couples directly, and mark both
      # spouses in one vectorized operation (O(n)).
      #
      # Goal: choose enough married couples (based on marR) to be parents.
      # -------------------------------------------------------------------------

      if (verbose == TRUE) {
        message(
          "Step 2.2: mark a group of potential parents in the i-1 th generation"
        )
      }
      df_Ngen <- df_Fam[rows_prev, , drop = FALSE]

      # Reset flags within i-1 before reselecting parent couples.
      df_Ngen$ifparent <- FALSE
      df_Ngen$ifson <- FALSE
      df_Ngen$ifdau <- FALSE

      # Randomize order so parent selection is not tied to row ordering.
      # This matches the base version and ensures similar random behavior.
      df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE]

      # OPTIMIZED: Fully vectorized parent couple selection
      # Process all couples at once instead of looping through individuals

      # Identify individuals with spouses
      has_spouse <- !is.na(df_Ngen$spID)

      if (any(has_spouse)) {
        # Create symmetric couple keys for ALL rows (NA for singles)
        couple_keys_all <- ifelse(
          has_spouse,
          paste(
            pmin(df_Ngen$id, df_Ngen$spID),
            pmax(df_Ngen$id, df_Ngen$spID),
            sep = "_"
          ),
          NA_character_
        )

        # Find first occurrence of each couple using !duplicated()
        # This gives us unique couples in the order they appear (after randomization)
        first_occurrence <- !duplicated(couple_keys_all) & has_spouse

        # Get the unique couple keys in order
        unique_couples_ordered <- couple_keys_all[first_occurrence]

        # Calculate how many couples to select
        # Target: marR proportion of individuals = (marR * n) / 2 couples
        n_couples_target <- ceiling(sizeGens[i - 1] * marR / 2)
        n_couples_target <- min(n_couples_target, length(unique_couples_ordered))

        # Select first n couples (in randomized order from the shuffling above)
        selected_couples <- unique_couples_ordered[seq_len(n_couples_target)]

        # Mark all individuals in selected couples as parents (vectorized)
        df_Ngen$ifparent <- couple_keys_all %in% selected_couples
      } else {
        df_Ngen$ifparent <- FALSE
      }

      df_Fam[rows_prev, ] <- df_Ngen

      if (verbose == TRUE) {
        message(
          "Step 2.3: connect the i and i-1 th generation"
        )
      }


      if (i == 1) {
        next
      } else {
        # Pull the two generations together.
        # OPTIMIZATION: Use pre-computed row indices instead of df_Fam$gen %in% c(i, i-1)
        df_Ngen <- df_Fam[c(rows_prev, rows_i), , drop = FALSE]

        sizeI <- sizeGens[i - 1]
        sizeII <- sizeGens[i]

        # Collect IDs of marked sons and daughters in generation i.
        IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i]
        IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i]

        # Interleave sons and daughters to get an offspring list.
        IdOfp <- evenInsert(IdSon, IdDau)

        # nMates is number of parent couples selected (ifparent rows are individuals).
        nMates <- sum(df_Ngen$ifparent) / 2

        # If no mates or no offspring were selected for linkage, skip linkage.
        if (nMates <= 0 || length(IdOfp) == 0) {
          df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
          df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
          next
        }

        # generate link kids to the couples
        random_numbers <- adjustKidsPerCouple(
          nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc,
          rd_kpc = rd_kpc,
          beta = beta
        )

        # Guard: adjustKidsPerCouple returned nothing usable
        if (length(random_numbers) == 0 || all(is.na(random_numbers))) {
          df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
          df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
          next
        }

        # -------------------------------------------------------------------------
        # Step E: Build parent assignment vectors IdMa and IdPa
        #
        # The goal is to expand couples into per-child vectors of mother IDs and father IDs,
        # where each couple contributes random_numbers[couple_index] children.
        #
        # Important: df_Ngen contains both generations. We only want parent generation rows.
        # -------------------------------------------------------------------------

        # Identify rows in df_Ngen that belong to generation i-1 (parent generation).
        rows_prev_in_pair <- which(df_Ngen$gen == (i - 1))

        # Extract parent generation into a smaller frame to make operations faster and clearer.
        prev <- df_Ngen[rows_prev_in_pair, , drop = FALSE]

        # Keep only those rows that are marked ifparent and are actually paired (non-NA spID).
        parent_rows <- which(prev$ifparent == TRUE & !is.na(prev$spID))

        # If no usable parent couples remain, skip linkage.
        if (length(parent_rows) == 0) {
          df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
          df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
          next
        }
        # Create a symmetric couple key so we can keep only one row per couple.
        a <- pmin(prev$id, prev$spID)
        b <- pmax(prev$id, prev$spID)
        couple_key <- paste(a, b, sep = "_")

        # Keep only the first row for each couple among the parent rows.
        parent_rows <- parent_rows[!duplicated(couple_key[parent_rows])]

        # Determine whether each kept row corresponds to the female member of the couple.
        # If the kept row is female: mother = id, father = spID
        # If the kept row is male:   father = id, mother = spID
        is_female_row <- prev$sex[parent_rows] == code_female
        # One mother ID per couple.
        ma_ids <- ifelse(is_female_row, prev$id[parent_rows], prev$spID[parent_rows])

        # One father ID per couple.
        pa_ids <- ifelse(is_female_row, prev$spID[parent_rows], prev$id[parent_rows])

        # Align lengths between couples and random_numbers.
        # If random_numbers is longer than couples, truncate random_numbers.
        nCouples <- length(parent_rows)

        if (length(random_numbers) > nCouples) {
          random_numbers <- random_numbers[seq_len(nCouples)]
        }

        # Expand from "one mother/father per couple" to "one mother/father per child".
        # rep.int is used to avoid extra overhead.
        IdMa <- rep.int(ma_ids, times = random_numbers)
        IdPa <- rep.int(pa_ids, times = random_numbers)

        # -------------------------------------------------------------------------
        # Step F: Ensure IdMa/IdPa length matches the number of offspring IdOfp
        #
        # Two mismatch cases:
        # 1) Too many parent slots relative to offspring: drop excess parent slots.
        # 2) Too many offspring relative to parent slots: drop some offspring.
        #
        # drop singles first (IdSingle) when reducing offspring.
        # -------------------------------------------------------------------------


        if (length(IdPa) - length(IdOfp) > 0) {
          if (verbose == TRUE) {
            message("length of IdPa", length(IdPa), "\n")
          }
          # Excess parent slots: randomly remove that many entries from IdPa and IdMa.

          excess <- length(IdPa) - length(IdOfp)
          if (length(IdPa) > 0 && excess > 0) {
            IdRm <- sample.int(length(IdPa), size = excess)
            IdPa <- IdPa[-IdRm]
            IdMa <- IdMa[-IdRm]
          }
        } else if (length(IdPa) - length(IdOfp) < 0) {
          if (verbose == TRUE) {
            message("length of IdOfp", length(IdOfp), "\n")
            message("length of IdPa", length(IdPa), "\n")
            message("length of IdSingle", length(IdMa), "\n")
          }


          # harden the resample call when IdSingle is empty:
          # Need to drop some offspring because we do not have enough parent slots.
          need_drop <- length(IdOfp) - length(IdPa)

          if (need_drop > 0) {
            if (length(IdSingle) > 0) {
              # Preferentially remove offspring IDs that correspond to singles.
              # resample is expected to return a vector of IDs to remove.

              IdRm <- resample(IdSingle, size = need_drop)
              IdOfp <- IdOfp[!(IdOfp %in% IdRm)]
            } else {
              # If there are no singles to target, drop arbitrary offspring indices.
              drop_idx <- sample.int(length(IdOfp), size = need_drop)
              IdOfp <- IdOfp[-drop_idx]
            }
          }
        }

        # -------------------------------------------------------------------------
        # Step G: Assign pat/mat into df_Ngen for the selected offspring.
        #
        # Replaces the old loop:
        #   for (m in seq_along(IdOfp)) df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- ...
        # Using match avoids repeated scanning over df_Ngen$id.
        # -------------------------------------------------------------------------

        # Find row positions in df_Ngen corresponding to offspring IDs.
        child_rows <- match(IdOfp, df_Ngen$id)
        # Only keep rows that matched successfully.

        ok <- !is.na(child_rows)

        if (any(ok)) {
          # Assign father IDs and mother IDs to offspring rows.

          df_Ngen$pat[child_rows[ok]] <- IdPa[ok]
          df_Ngen$mat[child_rows[ok]] <- IdMa[ok]
        }
        # -------------------------------------------------------------------------
        # Step H: Write the two generations back into df_Fam using the precomputed indices.
        # -------------------------------------------------------------------------

        df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ]
        df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ]
      }
    }
  }
  df_Fam
}

#' Simulate Pedigrees
#' This function simulates "balanced" pedigrees based on a group of parameters:
#' 1) k - Kids per couple;
#' 2) G - Number of generations;
#' 3) p - Proportion of males in offspring;
#' 4) r - Mating rate.
#'
#' @importFrom stats runif
#' @param kpc Number of kids per couple. An integer >= 2 that determines how
#' many kids each fertilized mated couple will have in the pedigree. Default
#' value is 3. Returns an error when kpc equals 1.
#' @param Ngen Number of generations. An integer >= 2 that determines how many
#' generations the simulated pedigree will have. The first generation is always
#' a fertilized couple. The last generation has no mated individuals.
#' @param sexR Sex ratio of offspring. A numeric value ranging from 0 to 1 that
#' determines the proportion of males in all offspring in this pedigree. For
#' instance, 0.4 means 40 percent of the offspring will be male.
#' @param marR Mating rate. A numeric value ranging from 0 to 1 which determines
#' the proportion of mated (fertilized) couples in the pedigree within each
#' generation. For instance, marR  = 0.5 suggests 50 percent of the offspring in
#'  a specific generation will be  mated and have their offspring.
#' @param rd_kpc logical. If TRUE, the number of kids per mate will be randomly
#' generated from a poisson distribution with mean kpc. If FALSE, the number of
#' kids per mate will be fixed at kpc.
#' @param balancedSex Not fully developed yet. Always \code{TRUE} in the
#' current version.
#' @param balancedMar Not fully developed yet. Always \code{TRUE} in the
#' current version.
#' @param verbose logical  If TRUE, message progress through stages of algorithm
#' @param code_male The value to use for males. Default is "M"
#' @param code_female The value to use for females. Default is "F"
#' @param fam_shift An integer to shift the person ID. Default is 1L.
#' This is useful when simulating multiple pedigrees to avoid ID conflicts.
#' @param beta logical or character. Controls which algorithm version to use:
#'   \itemize{
#'     \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm.
#'           Slower but ensures exact reproducibility with set.seed().}
#'     \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup.
#'           Produces statistically equivalent results but not identical to base version
#'           due to different random number consumption. Recommended for large simulations
#'           where speed matters more than exact reproducibility.}
#'   }
#'   Note: Both versions are mathematically correct and produce valid pedigrees with the
#'   same statistical properties (sex ratios, mating rates, etc.). The optimized version
#'   uses vectorized operations instead of loops, making it much faster for large pedigrees.
#' @param ... Additional arguments to be passed to other functions.
#' @inheritParams ped2fam
#' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID".
#' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows:
#' \itemize{
#'   \item{fam: The family id of each simulated individual. It is 'fam1' in a single simulated pedigree.}
#'   \item{ID: The unique personal ID of each simulated individual. The first digit is the fam id; the fourth digit is the generation the individual is in; the following digits represent the order of the individual within their  pedigree. For example, 100411 suggests this individual has a family id of 1, is in the 4th generation, and is the 11th individual in the 4th generation.}
#'   \item{gen: The generation the simulated individual is in.}
#'   \item{dadID: Personal ID of the individual's father.}
#'   \item{momID: Personal ID of the individual's mother.}
#'   \item{spID: Personal ID of the individual's mate.}
#'   \item{sex: Biological sex of the individual. F - female; M - male.}
#' }
#' @export
#' @examples
#' set.seed(5)
#' df_ped <- simulatePedigree(
#'   kpc = 4,
#'   Ngen = 4,
#'   sexR = .5,
#'   marR = .7
#' )
#' summary(df_ped)
simulatePedigree <- function(kpc = 3,
                             Ngen = 4,
                             sexR = .5,
                             marR = 2 / 3,
                             rd_kpc = FALSE,
                             balancedSex = TRUE,
                             balancedMar = TRUE,
                             verbose = FALSE,
                             personID = "ID",
                             momID = "momID",
                             dadID = "dadID",
                             spouseID = "spouseID",
                             code_male = "M",
                             code_female = "F",
                             fam_shift = 1L,
                             beta = FALSE) {
  # SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations
  # SexRatio <- sexR / (1 - sexR)

  # Calculate the expected family size in each generations
  sizeGens <- allGens(kpc = kpc, Ngen = Ngen, marR = marR)
  #  famSizeIndex <- 1:sum(sizeGens)
  if (verbose == TRUE) {
    message(
      "Step 1: Let's build the connection within each generation first"
    )
  }
  df_Fam <- buildWithinGenerations(
    sizeGens = sizeGens,
    Ngen = Ngen,
    sexR = sexR,
    marR = marR,
    verbose = verbose,
    personID = personID,
    momID = momID,
    dadID = dadID,
    code_male = code_male,
    code_female = code_female,
    fam_shift = fam_shift,
    beta = beta
  )
  if (verbose == TRUE) {
    message(
      "Step 2: Let's try to build connection between each two generations"
    )
  }

  df_Fam <- buildBtwnGenerations(
    df_Fam = df_Fam,
    Ngen = Ngen,
    sizeGens = sizeGens,
    verbose = verbose,
    marR = marR,
    sexR = sexR,
    kpc = kpc,
    rd_kpc = rd_kpc,
    personID = personID,
    momID = momID,
    dadID = dadID,
    code_male = code_male,
    code_female = code_female,
    beta = beta
  )

  df_Fam <- df_Fam[, 1:7]
  df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ]

  colnames(df_Fam)[c(2, 4, 5)] <- c(personID, dadID, momID)

  # connect the detached members
  df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ]

  df_Fam
}

#' @rdname simulatePedigree
#' @export
SimPed <- function(...) { # nolint: object_name_linter.
  warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.")
  simulatePedigree(...)
}