diff --git a/R/buildComponent.R b/R/buildComponent.R index cff3268e..71f28929 100644 --- a/R/buildComponent.R +++ b/R/buildComponent.R @@ -19,7 +19,7 @@ #' @param isChild_method character. The method to use for computing the isChild matrix. Options are "classic" or "partialparent" #' @param adjBeta_method numeric The method to use for computing the building the adjacency_method matrix when using the "beta" build #' @param compress logical. If TRUE, use compression when saving the checkpoint files. Defaults to TRUE. -#' @param mz_twins logical. If TRUE, treat MZ twins as having an additional parent-child relationship for the purposes of computing the relatedness matrix. Defaults to FALSE. +#' @param mz_twins logical. If TRUE, merge MZ co-twin columns in the r2 matrix before tcrossprod so that MZ twins are coded with relatedness 1 instead of 0.5. Twin pairs are identified from the \code{twinID} column. When a \code{zygosity} column is also present, only pairs where both members have \code{zygosity == "MZ"} are used; otherwise all \code{twinID} pairs are assumed to be MZ. Defaults to FALSE. #' @param ... additional arguments to be passed to \code{\link{ped2com}} #' @details The algorithms and methodologies used in this function are further discussed and exemplified in the vignette titled "examplePedigreeFunctions". For more advanced scenarios and detailed explanations, consult this vignette. #' @export @@ -123,9 +123,9 @@ ped2com <- function(ped, component, ped <- standardizeColnames(ped, verbose = config$verbose) } + mz_pairs <- NULL if (mz_twins == TRUE && "twinID" %in% colnames(ped)) { - # TODO - # ped <- addMZtwins(ped, verbose = config$verbose) + mz_pairs <- findMZtwins(ped, verbose = config$verbose) } @@ -289,6 +289,22 @@ ped2com <- function(ped, component, compress = config$compress ) + # --- Step 3b: Temporarily merge MZ twin columns in r2 --- + # MZ twins share the same genetic source. We absorb twin2's column into + # twin1's before tcrossprod so all path-traced relatedness flows through a + # single source. After tcrossprod we copy twin1's row/col back to twin2. + if (!is.null(mz_pairs) && length(mz_pairs) > 0) { + for (pair in mz_pairs) { + idx1 <- pair[1] + idx2 <- pair[2] + r2[, idx1] <- r2[, idx1] + r2[, idx2] + r2[, idx2] <- 0 + } + if (config$verbose == TRUE) { + message("Merged ", length(mz_pairs), " MZ twin pair column(s) in r2") + } + } + # --- Step 4: T crossproduct --- if (config$resume == TRUE && file.exists(checkpoint_files$tcrossprod_checkpoint) && @@ -308,6 +324,20 @@ ped2com <- function(ped, component, } } + # --- Step 4b: Restore MZ twins --- + # Copy twin1's row/col to twin2 so both twins appear in the final matrix. + if (!is.null(mz_pairs) && length(mz_pairs) > 0) { + for (pair in mz_pairs) { + idx1 <- pair[1] + idx2 <- pair[2] + r[idx2, ] <- r[idx1, ] + r[, idx2] <- r[, idx1] + } + if (config$verbose == TRUE) { + message("Restored ", length(mz_pairs), " MZ twin pair(s) in relatedness matrix") + } + } + if (config$component %in% c("mitochondrial", "mtdna", "mitochondria")) { r@x <- rep(1, length(r@x)) # Assign 1 to all nonzero elements for mitochondrial component @@ -343,6 +373,7 @@ ped2add <- function(ped, max_gen = 25, sparse = TRUE, verbose = FALSE, save_rate_parlist = 100000 * save_rate, save_path = "checkpoint/", compress = TRUE, + mz_twins = FALSE, ...) { ped2com( ped = ped, @@ -361,6 +392,7 @@ ped2add <- function(ped, max_gen = 25, sparse = TRUE, verbose = FALSE, save_rate_parlist = save_rate_parlist, save_path = save_path, compress = compress, + mz_twins = mz_twins, ... ) } diff --git a/R/constructAdjacency.R b/R/constructAdjacency.R index 3053fb5b..d779937b 100644 --- a/R/constructAdjacency.R +++ b/R/constructAdjacency.R @@ -559,3 +559,70 @@ isChild <- function(isChild_method, ped) { }) } } + + +#' Find MZ twin pairs in a pedigree +#' +#' Identifies MZ twin pairs from the \code{twinID} column and returns their +#' row indices. These indices are used later to merge the twins' columns in +#' the \code{r2} matrix before \code{tcrossprod}, which correctly produces +#' relatedness 1 between MZ co-twins with no diagonal or downstream artifacts. +#' +#' @param ped A pedigree data.frame with columns \code{ID} and \code{twinID}. +#' Optionally a \code{zygosity} column; when present only pairs where both +#' members have \code{zygosity == "MZ"} are used. +#' @param verbose logical. If TRUE, print progress messages. +#' @return A list of length-2 integer vectors \code{c(idx1, idx2)} giving the +#' row indices of each MZ pair in the pedigree, or \code{NULL} if none found. +#' @keywords internal +findMZtwins <- function(ped, verbose = FALSE) { + if (!"twinID" %in% colnames(ped)) { + return(NULL) + } + + twin_rows <- which(!is.na(ped$twinID)) + + # If zygosity column exists, restrict to MZ pairs + if ("zygosity" %in% colnames(ped)) { + twin_rows <- twin_rows[!is.na(ped$zygosity[twin_rows]) & + ped$zygosity[twin_rows] %in% c("mz", "MZ")] + } + + if (length(twin_rows) == 0) { + return(NULL) + } + + processed <- c() + pairs <- list() + + for (idx in twin_rows) { + twin_id <- ped$ID[idx] + co_twin_id <- ped$twinID[idx] + + # Skip if already processed this pair + if (twin_id %in% processed || co_twin_id %in% processed) next + + idx1 <- which(ped$ID == twin_id) + idx2 <- which(ped$ID == co_twin_id) + + if (length(idx1) != 1 || length(idx2) != 1) next + + # Always put the lower index first for consistency + if (idx1 > idx2) { + tmp <- idx1 + idx1 <- idx2 + idx2 <- tmp + } + + processed <- c(processed, twin_id, co_twin_id) + pairs[[length(pairs) + 1]] <- c(idx1, idx2) + + if (verbose) { + message("MZ twin pair found: ", twin_id, " (row ", idx1, + ") and ", co_twin_id, " (row ", idx2, ")") + } + } + + if (length(pairs) == 0) return(NULL) + return(pairs) +} diff --git a/man/adjustKidsPerCouple.Rd b/man/adjustKidsPerCouple.Rd index bf5b8192..dbe7e500 100644 --- a/man/adjustKidsPerCouple.Rd +++ b/man/adjustKidsPerCouple.Rd @@ -21,7 +21,18 @@ value is 3. Returns an error when kpc equals 1.} generated from a poisson distribution with mean kpc. If FALSE, the number of kids per mate will be fixed at kpc.} -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} } \value{ A numeric vector with the generated or adjusted number of kids per couple. diff --git a/man/buildBetweenGenerations.Rd b/man/buildBetweenGenerations.Rd index fa87604e..f28622b2 100644 --- a/man/buildBetweenGenerations.Rd +++ b/man/buildBetweenGenerations.Rd @@ -62,7 +62,18 @@ kids per mate will be fixed at kpc.} \item{code_female}{The value to use for females. Default is "F"} -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} } \value{ The function updates the `df_Fam` data frame in place, adding or modifying columns related to parental and offspring status, diff --git a/man/buildWithinGenerations.Rd b/man/buildWithinGenerations.Rd index 905e42ad..cbddf27a 100644 --- a/man/buildWithinGenerations.Rd +++ b/man/buildWithinGenerations.Rd @@ -20,7 +20,18 @@ buildWithinGenerations( ) } \arguments{ -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} \item{sizeGens}{A numeric vector containing the sizes of each generation within the pedigree.} diff --git a/man/findMZtwins.Rd b/man/findMZtwins.Rd new file mode 100644 index 00000000..a265c4d2 --- /dev/null +++ b/man/findMZtwins.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/constructAdjacency.R +\name{findMZtwins} +\alias{findMZtwins} +\title{Find MZ twin pairs in a pedigree} +\usage{ +findMZtwins(ped, verbose = FALSE) +} +\arguments{ +\item{ped}{A pedigree data.frame with columns \code{ID} and \code{twinID}. +Optionally a \code{zygosity} column; when present only pairs where both +members have \code{zygosity == "MZ"} are used.} + +\item{verbose}{logical. If TRUE, print progress messages.} +} +\value{ +A list of length-2 integer vectors \code{c(idx1, idx2)} giving the + row indices of each MZ pair in the pedigree, or \code{NULL} if none found. +} +\description{ +Identifies MZ twin pairs from the \code{twinID} column and returns their +row indices. These indices are used later to merge the twins' columns in +the \code{r2} matrix before \code{tcrossprod}, which correctly produces +relatedness 1 between MZ co-twins with no diagonal or downstream artifacts. +} +\keyword{internal} diff --git a/man/ped2add.Rd b/man/ped2add.Rd index dc2fee88..a7e83f8d 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -21,6 +21,7 @@ ped2add( save_rate_parlist = 1e+05 * save_rate, save_path = "checkpoint/", compress = TRUE, + mz_twins = FALSE, ... ) } @@ -57,6 +58,8 @@ ped2add( \item{compress}{logical. If TRUE, use compression when saving the checkpoint files. Defaults to TRUE.} +\item{mz_twins}{logical. If TRUE, merge MZ co-twin columns in the r2 matrix before tcrossprod so that MZ twins are coded with relatedness 1 instead of 0.5. Twin pairs are identified from the \code{twinID} column. When a \code{zygosity} column is also present, only pairs where both members have \code{zygosity == "MZ"} are used; otherwise all \code{twinID} pairs are assumed to be MZ. Defaults to FALSE.} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2com.Rd b/man/ped2com.Rd index f34d6022..27014a25 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -25,6 +25,7 @@ ped2com( save_path = "checkpoint/", adjBeta_method = NULL, compress = TRUE, + mz_twins = FALSE, ... ) } @@ -69,6 +70,8 @@ ped2com( \item{compress}{logical. If TRUE, use compression when saving the checkpoint files. Defaults to TRUE.} +\item{mz_twins}{logical. If TRUE, merge MZ co-twin columns in the r2 matrix before tcrossprod so that MZ twins are coded with relatedness 1 instead of 0.5. Twin pairs are identified from the \code{twinID} column. When a \code{zygosity} column is also present, only pairs where both members have \code{zygosity == "MZ"} are used; otherwise all \code{twinID} pairs are assumed to be MZ. Defaults to FALSE.} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2fam.Rd b/man/ped2fam.Rd index 3052e568..b3cf1e07 100644 --- a/man/ped2fam.Rd +++ b/man/ped2fam.Rd @@ -10,6 +10,7 @@ ped2fam( momID = "momID", dadID = "dadID", famID = "famID", + twinID = "twinID", ... ) } @@ -24,6 +25,8 @@ ped2fam( \item{famID}{character. Name of the column to be created in ped for the family ID variable} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \value{ diff --git a/man/ped2graph.Rd b/man/ped2graph.Rd index 5e7ac7b1..022af3c7 100755 --- a/man/ped2graph.Rd +++ b/man/ped2graph.Rd @@ -9,6 +9,7 @@ ped2graph( personID = "ID", momID = "momID", dadID = "dadID", + twinID = "twinID", directed = TRUE, adjacent = c("parents", "mothers", "fathers"), ... @@ -23,6 +24,8 @@ ped2graph( \item{dadID}{character. Name of the column in ped for the father ID variable} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{directed}{Logical scalar. Default is TRUE. Indicates whether or not to create a directed graph.} \item{adjacent}{Character. Relationship that defines adjacency in the graph: parents, mothers, or fathers} diff --git a/man/ped2maternal.Rd b/man/ped2maternal.Rd index 03e02311..1f3bcb2e 100755 --- a/man/ped2maternal.Rd +++ b/man/ped2maternal.Rd @@ -10,6 +10,7 @@ ped2maternal( momID = "momID", dadID = "dadID", matID = "matID", + twinID = "twinID", ... ) } @@ -24,6 +25,8 @@ ped2maternal( \item{matID}{Character. Maternal line ID variable to be created and added to the pedigree} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/ped2paternal.Rd b/man/ped2paternal.Rd index e893ec03..7fac27b9 100755 --- a/man/ped2paternal.Rd +++ b/man/ped2paternal.Rd @@ -10,6 +10,7 @@ ped2paternal( momID = "momID", dadID = "dadID", patID = "patID", + twinID = "twinID", ... ) } @@ -24,6 +25,8 @@ ped2paternal( \item{patID}{Character. Paternal line ID variable to be created and added to the pedigree} +\item{twinID}{character. Name of the column in ped for the twin ID variable, if applicable} + \item{...}{additional arguments to be passed to \code{\link{ped2com}}} } \description{ diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index fa87632e..d56ab72c 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -76,7 +76,18 @@ current version.} \item{fam_shift}{An integer to shift the person ID. Default is 1L. This is useful when simulating multiple pedigrees to avoid ID conflicts.} -\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} \item{...}{Additional arguments to be passed to other functions.} } diff --git a/tests/testthat/test-buildComponent.R b/tests/testthat/test-buildComponent.R index b00fa039..d23249d9 100644 --- a/tests/testthat/test-buildComponent.R +++ b/tests/testthat/test-buildComponent.R @@ -1,3 +1,117 @@ +test_that("MZ twins coded at relatedness 1 via twinID column", { + # Simple pedigree: two parents and two MZ twin children + ped <- potter + + # Without mz_twins: siblings get 0.5 + r_no_mz <- ped2add(ped, mz_twins = FALSE, sparse = FALSE) + expect_equal(r_no_mz["12", "13"], 0.5) + expect_equal(r_no_mz["13", "12"], 0.5) + + # With mz_twins: MZ twins get 1.0 + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + expect_equal(r_mz["12", "13"], 1.0) + expect_equal(r_mz["13", "12"], 1.0) + + # Self-relatedness should still be 1 + expect_equal(r_mz["12", "12"], 1.0) + expect_equal(r_mz["13", "13"], 1.0) + + # Parent-child relatedness unchanged + +}) + + + + +test_that("MZ twins coded at relatedness 1 via twinID column", { + # Simple pedigree: two parents and two MZ twin children + ped <- data.frame( + ID = c(1, 2, 3, 4), + momID = c(NA, NA, 2, 2), + dadID = c(NA, NA, 1, 1), + sex = c("M", "F", "M", "M"), + twinID = c(NA, NA, 4, 3), + zygosity = c(NA, NA, "MZ", "MZ") + ) + + # Without mz_twins: siblings get 0.5 + r_no_mz <- ped2add(ped, mz_twins = FALSE, sparse = FALSE) + expect_equal(r_no_mz["3", "4"], 0.5) + expect_equal(r_no_mz["4", "3"], 0.5) + + # With mz_twins: MZ twins get 1.0 + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + expect_equal(r_mz["3", "4"], 1.0) + expect_equal(r_mz["4", "3"], 1.0) + + # Self-relatedness should still be 1 + expect_equal(r_mz["3", "3"], 1.0) + expect_equal(r_mz["4", "4"], 1.0) + + # Parent-child relatedness unchanged + expect_equal(r_mz["3", "1"], 0.5) + expect_equal(r_mz["4", "1"], 0.5) + expect_equal(r_mz["3", "2"], 0.5) + expect_equal(r_mz["4", "2"], 0.5) +}) + +test_that("MZ twins without zygosity column assumes all twinID pairs are MZ", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + momID = c(NA, NA, 2, 2), + dadID = c(NA, NA, 1, 1), + sex = c("M", "F", "M", "M"), + twinID = c(NA, NA, 4, 3) + ) + + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + expect_equal(r_mz["3", "4"], 1.0) + expect_equal(r_mz["4", "3"], 1.0) +}) + +test_that("DZ twins with zygosity column are NOT modified", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + momID = c(NA, NA, 2, 2), + dadID = c(NA, NA, 1, 1), + sex = c("M", "F", "M", "F"), + twinID = c(NA, NA, 4, 3), + zygosity = c(NA, NA, "DZ", "DZ") + ) + + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + # DZ twins remain at sibling relatedness = 0.5 + expect_equal(r_mz["3", "4"], 0.5) + expect_equal(r_mz["4", "3"], 0.5) +}) + +test_that("MZ twins: downstream child relatedness is correct", { + # 3-generation pedigree: parents -> MZ twins -> twin2 has a child + ped <- data.frame( + ID = c(1, 2, 3, 4, 5, 6), + momID = c(NA, NA, 2, 2, NA, 4), + dadID = c(NA, NA, 1, 1, NA, 5), + sex = c("M", "F", "M", "M", "F", "M"), + twinID = c(NA, NA, 4, 3, NA, NA), + zygosity = c(NA, NA, "MZ", "MZ", NA, NA) + ) + + r_mz <- ped2add(ped, mz_twins = TRUE, sparse = FALSE) + + # MZ twins at 1.0 + expect_equal(r_mz["3", "4"], 1.0) + + # Child of twin2 (ID=4) should be 0.5 to twin2 (parent) + expect_equal(r_mz["6", "4"], 0.5) + + # Child of twin2 should ALSO be 0.5 to twin1 (genetically identical to parent) + expect_equal(r_mz["6", "3"], 0.5) + + # Diagonal for both twins should be clean (no inflation) + expect_equal(r_mz["3", "3"], 1.0) + expect_equal(r_mz["4", "4"], 1.0) +}) + test_that(".assignParentValue works", { expect_equal(.assignParentValue("generation"), .5) expect_equal(.assignParentValue("additive"), .5) diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 381443ec..a33d1314 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -9,7 +9,7 @@ test_that("simulated pedigree generates expected data structure", { sex_tolerance <- .035 base_length <- 57 base_length_tol <- 0.2 * base_length - beta_match_base <- T + beta_match_base <- F # beta_options <- T for (beta in beta_options) { set.seed(seed) @@ -20,7 +20,7 @@ test_that("simulated pedigree generates expected data structure", { if (isFALSE(beta) || (isTRUE(beta) && beta_match_base)) { expect_equal(length(results$ID), base_length, tolerance = strict_tolerance) } else { - expect_true(length(results$ID) >= base_length-base_length_tol*base_length && length(results$ID) <= base_length_tol*base_length+base_length, + expect_true(length(results$ID) >= base_length - base_length_tol * base_length && length(results$ID) <= base_length_tol * base_length + base_length, info = paste0("Beta=TRUE: Expected 45-70 individuals, got ", length(results$ID)) ) } @@ -54,7 +54,7 @@ test_that("simulated pedigree generates expected data structure when sexR is imb sex_tolerance <- .03 base_length <- 154 base_length_tol <- 0.2 * base_length - beta_match_base <- T + beta_match_base <- F # beta_options <- T for (beta in beta_options) { set.seed(seed) @@ -104,7 +104,7 @@ test_that("simulated pedigree generates expected data structure when sexR is imb base_length <- 424 base_length_tol <- 0.2 * base_length - beta_match_base <- T + beta_match_base <- F # beta_options <- T for (beta in beta_options) { @@ -157,7 +157,7 @@ test_that("simulated pedigree generates expected data structure but supply var n # beta_options <- T base_length <- 57 base_length_tol <- 0.2 * base_length - beta_match_base <- T + beta_match_base <- F for (beta in beta_options) { set.seed(seed)