Difference between revisions of "SMHS PartialCorrelation"
(→Applications) |
(→Problems and Exercises) |
||
| (22 intermediate revisions by the same user not shown) | |||
| Line 84: | Line 84: | ||
# R function implementing all three methods | # R function implementing all three methods | ||
calculate_partial_correlation <- function(X, Y, Z, method = "inversion") { | calculate_partial_correlation <- function(X, Y, Z, method = "inversion") { | ||
| + | # Input validation | ||
| + | if (!is.vector(X) || !is.vector(Y)) { | ||
| + | stop("X and Y must be numeric vectors.") | ||
| + | } | ||
n <- length(X) | n <- length(X) | ||
| + | if (length(Y) != n) stop("X and Y must have the same length.") | ||
| + | |||
| + | # Handle NULL Z (shouldn't happen in your loop, but safe) | ||
| + | if (is.null(Z)) { | ||
| + | return(cor(X, Y)) | ||
| + | } | ||
| + | |||
| + | # Ensure Z is a data.frame (handles vector, matrix, or data.frame) | ||
| + | if (is.vector(Z)) { | ||
| + | Z <- data.frame(Z1 = Z) | ||
| + | } else if (is.matrix(Z)) { | ||
| + | Z <- as.data.frame(Z) | ||
| + | } else if (!is.data.frame(Z)) { | ||
| + | stop("Z must be a vector, matrix, or data.frame.") | ||
| + | } | ||
| + | |||
| + | # Check that Z has same number of rows | ||
| + | if (nrow(Z) != n) stop("Z must have the same number of rows as X and Y.") | ||
if (method == "regression") { | if (method == "regression") { | ||
| − | # | + | # Combine into a data frame with explicit names |
| − | + | df_X <- data.frame(response = X, Z) | |
| − | + | df_Y <- data.frame(response = Y, Z) | |
| + | |||
| + | # Fit models using all columns in Z as predictors | ||
| + | # Use 'response ~ .' so all Z variables are included | ||
| + | mod_X <- lm(response ~ ., data = df_X) | ||
| + | mod_Y <- lm(response ~ ., data = df_Y) | ||
| + | |||
| + | res_X <- resid(mod_X) | ||
| + | res_Y <- resid(mod_Y) | ||
| + | |||
return(cor(res_X, res_Y)) | return(cor(res_X, res_Y)) | ||
} else if (method == "recursive") { | } else if (method == "recursive") { | ||
| − | # | + | # Only valid for a single control variable |
| − | if ( | + | if (ncol(Z) == 1) { |
| + | Z_vec <- Z[[1]] | ||
r_xy <- cor(X, Y) | r_xy <- cor(X, Y) | ||
| − | r_xz <- cor(X, | + | r_xz <- cor(X, Z_vec) |
| − | r_yz <- cor(Y, | + | r_yz <- cor(Y, Z_vec) |
| − | + | numerator <- r_xy - r_xz * r_yz | |
| + | denominator <- sqrt((1 - r_xz^2) * (1 - r_yz^2)) | ||
| + | if (denominator == 0) return(NA) | ||
| + | return(numerator / denominator) | ||
} else { | } else { | ||
| − | + | warning("Recursive method only supports a single control variable. Using inversion method instead.") | |
| − | + | method <- "inversion" | |
| − | |||
| − | |||
} | } | ||
| − | + | } | |
| − | + | ||
| − | # | + | if (method == "inversion") { |
| − | data_matrix <- cbind(X, Y, Z) | + | # Combine all variables |
| + | data_matrix <- cbind(X, Y, as.matrix(Z)) | ||
sigma <- cov(data_matrix) | sigma <- cov(data_matrix) | ||
| + | # Handle singular covariance (e.g., perfect collinearity) | ||
| + | if (det(sigma) == 0) { | ||
| + | warning("Covariance matrix is singular; partial correlation may be undefined.") | ||
| + | return(NA) | ||
| + | } | ||
omega <- solve(sigma) | omega <- solve(sigma) | ||
| − | + | pcor_val <- -omega[1, 2] / sqrt(omega[1, 1] * omega[2, 2]) | |
| − | return( | + | return(pcor_val) |
} | } | ||
} | } | ||
| Line 330: | Line 369: | ||
controls_list <- list( | controls_list <- list( | ||
"None" = NULL, | "None" = NULL, | ||
| − | "Age only" = medical_data | + | "Age only" = medical_data$age, # vector |
| − | "BMI only" = medical_data | + | "BMI only" = medical_data$bmi, # vector |
| − | "Age + BMI" = medical_data[ | + | "Age + BMI" = medical_data[c("age", "bmi")] # data.frame (OK now) |
) | ) | ||
| Line 516: | Line 555: | ||
===Advanced Topics=== | ===Advanced Topics=== | ||
| − | + | ====Regularized Partial Correlation==== | |
| + | |||
<pre> | <pre> | ||
# Regularized estimation for high-dimensional data | # Regularized estimation for high-dimensional data | ||
| Line 563: | Line 603: | ||
</pre> | </pre> | ||
| − | + | ====Bayesian Partial Correlation==== | |
| + | |||
<pre> | <pre> | ||
# Bayesian approach to partial correlation | # Bayesian approach to partial correlation | ||
| Line 569: | Line 610: | ||
bayesian_partial_correlation <- function(X, Y, Z, n_iter = 2000) { | bayesian_partial_correlation <- function(X, Y, Z, n_iter = 2000) { | ||
| − | + | if (is.null(dim(Z))) { | |
| + | Z <- matrix(Z, ncol = 1) | ||
| + | K <- 1 | ||
| + | } else { | ||
| + | Z <- as.matrix(Z) | ||
| + | K <- ncol(Z) | ||
| + | } | ||
| + | |||
stan_data <- list( | stan_data <- list( | ||
N = length(X), | N = length(X), | ||
| + | K = K, | ||
X = X, | X = X, | ||
Y = Y, | Y = Y, | ||
| − | Z = | + | Z = Z |
| − | |||
) | ) | ||
| − | |||
stan_model_code <- " | stan_model_code <- " | ||
data { | data { | ||
int<lower=1> N; | int<lower=1> N; | ||
| + | int<lower=1> K; | ||
vector[N] X; | vector[N] X; | ||
vector[N] Y; | vector[N] Y; | ||
matrix[N, K] Z; | matrix[N, K] Z; | ||
| − | |||
} | } | ||
parameters { | parameters { | ||
| Line 597: | Line 644: | ||
} | } | ||
model { | model { | ||
| − | |||
beta_X0 ~ normal(0, 10); | beta_X0 ~ normal(0, 10); | ||
beta_Y0 ~ normal(0, 10); | beta_Y0 ~ normal(0, 10); | ||
| Line 606: | Line 652: | ||
sigma_Y ~ cauchy(0, 2.5); | sigma_Y ~ cauchy(0, 2.5); | ||
| − | |||
for (n in 1:N) { | for (n in 1:N) { | ||
real mu_X = beta_X0 + Z[n] * beta_XZ; | real mu_X = beta_X0 + Z[n] * beta_XZ; | ||
| Line 616: | Line 661: | ||
} | } | ||
generated quantities { | generated quantities { | ||
| − | |||
real partial_corr = rho_resid; | real partial_corr = rho_resid; | ||
} | } | ||
" | " | ||
| − | # | + | # Show progress every ~10% of iterations per chain |
| − | fit <- stan(model_code = stan_model_code, | + | refresh_freq <- max(10, floor(n_iter / 10)) |
| − | + | ||
| − | + | cat("Running Bayesian partial correlation...\n") | |
| − | + | cat("Iterations per chain:", n_iter, "| Chains: 4 | Refresh every", refresh_freq, "iterations\n\n") | |
| + | |||
| + | fit <- stan( | ||
| + | model_code = stan_model_code, | ||
| + | data = stan_data, | ||
| + | iter = n_iter, | ||
| + | chains = 4, | ||
| + | refresh = refresh_freq, | ||
| + | verbose = FALSE | ||
| + | ) | ||
| − | |||
samples <- extract(fit) | samples <- extract(fit) | ||
| Line 637: | Line 689: | ||
)) | )) | ||
} | } | ||
| + | |||
# Example | # Example | ||
| Line 650: | Line 703: | ||
</pre> | </pre> | ||
| − | + | ====Causal Inference and Partial Correlation==== | |
| + | |||
<pre> | <pre> | ||
| − | # | + | # NEED TO INSTALL: R package pcalg and BiocManager::install(c("graph", "RBGL", "Rgraphviz")) |
| + | |||
| + | # if (!require("BiocManager", quietly = TRUE)) | ||
| + | # install.packages("BiocManager") | ||
| + | # BiocManager::install("Rgraphviz") | ||
| + | |||
| + | # Causal analysis with partial correlation | ||
library(pcalg) | library(pcalg) | ||
| + | library(ppcor) | ||
| − | + | causal_analysis_with_partial_correlation <- function(data, alpha = 0.05) { | |
| − | + | cat("=== Causal Analysis with Partial Correlation ===\n") | |
| − | |||
| − | |||
| − | |||
| − | # | + | # 1. Estimate causal skeleton using PC algorithm |
| + | suffStat <- list(C = cov(data), n = nrow(data)) | ||
| + | pc_fit <- pc(suffStat, | ||
| + | indepTest = gaussCItest, | ||
| + | p = ncol(data), | ||
| + | alpha = alpha) | ||
| + | |||
| + | # Convert to adjacency matrix | ||
adj_matrix <- as(pc_fit, "matrix") | adj_matrix <- as(pc_fit, "matrix") | ||
| + | colnames(adj_matrix) <- rownames(adj_matrix) <- colnames(data) | ||
| − | # Compute partial correlations | + | cat("\n1. Estimated Causal Skeleton (PC Algorithm):\n") |
| + | print(adj_matrix) | ||
| + | |||
| + | # 2. Compute full partial correlation matrix | ||
| + | cat("\n2. Full Partial Correlation Matrix (conditioning on all other variables):\n") | ||
| + | full_pcor_matrix <- pcor(data)$estimate | ||
| + | print(round(full_pcor_matrix, 3)) | ||
| + | |||
| + | # 3. Compare with simple correlation matrix | ||
| + | cat("\n3. Simple Correlation Matrix (for comparison):\n") | ||
| + | simple_cor_matrix <- cor(data) | ||
| + | print(round(simple_cor_matrix, 3)) | ||
| + | |||
| + | # 4. Extract edges from PC algorithm and their partial correlations | ||
| + | cat("\n4. Edges in Estimated Causal Graph with Partial Correlations:\n") | ||
p <- ncol(data) | p <- ncol(data) | ||
| − | + | edges_info <- data.frame() | |
for (i in 1:(p-1)) { | for (i in 1:(p-1)) { | ||
for (j in (i+1):p) { | for (j in (i+1):p) { | ||
if (adj_matrix[i, j] != 0) { | if (adj_matrix[i, j] != 0) { | ||
| − | # | + | # Get separating set from PC algorithm |
sep_set <- pc_fit@sepset[[i]][[j]] | sep_set <- pc_fit@sepset[[i]][[j]] | ||
| − | if (length(sep_set) | + | |
| − | + | # Compute appropriate correlation | |
| − | + | if (is.null(sep_set) || length(sep_set) == 0) { | |
| − | + | cor_type <- "Simple" | |
| + | cor_val <- cor(data[, i], data[, j]) | ||
| + | sepset_str <- "None" | ||
| + | } else { | ||
| + | cor_type <- "Partial" | ||
| + | pcor_result <- pcor.test(data[, i], data[, j], | ||
| + | data[, sep_set, drop = FALSE]) | ||
| + | cor_val <- pcor_result$estimate | ||
| + | sepset_str <- paste(colnames(data)[sep_set], collapse = ", ") | ||
} | } | ||
| + | |||
| + | edges_info <- rbind(edges_info, data.frame( | ||
| + | From = colnames(data)[i], | ||
| + | To = colnames(data)[j], | ||
| + | Correlation_Type = cor_type, | ||
| + | Value = round(cor_val, 3), | ||
| + | Separating_Set = sepset_str | ||
| + | )) | ||
} | } | ||
} | } | ||
| + | } | ||
| + | |||
| + | if (nrow(edges_info) > 0) { | ||
| + | print(edges_info) | ||
| + | } else { | ||
| + | cat("No edges found in the estimated graph.\n") | ||
| + | } | ||
| + | |||
| + | # 5. Simplified visualizations without qgraph | ||
| + | cat("\n5. Visualizations:\n") | ||
| + | |||
| + | try({ | ||
| + | # Reset par if needed | ||
| + | old_par <- par(no.readonly = TRUE) | ||
| + | on.exit(par(old_par)) | ||
| + | |||
| + | # Plot 1: Heatmap of partial correlations | ||
| + | par(mfrow = c(2, 2), mar = c(5, 4, 4, 2) + 0.1) | ||
| + | |||
| + | # Heatmap of partial correlations | ||
| + | image(1:ncol(full_pcor_matrix), 1:ncol(full_pcor_matrix), | ||
| + | full_pcor_matrix, | ||
| + | xlab = "", ylab = "", | ||
| + | main = "Partial Correlation Heatmap", | ||
| + | col = colorRampPalette(c("blue", "white", "red"))(100)) | ||
| + | axis(1, at = 1:ncol(full_pcor_matrix), labels = colnames(data), las = 2) | ||
| + | axis(2, at = 1:ncol(full_pcor_matrix), labels = colnames(data), las = 1) | ||
| + | |||
| + | # Plot 2: Network visualization (simplified using igraph) | ||
| + | if (require(igraph, quietly = TRUE)) { | ||
| + | # Create graph from adjacency matrix | ||
| + | g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected") | ||
| + | |||
| + | # Simple plot | ||
| + | plot(g, | ||
| + | main = "Estimated Causal Graph", | ||
| + | vertex.size = 30, | ||
| + | vertex.color = "lightblue", | ||
| + | vertex.label = colnames(data), | ||
| + | edge.color = "black", | ||
| + | layout = layout_with_fr) | ||
| + | } else { | ||
| + | plot.new() | ||
| + | text(0.5, 0.5, "Install 'igraph' for network visualization", | ||
| + | cex = 1.2) | ||
| + | title(main = "Estimated Causal Graph") | ||
| + | } | ||
| + | |||
| + | # Plot 3: Scatter plot of simple vs partial correlations | ||
| + | # Extract lower triangular matrices | ||
| + | simple_lower <- simple_cor_matrix[lower.tri(simple_cor_matrix)] | ||
| + | pcor_lower <- full_pcor_matrix[lower.tri(full_pcor_matrix)] | ||
| + | |||
| + | plot(simple_lower, pcor_lower, | ||
| + | xlab = "Simple Correlation", | ||
| + | ylab = "Partial Correlation", | ||
| + | main = "Simple vs Partial Correlation", | ||
| + | pch = 19, col = "blue", | ||
| + | xlim = c(-1, 1), ylim = c(-1, 1)) | ||
| + | abline(h = 0, v = 0, lty = 2, col = "gray") | ||
| + | abline(a = 0, b = 1, lty = 2, col = "red") # y = x line | ||
| + | |||
| + | # Plot 4: Bar plot of correlation differences | ||
| + | diff_cor <- simple_lower - pcor_lower | ||
| + | hist(diff_cor, | ||
| + | main = "Distribution of Correlation Differences", | ||
| + | xlab = "Simple - Partial Correlation", | ||
| + | col = "lightgreen", | ||
| + | border = "darkgreen") | ||
| + | abline(v = 0, lty = 2, col = "red", lwd = 2) | ||
| + | |||
| + | par(mfrow = c(1, 1)) | ||
| + | }, silent = TRUE) | ||
| + | |||
| + | # 6. Additional analysis: Test specific conditional independences | ||
| + | cat("\n6. Testing Specific Conditional Independences:\n") | ||
| + | |||
| + | # Find the strongest edge in the graph | ||
| + | max_edge <- which(abs(full_pcor_matrix) == max(abs(full_pcor_matrix[lower.tri(full_pcor_matrix)])), | ||
| + | arr.ind = TRUE)[1, ] | ||
| + | var1 <- colnames(data)[max_edge[1]] | ||
| + | var2 <- colnames(data)[max_edge[2]] | ||
| + | |||
| + | cat("Strongest partial correlation between:", var1, "and", var2, "\n") | ||
| + | cat("Partial correlation:", round(full_pcor_matrix[max_edge[1], max_edge[2]], 3), "\n") | ||
| + | |||
| + | # Get separating set for this edge | ||
| + | sep_set <- pc_fit@sepset[[max_edge[1]]][[max_edge[2]]] | ||
| + | if (!is.null(sep_set) && length(sep_set) > 0) { | ||
| + | cat("Separating set:", paste(colnames(data)[sep_set], collapse = ", "), "\n") | ||
} | } | ||
return(list( | return(list( | ||
| − | + | pc_fit = pc_fit, | |
adjacency = adj_matrix, | adjacency = adj_matrix, | ||
| − | partial_correlations = | + | simple_correlations = simple_cor_matrix, |
| + | partial_correlations = full_pcor_matrix, | ||
| + | edges_info = edges_info | ||
)) | )) | ||
} | } | ||
| − | # | + | # Test with the causal data |
set.seed(2023) | set.seed(2023) | ||
| − | n_causal <- | + | n_causal <- 200 # Larger sample size for stability |
| + | |||
causal_data <- data.frame( | causal_data <- data.frame( | ||
A = rnorm(n_causal), | A = rnorm(n_causal), | ||
| − | B = 0.5 | + | B = 0.5 * A + rnorm(n_causal, sd = 0.3), |
| − | C = 0.4 * rnorm(n_causal) | + | C = 0.3 * A + 0.4 * B + rnorm(n_causal, sd = 0.3), |
| − | + | D = 0.2 * B + 0.3 * C + rnorm(n_causal, sd = 0.3), | |
| + | E = rnorm(n_causal), | ||
| + | F = 0.1 * A + rnorm(n_causal, sd = 0.5) | ||
) | ) | ||
| − | + | result <- causal_analysis_with_partial_correlation(causal_data) | |
| − | |||
| − | |||
| − | |||
| − | |||
| − | cat("\ | + | # Additional interactive analysis |
| − | + | cat("\n7. Interactive Analysis:\n") | |
| + | cat("You can explore specific partial correlations using:\n") | ||
| + | cat(" pcor.test(causal_data$A, causal_data$B, causal_data[, c('C', 'D')])\n") | ||
| + | cat(" pcor.test(causal_data$A, causal_data$C, causal_data[, c('B', 'D')])\n") | ||
| + | cat("\nTo visualize the full partial correlation network:\n") | ||
| + | cat(" library(corrplot)\n") | ||
| + | cat(" corrplot(result$partial_correlations, method = 'circle', type = 'full')\n") | ||
</pre> | </pre> | ||
| Line 738: | Line 931: | ||
pcor_result <- pcor.test(X, Y, Z) | pcor_result <- pcor.test(X, Y, Z) | ||
cat("\n2. Partial correlation:", round(pcor_result$estimate, 4), "\n") | cat("\n2. Partial correlation:", round(pcor_result$estimate, 4), "\n") | ||
| + | |||
if (inference) { | if (inference) { | ||
cat(" p-value:", format.pval(pcor_result$p.value, digits = 4), "\n") | cat(" p-value:", format.pval(pcor_result$p.value, digits = 4), "\n") | ||
| + | |||
| + | # Compute confidence interval using Fisher's z-transform | ||
| + | n <- length(X) | ||
| + | k <- ifelse(is.null(Z), 0, ncol(Z)) | ||
| + | r <- pcor_result$estimate | ||
| + | |||
| + | # Fisher's z-transform | ||
| + | z <- 0.5 * log((1 + r) / (1 - r)) | ||
| + | se <- 1 / sqrt(n - k - 3) | ||
| + | |||
| + | # 95% CI for z | ||
| + | z_lower <- z - qnorm(0.975) * se | ||
| + | z_upper <- z + qnorm(0.975) * se | ||
| + | |||
| + | # Transform back to correlation scale | ||
| + | r_lower <- (exp(2 * z_lower) - 1) / (exp(2 * z_lower) + 1) | ||
| + | r_upper <- (exp(2 * z_upper) - 1) / (exp(2 * z_upper) + 1) | ||
| + | |||
cat(" 95% CI: [", | cat(" 95% CI: [", | ||
| − | round( | + | round(r_lower, 4), ", ", |
| − | round( | + | round(r_upper, 4), "]\n", sep = "") |
} | } | ||
} else { | } else { | ||
| Line 771: | Line 983: | ||
# 4. Visualization | # 4. Visualization | ||
if (visualization) { | if (visualization) { | ||
| + | # Save current par settings | ||
| + | old_par <- par(no.readonly = TRUE) | ||
| + | on.exit(par(old_par)) | ||
| + | |||
par(mfrow = c(2, 2)) | par(mfrow = c(2, 2)) | ||
| Line 780: | Line 996: | ||
if (!is.null(Z)) { | if (!is.null(Z)) { | ||
# Residuals after controlling | # Residuals after controlling | ||
| − | + | formula_x <- as.formula(paste(var1, "~", paste(controls, collapse = "+"))) | |
| − | + | res_X <- resid(lm(formula_x, data = data)) | |
res_Y <- resid(lm(formula_y, data = data)) | res_Y <- resid(lm(formula_y, data = data)) | ||
| Line 800: | Line 1,016: | ||
) | ) | ||
| − | barplot( | + | # Remove NA values for plotting |
| − | ylim = c(-1, 1), col = c("blue", "red", "green"), | + | cor_types_complete <- cor_types[!is.na(cor_types$Value), ] |
| + | |||
| + | barplot(cor_types_complete$Value, | ||
| + | names.arg = cor_types_complete$Type, | ||
| + | ylim = c(-1, 1), | ||
| + | col = c("blue", "red", "green")[1:nrow(cor_types_complete)], | ||
main = "Comparison of Correlation Measures", | main = "Comparison of Correlation Measures", | ||
ylab = "Correlation Coefficient") | ylab = "Correlation Coefficient") | ||
| Line 807: | Line 1,028: | ||
} | } | ||
| − | # Network visualization if multiple variables | + | # Network visualization if multiple variables (with fallback) |
if (ncol(data) >= 3) { | if (ncol(data) >= 3) { | ||
| − | pcor_matrix <- pcor(data)$estimate | + | tryCatch({ |
| − | + | pcor_matrix <- pcor(data)$estimate | |
| − | + | ||
| − | + | # Try different visualization methods | |
| − | + | if (require(corrplot, quietly = TRUE)) { | |
| + | # Use corrplot for correlation matrix visualization | ||
| + | corrplot(pcor_matrix, | ||
| + | method = "color", | ||
| + | type = "full", | ||
| + | tl.col = "black", | ||
| + | tl.srt = 45, | ||
| + | title = "Partial Correlation Matrix", | ||
| + | mar = c(0, 0, 2, 0)) | ||
| + | } else if (require(qgraph, quietly = TRUE)) { | ||
| + | # Use qgraph for network visualization (with edge.labels = FALSE) | ||
| + | qgraph(pcor_matrix, | ||
| + | layout = "spring", | ||
| + | labels = colnames(data), | ||
| + | title = "Partial Correlation Network", | ||
| + | maximum = 0.8, | ||
| + | edge.labels = FALSE, # Avoid edge label errors | ||
| + | vsize = 8) | ||
| + | } else { | ||
| + | # Fallback to base R heatmap | ||
| + | heatmap(pcor_matrix, | ||
| + | main = "Partial Correlation Heatmap", | ||
| + | col = colorRampPalette(c("blue", "white", "red"))(100)) | ||
| + | } | ||
| + | }, error = function(e) { | ||
| + | plot.new() | ||
| + | text(0.5, 0.5, "Visualization failed", cex = 1.2) | ||
| + | title(main = "Partial Correlation Network") | ||
| + | }) | ||
} | } | ||
| − | |||
| − | |||
} | } | ||
| Line 826: | Line 1,073: | ||
p_value = if(exists("pcor_result")) pcor_result$p.value else NA | p_value = if(exists("pcor_result")) pcor_result$p.value else NA | ||
) | ) | ||
| + | |||
| + | if (inference && exists("pcor_result")) { | ||
| + | results$confidence_interval <- c(r_lower, r_upper) | ||
| + | } | ||
return(results) | return(results) | ||
} | } | ||
| + | |||
# Example usage with mtcars dataset | # Example usage with mtcars dataset | ||
| Line 848: | Line 1,100: | ||
===Common Issues and Solutions=== | ===Common Issues and Solutions=== | ||
| − | + | ====Multicollinearity Among Controls==== | |
| + | |||
<pre> | <pre> | ||
# Check for multicollinearity in control variables | # Check for multicollinearity in control variables | ||
check_multicollinearity <- function(Z) { | check_multicollinearity <- function(Z) { | ||
| − | if (is.null(dim(Z))) Z <- as.matrix(Z) | + | # Handle vector input |
| + | if (is.null(dim(Z))) { | ||
| + | Z <- as.matrix(Z) | ||
| + | } else if (is.data.frame(Z)) { | ||
| + | Z <- as.matrix(Z) # or keep as data.frame but use in data= | ||
| + | } | ||
| + | |||
| + | # Ensure Z is numeric matrix | ||
| + | Z <- as.matrix(Z) | ||
# Variance Inflation Factor (VIF) | # Variance Inflation Factor (VIF) | ||
| − | if ( | + | if (requireNamespace("car", quietly = TRUE)) { |
| − | # Create a | + | # Create a temporary data frame |
| − | + | df <- data.frame(dummy_y = rnorm(nrow(Z)), Z) | |
| − | dummy_lm <- lm(dummy_y ~ | + | |
| − | vif_values <- vif(dummy_lm) | + | # Fit model using all columns in Z as predictors |
| + | dummy_lm <- lm(dummy_y ~ ., data = df) | ||
| + | |||
| + | vif_values <- car::vif(dummy_lm) | ||
cat("Variance Inflation Factors (VIF):\n") | cat("Variance Inflation Factors (VIF):\n") | ||
print(vif_values) | print(vif_values) | ||
| − | + | if (any(vif_values > 10, na.rm = TRUE)) { | |
| − | |||
| − | if (any( | ||
cat("\nWarning: High multicollinearity detected in control variables.\n") | cat("\nWarning: High multicollinearity detected in control variables.\n") | ||
cat("Consider removing or combining variables with VIF > 10.\n") | cat("Consider removing or combining variables with VIF > 10.\n") | ||
| Line 874: | Line 1,136: | ||
# Condition number | # Condition number | ||
if (ncol(Z) > 1) { | if (ncol(Z) > 1) { | ||
| − | + | # Use correlation matrix | |
| + | cor_mat <- cor(Z) | ||
| + | # Handle potential non-positive-definite due to collinearity | ||
| + | kappa_value <- kappa(cor_mat, exact = TRUE) | ||
cat("\nCondition number of correlation matrix:", round(kappa_value, 2), "\n") | cat("\nCondition number of correlation matrix:", round(kappa_value, 2), "\n") | ||
if (kappa_value > 30) { | if (kappa_value > 30) { | ||
| Line 886: | Line 1,151: | ||
</pre> | </pre> | ||
| − | + | ====Missing Data Handling==== | |
<pre> | <pre> | ||
# Multiple approaches for missing data in partial correlation | # Multiple approaches for missing data in partial correlation | ||
| Line 940: | Line 1,205: | ||
1. '''Conceptual Problems''': | 1. '''Conceptual Problems''': | ||
| − | + | ||
| − | + | : a) Prove that for jointly Gaussian variables, zero partial correlation implies conditional independence | |
| − | + | : b) Derive the relationship between partial correlation and regression coefficients | |
| + | : c) Show that \(|r_{X(Y\cdot Z)}| \leq |r_{XY\cdot Z}| \leq |r_{XY}|\) | ||
2. '''Applied Problems''': | 2. '''Applied Problems''': | ||
| − | + | ||
| − | + | : a) Analyze the relationship between `mpg` and `qsec` in the `mtcars` dataset, controlling for `wt` and `hp` | |
| − | + | : b) Compute the partial correlation matrix for the `iris` dataset (sepal and petal measurements) | |
| + | : c) Conduct a power analysis for detecting partial correlation ρ = 0.3 with 3 control variables, α = 0.05, power = 0.80 | ||
3. '''Simulation Study''': | 3. '''Simulation Study''': | ||
| Line 957: | Line 1,224: | ||
estimates <- numeric(n_sim) | estimates <- numeric(n_sim) | ||
p_values <- numeric(n_sim) | p_values <- numeric(n_sim) | ||
| − | coverage <- | + | coverage <- logical(n_sim) # better as logical |
for (i in 1:n_sim) { | for (i in 1:n_sim) { | ||
| − | |||
# Generate control variables | # Generate control variables | ||
Z <- matrix(rnorm(n * n_controls), n, n_controls) | Z <- matrix(rnorm(n * n_controls), n, n_controls) | ||
| Line 980: | Line 1,246: | ||
p_values[i] <- result$p.value | p_values[i] <- result$p.value | ||
| − | # Check coverage of 95% CI | + | # Check coverage of 95% CI — safely |
ci <- result$conf.int | ci <- result$conf.int | ||
| − | coverage[i] <- (true_rho >= ci[1] & true_rho <= ci[2]) | + | if (length(ci) >= 2) { |
| + | coverage[i] <- (true_rho >= ci[1] & true_rho <= ci[2]) | ||
| + | } else { | ||
| + | # CI not computable — treat as NA or skip | ||
| + | coverage[i] <- NA | ||
| + | } | ||
} | } | ||
| + | |||
| + | # Remove NA coverage values for mean (or report proportion) | ||
| + | coverage_rate <- mean(coverage, na.rm = TRUE) | ||
cat("Simulation Results (n =", n_sim, "):\n") | cat("Simulation Results (n =", n_sim, "):\n") | ||
| Line 991: | Line 1,265: | ||
cat("RMSE:", sqrt(mean((estimates - true_rho)^2)), "\n") | cat("RMSE:", sqrt(mean((estimates - true_rho)^2)), "\n") | ||
cat("Type I error rate (for true_rho = 0):", mean(p_values < 0.05), "\n") | cat("Type I error rate (for true_rho = 0):", mean(p_values < 0.05), "\n") | ||
| − | cat("Coverage of 95% CI:", | + | cat("Coverage of 95% CI (excluding failed CI):", coverage_rate, "\n") |
| + | # # For more control, compute CI manually | ||
| + | # r <- result$estimate | ||
| + | # df <- n - n_controls - 2 # for partial correlation | ||
| + | # # Or use Fisher z: | ||
| + | # if (abs(r) < 1) { | ||
| + | # z <- atanh(r) | ||
| + | # se <- 1 / sqrt(n - n_controls - 3) | ||
| + | # z_ci <- z + c(-1, 1) * qnorm(0.975) * se | ||
| + | # ci <- tanh(z_ci) | ||
| + | # } else { | ||
| + | # ci <- c(NA, NA) | ||
| + | # } | ||
| + | |||
return(data.frame( | return(data.frame( | ||
estimate = estimates, | estimate = estimates, | ||
Latest revision as of 15:27, 9 December 2025
Contents
- 1 Scientific Methods for Health Sciences - Partial Correlation
- 1.1 Overview
- 1.2 Motivation
- 1.3 Theory
- 1.4 Applications
- 1.5 Advanced Topics
- 1.6 Software Implementation
- 1.7 Common Issues and Solutions
- 1.8 Problems and Exercises
- 1.9 References
- 1.10 Online Resources
Scientific Methods for Health Sciences - Partial Correlation
Overview
Partial correlation measures the degree of association between two random variables after removing the linear effects of one or more controlling variables. It quantifies the unique relationship between two variables while statistically controlling for potential confounding factors. Partial correlation is fundamental in: - Identifying direct relationships in multivariate systems - Controlling for confounding variables in observational studies - Network analysis and graphical models - Time series analysis (partial autocorrelation)
Motivation
Consider a study examining the relationship between exercise frequency and cholesterol levels. Age is known to affect both variables: older individuals tend to exercise less and have higher cholesterol. Simple correlation between exercise and cholesterol would be confounded by age. Partial correlation addresses this by:
1. Removing spurious correlations due to common causes
2. Identifying direct relationships in complex systems
3. Testing conditional independence in graphical models
4. Decomposing multivariate relationships into direct and indirect effects
Theory
1) Mathematical Foundations
Definition
The partial correlation between variables \(X\) and \(Y\) given a set of controlling variables \(Z = \{Z_1, Z_2, \ldots, Z_n\}\) is defined as\[ \rho_{XY \cdot Z} = \frac{\rho_{XY} - \rho_{XZ}\rho_{YZ}}{\sqrt{(1-\rho_{XZ}^2)(1-\rho_{YZ}^2)}} \] for a single controlling variable \(Z\), and more generally as\[ \rho_{XY \cdot Z} = \frac{\text{Cov}(X_{\perp Z}, Y_{\perp Z})}{\sqrt{\text{Var}(X_{\perp Z})\text{Var}(Y_{\perp Z})}} \] where \(X_{\perp Z}\) and \(Y_{\perp Z}\) are the residuals from regressing \(X\) and \(Y\) on \(Z\).
Matrix Formulation
For a set of variables with covariance matrix \(\boldsymbol{\Sigma}\), let \(\boldsymbol{\Omega} = \boldsymbol{\Sigma}^{-1}\) be the precision matrix. The partial correlation between \(X_i\) and \(X_j\) controlling for all other variables is\[ \rho_{X_i X_j \cdot V\setminus\{X_i,X_j\}} = -\frac{\omega_{ij}}{\sqrt{\omega_{ii}\omega_{jj}}} \] where \(\omega_{ij}\) are elements of \(\boldsymbol{\Omega}\).
This relationship reveals that zero partial correlation implies conditional independence for jointly Gaussian variables\[ \rho_{X_i X_j \cdot V\setminus\{X_i,X_j\}} = 0 \iff X_i \perp\!\!\!\perp X_j \mid V\setminus\{X_i,X_j\} \]
Geometric Interpretation
In vector space terminology, partial correlation corresponds to the cosine of the angle between the residual vectors after projecting \(X\) and \(Y\) onto the subspace spanned by \(Z\)\[ \rho_{XY\cdot Z} = \cos(\theta_{R_X R_Y}) \] where \(R_X = X - P_Z(X)\) and \(R_Y = Y - P_Z(Y)\) are residuals, and \(P_Z\) denotes projection onto the span of \(Z\).
2) Calculation Methods
Method 1: Residual Regression
1. Regress \(X\) on \(Z\): \(X = \boldsymbol{\beta}_X^\top Z + \varepsilon_X\) 2. Regress \(Y\) on \(Z\): \(Y = \boldsymbol{\beta}_Y^\top Z + \varepsilon_Y\) 3. Compute correlation between residuals: \(\rho_{XY\cdot Z} = \text{Cor}(\varepsilon_X, \varepsilon_Y)\)
The sample estimate is\[ \hat{\rho}_{XY\cdot Z} = \frac{\sum_{i=1}^n \hat{\varepsilon}_{X,i}\hat{\varepsilon}_{Y,i}}{\sqrt{\sum_{i=1}^n \hat{\varepsilon}_{X,i}^2 \sum_{i=1}^n \hat{\varepsilon}_{Y,i}^2}} \]
Method 2: Recursive Formula
For nested sets of controlling variables \(Z \subset W\)\[ \rho_{XY\cdot W} = \frac{\rho_{XY\cdot Z} - \rho_{XW_n\cdot Z\setminus\{W_n\}}\rho_{YW_n\cdot Z\setminus\{W_n\}}}{\sqrt{(1-\rho_{XW_n\cdot Z\setminus\{W_n\}}^2)(1-\rho_{YW_n\cdot Z\setminus\{W_n\}}^2)}} \] where \(W_n\) is any variable in \(W\setminus Z\).
This allows efficient computation with time complexity \(O(p^3)\) for \(p\) variables using dynamic programming.
Method 3: Matrix Inversion
From the precision matrix \(\boldsymbol{\Omega} = \boldsymbol{\Sigma}^{-1}\)\[ \hat{\rho}_{X_i X_j \cdot V\setminus\{X_i,X_j\}} = -\frac{\hat{\omega}_{ij}}{\sqrt{\hat{\omega}_{ii}\hat{\omega}_{jj}}} \] This method is particularly efficient for computing all pairwise partial correlations simultaneously.
# R function implementing all three methods
calculate_partial_correlation <- function(X, Y, Z, method = "inversion") {
# Input validation
if (!is.vector(X) || !is.vector(Y)) {
stop("X and Y must be numeric vectors.")
}
n <- length(X)
if (length(Y) != n) stop("X and Y must have the same length.")
# Handle NULL Z (shouldn't happen in your loop, but safe)
if (is.null(Z)) {
return(cor(X, Y))
}
# Ensure Z is a data.frame (handles vector, matrix, or data.frame)
if (is.vector(Z)) {
Z <- data.frame(Z1 = Z)
} else if (is.matrix(Z)) {
Z <- as.data.frame(Z)
} else if (!is.data.frame(Z)) {
stop("Z must be a vector, matrix, or data.frame.")
}
# Check that Z has same number of rows
if (nrow(Z) != n) stop("Z must have the same number of rows as X and Y.")
if (method == "regression") {
# Combine into a data frame with explicit names
df_X <- data.frame(response = X, Z)
df_Y <- data.frame(response = Y, Z)
# Fit models using all columns in Z as predictors
# Use 'response ~ .' so all Z variables are included
mod_X <- lm(response ~ ., data = df_X)
mod_Y <- lm(response ~ ., data = df_Y)
res_X <- resid(mod_X)
res_Y <- resid(mod_Y)
return(cor(res_X, res_Y))
} else if (method == "recursive") {
# Only valid for a single control variable
if (ncol(Z) == 1) {
Z_vec <- Z[[1]]
r_xy <- cor(X, Y)
r_xz <- cor(X, Z_vec)
r_yz <- cor(Y, Z_vec)
numerator <- r_xy - r_xz * r_yz
denominator <- sqrt((1 - r_xz^2) * (1 - r_yz^2))
if (denominator == 0) return(NA)
return(numerator / denominator)
} else {
warning("Recursive method only supports a single control variable. Using inversion method instead.")
method <- "inversion"
}
}
if (method == "inversion") {
# Combine all variables
data_matrix <- cbind(X, Y, as.matrix(Z))
sigma <- cov(data_matrix)
# Handle singular covariance (e.g., perfect collinearity)
if (det(sigma) == 0) {
warning("Covariance matrix is singular; partial correlation may be undefined.")
return(NA)
}
omega <- solve(sigma)
pcor_val <- -omega[1, 2] / sqrt(omega[1, 1] * omega[2, 2])
return(pcor_val)
}
}
3) Statistical Inference
Hypothesis Testing
Test \(H_0: \rho_{XY\cdot Z} = 0\) vs \(H_1: \rho_{XY\cdot Z} \neq 0\).
Fisher's z-transform\[ z = \frac{1}{2}\ln\left(\frac{1+\hat{\rho}}{1-\hat{\rho}}\right) \sim N\left(\frac{1}{2}\ln\left(\frac{1+\rho}{1-\rho}\right), \frac{1}{n-|Z|-3}\right) \]
Test statistic\[ T = \frac{z - \frac{1}{2}\ln\left(\frac{1+\rho_0}{1-\rho_0}\right)}{\sqrt{1/(n-|Z|-3)}} \sim N(0,1) \] Under \(H_0: \rho = 0\), this simplifies to \(T = z\sqrt{n-|Z|-3}\).
Exact t-test (for normally distributed data)\[ t = \hat{\rho}\sqrt{\frac{n-|Z|-2}{1-\hat{\rho}^2}} \sim t_{n-|Z|-2} \]
Confidence Intervals
Using Fisher's transform\[ \text{CI}_{1-\alpha} = \left[\tanh\left(z - \frac{z_{1-\alpha/2}}{\sqrt{n-|Z|-3}}\right), \tanh\left(z + \frac{z_{1-\alpha/2}}{\sqrt{n-|Z|-3}}\right)\right] \] where \(\tanh(x) = \frac{e^{2x}-1}{e^{2x}+1}\) is the hyperbolic tangent.
Power Analysis
The required sample size to detect partial correlation \(\rho\) with power \(1-\beta\) at level \(\alpha\)\[ n = |Z| + 3 + \left(\frac{z_{1-\alpha/2} + z_{1-\beta}}{z(\rho)}\right)^2 \] where \(z(\rho) = \frac{1}{2}\ln\left(\frac{1+\rho}{1-\rho}\right)\).
# R functions for inference
partial_correlation_test <- function(X, Y, Z, method = "fisher") {
n <- length(X)
k <- if(is.null(dim(Z))) 1 else ncol(Z)
r <- calculate_partial_correlation(X, Y, Z, method = "regression")
if (method == "fisher") {
# Fisher's z-transform
z <- 0.5 * log((1 + r) / (1 - r))
se <- 1 / sqrt(n - k - 3)
z_score <- z / se
p_value <- 2 * pnorm(-abs(z_score))
# Confidence interval
z_lower <- z - qnorm(0.975) * se
z_upper <- z + qnorm(0.975) * se
ci <- c(tanh(z_lower), tanh(z_upper))
} else if (method == "t_test") {
# Exact t-test (assumes normality)
t_stat <- r * sqrt((n - k - 2) / (1 - r^2))
p_value <- 2 * pt(-abs(t_stat), df = n - k - 2)
ci <- cor.test(X, Y)$conf.int # Approximation
}
return(list(
estimate = r,
statistic = if(method == "fisher") z_score else t_stat,
p_value = p_value,
confidence_interval = ci,
n = n,
df = n - k - 2
))
}
# Power calculation
partial_correlation_power <- function(rho, n, k, alpha = 0.05) {
# rho: true partial correlation
# n: sample size
# k: number of controlling variables
z_rho <- 0.5 * log((1 + rho) / (1 - rho))
se <- 1 / sqrt(n - k - 3)
# Non-centrality parameter
lambda <- z_rho / se
# Critical value under H0
z_crit <- qnorm(1 - alpha/2)
# Power
power <- pnorm(-z_crit - lambda) + 1 - pnorm(z_crit - lambda)
return(power)
}
4) Semi-Partial (Part) Correlation
The semi-partial correlation between \(X\) and \(Y\) controlling for \(Z\) from \(X\) only is\[ r_{X(Y\cdot Z)} = \frac{r_{XY} - r_{XZ}r_{YZ}}{\sqrt{1 - r_{YZ}^2}} \] Equivalently, it's the correlation between \(X\) and the residuals of \(Y\) regressed on \(Z\)\[ r_{X(Y\cdot Z)} = \text{Cor}(X, Y - \hat{Y}_Z) \]
Key differences from partial correlation:
1. Denominator: Semi-partial uses total variance of \(Y\), not residual variance
2. Interpretation: Proportion of total variance in \(Y\) uniquely explained by \(X\)
3. Range: \(|r_{X(Y\cdot Z)}| \leq |r_{XY\cdot Z}| \leq |r_{XY}|\)
4. Asymmetry: \(r_{X(Y\cdot Z)} \neq r_{Y(X\cdot Z)}\) generally
5) Partial Autocorrelation in Time Series
The partial autocorrelation function (PACF) at lag \(h\) is\[ \phi(h) = \rho_{X_t X_{t+h} \cdot \{X_{t+1}, \ldots, X_{t+h-1}\}} \]
For an AR(\(p\)) process, \(\phi(h) = 0\) for \(h > p\). The PACF is estimated via: 1. Durbin-Levinson algorithm: Recursive computation 2. Regression approach: \(\phi(h) = \) coefficient of \(X_{t-h}\) in regression of \(X_t\) on \(X_{t-1}, \ldots, X_{t-h}\) 3. Matrix inversion: Using the autocovariance matrix
Hypothesis testing: Under \(H_0: \phi(h) = 0\), \( \sqrt{n}\hat{\phi}(h) \sim N(0,1) \) Approximate 95% confidence bands: \(\pm 1.96/\sqrt{n}\).
Applications
Example 1: Medical Research - Controlling for Confounders
# Load necessary libraries
library(ppcor)
library(ggplot2)
library(GGally)
# Simulate medical data: Cholesterol (Y), Exercise (X), Age (Z1), BMI (Z2)
set.seed(123)
n <- 200
age <- rnorm(n, mean = 50, sd = 10)
bmi <- rnorm(n, mean = 25, sd = 4)
exercise <- 5 - 0.05*age - 0.1*bmi + rnorm(n, sd = 2)
cholesterol <- 200 + 0.8*age + 0.5*bmi - 0.7*exercise + rnorm(n, sd = 15)
medical_data <- data.frame(cholesterol, exercise, age, bmi)
cat("=== Medical Research Example ===\n")
cat("Research question: Relationship between exercise and cholesterol,\n")
cat("controlling for age and BMI.\n\n")
# 1. Simple correlations
cat("1. Simple (Marginal) Correlations:\n")
cor_matrix <- cor(medical_data)
print(cor_matrix)
# 2. Partial correlation (exercise ~ cholesterol | age, bmi)
cat("\n2. Partial Correlation Analysis:\n")
pcor_result <- pcor.test(medical_data$exercise, medical_data$cholesterol,
medical_data[, c("age", "bmi")])
print(pcor_result)
# 3. Compare with semi-partial correlation
cat("\n3. Semi-partial Correlation:\n")
# Regress cholesterol on age and BMI
lm_chol <- lm(cholesterol ~ age + bmi, data = medical_data)
residuals_chol <- resid(lm_chol)
semi_partial <- cor(medical_data$exercise, residuals_chol)
cat("Semi-partial correlation (exercise with cholesterol residuals):",
round(semi_partial, 4), "\n")
# 4. Visualization
par(mfrow = c(2, 2))
# Scatterplot matrix
plot(medical_data[, 1:4], main = "Scatterplot Matrix")
# Partial correlation network
if (require(qgraph)) {
pcor_network <- pcor(medical_data)$estimate
qgraph(pcor_network, layout = "spring",
labels = colnames(medical_data),
title = "Partial Correlation Network")
}
# Comparison of correlations
cor_types <- data.frame(
Type = c("Simple", "Partial", "Semi-partial"),
Value = c(cor(medical_data$exercise, medical_data$cholesterol),
pcor_result$estimate,
semi_partial)
)
ggplot(cor_types, aes(x = Type, y = Value, fill = Type)) +
geom_bar(stat = "identity") +
ylim(-1, 1) +
labs(title = "Comparison of Correlation Measures",
subtitle = "Exercise vs Cholesterol",
y = "Correlation Coefficient") +
theme_minimal()
par(mfrow = c(1, 1))
# 5. Sensitivity analysis: How partial correlation changes with different controls
cat("\n4. Sensitivity Analysis:\n")
cat("Partial correlations with different sets of controls:\n")
controls_list <- list(
"None" = NULL,
"Age only" = medical_data$age, # vector
"BMI only" = medical_data$bmi, # vector
"Age + BMI" = medical_data[c("age", "bmi")] # data.frame (OK now)
)
for (name in names(controls_list)) {
if (is.null(controls_list[[name]])) {
r <- cor(medical_data$exercise, medical_data$cholesterol)
} else {
r <- calculate_partial_correlation(
medical_data$exercise,
medical_data$cholesterol,
controls_list[[name]],
method = "regression"
)
}
cat(sprintf("%-15s: r = %.4f\n", name, r))
}
- Example 2: Gene Expression Network Analysis
# Using gene expression data to demonstrate partial correlation in networks
library(huge)
# Simulate gene expression data with network structure
set.seed(456)
n_genes <- 20
n_samples <- 100
# Generate precision matrix with sparse structure
prec_matrix <- diag(n_genes)
for(i in 1:(n_genes-1)) {
for(j in (i+1):n_genes) {
if(runif(1) < 0.1) { # 10% connections
prec_matrix[i,j] <- prec_matrix[j,i] <- 0.3
}
}
}
# Ensure positive definiteness
diag(prec_matrix) <- abs(min(eigen(prec_matrix)$values)) + 1.1
# Generate multivariate normal data
cov_matrix <- solve(prec_matrix)
gene_data <- MASS::mvrnorm(n_samples, mu = rep(0, n_genes), Sigma = cov_matrix)
colnames(gene_data) <- paste0("Gene_", 1:n_genes)
cat("=== Gene Expression Network Analysis ===\n")
# 1. Simple correlation network
simple_cor <- cor(gene_data)
cat("\n1. Simple correlation density:",
mean(abs(simple_cor[upper.tri(simple_cor)])), "\n")
# 2. Partial correlation network (Graphical Lasso)
if (require(huge)) {
huge_result <- huge(gene_data, method = "glasso")
huge_select <- huge.select(huge_result, criterion = "ebic")
# Estimated precision matrix
omega_est <- as.matrix(huge_select$opt.icov)
# Convert to partial correlations
pcor_network <- -cov2cor(omega_est)
diag(pcor_network) <- 1
cat("2. Partial correlation density:",
mean(abs(pcor_network[upper.tri(pcor_network)])), "\n")
# Compare networks
cat("\n3. Network Comparison:\n")
cat("Number of edges (simple correlation > 0.3):",
sum(abs(simple_cor[upper.tri(simple_cor)]) > 0.3), "\n")
cat("Number of edges (partial correlation > 0.3):",
sum(abs(pcor_network[upper.tri(pcor_network)]) > 0.3), "\n")
# Visualize both networks
par(mfrow = c(1, 2))
# Simple correlation network
qgraph(simple_cor, layout = "spring",
maximum = 0.5, minimum = -0.5,
title = "Simple Correlation Network")
# Partial correlation network
qgraph(pcor_network, layout = "spring",
maximum = 0.5, minimum = -0.5,
title = "Partial Correlation Network")
par(mfrow = c(1, 1))
# 3. Test specific partial correlations
cat("\n4. Hypothesis Testing for Specific Gene Pairs:\n")
# Test Gene_1 and Gene_2 controlling for others
test_result <- pcor.test(gene_data[, "Gene_1"],
gene_data[, "Gene_2"],
gene_data[, setdiff(colnames(gene_data),
c("Gene_1", "Gene_2"))])
print(test_result)
}
- Example 3: Time Series - Partial Autocorrelation Function
# Time series analysis with PACF
cat("=== Time Series Analysis: Partial Autocorrelation ===\n")
# Generate AR(2) process: X_t = 0.5*X_{t-1} - 0.3*X_{t-2} + ε_t
set.seed(789)
n <- 500
epsilon <- rnorm(n)
x <- numeric(n)
x[1] <- epsilon[1]
x[2] <- 0.5*x[1] + epsilon[2]
for(t in 3:n) {
x[t] <- 0.5*x[t-1] - 0.3*x[t-2] + epsilon[t]
}
# 1. Compute PACF using built-in function
pacf_result <- pacf(x, lag.max = 20, plot = FALSE)
cat("\n1. PACF values (lags 1-10):\n")
print(pacf_result$acf[1:10])
# 2. Manual calculation via regression (for understanding)
cat("\n2. Manual PACF calculation via regression:\n")
manual_pacf <- function(x, lag) {
if (lag == 1) {
return(cor(x[-1], x[-length(x)]))
} else {
# Create design matrix
n <- length(x)
X <- matrix(NA, nrow = n - lag, ncol = lag)
for (i in 1:lag) {
X[, i] <- x[(lag+1-i):(n-i)]
}
# Fit regression and get coefficient for most recent lag
y <- x[(lag+1):n]
coefs <- lm(y ~ X)$coefficients
return(coefs[lag + 1]) # +1 for intercept
}
}
for (h in 1:5) {
cat(sprintf("Lag %d: PACF = %.4f (built-in) vs %.4f (manual)\n",
h, pacf_result$acf[h], manual_pacf(x, h)))
}
# 3. Visual comparison
par(mfrow = c(2, 2))
# Time series plot
plot(x, type = "l", main = "AR(2) Time Series",
xlab = "Time", ylab = "Value")
# ACF plot
acf(x, lag.max = 20, main = "Autocorrelation Function")
# PACF plot
pacf(x, lag.max = 20, main = "Partial Autocorrelation Function")
# Compare ACF and PACF
plot(1:20, acf(x, lag.max = 20, plot = FALSE)$acf[-1],
type = "h", col = "blue", lwd = 2,
xlab = "Lag", ylab = "Correlation",
main = "ACF vs PACF", ylim = c(-0.5, 0.6))
points(1:20 + 0.2, pacf_result$acf,
type = "h", col = "red", lwd = 2)
legend("topright", legend = c("ACF", "PACF"),
col = c("blue", "red"), lwd = 2)
par(mfrow = c(1, 1))
# 4. Model identification
cat("\n3. Model Identification:\n")
cat("Based on PACF cutoff after lag 2, suggest AR(2) model.\n")
cat("PACF significant at lags 1 and 2, insignificant thereafter.\n")
Advanced Topics
Regularized Partial Correlation
# Regularized estimation for high-dimensional data
library(glasso)
regularized_partial_correlation <- function(data, lambda = 0.1) {
# Graphical Lasso for sparse precision matrix estimation
S <- cov(data)
glasso_result <- glasso(S, rho = lambda)
# Convert to partial correlations
omega <- glasso_result$wi
pcor_matrix <- -cov2cor(omega)
diag(pcor_matrix) <- 1
return(list(
precision_matrix = omega,
partial_correlation = pcor_matrix,
lambda = lambda
))
}
# Example with high-dimensional data
set.seed(101)
p <- 50 # Number of variables
n <- 30 # Number of observations (n < p)
high_dim_data <- matrix(rnorm(n * p), n, p)
cat("=== Regularized Partial Correlation (n < p) ===\n")
cat("Dimensions:", n, "samples ×", p, "variables\n")
# Standard correlation matrix is singular
try_cor <- try(cor(high_dim_data), silent = TRUE)
if (inherits(try_cor, "try-error")) {
cat("Standard correlation matrix is singular (n < p).\n")
}
# Regularized estimation
reg_result <- regularized_partial_correlation(high_dim_data, lambda = 0.5)
cat("\nRegularized partial correlation matrix computed successfully.\n")
cat("Sparsity:",
mean(reg_result$partial_correlation[upper.tri(reg_result$partial_correlation)] == 0),
"proportion of zero partial correlations.\n")
Bayesian Partial Correlation
# Bayesian approach to partial correlation
library(rstan)
bayesian_partial_correlation <- function(X, Y, Z, n_iter = 2000) {
if (is.null(dim(Z))) {
Z <- matrix(Z, ncol = 1)
K <- 1
} else {
Z <- as.matrix(Z)
K <- ncol(Z)
}
stan_data <- list(
N = length(X),
K = K,
X = X,
Y = Y,
Z = Z
)
stan_model_code <- "
data {
int<lower=1> N;
int<lower=1> K;
vector[N] X;
vector[N] Y;
matrix[N, K] Z;
}
parameters {
real beta_X0;
real beta_Y0;
vector[K] beta_XZ;
vector[K] beta_YZ;
real<lower=-1, upper=1> rho_resid;
real<lower=0> sigma_X;
real<lower=0> sigma_Y;
}
model {
beta_X0 ~ normal(0, 10);
beta_Y0 ~ normal(0, 10);
beta_XZ ~ normal(0, 5);
beta_YZ ~ normal(0, 5);
rho_resid ~ uniform(-1, 1);
sigma_X ~ cauchy(0, 2.5);
sigma_Y ~ cauchy(0, 2.5);
for (n in 1:N) {
real mu_X = beta_X0 + Z[n] * beta_XZ;
real mu_Y = beta_Y0 + Z[n] * beta_YZ;
[X[n], Y[n]]' ~ multi_normal([mu_X, mu_Y]',
[[sigma_X^2, rho_resid*sigma_X*sigma_Y],
[rho_resid*sigma_X*sigma_Y, sigma_Y^2]]);
}
}
generated quantities {
real partial_corr = rho_resid;
}
"
# Show progress every ~10% of iterations per chain
refresh_freq <- max(10, floor(n_iter / 10))
cat("Running Bayesian partial correlation...\n")
cat("Iterations per chain:", n_iter, "| Chains: 4 | Refresh every", refresh_freq, "iterations\n\n")
fit <- stan(
model_code = stan_model_code,
data = stan_data,
iter = n_iter,
chains = 4,
refresh = refresh_freq,
verbose = FALSE
)
samples <- extract(fit)
return(list(
posterior_mean = mean(samples$partial_corr),
posterior_median = median(samples$partial_corr),
credible_interval = quantile(samples$partial_corr, c(0.025, 0.975)),
posterior_samples = samples$partial_corr
))
}
# Example
cat("\n=== Bayesian Partial Correlation ===\n")
bayes_result <- bayesian_partial_correlation(
medical_data$exercise,
medical_data$cholesterol,
medical_data[, c("age", "bmi")],
n_iter = 1000
)
print(bayes_result)
Causal Inference and Partial Correlation
# NEED TO INSTALL: R package pcalg and BiocManager::install(c("graph", "RBGL", "Rgraphviz"))
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("Rgraphviz")
# Causal analysis with partial correlation
library(pcalg)
library(ppcor)
causal_analysis_with_partial_correlation <- function(data, alpha = 0.05) {
cat("=== Causal Analysis with Partial Correlation ===\n")
# 1. Estimate causal skeleton using PC algorithm
suffStat <- list(C = cov(data), n = nrow(data))
pc_fit <- pc(suffStat,
indepTest = gaussCItest,
p = ncol(data),
alpha = alpha)
# Convert to adjacency matrix
adj_matrix <- as(pc_fit, "matrix")
colnames(adj_matrix) <- rownames(adj_matrix) <- colnames(data)
cat("\n1. Estimated Causal Skeleton (PC Algorithm):\n")
print(adj_matrix)
# 2. Compute full partial correlation matrix
cat("\n2. Full Partial Correlation Matrix (conditioning on all other variables):\n")
full_pcor_matrix <- pcor(data)$estimate
print(round(full_pcor_matrix, 3))
# 3. Compare with simple correlation matrix
cat("\n3. Simple Correlation Matrix (for comparison):\n")
simple_cor_matrix <- cor(data)
print(round(simple_cor_matrix, 3))
# 4. Extract edges from PC algorithm and their partial correlations
cat("\n4. Edges in Estimated Causal Graph with Partial Correlations:\n")
p <- ncol(data)
edges_info <- data.frame()
for (i in 1:(p-1)) {
for (j in (i+1):p) {
if (adj_matrix[i, j] != 0) {
# Get separating set from PC algorithm
sep_set <- pc_fit@sepset[[i]][[j]]
# Compute appropriate correlation
if (is.null(sep_set) || length(sep_set) == 0) {
cor_type <- "Simple"
cor_val <- cor(data[, i], data[, j])
sepset_str <- "None"
} else {
cor_type <- "Partial"
pcor_result <- pcor.test(data[, i], data[, j],
data[, sep_set, drop = FALSE])
cor_val <- pcor_result$estimate
sepset_str <- paste(colnames(data)[sep_set], collapse = ", ")
}
edges_info <- rbind(edges_info, data.frame(
From = colnames(data)[i],
To = colnames(data)[j],
Correlation_Type = cor_type,
Value = round(cor_val, 3),
Separating_Set = sepset_str
))
}
}
}
if (nrow(edges_info) > 0) {
print(edges_info)
} else {
cat("No edges found in the estimated graph.\n")
}
# 5. Simplified visualizations without qgraph
cat("\n5. Visualizations:\n")
try({
# Reset par if needed
old_par <- par(no.readonly = TRUE)
on.exit(par(old_par))
# Plot 1: Heatmap of partial correlations
par(mfrow = c(2, 2), mar = c(5, 4, 4, 2) + 0.1)
# Heatmap of partial correlations
image(1:ncol(full_pcor_matrix), 1:ncol(full_pcor_matrix),
full_pcor_matrix,
xlab = "", ylab = "",
main = "Partial Correlation Heatmap",
col = colorRampPalette(c("blue", "white", "red"))(100))
axis(1, at = 1:ncol(full_pcor_matrix), labels = colnames(data), las = 2)
axis(2, at = 1:ncol(full_pcor_matrix), labels = colnames(data), las = 1)
# Plot 2: Network visualization (simplified using igraph)
if (require(igraph, quietly = TRUE)) {
# Create graph from adjacency matrix
g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected")
# Simple plot
plot(g,
main = "Estimated Causal Graph",
vertex.size = 30,
vertex.color = "lightblue",
vertex.label = colnames(data),
edge.color = "black",
layout = layout_with_fr)
} else {
plot.new()
text(0.5, 0.5, "Install 'igraph' for network visualization",
cex = 1.2)
title(main = "Estimated Causal Graph")
}
# Plot 3: Scatter plot of simple vs partial correlations
# Extract lower triangular matrices
simple_lower <- simple_cor_matrix[lower.tri(simple_cor_matrix)]
pcor_lower <- full_pcor_matrix[lower.tri(full_pcor_matrix)]
plot(simple_lower, pcor_lower,
xlab = "Simple Correlation",
ylab = "Partial Correlation",
main = "Simple vs Partial Correlation",
pch = 19, col = "blue",
xlim = c(-1, 1), ylim = c(-1, 1))
abline(h = 0, v = 0, lty = 2, col = "gray")
abline(a = 0, b = 1, lty = 2, col = "red") # y = x line
# Plot 4: Bar plot of correlation differences
diff_cor <- simple_lower - pcor_lower
hist(diff_cor,
main = "Distribution of Correlation Differences",
xlab = "Simple - Partial Correlation",
col = "lightgreen",
border = "darkgreen")
abline(v = 0, lty = 2, col = "red", lwd = 2)
par(mfrow = c(1, 1))
}, silent = TRUE)
# 6. Additional analysis: Test specific conditional independences
cat("\n6. Testing Specific Conditional Independences:\n")
# Find the strongest edge in the graph
max_edge <- which(abs(full_pcor_matrix) == max(abs(full_pcor_matrix[lower.tri(full_pcor_matrix)])),
arr.ind = TRUE)[1, ]
var1 <- colnames(data)[max_edge[1]]
var2 <- colnames(data)[max_edge[2]]
cat("Strongest partial correlation between:", var1, "and", var2, "\n")
cat("Partial correlation:", round(full_pcor_matrix[max_edge[1], max_edge[2]], 3), "\n")
# Get separating set for this edge
sep_set <- pc_fit@sepset[[max_edge[1]]][[max_edge[2]]]
if (!is.null(sep_set) && length(sep_set) > 0) {
cat("Separating set:", paste(colnames(data)[sep_set], collapse = ", "), "\n")
}
return(list(
pc_fit = pc_fit,
adjacency = adj_matrix,
simple_correlations = simple_cor_matrix,
partial_correlations = full_pcor_matrix,
edges_info = edges_info
))
}
# Test with the causal data
set.seed(2023)
n_causal <- 200 # Larger sample size for stability
causal_data <- data.frame(
A = rnorm(n_causal),
B = 0.5 * A + rnorm(n_causal, sd = 0.3),
C = 0.3 * A + 0.4 * B + rnorm(n_causal, sd = 0.3),
D = 0.2 * B + 0.3 * C + rnorm(n_causal, sd = 0.3),
E = rnorm(n_causal),
F = 0.1 * A + rnorm(n_causal, sd = 0.5)
)
result <- causal_analysis_with_partial_correlation(causal_data)
# Additional interactive analysis
cat("\n7. Interactive Analysis:\n")
cat("You can explore specific partial correlations using:\n")
cat(" pcor.test(causal_data$A, causal_data$B, causal_data[, c('C', 'D')])\n")
cat(" pcor.test(causal_data$A, causal_data$C, causal_data[, c('B', 'D')])\n")
cat("\nTo visualize the full partial correlation network:\n")
cat(" library(corrplot)\n")
cat(" corrplot(result$partial_correlations, method = 'circle', type = 'full')\n")
Software Implementation
# Comprehensive partial correlation analysis function
run_partial_correlation_analysis <- function(data, var1, var2, controls,
method = "standard",
inference = TRUE,
visualization = TRUE) {
cat("=== PARTIAL CORRELATION ANALYSIS ===\n\n")
cat("Variable 1:", var1, "\n")
cat("Variable 2:", var2, "\n")
cat("Control variables:", paste(controls, collapse = ", "), "\n")
cat("Method:", method, "\n\n")
# Extract variables
X <- data[[var1]]
Y <- data[[var2]]
Z <- if(length(controls) > 0) data[, controls, drop = FALSE] else NULL
# 1. Simple correlation
simple_cor <- cor(X, Y)
cat("1. Simple correlation:", round(simple_cor, 4), "\n")
# 2. Partial correlation
if (method == "standard") {
if (!is.null(Z)) {
pcor_result <- pcor.test(X, Y, Z)
cat("\n2. Partial correlation:", round(pcor_result$estimate, 4), "\n")
if (inference) {
cat(" p-value:", format.pval(pcor_result$p.value, digits = 4), "\n")
# Compute confidence interval using Fisher's z-transform
n <- length(X)
k <- ifelse(is.null(Z), 0, ncol(Z))
r <- pcor_result$estimate
# Fisher's z-transform
z <- 0.5 * log((1 + r) / (1 - r))
se <- 1 / sqrt(n - k - 3)
# 95% CI for z
z_lower <- z - qnorm(0.975) * se
z_upper <- z + qnorm(0.975) * se
# Transform back to correlation scale
r_lower <- (exp(2 * z_lower) - 1) / (exp(2 * z_lower) + 1)
r_upper <- (exp(2 * z_upper) - 1) / (exp(2 * z_upper) + 1)
cat(" 95% CI: [",
round(r_lower, 4), ", ",
round(r_upper, 4), "]\n", sep = "")
}
} else {
cat("\nNo control variables specified.\n")
}
} else if (method == "regularized") {
# Regularized partial correlation
all_data <- cbind(X, Y, Z)
reg_result <- regularized_partial_correlation(all_data, lambda = 0.1)
pcor_value <- reg_result$partial_correlation[1, 2]
cat("\n2. Regularized partial correlation:", round(pcor_value, 4), "\n")
cat(" Regularization parameter lambda:", reg_result$lambda, "\n")
}
# 3. Semi-partial correlation
if (!is.null(Z)) {
# Regress Y on Z
formula_y <- as.formula(paste(var2, "~", paste(controls, collapse = "+")))
lm_y <- lm(formula_y, data = data)
residuals_y <- resid(lm_y)
semi_partial <- cor(X, residuals_y)
cat("\n3. Semi-partial correlation:", round(semi_partial, 4), "\n")
cat(" (Correlation of", var1, "with residuals of", var2, "after controlling for Z)\n")
}
# 4. Visualization
if (visualization) {
# Save current par settings
old_par <- par(no.readonly = TRUE)
on.exit(par(old_par))
par(mfrow = c(2, 2))
# Scatterplot
plot(X, Y, main = paste("Scatterplot:", var1, "vs", var2),
xlab = var1, ylab = var2, pch = 19, col = "blue")
# Residual plots if controls specified
if (!is.null(Z)) {
# Residuals after controlling
formula_x <- as.formula(paste(var1, "~", paste(controls, collapse = "+")))
res_X <- resid(lm(formula_x, data = data))
res_Y <- resid(lm(formula_y, data = data))
plot(res_X, res_Y,
main = paste("Residual Scatterplot\n(Controlling for",
paste(controls, collapse = ", "), ")"),
xlab = paste("Residuals of", var1),
ylab = paste("Residuals of", var2),
pch = 19, col = "red")
# Comparison plot
cor_types <- data.frame(
Type = factor(c("Simple", "Partial", "Semi-partial"),
levels = c("Simple", "Partial", "Semi-partial")),
Value = c(simple_cor,
if(exists("pcor_result")) pcor_result$estimate else NA,
semi_partial)
)
# Remove NA values for plotting
cor_types_complete <- cor_types[!is.na(cor_types$Value), ]
barplot(cor_types_complete$Value,
names.arg = cor_types_complete$Type,
ylim = c(-1, 1),
col = c("blue", "red", "green")[1:nrow(cor_types_complete)],
main = "Comparison of Correlation Measures",
ylab = "Correlation Coefficient")
abline(h = 0, lty = 2)
}
# Network visualization if multiple variables (with fallback)
if (ncol(data) >= 3) {
tryCatch({
pcor_matrix <- pcor(data)$estimate
# Try different visualization methods
if (require(corrplot, quietly = TRUE)) {
# Use corrplot for correlation matrix visualization
corrplot(pcor_matrix,
method = "color",
type = "full",
tl.col = "black",
tl.srt = 45,
title = "Partial Correlation Matrix",
mar = c(0, 0, 2, 0))
} else if (require(qgraph, quietly = TRUE)) {
# Use qgraph for network visualization (with edge.labels = FALSE)
qgraph(pcor_matrix,
layout = "spring",
labels = colnames(data),
title = "Partial Correlation Network",
maximum = 0.8,
edge.labels = FALSE, # Avoid edge label errors
vsize = 8)
} else {
# Fallback to base R heatmap
heatmap(pcor_matrix,
main = "Partial Correlation Heatmap",
col = colorRampPalette(c("blue", "white", "red"))(100))
}
}, error = function(e) {
plot.new()
text(0.5, 0.5, "Visualization failed", cex = 1.2)
title(main = "Partial Correlation Network")
})
}
}
# Return results
results <- list(
simple_correlation = simple_cor,
partial_correlation = if(exists("pcor_result")) pcor_result$estimate else NA,
semi_partial_correlation = if(exists("semi_partial")) semi_partial else NA,
p_value = if(exists("pcor_result")) pcor_result$p.value else NA
)
if (inference && exists("pcor_result")) {
results$confidence_interval <- c(r_lower, r_upper)
}
return(results)
}
# Example usage with mtcars dataset
cat("\n=== EXAMPLE: Partial Correlation with mtcars ===\n")
data(mtcars)
# Analyze relationship between mpg and hp, controlling for wt and cyl
results <- run_partial_correlation_analysis(
data = mtcars,
var1 = "mpg",
var2 = "hp",
controls = c("wt", "cyl"),
method = "standard",
inference = TRUE,
visualization = TRUE
)
Common Issues and Solutions
Multicollinearity Among Controls
# Check for multicollinearity in control variables
check_multicollinearity <- function(Z) {
# Handle vector input
if (is.null(dim(Z))) {
Z <- as.matrix(Z)
} else if (is.data.frame(Z)) {
Z <- as.matrix(Z) # or keep as data.frame but use in data=
}
# Ensure Z is numeric matrix
Z <- as.matrix(Z)
# Variance Inflation Factor (VIF)
if (requireNamespace("car", quietly = TRUE)) {
# Create a temporary data frame
df <- data.frame(dummy_y = rnorm(nrow(Z)), Z)
# Fit model using all columns in Z as predictors
dummy_lm <- lm(dummy_y ~ ., data = df)
vif_values <- car::vif(dummy_lm)
cat("Variance Inflation Factors (VIF):\n")
print(vif_values)
if (any(vif_values > 10, na.rm = TRUE)) {
cat("\nWarning: High multicollinearity detected in control variables.\n")
cat("Consider removing or combining variables with VIF > 10.\n")
}
}
# Condition number
if (ncol(Z) > 1) {
# Use correlation matrix
cor_mat <- cor(Z)
# Handle potential non-positive-definite due to collinearity
kappa_value <- kappa(cor_mat, exact = TRUE)
cat("\nCondition number of correlation matrix:", round(kappa_value, 2), "\n")
if (kappa_value > 30) {
cat("Warning: High condition number indicates multicollinearity.\n")
}
}
}
# Example
check_multicollinearity(mtcars[, c("wt", "cyl", "disp")])
Missing Data Handling
# Multiple approaches for missing data in partial correlation
partial_correlation_missing <- function(X, Y, Z, method = "complete") {
if (method == "complete") {
# Listwise deletion
complete_cases <- complete.cases(cbind(X, Y, Z))
X_comp <- X[complete_cases]
Y_comp <- Y[complete_cases]
Z_comp <- if(!is.null(dim(Z))) Z[complete_cases, ] else Z[complete_cases]
return(pcor.test(X_comp, Y_comp, Z_comp))
} else if (method == "pairwise") {
# Pairwise deletion (not recommended for partial correlation)
cat("Warning: Pairwise deletion can lead to inconsistent results in partial correlation.\n")
} else if (method == "multiple_imputation") {
# Multiple imputation
if (require(mice)) {
data_complete <- cbind(X, Y, Z)
imp <- mice(data_complete, m = 5, printFlag = FALSE)
# Fit model on each imputed dataset
fits <- with(imp, pcor.test(X, Y, Z))
# Pool results
pooled <- pool(fits)
return(summary(pooled))
}
}
}
# Example with missing data
set.seed(303)
n_missing <- 50
X_miss <- rnorm(n_missing)
Y_miss <- 0.5 * X_miss + rnorm(n_missing, sd = 0.5)
Z_miss <- 0.3 * X_miss + 0.4 * Y_miss + rnorm(n_missing, sd = 0.3)
# Introduce missing values
X_miss[sample(1:n_missing, 10)] <- NA
Y_miss[sample(1:n_missing, 10)] <- NA
Z_miss[sample(1:n_missing, 10)] <- NA
cat("=== Partial Correlation with Missing Data ===\n")
cat("Complete cases method:\n")
print(partial_correlation_missing(X_miss, Y_miss, Z_miss, method = "complete"))
Problems and Exercises
1. Conceptual Problems:
- a) Prove that for jointly Gaussian variables, zero partial correlation implies conditional independence
- b) Derive the relationship between partial correlation and regression coefficients
- c) Show that \(|r_{X(Y\cdot Z)}| \leq |r_{XY\cdot Z}| \leq |r_{XY}|\)
2. Applied Problems:
- a) Analyze the relationship between `mpg` and `qsec` in the `mtcars` dataset, controlling for `wt` and `hp`
- b) Compute the partial correlation matrix for the `iris` dataset (sepal and petal measurements)
- c) Conduct a power analysis for detecting partial correlation ρ = 0.3 with 3 control variables, α = 0.05, power = 0.80
3. Simulation Study:
# Simulation to understand partial correlation properties
simulate_partial_correlation <- function(n_sim = 1000, n = 100,
true_rho = 0.5, n_controls = 2) {
estimates <- numeric(n_sim)
p_values <- numeric(n_sim)
coverage <- logical(n_sim) # better as logical
for (i in 1:n_sim) {
# Generate control variables
Z <- matrix(rnorm(n * n_controls), n, n_controls)
# Generate residuals with specified correlation
sigma <- matrix(c(1, true_rho, true_rho, 1), 2, 2)
residuals <- MASS::mvrnorm(n, mu = c(0, 0), Sigma = sigma)
# Generate X and Y
beta_X <- rnorm(n_controls)
beta_Y <- rnorm(n_controls)
X <- Z %*% beta_X + residuals[, 1]
Y <- Z %*% beta_Y + residuals[, 2]
# Estimate partial correlation
result <- pcor.test(X, Y, Z)
estimates[i] <- result$estimate
p_values[i] <- result$p.value
# Check coverage of 95% CI — safely
ci <- result$conf.int
if (length(ci) >= 2) {
coverage[i] <- (true_rho >= ci[1] & true_rho <= ci[2])
} else {
# CI not computable — treat as NA or skip
coverage[i] <- NA
}
}
# Remove NA coverage values for mean (or report proportion)
coverage_rate <- mean(coverage, na.rm = TRUE)
cat("Simulation Results (n =", n_sim, "):\n")
cat("True partial correlation:", true_rho, "\n")
cat("Mean estimate:", mean(estimates), "\n")
cat("Bias:", mean(estimates) - true_rho, "\n")
cat("RMSE:", sqrt(mean((estimates - true_rho)^2)), "\n")
cat("Type I error rate (for true_rho = 0):", mean(p_values < 0.05), "\n")
cat("Coverage of 95% CI (excluding failed CI):", coverage_rate, "\n")
# # For more control, compute CI manually
# r <- result$estimate
# df <- n - n_controls - 2 # for partial correlation
# # Or use Fisher z:
# if (abs(r) < 1) {
# z <- atanh(r)
# se <- 1 / sqrt(n - n_controls - 3)
# z_ci <- z + c(-1, 1) * qnorm(0.975) * se
# ci <- tanh(z_ci)
# } else {
# ci <- c(NA, NA)
# }
return(data.frame(
estimate = estimates,
p_value = p_values,
coverage = coverage
))
}
# Run simulation
sim_results <- simulate_partial_correlation(n_sim = 500, n = 50,
true_rho = 0.3, n_controls = 2)
References
1. Whittaker, J. (1990). *Graphical Models in Applied Multivariate Statistics*. Wiley.
2. Edwards, D. (2000). *Introduction to Graphical Modelling* (2nd ed.). Springer.
3. Pearl, J. (2009). *Causality: Models, Reasoning, and Inference* (2nd ed.). Cambridge University Press.
4. Koller, D., & Friedman, N. (2009). *Probabilistic Graphical Models: Principles and Techniques*. MIT Press.
5. Bühlmann, P., & van de Geer, S. (2011). *Statistics for High-Dimensional Data: Methods, Theory and Applications*. Springer.
Online Resources
- Multivariate Statistics in R
- ppcor Package Documentation
- SOCR Resources and Datasets
- Bayesian Network Learning
- SOCR Home page: https://www.socr.umich.edu
Translate this page: