Difference between revisions of "SOCR Simulated HELP Data Activity"
(→Sorting and subsetting) |
(→Multiple imputation) |
||
| (53 intermediate revisions by the same user not shown) | |||
| Line 11: | Line 11: | ||
options(width=80) # narrows output to stay in the grey box | options(width=80) # narrows output to stay in the grey box | ||
| − | + | help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) | |
# note that we specify all of these values as indicating missing data ("",".","NA") | # note that we specify all of these values as indicating missing data ("",".","NA") | ||
| − | attach( | + | attach(help_sim_data) |
| − | rownames( | + | rownames(help_sim_data) # print row and column names |
| − | colnames( | + | colnames(help_sim_data) |
| − | summary( | + | summary(help_sim_data) |
| − | fivenum( | + | fivenum(help_sim_data$\$ $mcs) |
| − | mean( | + | mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE) |
| − | quantile( | + | quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE) |
| − | + | no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs) | |
| − | hist( | + | hist(no_mis_help_sim_data_mcs, main="", freq=FALSE) |
| − | lines(density( | + | lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2) |
| − | xvals <- seq(from=min( | + | xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100) |
| − | lines(xvals, dnorm(xvals, mean( | + | lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2) |
| − | cor_mat <- cor(cbind( | + | cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1)) |
cor_mat | cor_mat | ||
cor_mat[c(2, 3), 2] | cor_mat[c(2, 3), 2] | ||
| − | plot( | + | plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n") |
| − | text( | + | text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"], |
| − | + | help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A") | |
| − | text( | + | text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"], |
| − | + | help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C") | |
| − | text( | + | text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"], |
| − | + | help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H") | |
| − | rug(jitter( | + | rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2) |
| − | rug(jitter( | + | rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3) |
| − | table( | + | table(help_sim_data$\$ $homeless, help_sim_data$\$ $female) |
| − | + | OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)* | |
| − | sum( | + | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/ |
| − | (sum( | + | (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)* |
| − | sum( | + | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE)) |
| − | + | OR | |
| − | chisq_val <- chisq.test( | + | chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE) |
chisq_val | chisq_val | ||
| − | fisher.test( | + | fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female) |
| − | ttres <- t.test( | + | ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data) |
print(ttres) | print(ttres) | ||
| − | wilcox.test( | + | wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE) |
| − | ksres <- ks.test( | + | ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data) |
print(ksres) | print(ksres) | ||
| Line 87: | Line 87: | ||
===Sorting and subsetting=== | ===Sorting and subsetting=== | ||
| − | new_cesd = sum( | + | new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE); |
new_cesd | new_cesd | ||
| − | impute_mean_cesd = mean( | + | impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20; |
| − | sort( | + | sort(help_sim_data$\$ $cesd)[1:4] |
| − | sum(is.na( | + | sum(is.na(help_sim_data$\$ $drinkstatus)) |
| − | table( | + | table(help_sim_data$\$ $drinkstat, exclude="NULL") |
| − | gender <- factor( | + | gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female")) |
| − | table( | + | table(help_sim_data$\$ $female) |
===[[SMHS_EDA|Exploratory data analysis]]=== | ===[[SMHS_EDA|Exploratory data analysis]]=== | ||
| − | + | newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,] | |
| − | attach( | + | attach(newhelp_sim_data) |
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine")) | sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine")) | ||
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) | plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) | ||
| Line 123: | Line 123: | ||
names(summary(lm1)) | names(summary(lm1)) | ||
| − | summary(lm1)$sigma | + | summary(lm1)$\$ $sigma |
names(lm1) | names(lm1) | ||
| Line 134: | Line 134: | ||
quantile(resid) | quantile(resid) | ||
| − | + | # explore correlations and clusters in chunks of the data | |
| + | cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs") | ||
| + | h_dist_clust <- hclust(dist(cor_matrix)) | ||
| + | plot(h_dist_clust) | ||
===[[SMHS_SLR|Bivariate relationship]]=== | ===[[SMHS_SLR|Bivariate relationship]]=== | ||
| + | |||
| + | subst <- as.factor(substance) | ||
| + | genfem <- as.factor(ifelse(female, "F", "M")) | ||
| + | interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2) | ||
| + | |||
| + | subs <- character(length(substance)) | ||
| + | subs[substance=="alcohol"] <- "Alco" | ||
| + | subs[substance=="cocaine"] <- "Coca" | ||
| + | subs[substance=="heroin"] <- "Hero" | ||
| + | gend <- character(length(female)) | ||
| + | library("lattice") | ||
| + | bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray") | ||
| + | bwout | ||
| + | boxmeans <- tapply(cesd, list(subs, genfem), mean) | ||
| + | |||
| + | suicidal.thoughts <- as.factor(g1b) | ||
| + | # conditional plots | ||
| + | coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth) | ||
| + | |||
| + | # Generate random data using random effects GLM | ||
| + | library(lme4) | ||
| + | n <- 2000; p <- 3; sigbsq <- 10 | ||
| + | beta <- c(-2, 1.5, 0.5, -1, 1) | ||
| + | ID <- rep(1:n, each=p) # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n | ||
| + | Y1 <- as.numeric(id < (n+1)/2) # 1 1 ... 1 0 0 ... 0 | ||
| + | randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p) | ||
| + | Y2 <- rep(1:p, n) # 1 2 ... p 1 2 ... p ... | ||
| + | Y3 <- runif(p*n) | ||
| + | Y4 <- rexp(p*n, 1) | ||
| + | |||
| + | lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint | ||
| + | exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model)) | ||
| + | Y <- runif(p*n) < exp_model | ||
| + | glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit")) | ||
| + | summary(glmmres) | ||
| + | |||
| + | # inspect visually the relation between low homelessness and low pcs scores | ||
| + | homelow <- homeless[i1<3.5] | ||
| + | pcslow <- pcs[i1<3.5]<=31.94 | ||
| + | table(homelow, pcslow) | ||
===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]=== | ===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]=== | ||
| + | |||
| + | # you need to first load and attach the dataset: | ||
| + | #### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) | ||
| + | #### attach(help_sim_data) | ||
| + | table(homeless, female) | ||
| + | |||
| + | OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)* | ||
| + | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/ | ||
| + | (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)* | ||
| + | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE)) | ||
| + | OR | ||
| + | |||
| + | install.packages("epitools") | ||
| + | library(epitools) | ||
| + | OR_HomeFem <- oddsratio.wald(homeless, female) | ||
| + | OR_HomeFem | ||
| + | OR_HomeFem$\$ $measure | ||
| + | OR_HomeFem$\$ $p.value | ||
| + | |||
| + | chi_sq_test <- chisq.test(homeless, female, correct=FALSE) | ||
| + | chi_sq_test | ||
| + | |||
| + | fisher_test <- fisher.test(homeless, female) | ||
| + | fisher_test | ||
===[[SMHS_HypothesisTesting|Two-sample tests]]=== | ===[[SMHS_HypothesisTesting|Two-sample tests]]=== | ||
| + | |||
| + | Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data) | ||
| + | Welch_2sampleT_Age_Fem | ||
| + | |||
| + | fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values | ||
| + | res <- rep(0, length(x)) | ||
| + | for (i in 1:length(x)){ | ||
| + | if(is.na(x[i])) res[i] <- x[i] | ||
| + | else if (x[i]<2) res[i] <- x[i] | ||
| + | else res[i] <- 1} | ||
| + | return(res)} | ||
| + | |||
| + | Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data) | ||
| + | Welch_2sampleT_Home_treat | ||
| + | |||
| + | wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE) | ||
| + | wilcox_test_Age_Fem | ||
| + | |||
| + | ===[[SMHS_PowerSensitivitySpecificity|Power]]=== | ||
| + | # compute the sample size for a two-sample t-test | ||
| + | power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power | ||
| + | |||
| + | # for a two-sample t-test, compute power from sample-size and effect-size | ||
| + | power.t.test(delta=0.5, n=100) | ||
===[[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]=== | ===[[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]=== | ||
| − | ===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with smooth fit]]=== | + | small_data <- reshape(newhelp_sim_data, idvar="id", |
| + | varying=list(c("cesd1","cesd2","cesd3","cesd4"), | ||
| + | c("mcs1","mcs2","mcs3","mcs4"), | ||
| + | c("i11","i12","i13","i14"), | ||
| + | c("g1b1","g1b2","g1b3","g1b4")), | ||
| + | v.names=c("cesdtv","mcstv","i1tv","g1btv"), | ||
| + | timevar="time", times=1:4, direction="long") | ||
| + | library(lme4) | ||
| + | glmres <- glmer(g1btv ~ treat + time + (1|id), | ||
| + | family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data) | ||
| + | summary(glmres) | ||
| + | |||
| + | library(survival) | ||
| + | # fit a Cox proportional hazards regression model. Time dependent variable (dayslink), | ||
| + | # time dependent strata (linkstatus) | ||
| + | cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data) | ||
| + | print(cox_surv) | ||
| + | |||
| + | # creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted | ||
| + | # Cox model, or a previously fitted accelerated failure time model | ||
| + | surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat) | ||
| + | print(surv_KM) | ||
| + | plot(surv_KM, lty=1:2, lwd=2, col=c(4,2)) | ||
| + | title("Product-Limit Survival Estimates") | ||
| + | legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4) | ||
| + | |||
| + | ===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with model fit]]=== | ||
| + | |||
| + | plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) | ||
| + | |||
| + | points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca") | ||
| + | # lowess is a locally-weighted polynomial regression smooth fit | ||
| + | lines(lowess(age[substance=="cocaine"], | ||
| + | i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2) | ||
| + | |||
| + | points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco") | ||
| + | lines(lowess(age[substance=="alcohol"], | ||
| + | i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2) | ||
| + | |||
| + | points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero") | ||
| + | lines(lowess(age[substance=="heroin"], | ||
| + | i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2) | ||
| + | |||
| + | legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3, | ||
| + | cex=1.4, lwd=2, pch=c("c", "a", "h")) | ||
===[[SMHS_SLR|Regression with prediction intervals]]=== | ===[[SMHS_SLR|Regression with prediction intervals]]=== | ||
| + | |||
| + | predicted <- fitted(lm1); predicted | ||
| + | residuals <- residuals(lm1) | ||
| + | quantile(residuals) | ||
===Linear regression with interaction=== | ===Linear regression with interaction=== | ||
| + | |||
| + | lm1 <- lm(a15a ~ substance * age) | ||
| + | lm2 <- lm(a15a ~ substance + age) | ||
| + | anova(lm2, lm1) | ||
| + | summary(lm1) | ||
| + | |||
| + | names(summary(lm1)) | ||
| + | summary(lm1)$\$ $sigma | ||
| + | |||
| + | names(lm1) | ||
| + | lm1$\$ $coefficients | ||
| + | vcov(lm1) | ||
===Regression diagnostics=== | ===Regression diagnostics=== | ||
| + | |||
| + | # graphical model diagnostics | ||
| + | par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5) | ||
| + | plot(lm1) | ||
| + | par(par) | ||
| + | |||
| + | std.res <- rstandard(lm1) | ||
| + | hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F) | ||
| + | lines(density(std.res), lwd=2) | ||
| + | xvals <- seq(from=min(std.res), to=max(std.res), length=100) | ||
| + | lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3) | ||
===Fitting stratified regression models=== | ===Fitting stratified regression models=== | ||
| + | Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group. | ||
| + | |||
| + | unique_vals <- unique(substance[!is.na(substance)]); unique_vals | ||
| + | numb_unique <- length(unique_vals) | ||
| + | formula <- as.formula(i1 ~ age) | ||
| + | p <- length(coef(lm(formula))) | ||
| + | res <- matrix(rep(0, numb_unique*p), p, numb_unique) | ||
| + | for (i in 1:length(unique_vals)) { | ||
| + | res[,i] <- coef(lm(formula, subset=substance==unique_vals[i])) | ||
| + | } | ||
| + | rownames(res) <- c("intercept","slope") | ||
| + | colnames(res) <- unique_vals | ||
| + | res | ||
===[[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]=== | ===[[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]=== | ||
| + | |||
| + | aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data) | ||
| + | aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data) | ||
| + | resid <- residuals(aov2) | ||
| + | anova(aov2, aov1) | ||
| + | |||
| + | logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model | ||
| + | logLik(aov2) | ||
| + | lldiff <- logLik(aov1)[1] - logLik(aov2)[1] | ||
| + | lldiff | ||
| + | 1 - pchisq(2*lldiff, 2) # test for model differences | ||
| + | summary(aov1) | ||
| + | aov1 | ||
| + | |||
| + | contrasts(sub) <- contr.SAS(4) | ||
| + | contrasts(sub) | ||
| + | aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data) | ||
| + | summary(aov3) | ||
| + | |||
| + | AIC(aov1); BIC(aov1) # [http://en.wikipedia.org/wiki/Akaike_information_criterion Akaike] and [http://en.wikipedia.org/wiki/Bayesian_information_criterion Bayesian] Information criteria (smaller values yield better models | ||
| + | AIC(aov2); BIC(aov2) | ||
| + | AIC(aov3); BIC(aov3) | ||
===[[SMHS_CorrectionMultipleTesting|Multiple comparisons]]=== | ===[[SMHS_CorrectionMultipleTesting|Multiple comparisons]]=== | ||
| + | |||
| + | # generate confidence intervals on the differences between the means of the levels | ||
| + | # of a factor with the specified family-wise probability of coverage. | ||
| + | # The intervals are based on the Studentized range statistic, Tukey's | ||
| + | # ‘Honest Significant Difference’ method | ||
| + | multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub") | ||
| + | multiple | ||
| + | plot(multiple) | ||
| − | + | ===Logistic and Poisson Regression=== | |
| − | |||
| − | ===Logistic | ||
| − | + | # logistic (binomial outcomes) regression | |
| + | logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial) | ||
| + | logistres | ||
| + | summary(logistres) | ||
| + | names(summary(logistres)) | ||
| + | coeff.like.SAS <- summary(logistres)$\$ $coefficients | ||
| + | coeff.like.SAS | ||
| − | + | poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) | |
| + | summary(poissonres) | ||
===Negative binomial regression=== | ===Negative binomial regression=== | ||
| − | + | library(MASS) | |
| − | + | NB_Res <- glm.nb(a15a ~ female + substance + age) | |
| − | + | summary(NB_Res) | |
===Ordinal logit regression=== | ===Ordinal logit regression=== | ||
| − | == | + | library(MASS) |
| + | sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5)) | ||
| + | ord_logit <- polr(sex_risk_cat ~ cesd + pcs) | ||
| + | summary(ord_logit) | ||
===Generalized additive model=== | ===Generalized additive model=== | ||
| − | === | + | install.packages("gam"); library(gam) |
| + | gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs" | ||
| + | summary(gam_reg) | ||
| + | |||
| + | coefficients(gam_reg) | ||
| + | plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3) | ||
| + | abline(h=0) | ||
| + | |||
| + | plot(gam_reg, terms=c("substance"), se=2, lwd=3) | ||
===General linear model for correlated data=== | ===General linear model for correlated data=== | ||
| + | |||
| + | library(nlme) | ||
| + | fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary, | ||
| + | correlation = corAR1(form = ~ 1 | Mare)) | ||
| + | # variance increases as a power of the absolute fitted values | ||
| + | fm2 <- update(fm1, weights = varPower()) | ||
| + | summary(fm1) | ||
===Random effects model=== | ===Random effects model=== | ||
===[[SMHS_GEE|Generalized estimating equations (GEE) model]]=== | ===[[SMHS_GEE|Generalized estimating equations (GEE) model]]=== | ||
| + | |||
| + | install.packages("gee"); library(gee) | ||
| + | sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),] | ||
| + | # attach(sorted_data) | ||
| + | gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data, | ||
| + | family=binomial, na.action=na.omit, corstr="exchangeable") | ||
| + | coef(gee_res) | ||
| + | sqrt(diag(gee_res$\$ $robust.variance)) | ||
| + | gee_res$\$ $working.correlation | ||
===Generalized linear mixed model=== | ===Generalized linear mixed model=== | ||
| + | |||
| + | library(lme4) | ||
| + | glmm_res <- glmer(g1b ~ treat + cesd + (1|ID), | ||
| + | family=binomial(link="logit"), data=help_sim_data) | ||
| + | summary(glmm_res) | ||
===Proportional hazards regression model=== | ===Proportional hazards regression model=== | ||
| − | == | + | library(survival) |
| + | cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female + | ||
| + | cesd, method="breslow", data=help_sim_data) | ||
| + | print(cox_surv) | ||
| + | # Fit a Cox proportional hazards regression model | ||
| + | coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow") | ||
===[[SMHS_Cronbachs|Cronbach’s $\alpha$]]=== | ===[[SMHS_Cronbachs|Cronbach’s $\alpha$]]=== | ||
| + | |||
| + | library(multilevel) | ||
| + | cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, | ||
| + | f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)) | ||
| + | # assess the consistency of the core 20 measures in the simulated HELP dataset | ||
| + | res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + | ||
| + | f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, | ||
| + | rotation="varimax", na.action=na.omit, scores="regression") | ||
| + | print(res, cutoff=0.45, sort=TRUE) | ||
===[[SMHS_PCA_ICA_FA|Factor analysis]]=== | ===[[SMHS_PCA_ICA_FA|Factor analysis]]=== | ||
| + | |||
| + | # Factor analysis explains the (observed) variability of a multivariate dataset in terms of | ||
| + | # various underlying unobservable factors. The observed data elements are expressed as | ||
| + | # linear functions of the factors and some random error. | ||
| + | # This is an example of a maximum likelihood factor analysis of "cesd" | ||
| + | # (Center for Epidemiologic Studies–Depression) scale. | ||
| + | |||
| + | fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + | ||
| + | f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r + | ||
| + | f1s + f1t, factors=3, rotation="varimax", na.action=na.omit, | ||
| + | scores="regression") | ||
| + | print(fact_ana , cutoff=0.45, sort=TRUE) | ||
===Recursive partitioning=== | ===Recursive partitioning=== | ||
| + | |||
| + | # Recursive partitioning creates a decision tree to classify the observations based on | ||
| + | # categorical predictors. Here we classify subjects based on their race, | ||
| + | # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS. | ||
| + | |||
| + | library(rpart) | ||
| + | subst <- as.factor(substance) | ||
| + | homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data) | ||
| + | |||
| + | printcp(homelessness.rpart) | ||
| + | plot(homelessness.rpart) | ||
| + | text(homelessness.rpart) | ||
| + | |||
| + | race <- racegrp[i1<3.5] | ||
| + | pcslow <- pcs[i1<3.5]<=31.94 | ||
| + | length(pcslow==F) # count the number of subjects with (pcslow==F) | ||
| + | table(race, pcslow) | ||
| + | # in these data, 3 of 18 subjects with low PCS scores are white, | ||
| + | # and 48 of 154) of those with PCS scores above the threshold are white | ||
===Linear discriminant analysis=== | ===Linear discriminant analysis=== | ||
| + | |||
| + | # Fisher's linear discriminant analysis (LDA) identifies linear combinations of | ||
| + | # variables splitting the data into separate classes. LDA can help us | ||
| + | # identify clusters homelessness, using a uniform prior | ||
| + | |||
| + | library(MASS) | ||
| + | ngroups <- length(unique(homeless)) | ||
| + | lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1))) | ||
| + | print(lda_model) | ||
| + | plot(lda_model) | ||
| + | plot(lda_model, dimen=1, type="both") | ||
===[[SMHS_HLM|Hierarchical clustering]]=== | ===[[SMHS_HLM|Hierarchical clustering]]=== | ||
| + | |||
| + | # To group similar variables or cluster observations a hierarchical structure | ||
| + | # may be constructed. hclust() and kmeans() are examples of such clustering | ||
| + | # packages | ||
| + | |||
| + | cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs") | ||
| + | hier_clust <- hclust(dist(cor_mat)) | ||
| + | plot(hier_clust) | ||
| + | |||
| + | km_clust <- kmeans(cor_mat, 5, nstart = 25) | ||
| + | plot(cor_mat, col = km_clust$cluster) | ||
| + | points(km_clust$centers, col = 1:5, pch = 8) | ||
===[[SMHS_ROC|ROC curve]]=== | ===[[SMHS_ROC|ROC curve]]=== | ||
| + | |||
| + | library(ROCR) | ||
| + | pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide) | ||
| + | AUC<- slot(performance(pred, "auc"), "y.values")[[1]] | ||
| + | |||
| + | plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5), | ||
| + | text.adj=c(1, -.5), lwd=2) | ||
| + | lines(c(0, 1), c(0, 1)) | ||
| + | text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4) | ||
| + | title("ROC Curve for Depression scale vs. Suicide Prediction Model") | ||
===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]=== | ===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]=== | ||
| + | |||
| + | Also see the [[SMHS_MissingData| Missing Values section of the Scientific Methods for Health Sciences EBook]]. | ||
| + | |||
| + | miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs)) | ||
| + | summary(miss_data) | ||
| + | # library(Hmisc) | ||
| + | # na.pattern(miss_data) | ||
| + | |||
| + | install.packages("mi") | ||
| + | library(mi) | ||
| + | inf <- mi.info(miss_data) | ||
| + | # run the imputation without data transformation | ||
| + | IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10)) | ||
| + | |||
| + | # run the imputation with data transformation | ||
| + | miss_data.transformed <- mi.preprocess(miss_data, inf) | ||
| + | IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6)) | ||
| + | |||
| + | # Impute without noise | ||
| + | IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE) | ||
| + | |||
| + | # check convergence, should get FALSE here because only n.iter is small | ||
| + | converged(IMP, check = "data") | ||
| + | # converged(IMP, check = "coefs") | ||
| + | # visually check the imputation | ||
| + | plot(IMP); plot(IMP_T) | ||
===Propensity score modeling=== | ===Propensity score modeling=== | ||
| + | |||
| + | # Propensity score (PS) techniques may be applied for analysis of | ||
| + | # observational and registry data. The PS is the conditional probability | ||
| + | # of a certain treatment given patient’s covariates. PS may be used to | ||
| + | # eliminate imbalances in baseline covariate distributions between | ||
| + | # treatment groups and to estimate marginal effects. | ||
==References== | ==References== | ||
Latest revision as of 13:04, 24 September 2014
Contents
- 1 SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
- 2 SOCR Simulated HELP Data
- 3 R examples
- 3.1 Data I/O, summaries, visualization
- 3.2 Missing Values
- 3.3 Sorting and subsetting
- 3.4 Exploratory data analysis
- 3.5 Bivariate relationship
- 3.6 Contingency tables
- 3.7 Two-sample tests
- 3.8 Power
- 3.9 Survival analysis (Kaplan–Meier plot)
- 3.10 Scatterplot with model fit
- 3.11 Regression with prediction intervals
- 3.12 Linear regression with interaction
- 3.13 Regression diagnostics
- 3.14 Fitting stratified regression models
- 3.15 Two-way analysis of variance (ANOVA)
- 3.16 Multiple comparisons
- 3.17 Logistic and Poisson Regression
- 3.18 Negative binomial regression
- 3.19 Ordinal logit regression
- 3.20 Generalized additive model
- 3.21 General linear model for correlated data
- 3.22 Random effects model
- 3.23 Generalized estimating equations (GEE) model
- 3.24 Generalized linear mixed model
- 3.25 Proportional hazards regression model
- 3.26 Cronbach’s $\alpha$
- 3.27 Factor analysis
- 3.28 Recursive partitioning
- 3.29 Linear discriminant analysis
- 3.30 Hierarchical clustering
- 3.31 ROC curve
- 3.32 Multiple imputation
- 3.33 Propensity score modeling
- 4 References
SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
SOCR Simulated HELP Data
See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".
R examples
These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.
Data I/O, summaries, visualization
options(digits=2) # decimal precision
options(width=80) # narrows output to stay in the grey box
help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(help_sim_data)
rownames(help_sim_data) # print row and column names
colnames(help_sim_data)
summary(help_sim_data)
fivenum(help_sim_data$\$ $mcs)
mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)
quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)
no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)
hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)
cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]
plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)
table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
(sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
chisq_val
fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
print(ttres)
wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)
ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
print(ksres)
Missing Values
sum(is.na(pcs1)) # count the missing values in the variable pcs1, 208 sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246 sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values) which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
Sorting and subsetting
new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd
impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(help_sim_data$\$ $cesd)[1:4]
sum(is.na(help_sim_data$\$ $drinkstatus))
table(help_sim_data$\$ $drinkstat, exclude="NULL")
gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
table(help_sim_data$\$ $female)
Exploratory data analysis
newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
attach(newhelp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"],
i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"],
i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"],
i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3,
cex=1.4, lwd=2, pch=c("A", "C", "H"))
options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)
summary(lm1)
names(summary(lm1))
summary(lm1)$\$ $sigma
names(lm1)
lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)
# explore correlations and clusters in chunks of the data cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs") h_dist_clust <- hclust(dist(cor_matrix)) plot(h_dist_clust)
Bivariate relationship
subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)
subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout
boxmeans <- tapply(cesd, list(subs, genfem), mean)
suicidal.thoughts <- as.factor(g1b) # conditional plots coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
# Generate random data using random effects GLM library(lme4) n <- 2000; p <- 3; sigbsq <- 10 beta <- c(-2, 1.5, 0.5, -1, 1) ID <- rep(1:n, each=p) # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n Y1 <- as.numeric(id < (n+1)/2) # 1 1 ... 1 0 0 ... 0 randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p) Y2 <- rep(1:p, n) # 1 2 ... p 1 2 ... p ... Y3 <- runif(p*n) Y4 <- rexp(p*n, 1) lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model)) Y <- runif(p*n) < exp_model glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit")) summary(glmmres)
# inspect visually the relation between low homelessness and low pcs scores homelow <- homeless[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 table(homelow, pcslow)
Contingency tables
# you need to first load and attach the dataset:
#### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
#### attach(help_sim_data)
table(homeless, female)
OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
(sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
install.packages("epitools")
library(epitools)
OR_HomeFem <- oddsratio.wald(homeless, female)
OR_HomeFem
OR_HomeFem$\$ $measure
OR_HomeFem$\$ $p.value
chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
chi_sq_test
fisher_test <- fisher.test(homeless, female) fisher_test
Two-sample tests
Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
Welch_2sampleT_Age_Fem
fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
res <- rep(0, length(x))
for (i in 1:length(x)){
if(is.na(x[i])) res[i] <- x[i]
else if (x[i]<2) res[i] <- x[i]
else res[i] <- 1}
return(res)}
Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
Welch_2sampleT_Home_treat
wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
wilcox_test_Age_Fem
Power
# compute the sample size for a two-sample t-test power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
# for a two-sample t-test, compute power from sample-size and effect-size power.t.test(delta=0.5, n=100)
Survival analysis (Kaplan–Meier plot)
small_data <- reshape(newhelp_sim_data, idvar="id",
varying=list(c("cesd1","cesd2","cesd3","cesd4"),
c("mcs1","mcs2","mcs3","mcs4"),
c("i11","i12","i13","i14"),
c("g1b1","g1b2","g1b3","g1b4")),
v.names=c("cesdtv","mcstv","i1tv","g1btv"),
timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)
library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
print(cox_surv)
# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)
Scatterplot with model fit
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
# lowess is a locally-weighted polynomial regression smooth fit
lines(lowess(age[substance=="cocaine"],
i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
lines(lowess(age[substance=="alcohol"],
i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
lines(lowess(age[substance=="heroin"],
i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
cex=1.4, lwd=2, pch=c("c", "a", "h"))
Regression with prediction intervals
predicted <- fitted(lm1); predicted residuals <- residuals(lm1) quantile(residuals)
Linear regression with interaction
lm1 <- lm(a15a ~ substance * age) lm2 <- lm(a15a ~ substance + age) anova(lm2, lm1) summary(lm1) names(summary(lm1)) summary(lm1)$\$ $sigma names(lm1) lm1$\$ $coefficients vcov(lm1)
Regression diagnostics
# graphical model diagnostics par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5) plot(lm1) par(par)
std.res <- rstandard(lm1) hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F) lines(density(std.res), lwd=2) xvals <- seq(from=min(std.res), to=max(std.res), length=100) lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)
Fitting stratified regression models
Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.
unique_vals <- unique(substance[!is.na(substance)]); unique_vals
numb_unique <- length(unique_vals)
formula <- as.formula(i1 ~ age)
p <- length(coef(lm(formula)))
res <- matrix(rep(0, numb_unique*p), p, numb_unique)
for (i in 1:length(unique_vals)) {
res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
}
rownames(res) <- c("intercept","slope")
colnames(res) <- unique_vals
res
Two-way analysis of variance (ANOVA)
aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data) aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data) resid <- residuals(aov2) anova(aov2, aov1)
logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model logLik(aov2) lldiff <- logLik(aov1)[1] - logLik(aov2)[1] lldiff 1 - pchisq(2*lldiff, 2) # test for model differences summary(aov1) aov1 contrasts(sub) <- contr.SAS(4) contrasts(sub) aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data) summary(aov3) AIC(aov1); BIC(aov1) # Akaike and Bayesian Information criteria (smaller values yield better models AIC(aov2); BIC(aov2) AIC(aov3); BIC(aov3)
Multiple comparisons
# generate confidence intervals on the differences between the means of the levels # of a factor with the specified family-wise probability of coverage. # The intervals are based on the Studentized range statistic, Tukey's # ‘Honest Significant Difference’ method multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub") multiple plot(multiple)
Logistic and Poisson Regression
# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS
poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) summary(poissonres)
Negative binomial regression
library(MASS) NB_Res <- glm.nb(a15a ~ female + substance + age) summary(NB_Res)
Ordinal logit regression
library(MASS) sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5)) ord_logit <- polr(sex_risk_cat ~ cesd + pcs) summary(ord_logit)
Generalized additive model
install.packages("gam"); library(gam)
gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
summary(gam_reg)
coefficients(gam_reg)
plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
abline(h=0)
plot(gam_reg, terms=c("substance"), se=2, lwd=3)
library(nlme)
fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
correlation = corAR1(form = ~ 1 | Mare))
# variance increases as a power of the absolute fitted values
fm2 <- update(fm1, weights = varPower())
summary(fm1)
Random effects model
Generalized estimating equations (GEE) model
install.packages("gee"); library(gee)
sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
# attach(sorted_data)
gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
family=binomial, na.action=na.omit, corstr="exchangeable")
coef(gee_res)
sqrt(diag(gee_res$\$ $robust.variance))
gee_res$\$ $working.correlation
Generalized linear mixed model
library(lme4)
glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
family=binomial(link="logit"), data=help_sim_data)
summary(glmm_res)
Proportional hazards regression model
library(survival)
cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
cesd, method="breslow", data=help_sim_data)
print(cox_surv)
# Fit a Cox proportional hazards regression model
coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")
Cronbach’s $\alpha$
library(multilevel)
cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t))
# assess the consistency of the core 20 measures in the simulated HELP dataset
res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k +
f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3,
rotation="varimax", na.action=na.omit, scores="regression")
print(res, cutoff=0.45, sort=TRUE)
Factor analysis
# Factor analysis explains the (observed) variability of a multivariate dataset in terms of
# various underlying unobservable factors. The observed data elements are expressed as
# linear functions of the factors and some random error.
# This is an example of a maximum likelihood factor analysis of "cesd"
# (Center for Epidemiologic Studies–Depression) scale.
fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
scores="regression")
print(fact_ana , cutoff=0.45, sort=TRUE)
Recursive partitioning
# Recursive partitioning creates a decision tree to classify the observations based on # categorical predictors. Here we classify subjects based on their race, # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS. library(rpart) subst <- as.factor(substance) homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data) printcp(homelessness.rpart) plot(homelessness.rpart) text(homelessness.rpart) race <- racegrp[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 length(pcslow==F) # count the number of subjects with (pcslow==F) table(race, pcslow) # in these data, 3 of 18 subjects with low PCS scores are white, # and 48 of 154) of those with PCS scores above the threshold are white
Linear discriminant analysis
# Fisher's linear discriminant analysis (LDA) identifies linear combinations of # variables splitting the data into separate classes. LDA can help us # identify clusters homelessness, using a uniform prior
library(MASS) ngroups <- length(unique(homeless)) lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1))) print(lda_model) plot(lda_model) plot(lda_model, dimen=1, type="both")
Hierarchical clustering
# To group similar variables or cluster observations a hierarchical structure # may be constructed. hclust() and kmeans() are examples of such clustering # packages cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs") hier_clust <- hclust(dist(cor_mat)) plot(hier_clust)
km_clust <- kmeans(cor_mat, 5, nstart = 25) plot(cor_mat, col = km_clust$cluster) points(km_clust$centers, col = 1:5, pch = 8)
ROC curve
library(ROCR) pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide) AUC<- slot(performance(pred, "auc"), "y.values")1
plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
text.adj=c(1, -.5), lwd=2)
lines(c(0, 1), c(0, 1))
text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
title("ROC Curve for Depression scale vs. Suicide Prediction Model")
Multiple imputation
Also see the Missing Values section of the Scientific Methods for Health Sciences EBook.
miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs))
summary(miss_data)
# library(Hmisc)
# na.pattern(miss_data)
install.packages("mi")
library(mi)
inf <- mi.info(miss_data)
# run the imputation without data transformation
IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10))
# run the imputation with data transformation
miss_data.transformed <- mi.preprocess(miss_data, inf)
IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6))
# Impute without noise
IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)
# check convergence, should get FALSE here because only n.iter is small converged(IMP, check = "data") # converged(IMP, check = "coefs") # visually check the imputation plot(IMP); plot(IMP_T)
Propensity score modeling
# Propensity score (PS) techniques may be applied for analysis of # observational and registry data. The PS is the conditional probability # of a certain treatment given patient’s covariates. PS may be used to # eliminate imbalances in baseline covariate distributions between # treatment groups and to estimate marginal effects.
References
- Evaluation and Linkage to Primary (HELP) Care study
- Data formats: help.csv and help.Rdata
- Study and Data specifications
- HELP study documentation
- SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009
- SOCR Home page: http://www.SOCR.ucla.edu
Translate this page: