Difference between revisions of "SOCR Simulated HELP Data Activity"
(→Multiple imputation) |
|||
(70 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | == [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]] | + | == [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]: [[SOCR_EduMaterials|SOCR Activity]]: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset == |
==[[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]== | ==[[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]== | ||
Line 8: | Line 8: | ||
===Data I/O, summaries, visualization=== | ===Data I/O, summaries, visualization=== | ||
+ | options(digits=2) # decimal precision | ||
+ | options(width=80) # narrows output to stay in the grey box | ||
+ | |||
+ | help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) | ||
+ | # note that we specify all of these values as indicating missing data ("",".","NA") | ||
+ | attach(help_sim_data) | ||
+ | |||
+ | rownames(help_sim_data) # print row and column names | ||
+ | colnames(help_sim_data) | ||
+ | |||
+ | summary(help_sim_data) | ||
+ | fivenum(help_sim_data$\$ $mcs) | ||
+ | |||
+ | mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE) | ||
+ | |||
+ | quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE) | ||
+ | |||
+ | no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs) | ||
+ | |||
+ | hist(no_mis_help_sim_data_mcs, main="", freq=FALSE) | ||
+ | lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2) | ||
+ | xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100) | ||
+ | lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2) | ||
+ | |||
+ | cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1)) | ||
+ | cor_mat | ||
+ | cor_mat[c(2, 3), 2] | ||
+ | |||
+ | plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n") | ||
+ | |||
+ | text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"], | ||
+ | help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A") | ||
+ | |||
+ | text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"], | ||
+ | help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C") | ||
+ | |||
+ | text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"], | ||
+ | help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H") | ||
+ | |||
+ | rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2) | ||
+ | rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3) | ||
+ | |||
+ | |||
+ | table(help_sim_data$\$ $homeless, help_sim_data$\$ $female) | ||
+ | |||
+ | |||
+ | OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)* | ||
+ | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/ | ||
+ | (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)* | ||
+ | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE)) | ||
+ | OR | ||
+ | |||
+ | |||
+ | chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE) | ||
+ | chisq_val | ||
+ | |||
+ | |||
+ | fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female) | ||
+ | |||
+ | |||
+ | ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data) | ||
+ | print(ttres) | ||
+ | |||
+ | |||
+ | wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE) | ||
+ | |||
+ | ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data) | ||
+ | print(ksres) | ||
− | === | + | ===Missing Values=== |
+ | sum(is.na(pcs1)) # count the missing values in the variable pcs1, 208 | ||
+ | sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246 | ||
+ | |||
+ | sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values) | ||
+ | which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete | ||
+ | # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) | ||
+ | # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1 | ||
===Sorting and subsetting=== | ===Sorting and subsetting=== | ||
+ | new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE); | ||
+ | new_cesd | ||
+ | |||
+ | impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20; | ||
+ | sort(help_sim_data$\$ $cesd)[1:4] | ||
+ | sum(is.na(help_sim_data$\$ $drinkstatus)) | ||
+ | table(help_sim_data$\$ $drinkstat, exclude="NULL") | ||
+ | |||
+ | gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female")) | ||
+ | table(help_sim_data$\$ $female) | ||
===[[SMHS_EDA|Exploratory data analysis]]=== | ===[[SMHS_EDA|Exploratory data analysis]]=== | ||
+ | newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,] | ||
+ | attach(newhelp_sim_data) | ||
+ | sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine")) | ||
+ | plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) | ||
+ | points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A") | ||
+ | lines(lowess(age[substance=="alcohol"], | ||
+ | i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2) | ||
+ | points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C") | ||
+ | lines(lowess(age[substance=="cocaine"], | ||
+ | i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2) | ||
+ | points(age[substance=="heroin"], i1[substance=="heroin"], pch="H") | ||
+ | lines(lowess(age[substance=="heroin"], | ||
+ | i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2) | ||
+ | legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, | ||
+ | cex=1.4, lwd=2, pch=c("A", "C", "H")) | ||
+ | |||
+ | options(show.signif.stars=FALSE) | ||
+ | lm1 <- lm(i1 ~ sub * age) | ||
+ | lm2 <- lm(i1 ~ sub + age) | ||
+ | anova(lm2, lm1) | ||
+ | |||
+ | summary(lm1) | ||
+ | |||
+ | names(summary(lm1)) | ||
+ | summary(lm1)$\$ $sigma | ||
+ | |||
+ | names(lm1) | ||
+ | |||
+ | lm1$\$ $coefficients | ||
+ | coef(lm1) | ||
+ | vcov(lm1) | ||
+ | pred <- fitted(lm1) | ||
+ | resid <- residuals(lm1) | ||
+ | quantile(resid) | ||
− | + | # explore correlations and clusters in chunks of the data | |
+ | cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs") | ||
+ | h_dist_clust <- hclust(dist(cor_matrix)) | ||
+ | plot(h_dist_clust) | ||
===[[SMHS_SLR|Bivariate relationship]]=== | ===[[SMHS_SLR|Bivariate relationship]]=== | ||
+ | |||
+ | subst <- as.factor(substance) | ||
+ | genfem <- as.factor(ifelse(female, "F", "M")) | ||
+ | interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2) | ||
+ | |||
+ | subs <- character(length(substance)) | ||
+ | subs[substance=="alcohol"] <- "Alco" | ||
+ | subs[substance=="cocaine"] <- "Coca" | ||
+ | subs[substance=="heroin"] <- "Hero" | ||
+ | gend <- character(length(female)) | ||
+ | library("lattice") | ||
+ | bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray") | ||
+ | bwout | ||
+ | boxmeans <- tapply(cesd, list(subs, genfem), mean) | ||
+ | |||
+ | suicidal.thoughts <- as.factor(g1b) | ||
+ | # conditional plots | ||
+ | coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth) | ||
+ | |||
+ | # Generate random data using random effects GLM | ||
+ | library(lme4) | ||
+ | n <- 2000; p <- 3; sigbsq <- 10 | ||
+ | beta <- c(-2, 1.5, 0.5, -1, 1) | ||
+ | ID <- rep(1:n, each=p) # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n | ||
+ | Y1 <- as.numeric(id < (n+1)/2) # 1 1 ... 1 0 0 ... 0 | ||
+ | randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p) | ||
+ | Y2 <- rep(1:p, n) # 1 2 ... p 1 2 ... p ... | ||
+ | Y3 <- runif(p*n) | ||
+ | Y4 <- rexp(p*n, 1) | ||
+ | |||
+ | lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint | ||
+ | exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model)) | ||
+ | Y <- runif(p*n) < exp_model | ||
+ | glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit")) | ||
+ | summary(glmmres) | ||
+ | |||
+ | # inspect visually the relation between low homelessness and low pcs scores | ||
+ | homelow <- homeless[i1<3.5] | ||
+ | pcslow <- pcs[i1<3.5]<=31.94 | ||
+ | table(homelow, pcslow) | ||
===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]=== | ===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]=== | ||
+ | |||
+ | # you need to first load and attach the dataset: | ||
+ | #### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) | ||
+ | #### attach(help_sim_data) | ||
+ | table(homeless, female) | ||
+ | |||
+ | OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)* | ||
+ | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/ | ||
+ | (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)* | ||
+ | sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE)) | ||
+ | OR | ||
+ | |||
+ | install.packages("epitools") | ||
+ | library(epitools) | ||
+ | OR_HomeFem <- oddsratio.wald(homeless, female) | ||
+ | OR_HomeFem | ||
+ | OR_HomeFem$\$ $measure | ||
+ | OR_HomeFem$\$ $p.value | ||
+ | |||
+ | chi_sq_test <- chisq.test(homeless, female, correct=FALSE) | ||
+ | chi_sq_test | ||
+ | |||
+ | fisher_test <- fisher.test(homeless, female) | ||
+ | fisher_test | ||
===[[SMHS_HypothesisTesting|Two-sample tests]]=== | ===[[SMHS_HypothesisTesting|Two-sample tests]]=== | ||
+ | |||
+ | Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data) | ||
+ | Welch_2sampleT_Age_Fem | ||
+ | |||
+ | fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values | ||
+ | res <- rep(0, length(x)) | ||
+ | for (i in 1:length(x)){ | ||
+ | if(is.na(x[i])) res[i] <- x[i] | ||
+ | else if (x[i]<2) res[i] <- x[i] | ||
+ | else res[i] <- 1} | ||
+ | return(res)} | ||
+ | |||
+ | Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data) | ||
+ | Welch_2sampleT_Home_treat | ||
+ | |||
+ | wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE) | ||
+ | wilcox_test_Age_Fem | ||
+ | |||
+ | ===[[SMHS_PowerSensitivitySpecificity|Power]]=== | ||
+ | # compute the sample size for a two-sample t-test | ||
+ | power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power | ||
+ | |||
+ | # for a two-sample t-test, compute power from sample-size and effect-size | ||
+ | power.t.test(delta=0.5, n=100) | ||
===[[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]=== | ===[[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]=== | ||
− | ===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with smooth fit]]=== | + | small_data <- reshape(newhelp_sim_data, idvar="id", |
+ | varying=list(c("cesd1","cesd2","cesd3","cesd4"), | ||
+ | c("mcs1","mcs2","mcs3","mcs4"), | ||
+ | c("i11","i12","i13","i14"), | ||
+ | c("g1b1","g1b2","g1b3","g1b4")), | ||
+ | v.names=c("cesdtv","mcstv","i1tv","g1btv"), | ||
+ | timevar="time", times=1:4, direction="long") | ||
+ | library(lme4) | ||
+ | glmres <- glmer(g1btv ~ treat + time + (1|id), | ||
+ | family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data) | ||
+ | summary(glmres) | ||
+ | |||
+ | library(survival) | ||
+ | # fit a Cox proportional hazards regression model. Time dependent variable (dayslink), | ||
+ | # time dependent strata (linkstatus) | ||
+ | cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data) | ||
+ | print(cox_surv) | ||
+ | |||
+ | # creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted | ||
+ | # Cox model, or a previously fitted accelerated failure time model | ||
+ | surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat) | ||
+ | print(surv_KM) | ||
+ | plot(surv_KM, lty=1:2, lwd=2, col=c(4,2)) | ||
+ | title("Product-Limit Survival Estimates") | ||
+ | legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4) | ||
+ | |||
+ | ===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with model fit]]=== | ||
+ | |||
+ | plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) | ||
+ | |||
+ | points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca") | ||
+ | # lowess is a locally-weighted polynomial regression smooth fit | ||
+ | lines(lowess(age[substance=="cocaine"], | ||
+ | i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2) | ||
+ | |||
+ | points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco") | ||
+ | lines(lowess(age[substance=="alcohol"], | ||
+ | i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2) | ||
+ | |||
+ | points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero") | ||
+ | lines(lowess(age[substance=="heroin"], | ||
+ | i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2) | ||
+ | |||
+ | legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3, | ||
+ | cex=1.4, lwd=2, pch=c("c", "a", "h")) | ||
===[[SMHS_SLR|Regression with prediction intervals]]=== | ===[[SMHS_SLR|Regression with prediction intervals]]=== | ||
+ | |||
+ | predicted <- fitted(lm1); predicted | ||
+ | residuals <- residuals(lm1) | ||
+ | quantile(residuals) | ||
===Linear regression with interaction=== | ===Linear regression with interaction=== | ||
+ | |||
+ | lm1 <- lm(a15a ~ substance * age) | ||
+ | lm2 <- lm(a15a ~ substance + age) | ||
+ | anova(lm2, lm1) | ||
+ | summary(lm1) | ||
+ | |||
+ | names(summary(lm1)) | ||
+ | summary(lm1)$\$ $sigma | ||
+ | |||
+ | names(lm1) | ||
+ | lm1$\$ $coefficients | ||
+ | vcov(lm1) | ||
===Regression diagnostics=== | ===Regression diagnostics=== | ||
+ | |||
+ | # graphical model diagnostics | ||
+ | par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5) | ||
+ | plot(lm1) | ||
+ | par(par) | ||
+ | |||
+ | std.res <- rstandard(lm1) | ||
+ | hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F) | ||
+ | lines(density(std.res), lwd=2) | ||
+ | xvals <- seq(from=min(std.res), to=max(std.res), length=100) | ||
+ | lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3) | ||
===Fitting stratified regression models=== | ===Fitting stratified regression models=== | ||
+ | Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group. | ||
+ | |||
+ | unique_vals <- unique(substance[!is.na(substance)]); unique_vals | ||
+ | numb_unique <- length(unique_vals) | ||
+ | formula <- as.formula(i1 ~ age) | ||
+ | p <- length(coef(lm(formula))) | ||
+ | res <- matrix(rep(0, numb_unique*p), p, numb_unique) | ||
+ | for (i in 1:length(unique_vals)) { | ||
+ | res[,i] <- coef(lm(formula, subset=substance==unique_vals[i])) | ||
+ | } | ||
+ | rownames(res) <- c("intercept","slope") | ||
+ | colnames(res) <- unique_vals | ||
+ | res | ||
===[[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]=== | ===[[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]=== | ||
+ | |||
+ | aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data) | ||
+ | aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data) | ||
+ | resid <- residuals(aov2) | ||
+ | anova(aov2, aov1) | ||
+ | |||
+ | logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model | ||
+ | logLik(aov2) | ||
+ | lldiff <- logLik(aov1)[1] - logLik(aov2)[1] | ||
+ | lldiff | ||
+ | 1 - pchisq(2*lldiff, 2) # test for model differences | ||
+ | summary(aov1) | ||
+ | aov1 | ||
+ | |||
+ | contrasts(sub) <- contr.SAS(4) | ||
+ | contrasts(sub) | ||
+ | aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data) | ||
+ | summary(aov3) | ||
+ | |||
+ | AIC(aov1); BIC(aov1) # [http://en.wikipedia.org/wiki/Akaike_information_criterion Akaike] and [http://en.wikipedia.org/wiki/Bayesian_information_criterion Bayesian] Information criteria (smaller values yield better models | ||
+ | AIC(aov2); BIC(aov2) | ||
+ | AIC(aov3); BIC(aov3) | ||
===[[SMHS_CorrectionMultipleTesting|Multiple comparisons]]=== | ===[[SMHS_CorrectionMultipleTesting|Multiple comparisons]]=== | ||
+ | |||
+ | # generate confidence intervals on the differences between the means of the levels | ||
+ | # of a factor with the specified family-wise probability of coverage. | ||
+ | # The intervals are based on the Studentized range statistic, Tukey's | ||
+ | # ‘Honest Significant Difference’ method | ||
+ | multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub") | ||
+ | multiple | ||
+ | plot(multiple) | ||
− | === | + | ===Logistic and Poisson Regression=== |
− | + | # logistic (binomial outcomes) regression | |
+ | logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial) | ||
+ | logistres | ||
+ | summary(logistres) | ||
+ | names(summary(logistres)) | ||
+ | coeff.like.SAS <- summary(logistres)$\$ $coefficients | ||
+ | coeff.like.SAS | ||
− | = | + | poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) |
− | + | summary(poissonres) | |
− | |||
===Negative binomial regression=== | ===Negative binomial regression=== | ||
− | + | library(MASS) | |
− | + | NB_Res <- glm.nb(a15a ~ female + substance + age) | |
− | + | summary(NB_Res) | |
===Ordinal logit regression=== | ===Ordinal logit regression=== | ||
− | == | + | library(MASS) |
+ | sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5)) | ||
+ | ord_logit <- polr(sex_risk_cat ~ cesd + pcs) | ||
+ | summary(ord_logit) | ||
===Generalized additive model=== | ===Generalized additive model=== | ||
− | === | + | install.packages("gam"); library(gam) |
+ | gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs" | ||
+ | summary(gam_reg) | ||
+ | |||
+ | coefficients(gam_reg) | ||
+ | plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3) | ||
+ | abline(h=0) | ||
+ | |||
+ | plot(gam_reg, terms=c("substance"), se=2, lwd=3) | ||
===General linear model for correlated data=== | ===General linear model for correlated data=== | ||
+ | |||
+ | library(nlme) | ||
+ | fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary, | ||
+ | correlation = corAR1(form = ~ 1 | Mare)) | ||
+ | # variance increases as a power of the absolute fitted values | ||
+ | fm2 <- update(fm1, weights = varPower()) | ||
+ | summary(fm1) | ||
===Random effects model=== | ===Random effects model=== | ||
===[[SMHS_GEE|Generalized estimating equations (GEE) model]]=== | ===[[SMHS_GEE|Generalized estimating equations (GEE) model]]=== | ||
+ | |||
+ | install.packages("gee"); library(gee) | ||
+ | sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),] | ||
+ | # attach(sorted_data) | ||
+ | gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data, | ||
+ | family=binomial, na.action=na.omit, corstr="exchangeable") | ||
+ | coef(gee_res) | ||
+ | sqrt(diag(gee_res$\$ $robust.variance)) | ||
+ | gee_res$\$ $working.correlation | ||
===Generalized linear mixed model=== | ===Generalized linear mixed model=== | ||
+ | |||
+ | library(lme4) | ||
+ | glmm_res <- glmer(g1b ~ treat + cesd + (1|ID), | ||
+ | family=binomial(link="logit"), data=help_sim_data) | ||
+ | summary(glmm_res) | ||
===Proportional hazards regression model=== | ===Proportional hazards regression model=== | ||
− | == | + | library(survival) |
+ | cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female + | ||
+ | cesd, method="breslow", data=help_sim_data) | ||
+ | print(cox_surv) | ||
+ | # Fit a Cox proportional hazards regression model | ||
+ | coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow") | ||
===[[SMHS_Cronbachs|Cronbach’s $\alpha$]]=== | ===[[SMHS_Cronbachs|Cronbach’s $\alpha$]]=== | ||
+ | |||
+ | library(multilevel) | ||
+ | cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, | ||
+ | f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)) | ||
+ | # assess the consistency of the core 20 measures in the simulated HELP dataset | ||
+ | res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + | ||
+ | f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, | ||
+ | rotation="varimax", na.action=na.omit, scores="regression") | ||
+ | print(res, cutoff=0.45, sort=TRUE) | ||
===[[SMHS_PCA_ICA_FA|Factor analysis]]=== | ===[[SMHS_PCA_ICA_FA|Factor analysis]]=== | ||
+ | |||
+ | # Factor analysis explains the (observed) variability of a multivariate dataset in terms of | ||
+ | # various underlying unobservable factors. The observed data elements are expressed as | ||
+ | # linear functions of the factors and some random error. | ||
+ | # This is an example of a maximum likelihood factor analysis of "cesd" | ||
+ | # (Center for Epidemiologic Studies–Depression) scale. | ||
+ | |||
+ | fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + | ||
+ | f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r + | ||
+ | f1s + f1t, factors=3, rotation="varimax", na.action=na.omit, | ||
+ | scores="regression") | ||
+ | print(fact_ana , cutoff=0.45, sort=TRUE) | ||
===Recursive partitioning=== | ===Recursive partitioning=== | ||
+ | |||
+ | # Recursive partitioning creates a decision tree to classify the observations based on | ||
+ | # categorical predictors. Here we classify subjects based on their race, | ||
+ | # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS. | ||
+ | |||
+ | library(rpart) | ||
+ | subst <- as.factor(substance) | ||
+ | homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data) | ||
+ | |||
+ | printcp(homelessness.rpart) | ||
+ | plot(homelessness.rpart) | ||
+ | text(homelessness.rpart) | ||
+ | |||
+ | race <- racegrp[i1<3.5] | ||
+ | pcslow <- pcs[i1<3.5]<=31.94 | ||
+ | length(pcslow==F) # count the number of subjects with (pcslow==F) | ||
+ | table(race, pcslow) | ||
+ | # in these data, 3 of 18 subjects with low PCS scores are white, | ||
+ | # and 48 of 154) of those with PCS scores above the threshold are white | ||
===Linear discriminant analysis=== | ===Linear discriminant analysis=== | ||
+ | |||
+ | # Fisher's linear discriminant analysis (LDA) identifies linear combinations of | ||
+ | # variables splitting the data into separate classes. LDA can help us | ||
+ | # identify clusters homelessness, using a uniform prior | ||
+ | |||
+ | library(MASS) | ||
+ | ngroups <- length(unique(homeless)) | ||
+ | lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1))) | ||
+ | print(lda_model) | ||
+ | plot(lda_model) | ||
+ | plot(lda_model, dimen=1, type="both") | ||
===[[SMHS_HLM|Hierarchical clustering]]=== | ===[[SMHS_HLM|Hierarchical clustering]]=== | ||
+ | |||
+ | # To group similar variables or cluster observations a hierarchical structure | ||
+ | # may be constructed. hclust() and kmeans() are examples of such clustering | ||
+ | # packages | ||
+ | |||
+ | cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs") | ||
+ | hier_clust <- hclust(dist(cor_mat)) | ||
+ | plot(hier_clust) | ||
+ | |||
+ | km_clust <- kmeans(cor_mat, 5, nstart = 25) | ||
+ | plot(cor_mat, col = km_clust$cluster) | ||
+ | points(km_clust$centers, col = 1:5, pch = 8) | ||
===[[SMHS_ROC|ROC curve]]=== | ===[[SMHS_ROC|ROC curve]]=== | ||
+ | |||
+ | library(ROCR) | ||
+ | pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide) | ||
+ | AUC<- slot(performance(pred, "auc"), "y.values")[[1]] | ||
+ | |||
+ | plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5), | ||
+ | text.adj=c(1, -.5), lwd=2) | ||
+ | lines(c(0, 1), c(0, 1)) | ||
+ | text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4) | ||
+ | title("ROC Curve for Depression scale vs. Suicide Prediction Model") | ||
===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]=== | ===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]=== | ||
+ | |||
+ | Also see the [[SMHS_MissingData| Missing Values section of the Scientific Methods for Health Sciences EBook]]. | ||
+ | |||
+ | miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs)) | ||
+ | summary(miss_data) | ||
+ | # library(Hmisc) | ||
+ | # na.pattern(miss_data) | ||
+ | |||
+ | install.packages("mi") | ||
+ | library(mi) | ||
+ | inf <- mi.info(miss_data) | ||
+ | # run the imputation without data transformation | ||
+ | IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10)) | ||
+ | |||
+ | # run the imputation with data transformation | ||
+ | miss_data.transformed <- mi.preprocess(miss_data, inf) | ||
+ | IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6)) | ||
+ | |||
+ | # Impute without noise | ||
+ | IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE) | ||
+ | |||
+ | # check convergence, should get FALSE here because only n.iter is small | ||
+ | converged(IMP, check = "data") | ||
+ | # converged(IMP, check = "coefs") | ||
+ | # visually check the imputation | ||
+ | plot(IMP); plot(IMP_T) | ||
===Propensity score modeling=== | ===Propensity score modeling=== | ||
+ | # Propensity score (PS) techniques may be applied for analysis of | ||
+ | # observational and registry data. The PS is the conditional probability | ||
+ | # of a certain treatment given patient’s covariates. PS may be used to | ||
+ | # eliminate imbalances in baseline covariate distributions between | ||
+ | # treatment groups and to estimate marginal effects. | ||
==References== | ==References== | ||
Line 94: | Line 564: | ||
** Data formats: [http://www.math.smith.edu/sasr/datasets/help.csv help.csv] and [http://www.math.smith.edu/sasr/datasets/help.Rdata help.Rdata] | ** Data formats: [http://www.math.smith.edu/sasr/datasets/help.csv help.csv] and [http://www.math.smith.edu/sasr/datasets/help.Rdata help.Rdata] | ||
** [http://www3.amherst.edu/~nhorton/help/ Study and Data specifications] | ** [http://www3.amherst.edu/~nhorton/help/ Study and Data specifications] | ||
+ | ** [http://www3.amherst.edu/~nhorton/help/ HELP study documentation] | ||
* [http://www.amazon.com/SAS-Management-Statistical-Analysis-Graphics/dp/1420070576 SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009] | * [http://www.amazon.com/SAS-Management-Statistical-Analysis-Graphics/dp/1420070576 SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009] | ||
Latest revision as of 12:04, 24 September 2014
Contents
- 1 SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
- 2 SOCR Simulated HELP Data
- 3 R examples
- 3.1 Data I/O, summaries, visualization
- 3.2 Missing Values
- 3.3 Sorting and subsetting
- 3.4 Exploratory data analysis
- 3.5 Bivariate relationship
- 3.6 Contingency tables
- 3.7 Two-sample tests
- 3.8 Power
- 3.9 Survival analysis (Kaplan–Meier plot)
- 3.10 Scatterplot with model fit
- 3.11 Regression with prediction intervals
- 3.12 Linear regression with interaction
- 3.13 Regression diagnostics
- 3.14 Fitting stratified regression models
- 3.15 Two-way analysis of variance (ANOVA)
- 3.16 Multiple comparisons
- 3.17 Logistic and Poisson Regression
- 3.18 Negative binomial regression
- 3.19 Ordinal logit regression
- 3.20 Generalized additive model
- 3.21 General linear model for correlated data
- 3.22 Random effects model
- 3.23 Generalized estimating equations (GEE) model
- 3.24 Generalized linear mixed model
- 3.25 Proportional hazards regression model
- 3.26 Cronbach’s $\alpha$
- 3.27 Factor analysis
- 3.28 Recursive partitioning
- 3.29 Linear discriminant analysis
- 3.30 Hierarchical clustering
- 3.31 ROC curve
- 3.32 Multiple imputation
- 3.33 Propensity score modeling
- 4 References
SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
SOCR Simulated HELP Data
See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".
R examples
These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.
Data I/O, summaries, visualization
options(digits=2) # decimal precision options(width=80) # narrows output to stay in the grey box help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) # note that we specify all of these values as indicating missing data ("",".","NA") attach(help_sim_data) rownames(help_sim_data) # print row and column names colnames(help_sim_data) summary(help_sim_data) fivenum(help_sim_data$\$ $mcs) mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE) quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE) no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs) hist(no_mis_help_sim_data_mcs, main="", freq=FALSE) lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2) xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100) lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2) cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1)) cor_mat cor_mat[c(2, 3), 2] plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n") text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"], help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A") text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C") text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"], help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H") rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2) rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3) table(help_sim_data$\$ $homeless, help_sim_data$\$ $female) OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)* sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/ (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)* sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE)) OR chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE) chisq_val fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female) ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data) print(ttres) wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE) ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data) print(ksres)
Missing Values
sum(is.na(pcs1)) # count the missing values in the variable pcs1, 208 sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246 sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values) which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
Sorting and subsetting
new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE); new_cesd impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20; sort(help_sim_data$\$ $cesd)[1:4] sum(is.na(help_sim_data$\$ $drinkstatus)) table(help_sim_data$\$ $drinkstat, exclude="NULL") gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female")) table(help_sim_data$\$ $female)
Exploratory data analysis
newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,] attach(newhelp_sim_data) sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine")) plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A") lines(lowess(age[substance=="alcohol"], i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2) points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C") lines(lowess(age[substance=="cocaine"], i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2) points(age[substance=="heroin"], i1[substance=="heroin"], pch="H") lines(lowess(age[substance=="heroin"], i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2) legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, cex=1.4, lwd=2, pch=c("A", "C", "H")) options(show.signif.stars=FALSE) lm1 <- lm(i1 ~ sub * age) lm2 <- lm(i1 ~ sub + age) anova(lm2, lm1) summary(lm1) names(summary(lm1)) summary(lm1)$\$ $sigma names(lm1) lm1$\$ $coefficients coef(lm1) vcov(lm1) pred <- fitted(lm1) resid <- residuals(lm1) quantile(resid)
# explore correlations and clusters in chunks of the data cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs") h_dist_clust <- hclust(dist(cor_matrix)) plot(h_dist_clust)
Bivariate relationship
subst <- as.factor(substance) genfem <- as.factor(ifelse(female, "F", "M")) interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2) subs <- character(length(substance)) subs[substance=="alcohol"] <- "Alco" subs[substance=="cocaine"] <- "Coca" subs[substance=="heroin"] <- "Hero" gend <- character(length(female)) library("lattice") bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray") bwout boxmeans <- tapply(cesd, list(subs, genfem), mean)
suicidal.thoughts <- as.factor(g1b) # conditional plots coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
# Generate random data using random effects GLM library(lme4) n <- 2000; p <- 3; sigbsq <- 10 beta <- c(-2, 1.5, 0.5, -1, 1) ID <- rep(1:n, each=p) # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n Y1 <- as.numeric(id < (n+1)/2) # 1 1 ... 1 0 0 ... 0 randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p) Y2 <- rep(1:p, n) # 1 2 ... p 1 2 ... p ... Y3 <- runif(p*n) Y4 <- rexp(p*n, 1) lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model)) Y <- runif(p*n) < exp_model glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit")) summary(glmmres)
# inspect visually the relation between low homelessness and low pcs scores homelow <- homeless[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 table(homelow, pcslow)
Contingency tables
# you need to first load and attach the dataset: #### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) #### attach(help_sim_data) table(homeless, female) OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)* sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/ (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)* sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE)) OR
install.packages("epitools") library(epitools) OR_HomeFem <- oddsratio.wald(homeless, female) OR_HomeFem OR_HomeFem$\$ $measure OR_HomeFem$\$ $p.value chi_sq_test <- chisq.test(homeless, female, correct=FALSE) chi_sq_test
fisher_test <- fisher.test(homeless, female) fisher_test
Two-sample tests
Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data) Welch_2sampleT_Age_Fem fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values res <- rep(0, length(x)) for (i in 1:length(x)){ if(is.na(x[i])) res[i] <- x[i] else if (x[i]<2) res[i] <- x[i] else res[i] <- 1} return(res)} Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data) Welch_2sampleT_Home_treat wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE) wilcox_test_Age_Fem
Power
# compute the sample size for a two-sample t-test power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
# for a two-sample t-test, compute power from sample-size and effect-size power.t.test(delta=0.5, n=100)
Survival analysis (Kaplan–Meier plot)
small_data <- reshape(newhelp_sim_data, idvar="id", varying=list(c("cesd1","cesd2","cesd3","cesd4"), c("mcs1","mcs2","mcs3","mcs4"), c("i11","i12","i13","i14"), c("g1b1","g1b2","g1b3","g1b4")), v.names=c("cesdtv","mcstv","i1tv","g1btv"), timevar="time", times=1:4, direction="long") library(lme4) glmres <- glmer(g1btv ~ treat + time + (1|id), family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data) summary(glmres)
library(survival) # fit a Cox proportional hazards regression model. Time dependent variable (dayslink), # time dependent strata (linkstatus) cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data) print(cox_surv) # creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted # Cox model, or a previously fitted accelerated failure time model surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat) print(surv_KM) plot(surv_KM, lty=1:2, lwd=2, col=c(4,2)) title("Product-Limit Survival Estimates") legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)
Scatterplot with model fit
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca") # lowess is a locally-weighted polynomial regression smooth fit lines(lowess(age[substance=="cocaine"], i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2) points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco") lines(lowess(age[substance=="alcohol"], i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2) points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero") lines(lowess(age[substance=="heroin"], i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2) legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3, cex=1.4, lwd=2, pch=c("c", "a", "h"))
Regression with prediction intervals
predicted <- fitted(lm1); predicted residuals <- residuals(lm1) quantile(residuals)
Linear regression with interaction
lm1 <- lm(a15a ~ substance * age) lm2 <- lm(a15a ~ substance + age) anova(lm2, lm1) summary(lm1) names(summary(lm1)) summary(lm1)$\$ $sigma names(lm1) lm1$\$ $coefficients vcov(lm1)
Regression diagnostics
# graphical model diagnostics par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5) plot(lm1) par(par)
std.res <- rstandard(lm1) hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F) lines(density(std.res), lwd=2) xvals <- seq(from=min(std.res), to=max(std.res), length=100) lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)
Fitting stratified regression models
Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.
unique_vals <- unique(substance[!is.na(substance)]); unique_vals numb_unique <- length(unique_vals) formula <- as.formula(i1 ~ age) p <- length(coef(lm(formula))) res <- matrix(rep(0, numb_unique*p), p, numb_unique) for (i in 1:length(unique_vals)) { res[,i] <- coef(lm(formula, subset=substance==unique_vals[i])) } rownames(res) <- c("intercept","slope") colnames(res) <- unique_vals res
Two-way analysis of variance (ANOVA)
aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data) aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data) resid <- residuals(aov2) anova(aov2, aov1)
logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model logLik(aov2) lldiff <- logLik(aov1)[1] - logLik(aov2)[1] lldiff 1 - pchisq(2*lldiff, 2) # test for model differences summary(aov1) aov1 contrasts(sub) <- contr.SAS(4) contrasts(sub) aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data) summary(aov3) AIC(aov1); BIC(aov1) # Akaike and Bayesian Information criteria (smaller values yield better models AIC(aov2); BIC(aov2) AIC(aov3); BIC(aov3)
Multiple comparisons
# generate confidence intervals on the differences between the means of the levels # of a factor with the specified family-wise probability of coverage. # The intervals are based on the Studentized range statistic, Tukey's # ‘Honest Significant Difference’ method multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub") multiple plot(multiple)
Logistic and Poisson Regression
# logistic (binomial outcomes) regression logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial) logistres summary(logistres) names(summary(logistres)) coeff.like.SAS <- summary(logistres)$\$ $coefficients coeff.like.SAS
poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) summary(poissonres)
Negative binomial regression
library(MASS) NB_Res <- glm.nb(a15a ~ female + substance + age) summary(NB_Res)
Ordinal logit regression
library(MASS) sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5)) ord_logit <- polr(sex_risk_cat ~ cesd + pcs) summary(ord_logit)
Generalized additive model
install.packages("gam"); library(gam) gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs" summary(gam_reg) coefficients(gam_reg) plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3) abline(h=0) plot(gam_reg, terms=c("substance"), se=2, lwd=3)
library(nlme) fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary, correlation = corAR1(form = ~ 1 | Mare)) # variance increases as a power of the absolute fitted values fm2 <- update(fm1, weights = varPower()) summary(fm1)
Random effects model
Generalized estimating equations (GEE) model
install.packages("gee"); library(gee) sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),] # attach(sorted_data) gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data, family=binomial, na.action=na.omit, corstr="exchangeable") coef(gee_res) sqrt(diag(gee_res$\$ $robust.variance)) gee_res$\$ $working.correlation
Generalized linear mixed model
library(lme4) glmm_res <- glmer(g1b ~ treat + cesd + (1|ID), family=binomial(link="logit"), data=help_sim_data) summary(glmm_res)
Proportional hazards regression model
library(survival) cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female + cesd, method="breslow", data=help_sim_data) print(cox_surv) # Fit a Cox proportional hazards regression model coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")
Cronbach’s $\alpha$
library(multilevel) cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)) # assess the consistency of the core 20 measures in the simulated HELP dataset res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, rotation="varimax", na.action=na.omit, scores="regression") print(res, cutoff=0.45, sort=TRUE)
Factor analysis
# Factor analysis explains the (observed) variability of a multivariate dataset in terms of # various underlying unobservable factors. The observed data elements are expressed as # linear functions of the factors and some random error. # This is an example of a maximum likelihood factor analysis of "cesd" # (Center for Epidemiologic Studies–Depression) scale. fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, rotation="varimax", na.action=na.omit, scores="regression") print(fact_ana , cutoff=0.45, sort=TRUE)
Recursive partitioning
# Recursive partitioning creates a decision tree to classify the observations based on # categorical predictors. Here we classify subjects based on their race, # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS. library(rpart) subst <- as.factor(substance) homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data) printcp(homelessness.rpart) plot(homelessness.rpart) text(homelessness.rpart) race <- racegrp[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 length(pcslow==F) # count the number of subjects with (pcslow==F) table(race, pcslow) # in these data, 3 of 18 subjects with low PCS scores are white, # and 48 of 154) of those with PCS scores above the threshold are white
Linear discriminant analysis
# Fisher's linear discriminant analysis (LDA) identifies linear combinations of # variables splitting the data into separate classes. LDA can help us # identify clusters homelessness, using a uniform prior
library(MASS) ngroups <- length(unique(homeless)) lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1))) print(lda_model) plot(lda_model) plot(lda_model, dimen=1, type="both")
Hierarchical clustering
# To group similar variables or cluster observations a hierarchical structure # may be constructed. hclust() and kmeans() are examples of such clustering # packages cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs") hier_clust <- hclust(dist(cor_mat)) plot(hier_clust)
km_clust <- kmeans(cor_mat, 5, nstart = 25) plot(cor_mat, col = km_clust$cluster) points(km_clust$centers, col = 1:5, pch = 8)
ROC curve
library(ROCR) pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide) AUC<- slot(performance(pred, "auc"), "y.values")1
plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5), text.adj=c(1, -.5), lwd=2) lines(c(0, 1), c(0, 1)) text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4) title("ROC Curve for Depression scale vs. Suicide Prediction Model")
Multiple imputation
Also see the Missing Values section of the Scientific Methods for Health Sciences EBook.
miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs)) summary(miss_data) # library(Hmisc) # na.pattern(miss_data) install.packages("mi") library(mi) inf <- mi.info(miss_data) # run the imputation without data transformation IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10)) # run the imputation with data transformation miss_data.transformed <- mi.preprocess(miss_data, inf) IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6)) # Impute without noise IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)
# check convergence, should get FALSE here because only n.iter is small converged(IMP, check = "data") # converged(IMP, check = "coefs") # visually check the imputation plot(IMP); plot(IMP_T)
Propensity score modeling
# Propensity score (PS) techniques may be applied for analysis of # observational and registry data. The PS is the conditional probability # of a certain treatment given patient’s covariates. PS may be used to # eliminate imbalances in baseline covariate distributions between # treatment groups and to estimate marginal effects.
References
- Evaluation and Linkage to Primary (HELP) Care study
- Data formats: help.csv and help.Rdata
- Study and Data specifications
- HELP study documentation
- SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009
- SOCR Home page: http://www.SOCR.ucla.edu
Translate this page: