SOCR Simulated HELP Data Activity
Contents
- 1 SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
- 2 SOCR Simulated HELP Data
- 3 R examples
- 3.1 Data I/O, summaries, visualization
- 3.2 Missing Values
- 3.3 Sorting and subsetting
- 3.4 Exploratory data analysis
- 3.5 Bivariate relationship
- 3.6 Contingency tables
- 3.7 Two-sample tests
- 3.8 Power
- 3.9 Survival analysis (Kaplan–Meier plot)
- 3.10 Scatterplot with model fit
- 3.11 Regression with prediction intervals
- 3.12 Linear regression with interaction
- 3.13 Regression diagnostics
- 3.14 Fitting stratified regression models
- 3.15 Two-way analysis of variance (ANOVA)
- 3.16 Multiple comparisons
- 3.17 Logistic and Poisson Regression
- 3.18 Negative binomial regression
- 3.19 Lasso model selection
- 3.20 Quantile regression
- 3.21 Ordinal logit regression
- 3.22 Multinomial logit regression
- 3.23 Generalized additive model
- 3.24 Data transformations
- 3.25 General linear model for correlated data
- 3.26 Random effects model
- 3.27 Generalized estimating equations (GEE) model
- 3.28 Generalized linear mixed model
- 3.29 Proportional hazards regression model
- 3.30 Bayesian Poisson regression
- 3.31 Cronbach’s $\alpha$
- 3.32 Factor analysis
- 3.33 Recursive partitioning
- 3.34 Linear discriminant analysis
- 3.35 Hierarchical clustering
- 3.36 ROC curve
- 3.37 Multiple imputation
- 3.38 Propensity score modeling
- 4 References
SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
SOCR Simulated HELP Data
See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".
R examples
These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.
Data I/O, summaries, visualization
options(digits=2) # decimal precision
options(width=80) # narrows output to stay in the grey box
help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(help_sim_data)
rownames(help_sim_data) # print row and column names
colnames(help_sim_data)
summary(help_sim_data)
fivenum(help_sim_data$\$ $mcs)
mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)
quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)
no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)
hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)
cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]
plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)
table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
(sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
chisq_val
fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
print(ttres)
wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)
ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
print(ksres)
Missing Values
sum(is.na(pcs1)) # count the missing values in the variable pcs1, 208 sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246 sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values) which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
Sorting and subsetting
new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd
impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(help_sim_data$\$ $cesd)[1:4]
sum(is.na(help_sim_data$\$ $drinkstatus))
table(help_sim_data$\$ $drinkstat, exclude="NULL")
gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
table(help_sim_data$\$ $female)
Exploratory data analysis
newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
attach(newhelp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"],
i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"],
i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"],
i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3,
cex=1.4, lwd=2, pch=c("A", "C", "H"))
options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)
summary(lm1)
names(summary(lm1))
summary(lm1)$\$ $sigma
names(lm1)
lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)
# explore correlations and clusters in chunks of the data cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs") h_dist_clust <- hclust(dist(cor_matrix)) plot(h_dist_clust)
Bivariate relationship
subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)
subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout
boxmeans <- tapply(cesd, list(subs, genfem), mean)
suicidal.thoughts <- as.factor(g1b) # conditional plots coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
# Generate random data using random effects GLM library(lme4) n <- 2000; p <- 3; sigbsq <- 10 beta <- c(-2, 1.5, 0.5, -1, 1) ID <- rep(1:n, each=p) # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n Y1 <- as.numeric(id < (n+1)/2) # 1 1 ... 1 0 0 ... 0 randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p) Y2 <- rep(1:p, n) # 1 2 ... p 1 2 ... p ... Y3 <- runif(p*n) Y4 <- rexp(p*n, 1) lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model)) Y <- runif(p*n) < exp_model glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit")) summary(glmmres)
# inspect visually the relation between low homelessness and low pcs scores homelow <- homeless[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 table(homelow, pcslow)
Contingency tables
# you need to first load and attach the dataset:
#### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
#### attach(help_sim_data)
table(homeless, female)
OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
(sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
install.packages("epitools")
library(epitools)
OR_HomeFem <- oddsratio.wald(homeless, female)
OR_HomeFem
OR_HomeFem$\$ $measure
OR_HomeFem$\$ $p.value
chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
chi_sq_test
fisher_test <- fisher.test(homeless, female) fisher_test
Two-sample tests
Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
Welch_2sampleT_Age_Fem
fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
res <- rep(0, length(x))
for (i in 1:length(x)){
if(is.na(x[i])) res[i] <- x[i]
else if (x[i]<2) res[i] <- x[i]
else res[i] <- 1}
return(res)}
Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
Welch_2sampleT_Home_treat
wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
wilcox_test_Age_Fem
Power
# compute the sample size for a two-sample t-test power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
# for a two-sample t-test, compute power from sample-size and effect-size power.t.test(delta=0.5, n=100)
Survival analysis (Kaplan–Meier plot)
small_data <- reshape(newhelp_sim_data, idvar="id",
varying=list(c("cesd1","cesd2","cesd3","cesd4"),
c("mcs1","mcs2","mcs3","mcs4"),
c("i11","i12","i13","i14"),
c("g1b1","g1b2","g1b3","g1b4")),
v.names=c("cesdtv","mcstv","i1tv","g1btv"),
timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)
library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
print(cox_surv)
# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)
Scatterplot with model fit
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
# lowess is a locally-weighted polynomial regression smooth fit
lines(lowess(age[substance=="cocaine"],
i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
lines(lowess(age[substance=="alcohol"],
i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
lines(lowess(age[substance=="heroin"],
i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
cex=1.4, lwd=2, pch=c("c", "a", "h"))
Regression with prediction intervals
predicted <- fitted(lm1); predicted residuals <- residuals(lm1) quantile(residuals)
Linear regression with interaction
lm1 <- lm(a15a ~ substance * age) lm2 <- lm(a15a ~ substance + age) anova(lm2, lm1) summary(lm1) names(summary(lm1)) summary(lm1)$\$ $sigma names(lm1) lm1$\$ $coefficients vcov(lm1)
Regression diagnostics
# graphical model diagnostics par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5) plot(lm1) par(par)
std.res <- rstandard(lm1) hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F) lines(density(std.res), lwd=2) xvals <- seq(from=min(std.res), to=max(std.res), length=100) lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)
Fitting stratified regression models
Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.
unique_vals <- unique(substance[!is.na(substance)]); unique_vals
numb_unique <- length(unique_vals)
formula <- as.formula(i1 ~ age)
p <- length(coef(lm(formula)))
res <- matrix(rep(0, numb_unique*p), p, numb_unique)
for (i in 1:length(unique_vals)) {
res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
}
rownames(res) <- c("intercept","slope")
colnames(res) <- unique_vals
res
Two-way analysis of variance (ANOVA)
aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data) aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data) resid <- residuals(aov2) anova(aov2, aov1)
logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model logLik(aov2) lldiff <- logLik(aov1)[1] - logLik(aov2)[1] lldiff 1 - pchisq(2*lldiff, 2) # test for model differences summary(aov1) aov1 contrasts(sub) <- contr.SAS(4) contrasts(sub) aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data) summary(aov3) AIC(aov1); BIC(aov1) # Akaike and Bayesian Information criteria (smaller values yield better models AIC(aov2); BIC(aov2) AIC(aov3); BIC(aov3)
Multiple comparisons
# generate confidence intervals on the differences between the means of the levels # of a factor with the specified family-wise probability of coverage. # The intervals are based on the Studentized range statistic, Tukey's # ‘Honest Significant Difference’ method multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub") multiple plot(multiple)
Logistic and Poisson Regression
# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS
poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) summary(poissonres)
Negative binomial regression
Lasso model selection
Quantile regression
Ordinal logit regression
Multinomial logit regression
Generalized additive model
Data transformations
Random effects model
Generalized estimating equations (GEE) model
Generalized linear mixed model
Proportional hazards regression model
Bayesian Poisson regression
Cronbach’s $\alpha$
library(multilevel)
cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t))
# assess the consistency of the core 20 measures in the simulated HELP dataset
res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k +
f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3,
rotation="varimax", na.action=na.omit, scores="regression")
print(res, cutoff=0.45, sort=TRUE)
Factor analysis
Recursive partitioning
Linear discriminant analysis
Hierarchical clustering
ROC curve
library(ROCR) pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide) AUC<- slot(performance(pred, "auc"), "y.values")1
plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
text.adj=c(1, -.5), lwd=2)
lines(c(0, 1), c(0, 1))
text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
title("ROC Curve for Depression scale vs. Suicide Prediction Model")
Multiple imputation
Propensity score modeling
References
- Evaluation and Linkage to Primary (HELP) Care study
- Data formats: help.csv and help.Rdata
- Study and Data specifications
- HELP study documentation
- SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009
- SOCR Home page: http://www.SOCR.ucla.edu
Translate this page: