SOCR Simulated HELP Data Activity

From SOCR
Revision as of 15:27, 14 September 2014 by Dinov (talk | contribs) (Linear discriminant analysis)
Jump to: navigation, search

SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset

SOCR Simulated HELP Data

See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".

R examples

These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.

Data I/O, summaries, visualization

options(digits=2)  # decimal precision
options(width=80)  # narrows output to stay in the grey box

help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(help_sim_data)

rownames(help_sim_data)  # print row and column names
colnames(help_sim_data)

summary(help_sim_data)
fivenum(help_sim_data$\$ $mcs)

mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)

quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)

no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)

hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)

cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]

plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")

rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)


table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)


OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
       sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
      (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
       sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR


chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
chisq_val


fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)


ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
print(ttres)


wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)

ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
print(ksres)

Missing Values

sum(is.na(pcs1))  # count the missing values in the variable pcs1, 208
sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246

sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values)
which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete
# pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) 
# pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1

Sorting and subsetting

new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd

impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(help_sim_data$\$ $cesd)[1:4]
sum(is.na(help_sim_data$\$ $drinkstatus))
table(help_sim_data$\$ $drinkstat, exclude="NULL")

gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
table(help_sim_data$\$ $female)

Exploratory data analysis

newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
attach(newhelp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"], 
  i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"], 
  i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"], 
  i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, 
  cex=1.4, lwd=2, pch=c("A", "C", "H"))

options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)

summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)

lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)
# explore correlations and clusters in chunks of the data
cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs")
h_dist_clust <- hclust(dist(cor_matrix))
plot(h_dist_clust)

Bivariate relationship

subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)

subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout 
boxmeans <- tapply(cesd, list(subs, genfem), mean)
suicidal.thoughts <- as.factor(g1b)
# conditional plots
coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
# Generate random data using random effects GLM 
library(lme4)
n <- 2000; p <- 3; sigbsq <- 10
beta <- c(-2, 1.5, 0.5, -1, 1)
ID <- rep(1:n, each=p)   # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n
Y1 <- as.numeric(id < (n+1)/2)  # 1 1 ... 1 0 0 ... 0
randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p)
Y2 <- rep(1:p, n)        # 1 2 ... p 1 2 ... p ...
Y3 <- runif(p*n)
Y4 <- rexp(p*n, 1)

lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint
exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model))
Y <- runif(p*n) < exp_model
glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit"))
summary(glmmres)
# inspect visually the relation between low homelessness and low pcs scores
homelow <- homeless[i1<3.5]
pcslow <- pcs[i1<3.5]<=31.94
table(homelow, pcslow)

Contingency tables

# you need to first load and attach the dataset:
#### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
#### attach(help_sim_data)
table(homeless, female)

OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
            sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
     (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
         sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
install.packages("epitools")
library(epitools)
OR_HomeFem <- oddsratio.wald(homeless, female)
OR_HomeFem
OR_HomeFem$\$ $measure
OR_HomeFem$\$ $p.value

chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
chi_sq_test
fisher_test <- fisher.test(homeless, female)
fisher_test

Two-sample tests

Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
Welch_2sampleT_Age_Fem

fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
    res <- rep(0, length(x))
    for (i in 1:length(x)){
         if(is.na(x[i])) res[i] <- x[i]
         else if (x[i]<2) res[i] <- x[i]
         else res[i] <- 1}
    return(res)}

Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
Welch_2sampleT_Home_treat

wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
wilcox_test_Age_Fem

Power

# compute the sample size for a two-sample t-test
power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
# for a two-sample t-test, compute power from sample-size and effect-size
power.t.test(delta=0.5, n=100)

Survival analysis (Kaplan–Meier plot)

small_data <- reshape(newhelp_sim_data, idvar="id", 
                    varying=list(c("cesd1","cesd2","cesd3","cesd4"),
                        c("mcs1","mcs2","mcs3","mcs4"), 
                        c("i11","i12","i13","i14"),
                        c("g1b1","g1b2","g1b3","g1b4")), 
                    v.names=c("cesdtv","mcstv","i1tv","g1btv"),
                    timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
  family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)
library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
print(cox_surv)

# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)

Scatterplot with model fit

plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)

points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
# lowess is a locally-weighted polynomial regression smooth fit
lines(lowess(age[substance=="cocaine"],
      i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)

points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
lines(lowess(age[substance=="alcohol"],
      i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)

points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
lines(lowess(age[substance=="heroin"],
       i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)

legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
        cex=1.4, lwd=2, pch=c("c", "a", "h"))

Regression with prediction intervals

predicted <- fitted(lm1); predicted
residuals <- residuals(lm1)
quantile(residuals)

Linear regression with interaction

lm1 <- lm(a15a ~ substance * age)
lm2 <- lm(a15a ~ substance + age)
anova(lm2, lm1)
summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)
lm1$\$ $coefficients
vcov(lm1)

Regression diagnostics

# graphical model diagnostics
par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5)
plot(lm1)
par(par)
std.res <- rstandard(lm1)
hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F)
lines(density(std.res), lwd=2)
xvals <- seq(from=min(std.res), to=max(std.res), length=100)
lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)

Fitting stratified regression models

Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.

unique_vals <- unique(substance[!is.na(substance)]); unique_vals
numb_unique <- length(unique_vals)
formula <- as.formula(i1 ~ age)
p <- length(coef(lm(formula)))
res <- matrix(rep(0, numb_unique*p), p, numb_unique)
for (i in 1:length(unique_vals)) {
      res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
}
rownames(res) <- c("intercept","slope")
colnames(res) <- unique_vals
res

Two-way analysis of variance (ANOVA)

aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data)
aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data)
resid <- residuals(aov2)
anova(aov2, aov1)
logLik(aov1)  # compute the exact Log-Likelihood of each ANOVA model
logLik(aov2)
lldiff <- logLik(aov1)[1] - logLik(aov2)[1]
lldiff
1 - pchisq(2*lldiff, 2) # test for model differences
summary(aov1)
aov1

contrasts(sub) <- contr.SAS(4)
contrasts(sub) 
aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data)
summary(aov3)

AIC(aov1); BIC(aov1)  # Akaike and Bayesian Information criteria (smaller values yield better models
AIC(aov2); BIC(aov2)
AIC(aov3); BIC(aov3)

Multiple comparisons

# generate confidence intervals on the differences between the means of the levels 
# of a factor with the specified family-wise probability of coverage. 
# The intervals are based on the Studentized range statistic, Tukey's 
# ‘Honest Significant Difference’ method
multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub")
multiple 
plot(multiple)

Logistic and Poisson Regression

# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS
poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson)
summary(poissonres)

Negative binomial regression

library(MASS)
NB_Res <- glm.nb(a15a ~ female + substance + age)
summary(NB_Res)

Ordinal logit regression

library(MASS)
sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5))
ord_logit <- polr(sex_risk_cat ~ cesd + pcs)
summary(ord_logit)

Generalized additive model

install.packages("gam"); library(gam)
gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
summary(gam_reg)

coefficients(gam_reg)
plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
abline(h=0)

plot(gam_reg, terms=c("substance"), se=2, lwd=3)

General linear model for correlated data

library(nlme)
fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
          correlation = corAR1(form = ~ 1 | Mare))
# variance increases as a power of the absolute fitted values
fm2 <- update(fm1, weights = varPower())
summary(fm1)

Random effects model

Generalized estimating equations (GEE) model

install.packages("gee"); library(gee)
sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
# attach(sorted_data)
gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
      family=binomial, na.action=na.omit, corstr="exchangeable")
coef(gee_res)
sqrt(diag(gee_res$\$ $robust.variance))
gee_res$\$ $working.correlation

Generalized linear mixed model

library(lme4)
glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
      family=binomial(link="logit"), data=help_sim_data)
summary(glmm_res)

Proportional hazards regression model

library(survival)
cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
      cesd, method="breslow", data=help_sim_data)
print(cox_surv) 
# Fit a Cox proportional hazards regression model
coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")

Cronbach’s $\alpha$

library(multilevel)
cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
  f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)) 
# assess the consistency of the core 20 measures in the simulated HELP dataset
res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + 
      f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, 
      rotation="varimax", na.action=na.omit, scores="regression")
print(res, cutoff=0.45, sort=TRUE)

Factor analysis

# Factor analysis explains the (observed) variability of a multivariate dataset in terms of 
# various underlying unobservable factors. The observed data elements are expressed as 
# linear functions of the factors and some random error. 
# This is an example of a maximum likelihood factor analysis of "cesd" 
# (Center for Epidemiologic Studies–Depression) scale.

fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
      f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
      f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
       scores="regression")
print(fact_ana , cutoff=0.45, sort=TRUE)

Recursive partitioning

# Recursive partitioning creates a decision tree to classify the observations based on
# categorical predictors. Here we classify subjects based on their race, 
# using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS.

library(rpart)
subst <- as.factor(substance)
homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data)

printcp(homelessness.rpart)
plot(homelessness.rpart)
text(homelessness.rpart)

race <- racegrp[i1<3.5]
pcslow <- pcs[i1<3.5]<=31.94
length(pcslow==F)   # count the number of subjects with (pcslow==F)
table(race, pcslow)
# in these data, 3 of 18 subjects with low PCS scores are white, 
# and 48 of 154) of those with PCS scores above the threshold are white

Linear discriminant analysis

# Fisher's linear discriminant analysis (LDA) identifies linear combinations of 
# variables splitting the data into separate classes. LDA can help us
# identify clusters homelessness, using a uniform prior
library(MASS)
ngroups <- length(unique(homeless))
lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1)))
print(lda_model)
plot(lda_model)
plot(lda_model, dimen=1, type="both")

Hierarchical clustering

ROC curve

library(ROCR)
pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide)
AUC<- slot(performance(pred, "auc"), "y.values")1
plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
  text.adj=c(1, -.5), lwd=2)
lines(c(0, 1), c(0, 1))
text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
title("ROC Curve for Depression scale vs. Suicide Prediction Model")

Multiple imputation

Propensity score modeling

References




Translate this page:

(default)
Uk flag.gif

Deutsch
De flag.gif

Español
Es flag.gif

Français
Fr flag.gif

Italiano
It flag.gif

Português
Pt flag.gif

日本語
Jp flag.gif

България
Bg flag.gif

الامارات العربية المتحدة
Ae flag.gif

Suomi
Fi flag.gif

इस भाषा में
In flag.gif

Norge
No flag.png

한국어
Kr flag.gif

中文
Cn flag.gif

繁体中文
Cn flag.gif

Русский
Ru flag.gif

Nederlands
Nl flag.gif

Ελληνικά
Gr flag.gif

Hrvatska
Hr flag.gif

Česká republika
Cz flag.gif

Danmark
Dk flag.gif

Polska
Pl flag.png

România
Ro flag.png

Sverige
Se flag.gif