Difference between revisions of "SOCR Simulated HELP Data Activity"

Latest revision as of 12:04, 24 September 2014

SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset

SOCR Simulated HELP Data

See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".

R examples

These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.

Data I/O, summaries, visualization

options(digits=2)  # decimal precision
options(width=80)  # narrows output to stay in the grey box

help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(help_sim_data)

rownames(help_sim_data)  # print row and column names
colnames(help_sim_data)

summary(help_sim_data)
fivenum(help_sim_data$\$ $mcs)

mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)

quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)

no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)

hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)

cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]

plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")

rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)


table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)


OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
       sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
      (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
       sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR


chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
chisq_val


fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)


ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
print(ttres)


wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)

ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
print(ksres)

Missing Values

sum(is.na(pcs1))  # count the missing values in the variable pcs1, 208
sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246

sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values)
which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete
# pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) 
# pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1

Sorting and subsetting

new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd

impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(help_sim_data$\$ $cesd)[1:4]
sum(is.na(help_sim_data$\$ $drinkstatus))
table(help_sim_data$\$ $drinkstat, exclude="NULL")

gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
table(help_sim_data$\$ $female)

Exploratory data analysis

newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
attach(newhelp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"], 
  i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"], 
  i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"], 
  i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, 
  cex=1.4, lwd=2, pch=c("A", "C", "H"))

options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)

summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)

lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)

# explore correlations and clusters in chunks of the data
cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs")
h_dist_clust <- hclust(dist(cor_matrix))
plot(h_dist_clust)

Bivariate relationship

subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)

subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout 
boxmeans <- tapply(cesd, list(subs, genfem), mean)

suicidal.thoughts <- as.factor(g1b)
# conditional plots
coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)

# Generate random data using random effects GLM 
library(lme4)
n <- 2000; p <- 3; sigbsq <- 10
beta <- c(-2, 1.5, 0.5, -1, 1)
ID <- rep(1:n, each=p)   # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n
Y1 <- as.numeric(id < (n+1)/2)  # 1 1 ... 1 0 0 ... 0
randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p)
Y2 <- rep(1:p, n)        # 1 2 ... p 1 2 ... p ...
Y3 <- runif(p*n)
Y4 <- rexp(p*n, 1)

lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint
exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model))
Y <- runif(p*n) < exp_model
glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit"))
summary(glmmres)

# inspect visually the relation between low homelessness and low pcs scores
homelow <- homeless[i1<3.5]
pcslow <- pcs[i1<3.5]<=31.94
table(homelow, pcslow)

Contingency tables

# you need to first load and attach the dataset:
#### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
#### attach(help_sim_data)
table(homeless, female)

OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
            sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
     (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
         sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR

install.packages("epitools")
library(epitools)
OR_HomeFem <- oddsratio.wald(homeless, female)
OR_HomeFem
OR_HomeFem$\$ $measure
OR_HomeFem$\$ $p.value

chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
chi_sq_test

fisher_test <- fisher.test(homeless, female)
fisher_test

Two-sample tests

Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
Welch_2sampleT_Age_Fem

fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
    res <- rep(0, length(x))
    for (i in 1:length(x)){
         if(is.na(x[i])) res[i] <- x[i]
         else if (x[i]<2) res[i] <- x[i]
         else res[i] <- 1}
    return(res)}

Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
Welch_2sampleT_Home_treat

wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
wilcox_test_Age_Fem

Power

# compute the sample size for a two-sample t-test
power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power

# for a two-sample t-test, compute power from sample-size and effect-size
power.t.test(delta=0.5, n=100)

Survival analysis (Kaplan–Meier plot)

small_data <- reshape(newhelp_sim_data, idvar="id", 
                    varying=list(c("cesd1","cesd2","cesd3","cesd4"),
                        c("mcs1","mcs2","mcs3","mcs4"), 
                        c("i11","i12","i13","i14"),
                        c("g1b1","g1b2","g1b3","g1b4")), 
                    v.names=c("cesdtv","mcstv","i1tv","g1btv"),
                    timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
  family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)

library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
print(cox_surv)

# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)

Scatterplot with model fit

plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)

points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
# lowess is a locally-weighted polynomial regression smooth fit
lines(lowess(age[substance=="cocaine"],
      i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)

points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
lines(lowess(age[substance=="alcohol"],
      i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)

points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
lines(lowess(age[substance=="heroin"],
       i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)

legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
        cex=1.4, lwd=2, pch=c("c", "a", "h"))

Regression with prediction intervals

predicted <- fitted(lm1); predicted
residuals <- residuals(lm1)
quantile(residuals)

Linear regression with interaction

lm1 <- lm(a15a ~ substance * age)
lm2 <- lm(a15a ~ substance + age)
anova(lm2, lm1)
summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)
lm1$\$ $coefficients
vcov(lm1)

Regression diagnostics

# graphical model diagnostics
par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5)
plot(lm1)
par(par)

std.res <- rstandard(lm1)
hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F)
lines(density(std.res), lwd=2)
xvals <- seq(from=min(std.res), to=max(std.res), length=100)
lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)

Fitting stratified regression models

Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.

unique_vals <- unique(substance[!is.na(substance)]); unique_vals
numb_unique <- length(unique_vals)
formula <- as.formula(i1 ~ age)
p <- length(coef(lm(formula)))
res <- matrix(rep(0, numb_unique*p), p, numb_unique)
for (i in 1:length(unique_vals)) {
      res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
}
rownames(res) <- c("intercept","slope")
colnames(res) <- unique_vals
res

Two-way analysis of variance (ANOVA)

aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data)
aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data)
resid <- residuals(aov2)
anova(aov2, aov1)

logLik(aov1)  # compute the exact Log-Likelihood of each ANOVA model
logLik(aov2)
lldiff <- logLik(aov1)[1] - logLik(aov2)[1]
lldiff
1 - pchisq(2*lldiff, 2) # test for model differences
summary(aov1)
aov1

contrasts(sub) <- contr.SAS(4)
contrasts(sub) 
aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data)
summary(aov3)

AIC(aov1); BIC(aov1)  # Akaike and Bayesian Information criteria (smaller values yield better models
AIC(aov2); BIC(aov2)
AIC(aov3); BIC(aov3)

Multiple comparisons

# generate confidence intervals on the differences between the means of the levels 
# of a factor with the specified family-wise probability of coverage. 
# The intervals are based on the Studentized range statistic, Tukey's 
# ‘Honest Significant Difference’ method
multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub")
multiple 
plot(multiple)

Logistic and Poisson Regression

# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS

poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson)
summary(poissonres)

Negative binomial regression

library(MASS)
NB_Res <- glm.nb(a15a ~ female + substance + age)
summary(NB_Res)

Ordinal logit regression

library(MASS)
sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5))
ord_logit <- polr(sex_risk_cat ~ cesd + pcs)
summary(ord_logit)

Generalized additive model

install.packages("gam"); library(gam)
gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
summary(gam_reg)

coefficients(gam_reg)
plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
abline(h=0)

plot(gam_reg, terms=c("substance"), se=2, lwd=3)

General linear model for correlated data

library(nlme)
fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
          correlation = corAR1(form = ~ 1 | Mare))
# variance increases as a power of the absolute fitted values
fm2 <- update(fm1, weights = varPower())
summary(fm1)

Random effects model

Generalized estimating equations (GEE) model

install.packages("gee"); library(gee)
sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
# attach(sorted_data)
gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
      family=binomial, na.action=na.omit, corstr="exchangeable")
coef(gee_res)
sqrt(diag(gee_res$\$ $robust.variance))
gee_res$\$ $working.correlation

Generalized linear mixed model

library(lme4)
glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
      family=binomial(link="logit"), data=help_sim_data)
summary(glmm_res)

Proportional hazards regression model

library(survival)
cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
      cesd, method="breslow", data=help_sim_data)
print(cox_surv) 
# Fit a Cox proportional hazards regression model
coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")

Cronbach’s $\alpha$

library(multilevel)
cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
  f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)) 
# assess the consistency of the core 20 measures in the simulated HELP dataset
res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + 
      f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, 
      rotation="varimax", na.action=na.omit, scores="regression")
print(res, cutoff=0.45, sort=TRUE)

Factor analysis

# Factor analysis explains the (observed) variability of a multivariate dataset in terms of 
# various underlying unobservable factors. The observed data elements are expressed as 
# linear functions of the factors and some random error. 
# This is an example of a maximum likelihood factor analysis of "cesd" 
# (Center for Epidemiologic Studies–Depression) scale.

fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
      f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
      f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
       scores="regression")
print(fact_ana , cutoff=0.45, sort=TRUE)

Recursive partitioning

# Recursive partitioning creates a decision tree to classify the observations based on
# categorical predictors. Here we classify subjects based on their race, 
# using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS.

library(rpart)
subst <- as.factor(substance)
homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data)

printcp(homelessness.rpart)
plot(homelessness.rpart)
text(homelessness.rpart)

race <- racegrp[i1<3.5]
pcslow <- pcs[i1<3.5]<=31.94
length(pcslow==F)   # count the number of subjects with (pcslow==F)
table(race, pcslow)
# in these data, 3 of 18 subjects with low PCS scores are white, 
# and 48 of 154) of those with PCS scores above the threshold are white

Linear discriminant analysis

# Fisher's linear discriminant analysis (LDA) identifies linear combinations of 
# variables splitting the data into separate classes. LDA can help us
# identify clusters homelessness, using a uniform prior

library(MASS)
ngroups <- length(unique(homeless))
lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1)))
print(lda_model)
plot(lda_model)
plot(lda_model, dimen=1, type="both")

Hierarchical clustering

# To group similar variables or cluster observations a hierarchical structure
# may be constructed. hclust() and kmeans() are examples of such clustering
# packages

cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs")
hier_clust <- hclust(dist(cor_mat))
plot(hier_clust)

km_clust <- kmeans(cor_mat, 5, nstart = 25)
plot(cor_mat, col = km_clust$cluster)
points(km_clust$centers, col = 1:5, pch = 8)

ROC curve

library(ROCR)
pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide)
AUC<- slot(performance(pred, "auc"), "y.values")1

plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
  text.adj=c(1, -.5), lwd=2)
lines(c(0, 1), c(0, 1))
text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
title("ROC Curve for Depression scale vs. Suicide Prediction Model")

Multiple imputation

Also see the Missing Values section of the Scientific Methods for Health Sciences EBook.

miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs))
summary(miss_data)
# library(Hmisc)
# na.pattern(miss_data)

install.packages("mi")
library(mi)
inf <- mi.info(miss_data)
# run the imputation without data transformation
IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10))

# run the imputation with data transformation
miss_data.transformed <- mi.preprocess(miss_data, inf)
IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6))

# Impute without noise
IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)

# check convergence, should get FALSE here because only n.iter is small 
converged(IMP, check = "data")
# converged(IMP, check = "coefs")
# visually check the imputation
plot(IMP); plot(IMP_T)

Propensity score modeling

# Propensity score (PS) techniques may be applied for analysis of 
# observational and registry data. The PS is the conditional probability
# of a certain treatment given patient’s covariates. PS may be used to 
# eliminate imbalances in baseline covariate distributions between 
# treatment groups and to estimate marginal effects.

References

SOCR Home page: http://www.SOCR.ucla.edu

Translate this page:

(default)	Deutsch	Español	Français	Italiano	Português	日本語	България	الامارات العربية المتحدة	Suomi	इस भाषा में	Norge
한국어	中文	繁体中文	Русский	Nederlands	Ελληνικά	Hrvatska	Česká republika	Danmark	Polska	România	Sverige

@@ Line 1: / Line 1: @@
-== [[SOCR Data]] - SOCR Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset ==
+== [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]: [[SOCR_EduMaterials|SOCR Activity]]: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset ==
-==Summary==
+==[[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]==
-This is a simulated dataset following the study design of the [http://www.ncbi.nlm.nih.gov/pubmed/12653820Health Evaluation and Linkage to Primary (HELP) Care study]. These data are not real but simulated and intended for demonstration of various multivariate statistical, computational, analytical and visualization protocols for data interrogation.
+See the [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]] first. These data can be copy-pasted using the mouse from the HTML table into a plain text file [http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv "help_data.csv"].
-==Background==
+==R examples==
-The original Health Evaluation and Linkage to Primary (HELP) Care study had recruited adult inpatients in a detoxification unit. Researchers randomized the patients with no primary care physician to receive evaluations and a brief care intervention and investigated their findings relative to prior enrollment in primary medical care. The complete original dataset can be found [http://www.math.smith.edu/sasr/datasets.php here].
+These simulated HELP data can be used to demonstrate (using [[SOCR]] and [http://www.r-project.org/ R])a number of different statistical, modeling, inferential and data analytic techniques.
-==Classroom use of this data set==
+===Data I/O, summaries, visualization===
-The simulated HELP data can be used to demonstrate a number of different statistical, modeling, inferential and data analytic techniques, see list below. The following [[SOCR_Simulated_HELP_Data_Activity|SOCR HELP Activity]] demonstrates exemplary use of these data using R.
+ options(digits=2)  # decimal precision
-* Data I/O, summaries, visualizaiton
+ options(width=80)  # narrows output to stay in the grey box
-* Derived variables and data manipulation
-* Sorting and subsetting
+ help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
-* [[SMHS_EDA|Exploratory data analysis]]
+ # note that we specify all of these values as indicating missing data ("",".","NA")
-* [[SOCR_EduMaterials_ChartsActivities|Graphing and plotting of data (scatterplot, bubble chart, multiple plots, dotplot, etc.)]]
+ attach(help_sim_data)
-* [[SMHS_SLR|Bivariate relationship]]
-* [[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]
+ rownames(help_sim_data)  # print row and column names
-* [[SMHS_HypothesisTesting|Two-sample tests]]
+ colnames(help_sim_data)
-* [[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]
-* [[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with smooth fit]]
+ summary(help_sim_data)
-* [[SMHS_SLR|Regression with prediction intervals]]
+ fivenum(help_sim_data$\$ $mcs)
-* Linear regression with interaction
-* Regression diagnostics
+ mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)
-* Fitting stratified regression models
-* [[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]
+ quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)
-* [[SMHS_CorrectionMultipleTesting|Multiple comparisons]]
-* Contrasts
+ no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)
-* Logistic regression
-* Poisson regression
+ hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
-* Zero-inflated Poisson regression
+ lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
-* Negative binomial regression
+ xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
-* Lasso model selection
+ lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)
-* Quantile regression
-* Ordinal logit regression
+ cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
-* Multinomial logit regression
+ cor_mat
-* Generalized additive model
+ cor_mat[c(2, 3), 2]
-* [[SOCR_EduMaterials_Activities_PowerTransformFamily_Graphs|Data transformations]]
-* General linear model for correlated data
+ plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")
-* Random effects model
-* [[SMHS_GEE|Generalized estimating equations (GEE) model]]
+ text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
-* Generalized linear mixed model
+    help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")
-* Proportional hazards regression model
-* Bayesian Poisson regression
+ text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
-* [[SMHS_Cronbachs|Cronbach’s $\alpha$]]
+    help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")
-* [[SMHS_PCA_ICA_FA|Factor analysis]]
-* Recursive partitioning
+ text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
-* Linear discriminant analysis
+    help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")
-* [[SMHS_HLM|Hierarchical clustering]]
-* [[SMHS_ROC|ROC curve]]
+ rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
-* [[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]
+ rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)
-* Propensity score modeling
+ table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
+ OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
+        sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
+       (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
+        sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
+ OR
+ chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
+ chisq_val
+ fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
+ ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
+ print(ttres)
+ wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)
+ ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
+ print(ksres)
-==Data Description==
+===Missing Values===
-===Variable Definitions===
+ sum(is.na(pcs1))  # count the missing values in the variable pcs1, 208
-* id	random subject identifier (range 1–470)
+ sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246
-*	a15a	number of nights in overnight shelter in past 6 months (range0–180) see also homeless
-*	a15b	number of nights on the street in past 6 months (range 0–180) see also homeless
+ sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values)
-*	age	age at baseline (in years) (range19–60)
+ which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete
-*	anysubstatus	use of any substance postdetox (0=no, 1=yes)see also daysanysub
+ # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing)
-*	cesd*	Center for Epidemiologic Studies Depression scale (range 0–60)see also f1a – f1t
+ # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
-*	d1	how many times hospitalized for medical problems (lifetime)(range 0–100)
-*	daysanysub	time (in days) to first use of any substance postdetox (range 0–268)see also anysubstatus
-*	daysdrink	time (in days) to first alcoholicdrink post-detox (range 0–270)see also drinkstatus
-*	dayslink	time (in days) to linkage to primary care (range 0–456) see also linkstatus
-*	drinkstatus	use of alcohol postdetox (0=no,1=yes)see also daysdrink
-*	drugrisk*	Risk-Assessment Battery(RAB) drug risk score (range0–21) see also sexrisk
-*	e2b*	number of times in past 6 months entered a detox pro-gram (range 1–21)
-*	f1a	I was bothered by things that usually don’t bother me (range0–3 #)
-*	f1b	I did not feel like eating; my appetite was poor (range 0–3 #)
-*	f1c	I felt that I could not shake off the blues even with help from my family or friends (range 0–3 #)
-*	f1d	I felt that I was just as good as other people (range 0–3 #)
-*	f1e	I had trouble keeping my mind on what I was doing (range 0–3 #)
-*	f1f	I felt depressed (range 0–3 #)
-*	f1g	I felt that everything I did was an effort (range 0–3 #)
-*	f1h	I felt hopeful about the future(range 0–3 #)
-*	f1i	I thought my life had been a failure (range 0–3 #)
-*	f1j	I felt fearful (range 0–3 #)
-*	f1k	My sleep was restless (range 0–3 #)
-*	f1l	I was happy (range 0–3 #)
-*	f1m	I talked less than usual (range0–3 #)
-*	f1n	I felt lonely (range 0–3 #)
-*	f1o	People were unfriendly (range0–3 #)
-*	f1p	I enjoyed life (range 0–3 #)
-*	f1q	I had crying spells (range 0–3 #)
-*	f1r	I felt sad (range 0–3 #)
-*	f1s	I felt that people dislike me(range 0–3 #)
-*	f1t	I could not get going (range 0–3 #)
-*	female	gender of respondent (0=male,1=female)
-*	g1b*	experienced serious thoughts of suicide (last 30 days, values0=no, 1=yes)
-*	homeless*	1 or more nights on the street or shelter in past 6 months (0=no,1=yes) see also a15a and a15b
-*	i1*	average number of drinks (standard units) consumed per day (in the past 30 days, range 0–142) see also i2
-*	i2	maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1
-*	indtot*	Inventory of Drug Use Con-sequences (InDUC) total score (range 4–45)
-*	linkstatus	post-detox linkage to primary care (0=no, 1=yes) see also dayslink
-*	mcs*	SF-36 Mental Component Score(range 7-62) see also pcs
-*	pcrec*	number of primary care visits in past 6 months (range 0–2) see also linkstatus, not observed at baseline
-*	pcs*	SF-36 Physical Component Score (range 14-75) see also mcs
-*	pss_fr	perceived social supports (friends, range 0–14) see also dayslink
-*	satreat	any BSAS substance abuset reatment at baseline (0=no,1=yes)
-*	sexrisk*	Risk-Assessment Battery (RAB) drug risk score (range 0–21) see also drugrisk
-*	substance	primary substance of abuse (alcohol, cocaine or heroin)
-*	treat	randomization group (0=usual care, 1=HELP clinic)
-===Notes===
-Data for continuous variables reports observed range at baseline.
-* * marked variables are recorded at baseline and follow up (e.g., mcs is baseline measure, mcs1 is measure at 6 months, and mcs4 is measure at 24 months);
-* for each of the 20 items in HELP section F1 (f1a, …, f1t), respondents were asked to indicate how often they behaved this way during the past week (0=rarely or none of the time, less than one day; 1 = some or a little of the time, 1 or 2 days; 2 = occasionally or a moderate amount of time, 3 or 4 days; or 3=most or all of the time, 5 to 7 days);
-* Reverse coding was used for items f1d, f1h, f1l, and f1p (as these questions are intrinsically positive in nature (e.g., hope), where as the remaining once are negative in nature (e.g., blues)).
-* Some subjects (e.g., case 13) did not have metadata (e.g., e2b) recorded for them.
-* "." (period) denotes missing or incomplete observation.
+===Sorting and subsetting===
+ new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
+ new_cesd
+ impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
+ sort(help_sim_data$\$ $cesd)[1:4]
+ sum(is.na(help_sim_data$\$ $drinkstatus))
+ table(help_sim_data$\$ $drinkstat, exclude="NULL")
+ gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
+ table(help_sim_data$\$ $female)
+===[[SMHS_EDA|Exploratory data analysis]]===
+ newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
+ attach(newhelp_sim_data)
+ sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
+ plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
+ points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
+ lines(lowess(age[substance=="alcohol"],
+   i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
+ points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
+ lines(lowess(age[substance=="cocaine"],
+   i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
+ points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
+ lines(lowess(age[substance=="heroin"],
+   i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
+ legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3,
+   cex=1.4, lwd=2, pch=c("A", "C", "H"))
+ options(show.signif.stars=FALSE)
+ lm1 <- lm(i1 ~ sub * age)
+ lm2 <- lm(i1 ~ sub + age)
+ anova(lm2, lm1)
+ summary(lm1)
+ names(summary(lm1))
+ summary(lm1)$\$ $sigma
+ names(lm1)
+ lm1$\$ $coefficients
+ coef(lm1)
+ vcov(lm1)
+ pred <- fitted(lm1)
+ resid <- residuals(lm1)
+ quantile(resid)
+ # explore correlations and clusters in chunks of the data
+ cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs")
+ h_dist_clust <- hclust(dist(cor_matrix))
+ plot(h_dist_clust)
+===[[SMHS_SLR|Bivariate relationship]]===
+ subst <- as.factor(substance)
+ genfem <- as.factor(ifelse(female, "F", "M"))
+ interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)
+ subs <- character(length(substance))
+ subs[substance=="alcohol"] <- "Alco"
+ subs[substance=="cocaine"] <- "Coca"
+ subs[substance=="heroin"] <- "Hero"
+ gend <- character(length(female))
+ library("lattice")
+ bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
+ bwout
+ boxmeans <- tapply(cesd, list(subs, genfem), mean)
+ suicidal.thoughts <- as.factor(g1b)
+ # conditional plots
+ coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
+ # Generate random data using random effects GLM
+ library(lme4)
+ n <- 2000; p <- 3; sigbsq <- 10
+ beta <- c(-2, 1.5, 0.5, -1, 1)
+ ID <- rep(1:n, each=p)   # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n
+ Y1 <- as.numeric(id < (n+1)/2)  # 1 1 ... 1 0 0 ... 0
+ randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p)
+ Y2 <- rep(1:p, n)        # 1 2 ... p 1 2 ... p ...
+ Y3 <- runif(p*n)
+ Y4 <- rexp(p*n, 1)
+ lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint
+ exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model))
+ Y <- runif(p*n) < exp_model
+ glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit"))
+ summary(glmmres)
+ # inspect visually the relation between low homelessness and low pcs scores
+ homelow <- homeless[i1<3.5]
+ pcslow <- pcs[i1<3.5]<=31.94
+ table(homelow, pcslow)
+===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]===
+ # you need to first load and attach the dataset:
+ #### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
+ #### attach(help_sim_data)
+ table(homeless, female)
+ OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
+             sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
+      (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
+          sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
+ OR
+ install.packages("epitools")
+ library(epitools)
+ OR_HomeFem <- oddsratio.wald(homeless, female)
+ OR_HomeFem
+ OR_HomeFem$\$ $measure
+ OR_HomeFem$\$ $p.value
+ chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
+ chi_sq_test
+ fisher_test <- fisher.test(homeless, female)
+ fisher_test
+===[[SMHS_HypothesisTesting|Two-sample tests]]===
+ Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
+ Welch_2sampleT_Age_Fem
+ fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
+     res <- rep(0, length(x))
+     for (i in 1:length(x)){
+          if(is.na(x[i])) res[i] <- x[i]
+          else if (x[i]<2) res[i] <- x[i]
+          else res[i] <- 1}
+     return(res)}
+ Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
+ Welch_2sampleT_Home_treat
+ wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
+ wilcox_test_Age_Fem
+===[[SMHS_PowerSensitivitySpecificity|Power]]===
+ # compute the sample size for a two-sample t-test
+ power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
+ # for a two-sample t-test, compute power from sample-size and effect-size
+ power.t.test(delta=0.5, n=100)
+===[[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]===
+ small_data <- reshape(newhelp_sim_data, idvar="id",
+                     varying=list(c("cesd1","cesd2","cesd3","cesd4"),
+                         c("mcs1","mcs2","mcs3","mcs4"),
+                         c("i11","i12","i13","i14"),
+                         c("g1b1","g1b2","g1b3","g1b4")),
+                     v.names=c("cesdtv","mcstv","i1tv","g1btv"),
+                     timevar="time", times=1:4, direction="long")
+ library(lme4)
+ glmres <- glmer(g1btv ~ treat + time + (1|id),
+   family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
+ summary(glmres)
+ library(survival)
+ # fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
+ # time dependent strata (linkstatus)
+ cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
+ print(cox_surv)
+ # creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
+ # Cox model, or a previously fitted accelerated failure time model
+ surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
+ print(surv_KM)
+ plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
+ title("Product-Limit Survival Estimates")
+ legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)
+===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with model fit]]===
+ plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
+ points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
+ # lowess is a locally-weighted polynomial regression smooth fit
+ lines(lowess(age[substance=="cocaine"],
+       i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
+ points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
+ lines(lowess(age[substance=="alcohol"],
+       i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
+ points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
+ lines(lowess(age[substance=="heroin"],
+        i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
+ legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
+         cex=1.4, lwd=2, pch=c("c", "a", "h"))
+===[[SMHS_SLR|Regression with prediction intervals]]===
+ predicted <- fitted(lm1); predicted
+ residuals <- residuals(lm1)
+ quantile(residuals)
+===Linear regression with interaction===
+ lm1 <- lm(a15a ~ substance * age)
+ lm2 <- lm(a15a ~ substance + age)
+ anova(lm2, lm1)
+ summary(lm1)
+ names(summary(lm1))
+ summary(lm1)$\$ $sigma
+ names(lm1)
+ lm1$\$ $coefficients
+ vcov(lm1)
+===Regression diagnostics===
+ # graphical model diagnostics
+ par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5)
+ plot(lm1)
+ par(par)
+ std.res <- rstandard(lm1)
+ hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F)
+ lines(density(std.res), lwd=2)
+ xvals <- seq(from=min(std.res), to=max(std.res), length=100)
+ lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)
+===Fitting stratified regression models===
+Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.
+ unique_vals <- unique(substance[!is.na(substance)]); unique_vals
+ numb_unique <- length(unique_vals)
+ formula <- as.formula(i1 ~ age)
+ p <- length(coef(lm(formula)))
+ res <- matrix(rep(0, numb_unique*p), p, numb_unique)
+ for (i in 1:length(unique_vals)) {
+       res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
+ }
+ rownames(res) <- c("intercept","slope")
+ colnames(res) <- unique_vals
+ res
+===[[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]===
+ aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data)
+ aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data)
+ resid <- residuals(aov2)
+ anova(aov2, aov1)
+ logLik(aov1)  # compute the exact Log-Likelihood of each ANOVA model
+ logLik(aov2)
+ lldiff <- logLik(aov1)[1] - logLik(aov2)[1]
+ lldiff
+- pchisq(2*lldiff, 2) # test for model differences
+ summary(aov1)
+ aov1
+ contrasts(sub) <- contr.SAS(4)
+ contrasts(sub)
+ aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data)
+ summary(aov3)
+ AIC(aov1); BIC(aov1)  # [http://en.wikipedia.org/wiki/Akaike_information_criterion Akaike] and [http://en.wikipedia.org/wiki/Bayesian_information_criterion Bayesian] Information criteria (smaller values yield better models
+ AIC(aov2); BIC(aov2)
+ AIC(aov3); BIC(aov3)
+===[[SMHS_CorrectionMultipleTesting|Multiple comparisons]]===
+ # generate confidence intervals on the differences between the means of the levels
+ # of a factor with the specified family-wise probability of coverage.
+ # The intervals are based on the Studentized range statistic, Tukey's
+ # ‘Honest Significant Difference’ method
+ multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub")
+ multiple
+ plot(multiple)
+===Logistic and Poisson Regression===
+ # logistic (binomial outcomes) regression
+ logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
+ logistres
+ summary(logistres)
+ names(summary(logistres))
+ coeff.like.SAS <- summary(logistres)$\$ $coefficients
+ coeff.like.SAS
+ poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson)
+ summary(poissonres)
+===Negative binomial regression===
+ library(MASS)
+ NB_Res <- glm.nb(a15a ~ female + substance + age)
+ summary(NB_Res)
+===Ordinal logit regression===
+ library(MASS)
+ sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5))
+ ord_logit <- polr(sex_risk_cat ~ cesd + pcs)
+ summary(ord_logit)
+===Generalized additive model===
+ install.packages("gam"); library(gam)
+ gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
+ summary(gam_reg)
+ coefficients(gam_reg)
+ plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
+ abline(h=0)
+ plot(gam_reg, terms=c("substance"), se=2, lwd=3)
+===General linear model for correlated data===
+ library(nlme)
+ fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
+           correlation = corAR1(form = ~ 1 | Mare))
+ # variance increases as a power of the absolute fitted values
+ fm2 <- update(fm1, weights = varPower())
+ summary(fm1)
+===Random effects model===
+===[[SMHS_GEE|Generalized estimating equations (GEE) model]]===
+ install.packages("gee"); library(gee)
+ sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
+ # attach(sorted_data)
+ gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
+       family=binomial, na.action=na.omit, corstr="exchangeable")
+ coef(gee_res)
+ sqrt(diag(gee_res$\$ $robust.variance))
+ gee_res$\$ $working.correlation
+===Generalized linear mixed model===
+ library(lme4)
+ glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
+       family=binomial(link="logit"), data=help_sim_data)
+ summary(glmm_res)
+===Proportional hazards regression model===
+ library(survival)
+ cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
+       cesd, method="breslow", data=help_sim_data)
+ print(cox_surv)
+ # Fit a Cox proportional hazards regression model
+ coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")
+===[[SMHS_Cronbachs|Cronbach’s $\alpha$]]===
+ library(multilevel)
+ cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
+   f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t))
+ # assess the consistency of the core 20 measures in the simulated HELP dataset
+ res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k +
+       f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3,
+       rotation="varimax", na.action=na.omit, scores="regression")
+ print(res, cutoff=0.45, sort=TRUE)
+===[[SMHS_PCA_ICA_FA|Factor analysis]]===
+ # Factor analysis explains the (observed) variability of a multivariate dataset in terms of
+ # various underlying unobservable factors. The observed data elements are expressed as
+ # linear functions of the factors and some random error.
+ # This is an example of a maximum likelihood factor analysis of "cesd"
+ # (Center for Epidemiologic Studies–Depression) scale.
+ fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
+       f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
+       f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
+        scores="regression")
+ print(fact_ana , cutoff=0.45, sort=TRUE)
+===Recursive partitioning===
+ # Recursive partitioning creates a decision tree to classify the observations based on
+ # categorical predictors. Here we classify subjects based on their race,
+ # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS.
+ library(rpart)
+ subst <- as.factor(substance)
+ homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data)
+ printcp(homelessness.rpart)
+ plot(homelessness.rpart)
+ text(homelessness.rpart)
+ race <- racegrp[i1<3.5]
+ pcslow <- pcs[i1<3.5]<=31.94
+ length(pcslow==F)   # count the number of subjects with (pcslow==F)
+ table(race, pcslow)
+ # in these data, 3 of 18 subjects with low PCS scores are white,
+ # and 48 of 154) of those with PCS scores above the threshold are white
+===Linear discriminant analysis===
+ # Fisher's linear discriminant analysis (LDA) identifies linear combinations of
+ # variables splitting the data into separate classes. LDA can help us
+ # identify clusters homelessness, using a uniform prior
+ library(MASS)
+ ngroups <- length(unique(homeless))
+ lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1)))
+ print(lda_model)
+ plot(lda_model)
+ plot(lda_model, dimen=1, type="both")
+===[[SMHS_HLM|Hierarchical clustering]]===
+ # To group similar variables or cluster observations a hierarchical structure
+ # may be constructed. hclust() and kmeans() are examples of such clustering
+ # packages
+ cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs")
+ hier_clust <- hclust(dist(cor_mat))
+ plot(hier_clust)
+ km_clust <- kmeans(cor_mat, 5, nstart = 25)
+ plot(cor_mat, col = km_clust$cluster)
+ points(km_clust$centers, col = 1:5, pch = 8)
+===[[SMHS_ROC|ROC curve]]===
+ library(ROCR)
+ pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide)
+ AUC<- slot(performance(pred, "auc"), "y.values")[[1]]
+ plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
+   text.adj=c(1, -.5), lwd=2)
+ lines(c(0, 1), c(0, 1))
+ text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
+ title("ROC Curve for Depression scale vs. Suicide Prediction Model")
+===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]===
+Also see the [[SMHS_MissingData| Missing Values section of the Scientific Methods for Health Sciences EBook]].
+ miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs))
+ summary(miss_data)
+ # library(Hmisc)
+ # na.pattern(miss_data)
+ install.packages("mi")
+ library(mi)
+ inf <- mi.info(miss_data)
+ # run the imputation without data transformation
+ IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10))
+ # run the imputation with data transformation
+ miss_data.transformed <- mi.preprocess(miss_data, inf)
+ IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6))
+ # Impute without noise
+ IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)
+ # check convergence, should get FALSE here because only n.iter is small
+ converged(IMP, check = "data")
+ # converged(IMP, check = "coefs")
+ # visually check the imputation
+ plot(IMP); plot(IMP_T)
+===Propensity score modeling===
+ # Propensity score (PS) techniques may be applied for analysis of
+ # observational and registry data. The PS is the conditional probability
+ # of a certain treatment given patient’s covariates. PS may be used to
+ # eliminate imbalances in baseline covariate distributions between
+ # treatment groups and to estimate marginal effects.
 ==References==
@@ Line 115: / Line 564: @@
 ** Data formats: [http://www.math.smith.edu/sasr/datasets/help.csv help.csv] and [http://www.math.smith.edu/sasr/datasets/help.Rdata help.Rdata]
 ** [http://www3.amherst.edu/~nhorton/help/ Study and Data specifications]
+** [http://www3.amherst.edu/~nhorton/help/ HELP study documentation]
 * [http://www.amazon.com/SAS-Management-Statistical-Analysis-Graphics/dp/1420070576 SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009]