Difference between revisions of "SOCR Simulated HELP Data Activity"

Latest revision as of 12:04, 24 September 2014

SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset

SOCR Simulated HELP Data

See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".

R examples

These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.

Data I/O, summaries, visualization

options(digits=2)  # decimal precision
options(width=80)  # narrows output to stay in the grey box

help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(help_sim_data)

rownames(help_sim_data)  # print row and column names
colnames(help_sim_data)

summary(help_sim_data)
fivenum(help_sim_data$\$ $mcs)

mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)

quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)

no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)

hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)

cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]

plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")

text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
   help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")

rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)


table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)


OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
       sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
      (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
       sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR


chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
chisq_val


fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)


ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
print(ttres)


wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)

ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
print(ksres)

Missing Values

sum(is.na(pcs1))  # count the missing values in the variable pcs1, 208
sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246

sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values)
which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete
# pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) 
# pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1

Sorting and subsetting

new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd

impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(help_sim_data$\$ $cesd)[1:4]
sum(is.na(help_sim_data$\$ $drinkstatus))
table(help_sim_data$\$ $drinkstat, exclude="NULL")

gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
table(help_sim_data$\$ $female)

Exploratory data analysis

newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
attach(newhelp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"], 
  i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"], 
  i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"], 
  i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, 
  cex=1.4, lwd=2, pch=c("A", "C", "H"))

options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)

summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)

lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)

# explore correlations and clusters in chunks of the data
cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs")
h_dist_clust <- hclust(dist(cor_matrix))
plot(h_dist_clust)

Bivariate relationship

subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)

subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout 
boxmeans <- tapply(cesd, list(subs, genfem), mean)

suicidal.thoughts <- as.factor(g1b)
# conditional plots
coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)

# Generate random data using random effects GLM 
library(lme4)
n <- 2000; p <- 3; sigbsq <- 10
beta <- c(-2, 1.5, 0.5, -1, 1)
ID <- rep(1:n, each=p)   # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n
Y1 <- as.numeric(id < (n+1)/2)  # 1 1 ... 1 0 0 ... 0
randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p)
Y2 <- rep(1:p, n)        # 1 2 ... p 1 2 ... p ...
Y3 <- runif(p*n)
Y4 <- rexp(p*n, 1)

lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint
exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model))
Y <- runif(p*n) < exp_model
glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit"))
summary(glmmres)

# inspect visually the relation between low homelessness and low pcs scores
homelow <- homeless[i1<3.5]
pcslow <- pcs[i1<3.5]<=31.94
table(homelow, pcslow)

Contingency tables

# you need to first load and attach the dataset:
#### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
#### attach(help_sim_data)
table(homeless, female)

OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
            sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
     (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
         sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR

install.packages("epitools")
library(epitools)
OR_HomeFem <- oddsratio.wald(homeless, female)
OR_HomeFem
OR_HomeFem$\$ $measure
OR_HomeFem$\$ $p.value

chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
chi_sq_test

fisher_test <- fisher.test(homeless, female)
fisher_test

Two-sample tests

Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
Welch_2sampleT_Age_Fem

fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
    res <- rep(0, length(x))
    for (i in 1:length(x)){
         if(is.na(x[i])) res[i] <- x[i]
         else if (x[i]<2) res[i] <- x[i]
         else res[i] <- 1}
    return(res)}

Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
Welch_2sampleT_Home_treat

wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
wilcox_test_Age_Fem

Power

# compute the sample size for a two-sample t-test
power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power

# for a two-sample t-test, compute power from sample-size and effect-size
power.t.test(delta=0.5, n=100)

Survival analysis (Kaplan–Meier plot)

small_data <- reshape(newhelp_sim_data, idvar="id", 
                    varying=list(c("cesd1","cesd2","cesd3","cesd4"),
                        c("mcs1","mcs2","mcs3","mcs4"), 
                        c("i11","i12","i13","i14"),
                        c("g1b1","g1b2","g1b3","g1b4")), 
                    v.names=c("cesdtv","mcstv","i1tv","g1btv"),
                    timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
  family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)

library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
print(cox_surv)

# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)

Scatterplot with model fit

plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)

points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
# lowess is a locally-weighted polynomial regression smooth fit
lines(lowess(age[substance=="cocaine"],
      i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)

points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
lines(lowess(age[substance=="alcohol"],
      i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)

points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
lines(lowess(age[substance=="heroin"],
       i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)

legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
        cex=1.4, lwd=2, pch=c("c", "a", "h"))

Regression with prediction intervals

predicted <- fitted(lm1); predicted
residuals <- residuals(lm1)
quantile(residuals)

Linear regression with interaction

lm1 <- lm(a15a ~ substance * age)
lm2 <- lm(a15a ~ substance + age)
anova(lm2, lm1)
summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)
lm1$\$ $coefficients
vcov(lm1)

Regression diagnostics

# graphical model diagnostics
par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5)
plot(lm1)
par(par)

std.res <- rstandard(lm1)
hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F)
lines(density(std.res), lwd=2)
xvals <- seq(from=min(std.res), to=max(std.res), length=100)
lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)

Fitting stratified regression models

Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.

unique_vals <- unique(substance[!is.na(substance)]); unique_vals
numb_unique <- length(unique_vals)
formula <- as.formula(i1 ~ age)
p <- length(coef(lm(formula)))
res <- matrix(rep(0, numb_unique*p), p, numb_unique)
for (i in 1:length(unique_vals)) {
      res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
}
rownames(res) <- c("intercept","slope")
colnames(res) <- unique_vals
res

Two-way analysis of variance (ANOVA)

aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data)
aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data)
resid <- residuals(aov2)
anova(aov2, aov1)

logLik(aov1)  # compute the exact Log-Likelihood of each ANOVA model
logLik(aov2)
lldiff <- logLik(aov1)[1] - logLik(aov2)[1]
lldiff
1 - pchisq(2*lldiff, 2) # test for model differences
summary(aov1)
aov1

contrasts(sub) <- contr.SAS(4)
contrasts(sub) 
aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data)
summary(aov3)

AIC(aov1); BIC(aov1)  # Akaike and Bayesian Information criteria (smaller values yield better models
AIC(aov2); BIC(aov2)
AIC(aov3); BIC(aov3)

Multiple comparisons

# generate confidence intervals on the differences between the means of the levels 
# of a factor with the specified family-wise probability of coverage. 
# The intervals are based on the Studentized range statistic, Tukey's 
# ‘Honest Significant Difference’ method
multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub")
multiple 
plot(multiple)

Logistic and Poisson Regression

# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS

poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson)
summary(poissonres)

Negative binomial regression

library(MASS)
NB_Res <- glm.nb(a15a ~ female + substance + age)
summary(NB_Res)

Ordinal logit regression

library(MASS)
sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5))
ord_logit <- polr(sex_risk_cat ~ cesd + pcs)
summary(ord_logit)

Generalized additive model

install.packages("gam"); library(gam)
gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
summary(gam_reg)

coefficients(gam_reg)
plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
abline(h=0)

plot(gam_reg, terms=c("substance"), se=2, lwd=3)

General linear model for correlated data

library(nlme)
fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
          correlation = corAR1(form = ~ 1 | Mare))
# variance increases as a power of the absolute fitted values
fm2 <- update(fm1, weights = varPower())
summary(fm1)

Random effects model

Generalized estimating equations (GEE) model

install.packages("gee"); library(gee)
sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
# attach(sorted_data)
gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
      family=binomial, na.action=na.omit, corstr="exchangeable")
coef(gee_res)
sqrt(diag(gee_res$\$ $robust.variance))
gee_res$\$ $working.correlation

Generalized linear mixed model

library(lme4)
glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
      family=binomial(link="logit"), data=help_sim_data)
summary(glmm_res)

Proportional hazards regression model

library(survival)
cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
      cesd, method="breslow", data=help_sim_data)
print(cox_surv) 
# Fit a Cox proportional hazards regression model
coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")

Cronbach’s $\alpha$

library(multilevel)
cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
  f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)) 
# assess the consistency of the core 20 measures in the simulated HELP dataset
res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k + 
      f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3, 
      rotation="varimax", na.action=na.omit, scores="regression")
print(res, cutoff=0.45, sort=TRUE)

Factor analysis

# Factor analysis explains the (observed) variability of a multivariate dataset in terms of 
# various underlying unobservable factors. The observed data elements are expressed as 
# linear functions of the factors and some random error. 
# This is an example of a maximum likelihood factor analysis of "cesd" 
# (Center for Epidemiologic Studies–Depression) scale.

fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
      f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
      f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
       scores="regression")
print(fact_ana , cutoff=0.45, sort=TRUE)

Recursive partitioning

# Recursive partitioning creates a decision tree to classify the observations based on
# categorical predictors. Here we classify subjects based on their race, 
# using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS.

library(rpart)
subst <- as.factor(substance)
homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data)

printcp(homelessness.rpart)
plot(homelessness.rpart)
text(homelessness.rpart)

race <- racegrp[i1<3.5]
pcslow <- pcs[i1<3.5]<=31.94
length(pcslow==F)   # count the number of subjects with (pcslow==F)
table(race, pcslow)
# in these data, 3 of 18 subjects with low PCS scores are white, 
# and 48 of 154) of those with PCS scores above the threshold are white

Linear discriminant analysis

# Fisher's linear discriminant analysis (LDA) identifies linear combinations of 
# variables splitting the data into separate classes. LDA can help us
# identify clusters homelessness, using a uniform prior

library(MASS)
ngroups <- length(unique(homeless))
lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1)))
print(lda_model)
plot(lda_model)
plot(lda_model, dimen=1, type="both")

Hierarchical clustering

# To group similar variables or cluster observations a hierarchical structure
# may be constructed. hclust() and kmeans() are examples of such clustering
# packages

cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs")
hier_clust <- hclust(dist(cor_mat))
plot(hier_clust)

km_clust <- kmeans(cor_mat, 5, nstart = 25)
plot(cor_mat, col = km_clust$cluster)
points(km_clust$centers, col = 1:5, pch = 8)

ROC curve

library(ROCR)
pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide)
AUC<- slot(performance(pred, "auc"), "y.values")1

plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
  text.adj=c(1, -.5), lwd=2)
lines(c(0, 1), c(0, 1))
text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
title("ROC Curve for Depression scale vs. Suicide Prediction Model")

Multiple imputation

Also see the Missing Values section of the Scientific Methods for Health Sciences EBook.

miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs))
summary(miss_data)
# library(Hmisc)
# na.pattern(miss_data)

install.packages("mi")
library(mi)
inf <- mi.info(miss_data)
# run the imputation without data transformation
IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10))

# run the imputation with data transformation
miss_data.transformed <- mi.preprocess(miss_data, inf)
IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6))

# Impute without noise
IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)

# check convergence, should get FALSE here because only n.iter is small 
converged(IMP, check = "data")
# converged(IMP, check = "coefs")
# visually check the imputation
plot(IMP); plot(IMP_T)

Propensity score modeling

# Propensity score (PS) techniques may be applied for analysis of 
# observational and registry data. The PS is the conditional probability
# of a certain treatment given patient’s covariates. PS may be used to 
# eliminate imbalances in baseline covariate distributions between 
# treatment groups and to estimate marginal effects.

References

SOCR Home page: http://www.SOCR.ucla.edu

Translate this page:

(default)	Deutsch	Español	Français	Italiano	Português	日本語	България	الامارات العربية المتحدة	Suomi	इस भाषा में	Norge
한국어	中文	繁体中文	Русский	Nederlands	Ελληνικά	Hrvatska	Česká republika	Danmark	Polska	România	Sverige

@@ Line 1: / Line 1: @@
-== [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]] - SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset ==
+== [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]: [[SOCR_EduMaterials|SOCR Activity]]: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset ==
 ==[[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]]==
 See the [[SOCR_Simulated_HELP_Data|SOCR Simulated HELP Data]] first. These data can be copy-pasted using the mouse from the HTML table into a plain text file [http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv "help_data.csv"].
-==R examples==
-These simulated HELP data can be used to demonstrate (using [[SOCR]] and [http://www.r-project.org/ R])a number of different statistical, modeling, inferential and data analytic techniques.
 ==R examples==
@@ Line 14: / Line 11: @@
   options(width=80)  # narrows output to stay in the grey box
-  hemp_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv")
+  help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
- attach(hemp_sim_data)
+  # note that we specify all of these values as indicating missing data ("",".","NA")
-  summary(hemp_sim_data)
+  attach(help_sim_data)
-  fivenum(hemp_sim_data$\$ $mcs)
-  mean(hemp_sim_data$\$ $mcs, na.rm=TRUE); median(hemp_sim_data$\$ $mcs, na.rm=TRUE); range(hemp_sim_data$\$ $mcs, na.rm=TRUE); sd(hemp_sim_data$\$ $mcs, na.rm=TRUE); var(hemp_sim_data$\$ $mcs, na.rm=TRUE)
+  rownames(help_sim_data)  # print row and column names
+ colnames(help_sim_data)
-  quantile(hemp_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)
+  summary(help_sim_data)
+ fivenum(help_sim_data$\$ $mcs)
+ mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)
-  no_mis_hemp_sim_data_mcs <- na.omit(hemp_sim_data$\$ $mcs)
+  quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)
-  hist(no_mis_hemp_sim_data_mcs, main="", freq=FALSE)
+  no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)
- lines(density(no_mis_hemp_sim_data_mcs), main="MCS", lty=2, lwd=2)
- xvals <- seq(from=min(no_mis_hemp_sim_data_mcs), to=max(no_mis_hemp_sim_data_mcs), length=100)
- lines(xvals, dnorm(xvals, mean(no_mis_hemp_sim_data_mcs), sd(no_mis_hemp_sim_data_mcs)), lwd=2)
+ hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
+ lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
+ xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
+ lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)
-  cor_mat <- cor(cbind(hemp_sim_data$\$ $mcs, hemp_sim_data$\$ $i11, hemp_sim_data$\$ $pcs1))
+  cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
   cor_mat
   cor_mat[c(2, 3), 2]
+ plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")
-  plot(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0], hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")
+  text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
+    help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")
-  text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="alcohol"],
+  text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
-     hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==1& hemp_sim_data$\$ $substance=="alcohol"],"A")
+     help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")
-  text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="cocaine"],
+  text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
-     hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="cocaine"],"C")
+     help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")
-  text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="heroin"],
+  rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
-    hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==1& hemp_sim_data$\$ $substance=="heroin"],"H")
+ rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)
- rug(jitter(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0]), side=2)
- rug(jitter(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0]), side=3)
+ table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
- table(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female)
+ OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
+        sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
+       (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
+        sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
+ OR
- or <- (sum(hemp_sim_data$\$ $homeless==0 & hemp_sim_data$\$ $female==0 , na.rm=TRUE)*
-        sum(hemp_sim_data$\$ $homeless==1 & hemp_sim_data$\$ $female==1 , na.rm=TRUE))/
-       (sum(hemp_sim_data$\$ $homeless==0 & hemp_sim_data$\$ $female==1 , na.rm=TRUE)*
-        sum(hemp_sim_data$\$ $homeless==1 & hemp_sim_data$\$ $female==0 , na.rm=TRUE))
- or
+  chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
-  chisq_val <- chisq.test(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female, correct=FALSE)
   chisq_val
-  fisher.test(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female)
+  fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
-  ttres <- t.test(hemp_sim_data$\$ $age ~ hemp_sim_data$\$ $female, data=hemp_sim_data)
+  ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
   print(ttres)
-  wilcox.test(hemp_sim_data$\$ $age ~ as.factor(hemp_sim_data$\$ $female), correct=FALSE)
+  wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)
-  ksres <- ks.test(hemp_sim_data$\$ $age[hemp_sim_data$\$ $female==0], hemp_sim_data$\$ $age[hemp_sim_data$\$ $female==1], data=hemp_sim_data)
+  ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
   print(ksres)
+===Missing Values===
+ sum(is.na(pcs1))  # count the missing values in the variable pcs1, 208
+ sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246
+ sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values)
+ which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete
+ # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing)
+ # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
 ===Sorting and subsetting===
-  new_cesd = sum(hemp_sim_data$\$ $f1a-hemp_sim_data$\$ $f1t, na.rm=TRUE);
+  new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
   new_cesd
-  impute_mean_cesd = mean(hemp_sim_data$\$ $f1a - hemp_sim_data$\$ $f1t, na.rm=TRUE) * 20;
+  impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
-  sort(hemp_sim_data$\$ $cesd)[1:4]
+  sort(help_sim_data$\$ $cesd)[1:4]
-  sum(is.na(hemp_sim_data$\$ $drinkstat))
+  sum(is.na(help_sim_data$\$ $drinkstatus))
-table(hemp_sim_data$\$ $drinkstat, exclude="NULL")
+ table(help_sim_data$\$ $drinkstat, exclude="NULL")
-  gender <- factor(hemp_sim_data$\$ $female, c(0,1), c("male","Female"))
+  gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
-  table(hemp_sim_data$\$ $female)
+  table(help_sim_data$\$ $female)
 ===[[SMHS_EDA|Exploratory data analysis]]===
+ newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
+ attach(newhelp_sim_data)
+ sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
+ plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
+ points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
+ lines(lowess(age[substance=="alcohol"],
+   i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
+ points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
+ lines(lowess(age[substance=="cocaine"],
+   i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
+ points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
+ lines(lowess(age[substance=="heroin"],
+   i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
+ legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3,
+   cex=1.4, lwd=2, pch=c("A", "C", "H"))
+ options(show.signif.stars=FALSE)
+ lm1 <- lm(i1 ~ sub * age)
+ lm2 <- lm(i1 ~ sub + age)
+ anova(lm2, lm1)
+ summary(lm1)
+ names(summary(lm1))
+ summary(lm1)$\$ $sigma
+ names(lm1)
+ lm1$\$ $coefficients
+ coef(lm1)
+ vcov(lm1)
+ pred <- fitted(lm1)
+ resid <- residuals(lm1)
+ quantile(resid)
-===[[SOCR_EduMaterials_ChartsActivities|Graphing and plotting of data (scatterplot, bubble chart, multiple plots, dotplot, etc.)]]===
+ # explore correlations and clusters in chunks of the data
+ cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs")
+ h_dist_clust <- hclust(dist(cor_matrix))
+ plot(h_dist_clust)
 ===[[SMHS_SLR|Bivariate relationship]]===
+ subst <- as.factor(substance)
+ genfem <- as.factor(ifelse(female, "F", "M"))
+ interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)
+ subs <- character(length(substance))
+ subs[substance=="alcohol"] <- "Alco"
+ subs[substance=="cocaine"] <- "Coca"
+ subs[substance=="heroin"] <- "Hero"
+ gend <- character(length(female))
+ library("lattice")
+ bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
+ bwout
+ boxmeans <- tapply(cesd, list(subs, genfem), mean)
+ suicidal.thoughts <- as.factor(g1b)
+ # conditional plots
+ coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
+ # Generate random data using random effects GLM
+ library(lme4)
+ n <- 2000; p <- 3; sigbsq <- 10
+ beta <- c(-2, 1.5, 0.5, -1, 1)
+ ID <- rep(1:n, each=p)   # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n
+ Y1 <- as.numeric(id < (n+1)/2)  # 1 1 ... 1 0 0 ... 0
+ randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p)
+ Y2 <- rep(1:p, n)        # 1 2 ... p 1 2 ... p ...
+ Y3 <- runif(p*n)
+ Y4 <- rexp(p*n, 1)
+ lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint
+ exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model))
+ Y <- runif(p*n) < exp_model
+ glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit"))
+ summary(glmmres)
+ # inspect visually the relation between low homelessness and low pcs scores
+ homelow <- homeless[i1<3.5]
+ pcslow <- pcs[i1<3.5]<=31.94
+ table(homelow, pcslow)
 ===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]===
+ # you need to first load and attach the dataset:
+ #### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
+ #### attach(help_sim_data)
+ table(homeless, female)
+ OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
+             sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
+      (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
+          sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
+ OR
+ install.packages("epitools")
+ library(epitools)
+ OR_HomeFem <- oddsratio.wald(homeless, female)
+ OR_HomeFem
+ OR_HomeFem$\$ $measure
+ OR_HomeFem$\$ $p.value
+ chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
+ chi_sq_test
+ fisher_test <- fisher.test(homeless, female)
+ fisher_test
 ===[[SMHS_HypothesisTesting|Two-sample tests]]===
+ Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
+ Welch_2sampleT_Age_Fem
+ fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
+     res <- rep(0, length(x))
+     for (i in 1:length(x)){
+          if(is.na(x[i])) res[i] <- x[i]
+          else if (x[i]<2) res[i] <- x[i]
+          else res[i] <- 1}
+     return(res)}
+ Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
+ Welch_2sampleT_Home_treat
+ wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
+ wilcox_test_Age_Fem
+===[[SMHS_PowerSensitivitySpecificity|Power]]===
+ # compute the sample size for a two-sample t-test
+ power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
+ # for a two-sample t-test, compute power from sample-size and effect-size
+ power.t.test(delta=0.5, n=100)
 ===[[SMHS_SurvivalAnalysis|Survival analysis (Kaplan–Meier plot)]]===
-===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with smooth fit]]===
+ small_data <- reshape(newhelp_sim_data, idvar="id",
+                     varying=list(c("cesd1","cesd2","cesd3","cesd4"),
+                         c("mcs1","mcs2","mcs3","mcs4"),
+                         c("i11","i12","i13","i14"),
+                         c("g1b1","g1b2","g1b3","g1b4")),
+                     v.names=c("cesdtv","mcstv","i1tv","g1btv"),
+                     timevar="time", times=1:4, direction="long")
+ library(lme4)
+ glmres <- glmer(g1btv ~ treat + time + (1|id),
+   family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
+ summary(glmres)
+ library(survival)
+ # fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
+ # time dependent strata (linkstatus)
+ cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
+ print(cox_surv)
+ # creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
+ # Cox model, or a previously fitted accelerated failure time model
+ surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
+ print(surv_KM)
+ plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
+ title("Product-Limit Survival Estimates")
+ legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)
+===[[SOCR_EduMaterials_Activities_ScatterChart|Scatterplot with model fit]]===
+ plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
+ points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
+ # lowess is a locally-weighted polynomial regression smooth fit
+ lines(lowess(age[substance=="cocaine"],
+       i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
+ points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
+ lines(lowess(age[substance=="alcohol"],
+       i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
+ points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
+ lines(lowess(age[substance=="heroin"],
+        i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
+ legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
+         cex=1.4, lwd=2, pch=c("c", "a", "h"))
 ===[[SMHS_SLR|Regression with prediction intervals]]===
+ predicted <- fitted(lm1); predicted
+ residuals <- residuals(lm1)
+ quantile(residuals)
 ===Linear regression with interaction===
+ lm1 <- lm(a15a ~ substance * age)
+ lm2 <- lm(a15a ~ substance + age)
+ anova(lm2, lm1)
+ summary(lm1)
+ names(summary(lm1))
+ summary(lm1)$\$ $sigma
+ names(lm1)
+ lm1$\$ $coefficients
+ vcov(lm1)
 ===Regression diagnostics===
+ # graphical model diagnostics
+ par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5)
+ plot(lm1)
+ par(par)
+ std.res <- rstandard(lm1)
+ hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F)
+ lines(density(std.res), lwd=2)
+ xvals <- seq(from=min(std.res), to=max(std.res), length=100)
+ lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)
 ===Fitting stratified regression models===
+Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.
+ unique_vals <- unique(substance[!is.na(substance)]); unique_vals
+ numb_unique <- length(unique_vals)
+ formula <- as.formula(i1 ~ age)
+ p <- length(coef(lm(formula)))
+ res <- matrix(rep(0, numb_unique*p), p, numb_unique)
+ for (i in 1:length(unique_vals)) {
+       res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
+ }
+ rownames(res) <- c("intercept","slope")
+ colnames(res) <- unique_vals
+ res
 ===[[SMHS_ANOVA|Two-way analysis of variance (ANOVA)]]===
+ aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data)
+ aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data)
+ resid <- residuals(aov2)
+ anova(aov2, aov1)
+ logLik(aov1)  # compute the exact Log-Likelihood of each ANOVA model
+ logLik(aov2)
+ lldiff <- logLik(aov1)[1] - logLik(aov2)[1]
+ lldiff
+- pchisq(2*lldiff, 2) # test for model differences
+ summary(aov1)
+ aov1
+ contrasts(sub) <- contr.SAS(4)
+ contrasts(sub)
+ aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data)
+ summary(aov3)
+ AIC(aov1); BIC(aov1)  # [http://en.wikipedia.org/wiki/Akaike_information_criterion Akaike] and [http://en.wikipedia.org/wiki/Bayesian_information_criterion Bayesian] Information criteria (smaller values yield better models
+ AIC(aov2); BIC(aov2)
+ AIC(aov3); BIC(aov3)
 ===[[SMHS_CorrectionMultipleTesting|Multiple comparisons]]===
+ # generate confidence intervals on the differences between the means of the levels
+ # of a factor with the specified family-wise probability of coverage.
+ # The intervals are based on the Studentized range statistic, Tukey's
+ # ‘Honest Significant Difference’ method
+ multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub")
+ multiple
+ plot(multiple)
-===Contrasts===
+===Logistic and Poisson Regression===
-===Logistic regression===
+ # logistic (binomial outcomes) regression
+ logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
+ logistres
+ summary(logistres)
+ names(summary(logistres))
+ coeff.like.SAS <- summary(logistres)$\$ $coefficients
+ coeff.like.SAS
-===Poisson regression===
+ poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson)
+ summary(poissonres)
-===Zero-inflated Poisson regression===
 ===Negative binomial regression===
-===Lasso model selection===
+ library(MASS)
+ NB_Res <- glm.nb(a15a ~ female + substance + age)
-===Quantile regression===
+ summary(NB_Res)
 ===Ordinal logit regression===
-===Multinomial logit regression===
+ library(MASS)
+ sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5))
+ ord_logit <- polr(sex_risk_cat ~ cesd + pcs)
+ summary(ord_logit)
 ===Generalized additive model===
-===[[SOCR_EduMaterials_Activities_PowerTransformFamily_Graphs|Data transformations]]===
+ install.packages("gam"); library(gam)
+ gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
+ summary(gam_reg)
+ coefficients(gam_reg)
+ plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
+ abline(h=0)
+ plot(gam_reg, terms=c("substance"), se=2, lwd=3)
 ===General linear model for correlated data===
+ library(nlme)
+ fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
+           correlation = corAR1(form = ~ 1 | Mare))
+ # variance increases as a power of the absolute fitted values
+ fm2 <- update(fm1, weights = varPower())
+ summary(fm1)
 ===Random effects model===
 ===[[SMHS_GEE|Generalized estimating equations (GEE) model]]===
+ install.packages("gee"); library(gee)
+ sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
+ # attach(sorted_data)
+ gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
+       family=binomial, na.action=na.omit, corstr="exchangeable")
+ coef(gee_res)
+ sqrt(diag(gee_res$\$ $robust.variance))
+ gee_res$\$ $working.correlation
 ===Generalized linear mixed model===
+ library(lme4)
+ glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
+       family=binomial(link="logit"), data=help_sim_data)
+ summary(glmm_res)
 ===Proportional hazards regression model===
-===Bayesian Poisson regression===
+ library(survival)
+ cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
+       cesd, method="breslow", data=help_sim_data)
+ print(cox_surv)
+ # Fit a Cox proportional hazards regression model
+ coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")
 ===[[SMHS_Cronbachs|Cronbach’s $\alpha$]]===
+ library(multilevel)
+ cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
+   f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t))
+ # assess the consistency of the core 20 measures in the simulated HELP dataset
+ res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k +
+       f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3,
+       rotation="varimax", na.action=na.omit, scores="regression")
+ print(res, cutoff=0.45, sort=TRUE)
 ===[[SMHS_PCA_ICA_FA|Factor analysis]]===
+ # Factor analysis explains the (observed) variability of a multivariate dataset in terms of
+ # various underlying unobservable factors. The observed data elements are expressed as
+ # linear functions of the factors and some random error.
+ # This is an example of a maximum likelihood factor analysis of "cesd"
+ # (Center for Epidemiologic Studies–Depression) scale.
+ fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
+       f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
+       f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
+        scores="regression")
+ print(fact_ana , cutoff=0.45, sort=TRUE)
 ===Recursive partitioning===
+ # Recursive partitioning creates a decision tree to classify the observations based on
+ # categorical predictors. Here we classify subjects based on their race,
+ # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS.
+ library(rpart)
+ subst <- as.factor(substance)
+ homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data)
+ printcp(homelessness.rpart)
+ plot(homelessness.rpart)
+ text(homelessness.rpart)
+ race <- racegrp[i1<3.5]
+ pcslow <- pcs[i1<3.5]<=31.94
+ length(pcslow==F)   # count the number of subjects with (pcslow==F)
+ table(race, pcslow)
+ # in these data, 3 of 18 subjects with low PCS scores are white,
+ # and 48 of 154) of those with PCS scores above the threshold are white
 ===Linear discriminant analysis===
+ # Fisher's linear discriminant analysis (LDA) identifies linear combinations of
+ # variables splitting the data into separate classes. LDA can help us
+ # identify clusters homelessness, using a uniform prior
+ library(MASS)
+ ngroups <- length(unique(homeless))
+ lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1)))
+ print(lda_model)
+ plot(lda_model)
+ plot(lda_model, dimen=1, type="both")
 ===[[SMHS_HLM|Hierarchical clustering]]===
+ # To group similar variables or cluster observations a hierarchical structure
+ # may be constructed. hclust() and kmeans() are examples of such clustering
+ # packages
+ cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs")
+ hier_clust <- hclust(dist(cor_mat))
+ plot(hier_clust)
+ km_clust <- kmeans(cor_mat, 5, nstart = 25)
+ plot(cor_mat, col = km_clust$cluster)
+ points(km_clust$centers, col = 1:5, pch = 8)
 ===[[SMHS_ROC|ROC curve]]===
+ library(ROCR)
+ pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide)
+ AUC<- slot(performance(pred, "auc"), "y.values")[[1]]
+ plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
+   text.adj=c(1, -.5), lwd=2)
+ lines(c(0, 1), c(0, 1))
+ text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
+ title("ROC Curve for Depression scale vs. Suicide Prediction Model")
 ===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]===
+Also see the [[SMHS_MissingData| Missing Values section of the Scientific Methods for Health Sciences EBook]].
+ miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs))
+ summary(miss_data)
+ # library(Hmisc)
+ # na.pattern(miss_data)
+ install.packages("mi")
+ library(mi)
+ inf <- mi.info(miss_data)
+ # run the imputation without data transformation
+ IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10))
+ # run the imputation with data transformation
+ miss_data.transformed <- mi.preprocess(miss_data, inf)
+ IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6))
+ # Impute without noise
+ IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)
+ # check convergence, should get FALSE here because only n.iter is small
+ converged(IMP, check = "data")
+ # converged(IMP, check = "coefs")
+ # visually check the imputation
+ plot(IMP); plot(IMP_T)
 ===Propensity score modeling===
+ # Propensity score (PS) techniques may be applied for analysis of
+ # observational and registry data. The PS is the conditional probability
+ # of a certain treatment given patient’s covariates. PS may be used to
+ # eliminate imbalances in baseline covariate distributions between
+ # treatment groups and to estimate marginal effects.
 ==References==
@@ Line 170: / Line 564: @@
 ** Data formats: [http://www.math.smith.edu/sasr/datasets/help.csv help.csv] and [http://www.math.smith.edu/sasr/datasets/help.Rdata help.Rdata]
 ** [http://www3.amherst.edu/~nhorton/help/ Study and Data specifications]
+** [http://www3.amherst.edu/~nhorton/help/ HELP study documentation]
 * [http://www.amazon.com/SAS-Management-Statistical-Analysis-Graphics/dp/1420070576 SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009]