SOCR Simulated HELP Data Activity

SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset

SOCR Simulated HELP Data

See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".

R examples

These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.

Data I/O, summaries, visualization

options(digits=2)  # decimal precision
options(width=80)  # narrows output to stay in the grey box

hemp_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(hemp_sim_data)

rownames(hemp_sim_data)  # print row and column names
colnames(hemp_sim_data)

summary(hemp_sim_data)
fivenum(hemp_sim_data$\$ $mcs)

mean(hemp_sim_data$\$ $mcs, na.rm=TRUE); median(hemp_sim_data$\$ $mcs, na.rm=TRUE); range(hemp_sim_data$\$ $mcs, na.rm=TRUE); sd(hemp_sim_data$\$ $mcs, na.rm=TRUE); var(hemp_sim_data$\$ $mcs, na.rm=TRUE)

quantile(hemp_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)

no_mis_hemp_sim_data_mcs <- na.omit(hemp_sim_data$\$ $mcs)

hist(no_mis_hemp_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_hemp_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_hemp_sim_data_mcs), to=max(no_mis_hemp_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_hemp_sim_data_mcs), sd(no_mis_hemp_sim_data_mcs)), lwd=2)

cor_mat <- cor(cbind(hemp_sim_data$\$ $mcs, hemp_sim_data$\$ $i11, hemp_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]

plot(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0], hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")

text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="alcohol"],
   hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==1& hemp_sim_data$\$ $substance=="alcohol"],"A")

text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="cocaine"],
   hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="cocaine"],"C")

text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="heroin"],
   hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==1& hemp_sim_data$\$ $substance=="heroin"],"H")

rug(jitter(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0]), side=2)
rug(jitter(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0]), side=3)


table(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female)


OR <- (sum(hemp_sim_data$\$ $homeless==0 & hemp_sim_data$\$ $female==0 , na.rm=TRUE)*
       sum(hemp_sim_data$\$ $homeless==1 & hemp_sim_data$\$ $female==1 , na.rm=TRUE))/
      (sum(hemp_sim_data$\$ $homeless==0 & hemp_sim_data$\$ $female==1 , na.rm=TRUE)*
       sum(hemp_sim_data$\$ $homeless==1 & hemp_sim_data$\$ $female==0 , na.rm=TRUE))
OR


chisq_val <- chisq.test(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female, correct=FALSE)
chisq_val


fisher.test(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female)


ttres <- t.test(hemp_sim_data$\$ $age ~ hemp_sim_data$\$ $female, data=hemp_sim_data)
print(ttres)


wilcox.test(hemp_sim_data$\$ $age ~ as.factor(hemp_sim_data$\$ $female), correct=FALSE)

ksres <- ks.test(hemp_sim_data$\$ $age[hemp_sim_data$\$ $female==0], hemp_sim_data$\$ $age[hemp_sim_data$\$ $female==1], data=hemp_sim_data)
print(ksres)

Missing Values

sum(is.na(pcs1))  # count the missing values in the variable pcs1, 208
sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246

sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values)
which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete
# pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) 
# pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1

Sorting and subsetting

new_cesd = sum(hemp_sim_data$\$ $f1a-hemp_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd

impute_mean_cesd = mean(hemp_sim_data$\$ $f1a - hemp_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(hemp_sim_data$\$ $cesd)[1:4]
sum(is.na(hemp_sim_data$\$ $drinkstatus))
table(hemp_sim_data$\$ $drinkstat, exclude="NULL")

gender <- factor(hemp_sim_data$\$ $female, c(0,1), c("male","Female"))
table(hemp_sim_data$\$ $female)

Exploratory data analysis

newhemp_sim_data <- hemp_sim_data[hemp_sim_data$\$ $female==1,]
attach(newhemp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"], 
  i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"], 
  i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"], 
  i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, 
  cex=1.4, lwd=2, pch=c("A", "C", "H"))

options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)

summary(lm1)

names(summary(lm1))
summary(lm1)$\$ $sigma

names(lm1)

lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)

Bivariate relationship

subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)

subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout 
boxmeans <- tapply(cesd, list(subs, genfem), mean)

suicidal.thoughts <- as.factor(g1b)
# conditional plots
coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)

Contingency tables

Two-sample tests

Survival analysis (Kaplan–Meier plot)

small_data <- reshape(newhemp_sim_data, idvar="id", 
                    varying=list(c("cesd1","cesd2","cesd3","cesd4"),
                        c("mcs1","mcs2","mcs3","mcs4"), 
                        c("i11","i12","i13","i14"),
                        c("g1b1","g1b2","g1b3","g1b4")), 
                    v.names=c("cesdtv","mcstv","i1tv","g1btv"),
                    timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
  family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)

library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhemp_sim_data)
print(cox_surv)

# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)

Scatterplot with smooth fit

Regression with prediction intervals

Linear regression with interaction

Regression diagnostics

Fitting stratified regression models

Two-way analysis of variance (ANOVA)

aov1 <- aov(cesd ~ sub * genfem, data=hemp_sim_data)
aov2 <- aov(cesd ~ sub + genfem, data=hemp_sim_data)
resid <- residuals(aov2)
anova(aov2, aov1)

logLik(aov1)  # compute the exact Log-Likelihood of each ANOVA model
logLik(aov2)
lldiff <- logLik(aov1)[1] - logLik(aov2)[1]
lldiff
1 - pchisq(2*lldiff, 2) # test for model differences
summary(aov1)
aov1

contrasts(sub) <- contr.SAS(4)
contrasts(sub) 
aov3 <- lm(cesd ~ sub + genfem, data=hemp_sim_data)
summary(aov3)

AIC(aov1); BIC(aov1)  # Akaike and Bayesian Information criteria (smaller values yield better models
AIC(aov2); BIC(aov2)
AIC(aov3); BIC(aov3)

# generate confidence intervals on the differences between the means of the levels 
# of a factor with the specified family-wise probability of coverage. 
# The intervals are based on the Studentized range statistic, Tukey's 
# ‘Honest Significant Difference’ method
multiple <- TukeyHSD(aov(cesd ~ sub, data=hemp_sim_data), "sub")
multiple 
plot(multiple)

Multiple comparisons

Contrasts

Logistic and Poisson Regression

# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS

poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson)
summary(poissonres)

Poisson regression

Zero-inflated Poisson regression

Negative binomial regression

Lasso model selection

Quantile regression

Ordinal logit regression

Multinomial logit regression

Generalized additive model

Data transformations

General linear model for correlated data

Random effects model

Generalized estimating equations (GEE) model

Generalized linear mixed model

Proportional hazards regression model

Bayesian Poisson regression

Cronbach’s $\alpha$

Factor analysis

Recursive partitioning

Linear discriminant analysis

Hierarchical clustering

ROC curve

Multiple imputation

Propensity score modeling

References

SOCR Home page: http://www.SOCR.ucla.edu

Translate this page:

(default)	Deutsch	Español	Français	Italiano	Português	日本語	България	الامارات العربية المتحدة	Suomi	इस भाषा में	Norge
한국어	中文	繁体中文	Русский	Nederlands	Ελληνικά	Hrvatska	Česká republika	Danmark	Polska	România	Sverige