Difference between revisions of "SOCR Simulated HELP Data Activity"
(→Survival analysis (Kaplan–Meier plot)) |
(→Bivariate relationship) |
||
Line 149: | Line 149: | ||
bwout | bwout | ||
boxmeans <- tapply(cesd, list(subs, genfem), mean) | boxmeans <- tapply(cesd, list(subs, genfem), mean) | ||
+ | |||
+ | suicidal.thoughts <- as.factor(g1b) | ||
+ | # conditional plots | ||
+ | coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth) | ||
===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]=== | ===[[AP_Statistics_Curriculum_2007_Contingency_Indep|Contingency tables]]=== |
Revision as of 12:41, 13 September 2014
Contents
- 1 SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
- 2 SOCR Simulated HELP Data
- 3 R examples
- 3.1 Data I/O, summaries, visualization
- 3.2 Missing Values
- 3.3 Sorting and subsetting
- 3.4 Exploratory data analysis
- 3.5 Bivariate relationship
- 3.6 Contingency tables
- 3.7 Two-sample tests
- 3.8 Survival analysis (Kaplan–Meier plot)
- 3.9 Scatterplot with smooth fit
- 3.10 Regression with prediction intervals
- 3.11 Linear regression with interaction
- 3.12 Regression diagnostics
- 3.13 Fitting stratified regression models
- 3.14 Two-way analysis of variance (ANOVA)
- 3.15 Multiple comparisons
- 3.16 Contrasts
- 3.17 Logistic and Poisson Regression
- 3.18 Poisson regression
- 3.19 Zero-inflated Poisson regression
- 3.20 Negative binomial regression
- 3.21 Lasso model selection
- 3.22 Quantile regression
- 3.23 Ordinal logit regression
- 3.24 Multinomial logit regression
- 3.25 Generalized additive model
- 3.26 Data transformations
- 3.27 General linear model for correlated data
- 3.28 Random effects model
- 3.29 Generalized estimating equations (GEE) model
- 3.30 Generalized linear mixed model
- 3.31 Proportional hazards regression model
- 3.32 Bayesian Poisson regression
- 3.33 Cronbach’s $\alpha$
- 3.34 Factor analysis
- 3.35 Recursive partitioning
- 3.36 Linear discriminant analysis
- 3.37 Hierarchical clustering
- 3.38 ROC curve
- 3.39 Multiple imputation
- 3.40 Propensity score modeling
- 4 References
SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
SOCR Simulated HELP Data
See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".
R examples
These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.
Data I/O, summaries, visualization
options(digits=2) # decimal precision options(width=80) # narrows output to stay in the grey box hemp_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA")) # note that we specify all of these values as indicating missing data ("",".","NA") attach(hemp_sim_data) rownames(hemp_sim_data) # print row and column names colnames(hemp_sim_data) summary(hemp_sim_data) fivenum(hemp_sim_data$\$ $mcs) mean(hemp_sim_data$\$ $mcs, na.rm=TRUE); median(hemp_sim_data$\$ $mcs, na.rm=TRUE); range(hemp_sim_data$\$ $mcs, na.rm=TRUE); sd(hemp_sim_data$\$ $mcs, na.rm=TRUE); var(hemp_sim_data$\$ $mcs, na.rm=TRUE) quantile(hemp_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE) no_mis_hemp_sim_data_mcs <- na.omit(hemp_sim_data$\$ $mcs) hist(no_mis_hemp_sim_data_mcs, main="", freq=FALSE) lines(density(no_mis_hemp_sim_data_mcs), main="MCS", lty=2, lwd=2) xvals <- seq(from=min(no_mis_hemp_sim_data_mcs), to=max(no_mis_hemp_sim_data_mcs), length=100) lines(xvals, dnorm(xvals, mean(no_mis_hemp_sim_data_mcs), sd(no_mis_hemp_sim_data_mcs)), lwd=2) cor_mat <- cor(cbind(hemp_sim_data$\$ $mcs, hemp_sim_data$\$ $i11, hemp_sim_data$\$ $pcs1)) cor_mat cor_mat[c(2, 3), 2] plot(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0], hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n") text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="alcohol"], hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==1& hemp_sim_data$\$ $substance=="alcohol"],"A") text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="cocaine"], hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="cocaine"],"C") text(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0& hemp_sim_data$\$ $substance=="heroin"], hemp_sim_data$\$ $cesd[hemp_sim_data$\$ $female==1& hemp_sim_data$\$ $substance=="heroin"],"H") rug(jitter(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0]), side=2) rug(jitter(hemp_sim_data$\$ $mcs[hemp_sim_data$\$ $female==0]), side=3) table(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female) OR <- (sum(hemp_sim_data$\$ $homeless==0 & hemp_sim_data$\$ $female==0 , na.rm=TRUE)* sum(hemp_sim_data$\$ $homeless==1 & hemp_sim_data$\$ $female==1 , na.rm=TRUE))/ (sum(hemp_sim_data$\$ $homeless==0 & hemp_sim_data$\$ $female==1 , na.rm=TRUE)* sum(hemp_sim_data$\$ $homeless==1 & hemp_sim_data$\$ $female==0 , na.rm=TRUE)) OR chisq_val <- chisq.test(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female, correct=FALSE) chisq_val fisher.test(hemp_sim_data$\$ $homeless, hemp_sim_data$\$ $female) ttres <- t.test(hemp_sim_data$\$ $age ~ hemp_sim_data$\$ $female, data=hemp_sim_data) print(ttres) wilcox.test(hemp_sim_data$\$ $age ~ as.factor(hemp_sim_data$\$ $female), correct=FALSE) ksres <- ks.test(hemp_sim_data$\$ $age[hemp_sim_data$\$ $female==0], hemp_sim_data$\$ $age[hemp_sim_data$\$ $female==1], data=hemp_sim_data) print(ksres)
Missing Values
sum(is.na(pcs1)) # count the missing values in the variable pcs1, 208 sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246 sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values) which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
Sorting and subsetting
new_cesd = sum(hemp_sim_data$\$ $f1a-hemp_sim_data$\$ $f1t, na.rm=TRUE); new_cesd impute_mean_cesd = mean(hemp_sim_data$\$ $f1a - hemp_sim_data$\$ $f1t, na.rm=TRUE) * 20; sort(hemp_sim_data$\$ $cesd)[1:4] sum(is.na(hemp_sim_data$\$ $drinkstatus)) table(hemp_sim_data$\$ $drinkstat, exclude="NULL") gender <- factor(hemp_sim_data$\$ $female, c(0,1), c("male","Female")) table(hemp_sim_data$\$ $female)
Exploratory data analysis
newhemp_sim_data <- hemp_sim_data[hemp_sim_data$\$ $female==1,] attach(newhemp_sim_data) sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine")) plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4) points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A") lines(lowess(age[substance=="alcohol"], i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2) points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C") lines(lowess(age[substance=="cocaine"], i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2) points(age[substance=="heroin"], i1[substance=="heroin"], pch="H") lines(lowess(age[substance=="heroin"], i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2) legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3, cex=1.4, lwd=2, pch=c("A", "C", "H")) options(show.signif.stars=FALSE) lm1 <- lm(i1 ~ sub * age) lm2 <- lm(i1 ~ sub + age) anova(lm2, lm1) summary(lm1) names(summary(lm1)) summary(lm1)$\$ $sigma names(lm1) lm1$\$ $coefficients coef(lm1) vcov(lm1) pred <- fitted(lm1) resid <- residuals(lm1) quantile(resid)
Bivariate relationship
subst <- as.factor(substance) genfem <- as.factor(ifelse(female, "F", "M")) interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2) subs <- character(length(substance)) subs[substance=="alcohol"] <- "Alco" subs[substance=="cocaine"] <- "Coca" subs[substance=="heroin"] <- "Hero" gend <- character(length(female)) library("lattice") bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray") bwout boxmeans <- tapply(cesd, list(subs, genfem), mean)
suicidal.thoughts <- as.factor(g1b) # conditional plots coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
Contingency tables
Two-sample tests
Survival analysis (Kaplan–Meier plot)
small_data <- reshape(newhemp_sim_data, idvar="id", varying=list(c("cesd1","cesd2","cesd3","cesd4"), c("mcs1","mcs2","mcs3","mcs4"), c("i11","i12","i13","i14"), c("g1b1","g1b2","g1b3","g1b4")), v.names=c("cesdtv","mcstv","i1tv","g1btv"), timevar="time", times=1:4, direction="long") library(lme4) glmres <- glmer(g1btv ~ treat + time + (1|id), family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data) summary(glmres)
library(survival) # fit a Cox proportional hazards regression model. Time dependent variable (dayslink), # time dependent strata (linkstatus) cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhemp_sim_data) print(cox_surv)
Scatterplot with smooth fit
Regression with prediction intervals
Linear regression with interaction
Regression diagnostics
Fitting stratified regression models
Two-way analysis of variance (ANOVA)
aov1 <- aov(cesd ~ sub * genfem, data=hemp_sim_data) aov2 <- aov(cesd ~ sub + genfem, data=hemp_sim_data) resid <- residuals(aov2) anova(aov2, aov1)
logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model logLik(aov2) lldiff <- logLik(aov1)[1] - logLik(aov2)[1] lldiff 1 - pchisq(2*lldiff, 2) # test for model differences summary(aov1) aov1 contrasts(sub) <- contr.SAS(4) contrasts(sub) aov3 <- lm(cesd ~ sub + genfem, data=hemp_sim_data) summary(aov3) AIC(aov1); BIC(aov1) # Akaike and Bayesian Information criteria (smaller values yield better models AIC(aov2); BIC(aov2) AIC(aov3); BIC(aov3) # generate confidence intervals on the differences between the means of the levels # of a factor with the specified family-wise probability of coverage. # The intervals are based on the Studentized range statistic, Tukey's # ‘Honest Significant Difference’ method multiple <- TukeyHSD(aov(cesd ~ sub, data=hemp_sim_data), "sub") multiple plot(multiple)
Multiple comparisons
Contrasts
Logistic and Poisson Regression
# logistic (binomial outcomes) regression logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial) logistres summary(logistres) names(summary(logistres)) coeff.like.SAS <- summary(logistres)$\$ $coefficients coeff.like.SAS
poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) summary(poissonres)
Poisson regression
Zero-inflated Poisson regression
Negative binomial regression
Lasso model selection
Quantile regression
Ordinal logit regression
Multinomial logit regression
Generalized additive model
Data transformations
Random effects model
Generalized estimating equations (GEE) model
Generalized linear mixed model
Proportional hazards regression model
Bayesian Poisson regression
Cronbach’s $\alpha$
Factor analysis
Recursive partitioning
Linear discriminant analysis
Hierarchical clustering
ROC curve
Multiple imputation
Propensity score modeling
References
- Evaluation and Linkage to Primary (HELP) Care study
- Data formats: help.csv and help.Rdata
- Study and Data specifications
- HELP study documentation
- SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009
- SOCR Home page: http://www.SOCR.ucla.edu
Translate this page: