Difference between revisions of "SOCR Simulated HELP Data Activity"
(→Hierarchical clustering) |
(→Multiple imputation) |
||
| Line 525: | Line 525: | ||
===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]=== | ===[[SMHS#Chapter_IV:_Special_Topics|Multiple imputation]]=== | ||
| + | |||
| + | miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs)) | ||
| + | summary(miss_data) | ||
| + | # library(Hmisc) | ||
| + | # na.pattern(miss_data) | ||
| + | |||
| + | install.packages("mi") | ||
| + | library(mi) | ||
| + | inf <- mi.info(miss_data) | ||
| + | # run the imputation without data transformation | ||
| + | IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10)) | ||
| + | |||
| + | # run the imputation with data transformation | ||
| + | miss_data.transformed <- mi.preprocess(miss_data, inf) | ||
| + | IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6)) | ||
| + | |||
| + | # Impute without noise | ||
| + | IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE) | ||
| + | |||
| + | # check convergence, should get FALSE here because only n.iter is small | ||
| + | converged(IMP, check = "data") | ||
| + | # converged(IMP, check = "coefs") | ||
| + | # visually check the imputation | ||
| + | plot(IMP); plot(IMP_T) | ||
===Propensity score modeling=== | ===Propensity score modeling=== | ||
Revision as of 17:06, 14 September 2014
Contents
- 1 SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
- 2 SOCR Simulated HELP Data
- 3 R examples
- 3.1 Data I/O, summaries, visualization
- 3.2 Missing Values
- 3.3 Sorting and subsetting
- 3.4 Exploratory data analysis
- 3.5 Bivariate relationship
- 3.6 Contingency tables
- 3.7 Two-sample tests
- 3.8 Power
- 3.9 Survival analysis (Kaplan–Meier plot)
- 3.10 Scatterplot with model fit
- 3.11 Regression with prediction intervals
- 3.12 Linear regression with interaction
- 3.13 Regression diagnostics
- 3.14 Fitting stratified regression models
- 3.15 Two-way analysis of variance (ANOVA)
- 3.16 Multiple comparisons
- 3.17 Logistic and Poisson Regression
- 3.18 Negative binomial regression
- 3.19 Ordinal logit regression
- 3.20 Generalized additive model
- 3.21 General linear model for correlated data
- 3.22 Random effects model
- 3.23 Generalized estimating equations (GEE) model
- 3.24 Generalized linear mixed model
- 3.25 Proportional hazards regression model
- 3.26 Cronbach’s $\alpha$
- 3.27 Factor analysis
- 3.28 Recursive partitioning
- 3.29 Linear discriminant analysis
- 3.30 Hierarchical clustering
- 3.31 ROC curve
- 3.32 Multiple imputation
- 3.33 Propensity score modeling
- 4 References
SOCR Simulated HELP Data: SOCR Activity: Simulated Health Evaluation and Linkage to Primary (HELP) Care Dataset
SOCR Simulated HELP Data
See the SOCR Simulated HELP Data first. These data can be copy-pasted using the mouse from the HTML table into a plain text file "help_data.csv".
R examples
These simulated HELP data can be used to demonstrate (using SOCR and R)a number of different statistical, modeling, inferential and data analytic techniques.
Data I/O, summaries, visualization
options(digits=2) # decimal precision
options(width=80) # narrows output to stay in the grey box
help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
# note that we specify all of these values as indicating missing data ("",".","NA")
attach(help_sim_data)
rownames(help_sim_data) # print row and column names
colnames(help_sim_data)
summary(help_sim_data)
fivenum(help_sim_data$\$ $mcs)
mean(help_sim_data$\$ $mcs, na.rm=TRUE); median(help_sim_data$\$ $mcs, na.rm=TRUE); range(help_sim_data$\$ $mcs, na.rm=TRUE); sd(help_sim_data$\$ $mcs, na.rm=TRUE); var(help_sim_data$\$ $mcs, na.rm=TRUE)
quantile(help_sim_data$\$ $mcs, seq(from=0, to=1, length=11), na.rm=TRUE)
no_mis_help_sim_data_mcs <- na.omit(help_sim_data$\$ $mcs)
hist(no_mis_help_sim_data_mcs, main="", freq=FALSE)
lines(density(no_mis_help_sim_data_mcs), main="MCS", lty=2, lwd=2)
xvals <- seq(from=min(no_mis_help_sim_data_mcs), to=max(no_mis_help_sim_data_mcs), length=100)
lines(xvals, dnorm(xvals, mean(no_mis_help_sim_data_mcs), sd(no_mis_help_sim_data_mcs)), lwd=2)
cor_mat <- cor(cbind(help_sim_data$\$ $mcs, help_sim_data$\$ $i11, help_sim_data$\$ $pcs1))
cor_mat
cor_mat[c(2, 3), 2]
plot(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0], help_sim_data$\$ $cesd[help_sim_data$\$ $female==0], xlab="MCS", ylab="cesd", type="n", bty="n")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="alcohol"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="alcohol"],"A")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="cocaine"],"C")
text(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0& help_sim_data$\$ $substance=="heroin"],
help_sim_data$\$ $cesd[help_sim_data$\$ $female==1& help_sim_data$\$ $substance=="heroin"],"H")
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=2)
rug(jitter(help_sim_data$\$ $mcs[help_sim_data$\$ $female==0]), side=3)
table(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
(sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
chisq_val <- chisq.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female, correct=FALSE)
chisq_val
fisher.test(help_sim_data$\$ $homeless, help_sim_data$\$ $female)
ttres <- t.test(help_sim_data$\$ $age ~ help_sim_data$\$ $female, data=help_sim_data)
print(ttres)
wilcox.test(help_sim_data$\$ $age ~ as.factor(help_sim_data$\$ $female), correct=FALSE)
ksres <- ks.test(help_sim_data$\$ $age[help_sim_data$\$ $female==0], help_sim_data$\$ $age[help_sim_data$\$ $female==1], data=help_sim_data)
print(ksres)
Missing Values
sum(is.na(pcs1)) # count the missing values in the variable pcs1, 208 sum(!is.na(pcs1)) # count the non missing values in the variable pcs1, 246 sum(pcs1==49, na.rm=T) # Count the occurrence of 49 in pcs1, (omitting any missing values) which(!complete.cases(pcs1)) # Find cases (row numbers) that are incomplete # pcs1[pcs1==99] = NA You can re-map all 49 values in pcs1 as NA (missing) # pcs1 = pcs1[!is.na(pcs1)] # you can remove all NA (missing) data from pcs1
Sorting and subsetting
new_cesd = sum(help_sim_data$\$ $f1a-help_sim_data$\$ $f1t, na.rm=TRUE);
new_cesd
impute_mean_cesd = mean(help_sim_data$\$ $f1a - help_sim_data$\$ $f1t, na.rm=TRUE) * 20;
sort(help_sim_data$\$ $cesd)[1:4]
sum(is.na(help_sim_data$\$ $drinkstatus))
table(help_sim_data$\$ $drinkstat, exclude="NULL")
gender <- factor(help_sim_data$\$ $female, c(0,1), c("male","Female"))
table(help_sim_data$\$ $female)
Exploratory data analysis
newhelp_sim_data <- help_sim_data[help_sim_data$\$ $female==1,]
attach(newhelp_sim_data)
sub <- factor(substance, levels=c("heroin", "alcohol", "cocaine"))
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="A")
lines(lowess(age[substance=="alcohol"],
i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="C")
lines(lowess(age[substance=="cocaine"],
i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="H")
lines(lowess(age[substance=="heroin"],
i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(44, 38, legend=c("alcohol", "cocaine", "heroin"), lty=1:3,
cex=1.4, lwd=2, pch=c("A", "C", "H"))
options(show.signif.stars=FALSE)
lm1 <- lm(i1 ~ sub * age)
lm2 <- lm(i1 ~ sub + age)
anova(lm2, lm1)
summary(lm1)
names(summary(lm1))
summary(lm1)$\$ $sigma
names(lm1)
lm1$\$ $coefficients
coef(lm1)
vcov(lm1)
pred <- fitted(lm1)
resid <- residuals(lm1)
quantile(resid)
# explore correlations and clusters in chunks of the data cor_matrix <- cor(cbind(mcs, pcs, cesd, i1, sexrisk),use="pairwise.complete.obs") h_dist_clust <- hclust(dist(cor_matrix)) plot(h_dist_clust)
Bivariate relationship
subst <- as.factor(substance)
genfem <- as.factor(ifelse(female, "F", "M"))
interaction.plot(subst, genfem, cesd, xlab="substance", las=1, lwd=2)
subs <- character(length(substance))
subs[substance=="alcohol"] <- "Alco"
subs[substance=="cocaine"] <- "Coca"
subs[substance=="heroin"] <- "Hero"
gend <- character(length(female))
library("lattice")
bwout <- bwplot(cesd ~ subs + genfem, notch=TRUE, varwidth=TRUE, col="gray")
bwout
boxmeans <- tapply(cesd, list(subs, genfem), mean)
suicidal.thoughts <- as.factor(g1b) # conditional plots coplot(mcs ~ cesd | suicidal.thoughts*substance, panel=panel.smooth)
# Generate random data using random effects GLM library(lme4) n <- 2000; p <- 3; sigbsq <- 10 beta <- c(-2, 1.5, 0.5, -1, 1) ID <- rep(1:n, each=p) # 1 1 ... 1 2 2 ... 2 3 3 ....3 ... n... n Y1 <- as.numeric(id < (n+1)/2) # 1 1 ... 1 0 0 ... 0 randint <- rep(rnorm(n, 0, sqrt(sigbsq)), each=p) Y2 <- rep(1:p, n) # 1 2 ... p 1 2 ... p ... Y3 <- runif(p*n) Y4 <- rexp(p*n, 1) lin_pred_model <- beta[1] + beta[2]*Y1 + beta[3]*Y2 + beta[4]*Y3 + beta[5]*Y4 + randint exp_model <- exp(lin_pred_model)/(1 + exp(lin_pred_model)) Y <- runif(p*n) < exp_model glmmres <- lmer(Y ~ Y1 + Y2 + Y3 + +Y4 + (1|id), family=binomial(link="logit")) summary(glmmres)
# inspect visually the relation between low homelessness and low pcs scores homelow <- homeless[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 table(homelow, pcslow)
Contingency tables
# you need to first load and attach the dataset:
#### help_sim_data <- read.csv("http://socr.umich.edu/data/SOCR_HELP_SIm_Data_2014.csv", na.strings=c("",".","NA"))
#### attach(help_sim_data)
table(homeless, female)
OR <- (sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==0 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==1 , na.rm=TRUE))/
(sum(help_sim_data$\$ $homeless==0 & help_sim_data$\$ $female==1 , na.rm=TRUE)*
sum(help_sim_data$\$ $homeless==1 & help_sim_data$\$ $female==0 , na.rm=TRUE))
OR
install.packages("epitools")
library(epitools)
OR_HomeFem <- oddsratio.wald(homeless, female)
OR_HomeFem
OR_HomeFem$\$ $measure
OR_HomeFem$\$ $p.value
chi_sq_test <- chisq.test(homeless, female, correct=FALSE)
chi_sq_test
fisher_test <- fisher.test(homeless, female) fisher_test
Two-sample tests
Welch_2sampleT_Age_Fem <- t.test(age ~ female, data=help_sim_data)
Welch_2sampleT_Age_Fem
fixBinary<- function(x) { # this function is needed to pre-filter the "treat" array into binary (0,1) values
res <- rep(0, length(x))
for (i in 1:length(x)){
if(is.na(x[i])) res[i] <- x[i]
else if (x[i]<2) res[i] <- x[i]
else res[i] <- 1}
return(res)}
Welch_2sampleT_Home_treat <- t.test(homeless ~ fixBinary(treat), data=help_sim_data)
Welch_2sampleT_Home_treat
wilcox_test_Age_Fem <- wilcox.test(age ~ as.factor(female), correct=FALSE)
wilcox_test_Age_Fem
Power
# compute the sample size for a two-sample t-test power.t.test(delta=0.5, power=0.9) # given effect-size (delta) and desired power
# for a two-sample t-test, compute power from sample-size and effect-size power.t.test(delta=0.5, n=100)
Survival analysis (Kaplan–Meier plot)
small_data <- reshape(newhelp_sim_data, idvar="id",
varying=list(c("cesd1","cesd2","cesd3","cesd4"),
c("mcs1","mcs2","mcs3","mcs4"),
c("i11","i12","i13","i14"),
c("g1b1","g1b2","g1b3","g1b4")),
v.names=c("cesdtv","mcstv","i1tv","g1btv"),
timevar="time", times=1:4, direction="long")
library(lme4)
glmres <- glmer(g1btv ~ treat + time + (1|id),
family=binomial(link="logit"), control=glmerControl(tolPwrss=1e-6), na.action = na.omit, data=small_data)
summary(glmres)
library(survival)
# fit a Cox proportional hazards regression model. Time dependent variable (dayslink),
# time dependent strata (linkstatus)
cox_surv <- coxph(Surv(as.numeric(dayslink), as.numeric(!is.na(linkstatus))) ~ treat + age + female + cesd, method="breslow", data=newhelp_sim_data)
print(cox_surv)
# creates a survival curve from a formula (e.g. the Kaplan-Meier), a previously fitted
# Cox model, or a previously fitted accelerated failure time model
surv_KM <- survfit(Surv(as.numeric(dayslink), linkstatus) ~ treat)
print(surv_KM)
plot(surv_KM, lty=1:2, lwd=2, col=c(4,2))
title("Product-Limit Survival Estimates")
legend(200, .8, legend=c("Control", "Treatment"), lty=c(1,2), lwd=2, col=c(4,2), cex=1.4)
Scatterplot with model fit
plot(age, i1, ylim=c(0,40), type="n", cex.lab=1.4, cex.axis=1.4)
points(age[substance=="cocaine"], i1[substance=="cocaine"], pch="coca")
# lowess is a locally-weighted polynomial regression smooth fit
lines(lowess(age[substance=="cocaine"],
i1[substance=="cocaine"], delta = 0.01), lty=2, lwd=2)
points(age[substance=="alcohol"], i1[substance=="alcohol"], pch="alco")
lines(lowess(age[substance=="alcohol"],
i1[substance=="alcohol"], delta = 0.01), lty=1, lwd=2)
points(age[substance=="heroin"], i1[substance=="heroin"], pch="hero")
lines(lowess(age[substance=="heroin"],
i1[substance=="heroin"], delta = 0.01), lty=3, lwd=2)
legend(50, 42, legend=c("cocaine", "alcohol", "heroin"), lty=1:3,
cex=1.4, lwd=2, pch=c("c", "a", "h"))
Regression with prediction intervals
predicted <- fitted(lm1); predicted residuals <- residuals(lm1) quantile(residuals)
Linear regression with interaction
lm1 <- lm(a15a ~ substance * age) lm2 <- lm(a15a ~ substance + age) anova(lm2, lm1) summary(lm1) names(summary(lm1)) summary(lm1)$\$ $sigma names(lm1) lm1$\$ $coefficients vcov(lm1)
Regression diagnostics
# graphical model diagnostics par <- par(mfrow=c(2, 2), mar=c(4, 4, 2, 2)+.5) plot(lm1) par(par)
std.res <- rstandard(lm1) hist(std.res, main="", xlab="standardized residuals", col="gray", freq=F) lines(density(std.res), lwd=2) xvals <- seq(from=min(std.res), to=max(std.res), length=100) lines(xvals, dnorm(xvals, mean(std.res), sd(std.res)), lty=3)
Fitting stratified regression models
Frequently, one needs to perform similar analyses in several groups, i.e., fitting separate linear regression models for each substance abuse group.
unique_vals <- unique(substance[!is.na(substance)]); unique_vals
numb_unique <- length(unique_vals)
formula <- as.formula(i1 ~ age)
p <- length(coef(lm(formula)))
res <- matrix(rep(0, numb_unique*p), p, numb_unique)
for (i in 1:length(unique_vals)) {
res[,i] <- coef(lm(formula, subset=substance==unique_vals[i]))
}
rownames(res) <- c("intercept","slope")
colnames(res) <- unique_vals
res
Two-way analysis of variance (ANOVA)
aov1 <- aov(cesd ~ sub * genfem, data=help_sim_data) aov2 <- aov(cesd ~ sub + genfem, data=help_sim_data) resid <- residuals(aov2) anova(aov2, aov1)
logLik(aov1) # compute the exact Log-Likelihood of each ANOVA model logLik(aov2) lldiff <- logLik(aov1)[1] - logLik(aov2)[1] lldiff 1 - pchisq(2*lldiff, 2) # test for model differences summary(aov1) aov1 contrasts(sub) <- contr.SAS(4) contrasts(sub) aov3 <- lm(cesd ~ sub + genfem, data=help_sim_data) summary(aov3) AIC(aov1); BIC(aov1) # Akaike and Bayesian Information criteria (smaller values yield better models AIC(aov2); BIC(aov2) AIC(aov3); BIC(aov3)
Multiple comparisons
# generate confidence intervals on the differences between the means of the levels # of a factor with the specified family-wise probability of coverage. # The intervals are based on the Studentized range statistic, Tukey's # ‘Honest Significant Difference’ method multiple <- TukeyHSD(aov(cesd ~ sub, data=help_sim_data), "sub") multiple plot(multiple)
Logistic and Poisson Regression
# logistic (binomial outcomes) regression
logistres <- glm(factor(homeless, levels=c('0','1')) ~ female + i1 + substance + sexrisk + indtot, na.action = na.omit, binomial)
logistres
summary(logistres)
names(summary(logistres))
coeff.like.SAS <- summary(logistres)$\$ $coefficients
coeff.like.SAS
poissonres <- glm(i2 ~ female + substance + age, na.action = na.omit, poisson) summary(poissonres)
Negative binomial regression
library(MASS) NB_Res <- glm.nb(a15a ~ female + substance + age) summary(NB_Res)
Ordinal logit regression
library(MASS) sex_risk_cat <- as.factor(as.numeric(sexrisk>=1) + as.numeric(sexrisk>=5)) ord_logit <- polr(sex_risk_cat ~ cesd + pcs) summary(ord_logit)
Generalized additive model
install.packages("gam"); library(gam)
gam_reg <- gam(cesd ~ female + lo(pcs) + substance) # fitting a GAM using a smooth version of "pcs"
summary(gam_reg)
coefficients(gam_reg)
plot(gam_reg, terms=c("lo(pcs)"), se=2, lwd=3)
abline(h=0)
plot(gam_reg, terms=c("substance"), se=2, lwd=3)
library(nlme)
fm1 <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary,
correlation = corAR1(form = ~ 1 | Mare))
# variance increases as a power of the absolute fitted values
fm2 <- update(fm1, weights = varPower())
summary(fm1)
Random effects model
Generalized estimating equations (GEE) model
install.packages("gee"); library(gee)
sorted_data <- help_sim_data [order(help_sim_data$\$ $ID),]
# attach(sorted_data)
gee_res <- gee(formula = g1b ~ treat + substance, id=ID, data=sorted_data,
family=binomial, na.action=na.omit, corstr="exchangeable")
coef(gee_res)
sqrt(diag(gee_res$\$ $robust.variance))
gee_res$\$ $working.correlation
Generalized linear mixed model
library(lme4)
glmm_res <- glmer(g1b ~ treat + cesd + (1|ID),
family=binomial(link="logit"), data=help_sim_data)
summary(glmm_res)
Proportional hazards regression model
library(survival)
cox_surv <- coxph(Surv(dayslink, linkstatus) ~ treat + age + female +
cesd, method="breslow", data=help_sim_data)
print(cox_surv)
# Fit a Cox proportional hazards regression model
coxph(formula = Surv(dayslink, linkstatus) ~ treat + age + female + cesd, data = help_sim_data, method = "breslow")
Cronbach’s $\alpha$
library(multilevel)
cronbach(cbind(f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k,
f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t))
# assess the consistency of the core 20 measures in the simulated HELP dataset
res <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h + f1i + f1j + f1k +
f1l + f1m + f1n + f1o + f1p + f1q + f1r + f1s + f1t, factors=3,
rotation="varimax", na.action=na.omit, scores="regression")
print(res, cutoff=0.45, sort=TRUE)
Factor analysis
# Factor analysis explains the (observed) variability of a multivariate dataset in terms of
# various underlying unobservable factors. The observed data elements are expressed as
# linear functions of the factors and some random error.
# This is an example of a maximum likelihood factor analysis of "cesd"
# (Center for Epidemiologic Studies–Depression) scale.
fact_ana <- factanal(~ f1a + f1b + f1c + f1d + f1e + f1f + f1g + f1h +
f1i + f1j + f1k + f1l + f1m + f1n + f1o + f1p + f1q + f1r +
f1s + f1t, factors=3, rotation="varimax", na.action=na.omit,
scores="regression")
print(fact_ana , cutoff=0.45, sort=TRUE)
Recursive partitioning
# Recursive partitioning creates a decision tree to classify the observations based on # categorical predictors. Here we classify subjects based on their race, # using the following predictors: gender, drinking, primary substance, RAB sexrisk, MCS, and PCS. library(rpart) subst <- as.factor(substance) homelessness.rpart <- rpart(racegrp ~ female + i1 + sub + sexrisk + mcs + pcs, method="class", data=help_sim_data) printcp(homelessness.rpart) plot(homelessness.rpart) text(homelessness.rpart) race <- racegrp[i1<3.5] pcslow <- pcs[i1<3.5]<=31.94 length(pcslow==F) # count the number of subjects with (pcslow==F) table(race, pcslow) # in these data, 3 of 18 subjects with low PCS scores are white, # and 48 of 154) of those with PCS scores above the threshold are white
Linear discriminant analysis
# Fisher's linear discriminant analysis (LDA) identifies linear combinations of # variables splitting the data into separate classes. LDA can help us # identify clusters homelessness, using a uniform prior
library(MASS) ngroups <- length(unique(homeless)) lda_model <- lda(homeless ~ age + cesd + mcs + pcs, prior=rep(1/(ngroups-1), (ngroups-1))) print(lda_model) plot(lda_model) plot(lda_model, dimen=1, type="both")
Hierarchical clustering
# To group similar variables or cluster observations a hierarchical structure # may be constructed. hclust() and kmeans() are examples of such clustering # packages cor_mat <- cor(cbind(mcs, pcs, cesd, i1, a15a, homeless, sexrisk), use="pairwise.complete.obs") hier_clust <- hclust(dist(cor_mat)) plot(hier_clust)
km_clust <- kmeans(cor_mat, 5, nstart = 25) plot(cor_mat, col = km_clust$cluster) points(km_clust$centers, col = 1:5, pch = 8)
ROC curve
library(ROCR) pred <- prediction(cesd, g1b) # standardize the input prediction data (Depression scale vs. suicide) AUC<- slot(performance(pred, "auc"), "y.values")1
plot(performance(pred, "tpr", "fpr"), print.cutoffs.at=seq(from=20, to=50, by=5),
text.adj=c(1, -.5), lwd=2)
lines(c(0, 1), c(0, 1))
text(.6, .2, paste("AUC=", round(AUC,3), sep=""), cex=1.4)
title("ROC Curve for Depression scale vs. Suicide Prediction Model")
Multiple imputation
miss_data <- with(help_sim_data, data.frame(homeless, female, i1, sexrisk, indtot, mcs, pcs))
summary(miss_data)
# library(Hmisc)
# na.pattern(miss_data)
install.packages("mi")
library(mi)
inf <- mi.info(miss_data)
# run the imputation without data transformation
IMP <- mi(miss_data, info=inf, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=10))
# run the imputation with data transformation
miss_data.transformed <- mi.preprocess(miss_data, inf)
IMP_T <- mi(miss_data.transformed, n.iter=6, check.coef.convergence=TRUE, add.noise=noise.control(post.run.iter=6))
# Impute without noise
IMP_no_noise <- mi(miss_data.transformed, n.iter=6, add.noise=FALSE)
# check convergence, should get FALSE here because only n.iter is small converged(IMP, check = "data") # converged(IMP, check = "coefs") # visually check the imputation plot(IMP); plot(IMP_T)
Propensity score modeling
References
- Evaluation and Linkage to Primary (HELP) Care study
- Data formats: help.csv and help.Rdata
- Study and Data specifications
- HELP study documentation
- SAS and R Data Management, Statistical Analysis, and Graphics, Kleinman / Horton, 2009
- SOCR Home page: http://www.SOCR.ucla.edu
Translate this page: