SMHS Usage Rfundamentals
Contents
SMHS EBook Usage - Some R Fundamentals
Command-line Help
> apropos("nova") # within R keyword search/help
Saving R commands/scripts and HTML outputs
It's easy to save R scripts and commands that can be reused, shared and modified later or by others, as well as to create HTML or PDF files with R, Knitr, MiKTeX.
Needed packages
You may need to first install Knitr and Markdown (and/or MiKTeX/Pandoc). Open R or RStudio and install the packages:
# Install knitr install.packages("knitr") install.packages("markdown")
Create a .Rmd File Containing Your Complete Protocol
Open RStudio, click File --> New --> R Markdown, and enter meta-data (e.g., Author). Save as example.Rmd -- this generates an R Markdown document. A new markdown (.Rmd) file will be created that is populated with an example and contains the both R code and knitr code. Code between the sets of three apostrophes ```{r}
and ```
is pure R code. Code outside of these segments is knitr code (necessary for the export of entire protocol to HTML/PDF/etc). The R code tells R what to do and the Knitr code creates the output doc (e.g., HTML file). Below is an example of a fresh new Rmd file that is automatically generated as an example:
--- title: "SOCR Demo" author: "SOCR Team" date: "April 12, 2016" output: html_document --- This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>. When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ```{r} summary(cars) ``` You can also embed plots, for example: ```{r, echo=FALSE} plot(cars) ``` Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
Example: Generate an R HTML Export using the SOCR PD Data
This example illustrates a real data analysis of Parkinson's Disease using the SOCR PD Dataset. You can save this script into a text file (e.g., SOCR_PD_Example.Rmd) and load it into RStudio. Highlight the entire script in the shell and click Run. This will download the data, run the entire analytics protocol and generate an HTML output like this.
--- title: "R HTML Export Demo using the SOCR PD Data" author: "SOCR Team" date: "April 12, 2016" output: html_document --- ## Note: The opening "```{r}" and closing "```" wrapping strings indicate knitr code ## Data Import from SOCR Data server ```{r} library(rvest) wiki <- read_html("http://wiki.socr.umich.edu/index.php/SOCR_Data_PD_BiomedBigMetadata") html_nodes(wiki, "#content") pd <- html_table(html_nodes(wiki,"table")$1$) ``` ## Data Preparation (factorization of patient diagnosis) ```{r} ## Convert Dx to a dichotomous variable ## pd$\$$Dx <- gsub("PD", 0, pd$\$$Dx) ## pd$\$$Dx <- gsub("HC", 1, pd$\$$Dx) ## pd$\$$Dx <- gsub("SWEDD", 2, pd$\$$Dx) pd$\$$Dx <- as.factor(pd$\$$Dx) ``` ## Multivariate Linear Regression Analysis (predict binary DX) ```{r} ## Full Model m1 <- glm(Dx ~ L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd, family="binomial") summary(m1) par(mfrow=c(2,2)) # default layout() call may not be sufficiently large to contain the entire plot, so we extent it plot(m1) ``` ## Variable Selection ```{r} library(leaps) library(MASS) reg1 <- regsubsets(Dx ~ L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd) summary(reg1) ``` ## Forward Selection ```{r} reg2 <- regsubsets(Dx ~ L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd, method="forward") summary(reg2) ``` ## Backward Selection ```{r} reg3 <- regsubsets(Dx ~ L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd, method="backward") summary(reg3) ``` ## Reduced Model ```{r} m2 <- glm(Dx ~ R_caudate_Volume + R_fusiform_gyrus_Volume + Weight + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd, family="binomial") summary(m2) par(mfrow=c(2,2)) plot(m2) ``` ## Removing Bad Leverage Points ```{r} n <- dim(pd)[1] case <- c(1:n) cooks <- cooks.distance(m2) cooks_outlier <- case[cooks>4/(length(pd$\$$Dx)-2)] cooks_outlier m2 <- glm(Dx ~ R_caudate_Volume + R_fusiform_gyrus_Volume + Weight + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd[-cooks_outlier,], family="binomial") summary(m2) par(mfrow=c(2,2)) plot(m2) ``` ## Log Transformation ```{r} m3 <- glm(Dx ~ log(R_caudate_Volume) + log(R_fusiform_gyrus_Volume) + log(Weight) + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=pd[-cooks_outlier,], family="binomial") summary(m3) par(mfrow=c(2,2)) plot(m3) ``` ## Multicollinearity Check ```{r} library(car) vif(m2) ## Here, each of the variance inflaction factors is less than 5, so the multicollinearity between these predictors does not affect the result significantly. ``` ## Likelihood Ratio Test ```{r} with(m2, null.deviance - deviance) with(m2, df.null - df.residual) with(m2, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) ## The chi-square of 403 with 8 degrees of freedom and an associated p-value of less than 0.001 indicates that the model as a whole fits significantly better than an empty model. ``` ## Linear Discriminant Analysis ```{r} ## Consider time = 0 only pdzero <- pd[which(pd$\$$Time==0),] ## Split dataset into 70% training and 30% testing set set.seed(666) train_index <- sample(dim(pdzero)[1], trunc(dim(pdzero)[1]*0.7)) train <- pdzero[train_index,] test <- pdzero[-train_index,] library(MASS) lda.fit <- lda(Dx~L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=train) lda.predict <- predict(lda.fit, test) table(lda.predict$\$$class, test$\$$Dx) ``` ## Logistic Regression Analysis ```{r} ## Break up the three-level dependent variable into two binary variables pdzero$\$$PD_YN <- ifelse(pdzero$\$$Dx=="PD", 1, 0) pdzero$\$$SWEDD_YN <- ifelse(pdzero$\$$Dx=="SWEDD", 1, 0) ## Split the dataset into 70% training and 30% testing set set.seed(666) train_index <- sample(dim(pdzero)[1], trunc(dim(pdzero)[1]*0.7)) train <- pdzero[train_index,] test <- pdzero[-train_index,] ## PD_YN as outcome log1 <- glm(PD_YN ~ L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=train, family="binomial") probPD <- predict(log1, test, type="response") median(probPD) # 0.30 resultPD <- ifelse(probPD>0.3, 1, 0) table(resultPD, test$\$$PD_YN) ## SWEDD as outcome log2 <- glm(SWEDD_YN ~ L_caudate_Volume + R_caudate_Volume + L_putamen_Volume + R_putamen_Volume + L_hippocampus_Volume + R_hippocampus_Volume + cerebellum_Volume + L_lingual_gyrus_Volume + R_lingual_gyrus_Volume + L_fusiform_gyrus_Volume + R_fusiform_gyrus_Volume + Sex + Weight + Age + UPDRS_part_I + UPDRS_part_II + UPDRS_part_III + chr12_rs34637584_GT + chr17_rs11868035_GT, data=train, family="binomial") probSW <- predict(log2, test, type="response") median(probSW) # 0.301 resultSW <- ifelse(probSW>0.2, 1, 0) table(resultSW, test$\$$SWEDD_YN) ## Combine the results of two logistic model together finalpred <- NULL for (i in 1:dim(test)[1]) { if (resultPD[i]==1 & resultSW[i]==0) { finalpred[i] <- "PD" } if (resultPD[i]==1 & resultSW[i]==1) { finalpred[i] <- ifelse(probPD[i]>probSW[i], "PD", "SW") } if (resultPD[i]==0 & resultSW[i]==1) { finalpred[i] <- "SW" } if (resultPD[i]==0 & resultSW[i]==0) { finalpred[i] <- "HC" } } table(finalpred, test$\$$Dx) ```
References
- SOCR Home page: http://www.socr.umich.edu
Translate this page: