Difference between revisions of "SMHS DataSimulation"
(→Testing section) |
(→Scientific Methods for Health Sciences - Data Simulation) |
||
Line 50: | Line 50: | ||
mean(data_1$age) | mean(data_1$age) | ||
sd(data_1$age) | sd(data_1$age) | ||
+ | |||
+ | |||
+ | Simulate new data to match the properties/characteristics of observed data | ||
+ | |||
+ | i2 [0: 184] | ||
+ | age m=34,sd=12 | ||
+ | treat {0,1} | ||
+ | homeless {0,1} | ||
+ | pcs 14-75 | ||
+ | mcs 7-62 | ||
+ | cesd 0–60 | ||
+ | indtot 4-45 | ||
+ | pss_fr 0-14 | ||
+ | drugrisk 0-21 | ||
+ | sexrisk | ||
+ | satreat (0=no,1=yes) | ||
+ | female (0=no,1=yes) | ||
+ | racegrp (black, white, other) | ||
+ | |||
+ | # Demographics variables | ||
+ | |||
+ | Sex <- ifelse(runif(NumSubj)<.5,0,1) | ||
+ | |||
+ | Weight <- as.integer(rnorm(NumSubj, 80,10)) | ||
+ | |||
+ | Age <- as.integer(rnorm(NumSubj, 62,10)) | ||
+ | |||
+ | |||
+ | |||
+ | # Diagnosis: | ||
+ | |||
+ | Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82)) | ||
+ | |||
+ | |||
+ | |||
+ | # Genetics | ||
+ | |||
+ | chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials | ||
+ | |||
+ | chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials | ||
+ | |||
+ | |||
+ | |||
+ | # Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) | ||
+ | |||
+ | UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), | ||
+ | |||
+ | ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), | ||
+ | |||
+ | ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) | ||
+ | |||
+ | UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), | ||
+ | |||
+ | sample.int(18, 82, replace=T) ) | ||
+ | |||
+ | UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), | ||
+ | |||
+ | sample.int(25, 82, replace=T) ) | ||
+ | |||
+ | |||
+ | # Time: VisitTime – done automatically below in aggregator | ||
+ | |||
+ | # Data (putting all components together) | ||
+ | |||
+ | sim_PD_Data <- cbind( | ||
+ | |||
+ | rep(Cases, each= NumTime), # Cases | ||
+ | |||
+ | rep(L_caudate_ComputeArea, each= NumTime), # Imaging | ||
+ | |||
+ | rep(Sex, each= NumTime), # Demographics | ||
+ | |||
+ | rep(Weight, each= NumTime), | ||
+ | |||
+ | rep(Age, each= NumTime), | ||
+ | |||
+ | rep(Dx, each= NumTime), # Dx | ||
+ | |||
+ | rep(chr12_rs34637584_GT, each= NumTime), # Genetics | ||
+ | |||
+ | rep(chr17_rs11868035_GT, each= NumTime), | ||
+ | |||
+ | rep(UPDRS_part_I, each= NumTime), # Clinical | ||
+ | |||
+ | rep(UPDRS_part_II, each= NumTime), | ||
+ | |||
+ | rep(UPDRS_part_III, each= NumTime), | ||
+ | |||
+ | rep(c(0,6,12,18), NumSubj) # Time | ||
+ | |||
+ | ) | ||
+ | |||
+ | |||
+ | # Assign the column names | ||
+ | |||
+ | colnames(sim_PD_Data) <- c( | ||
+ | "Cases", | ||
+ | "L_caudate_ComputeArea", | ||
+ | "Sex", "Weight", "Age", | ||
+ | "Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT", | ||
+ | "UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III", | ||
+ | "Time" | ||
+ | ) | ||
+ | |||
+ | |||
+ | |||
+ | # some QC | ||
+ | |||
+ | summary(sim_PD_Data) | ||
+ | |||
+ | dim(sim_PD_Data) | ||
+ | |||
+ | head(sim_PD_Data) | ||
+ | |||
+ | |||
Revision as of 12:16, 20 January 2016
Contents
Scientific Methods for Health Sciences - Data Simulation
Importing observed data for exploratory analytics
Using the SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset we can extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv).
# data_1 <- read.csv('C:\\Users\\Dinov\\Desktop\\00_Tiny_SOCR_HELP_Data_Simmulation.csv',as.is=T, header=T) # data_1 = read.csv(file.choose( )) # data_1 <- read.table('C:\\Users\\Dinov\\Desktop\\00_Tiny_SOCR_HELP_Data_Simmulation.csv', header=TRUE, sep=",", row.names="ID") attach(data_1) # to ensure all variables are accessible within R, e.g., using “age” instead of data_1$age # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1 # treat randomization group (0=usual care, 1=HELP clinic) # pcs SF-36 Physical Component Score (range 14-75) # mcs SF-36 Mental Component Score(range 7-62) # cesd Center for Epidemiologic Studies Depression scale (range 0–60) # indtot Inventory of Drug Use Con-sequences (InDUC) total score (range 4–45) # pss_fr perceived social supports (friends, range 0–14) see also dayslink # drugrisk Risk-Assessment Battery(RAB) drug risk score (range0–21) # satreat any BSAS substance abuse treatment at baseline (0=no,1=yes)
ID | i2 | age | treat | homeless | pcs | mcs | cesd | indtot | pss_fr | drugrisk | sexrisk | satreat | female | substance | racegrp |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0 | 25 | 0 | 0 | 49 | 7 | 46 | 37 | 0 | 1 | 6 | 0 | 0 | cocaine | black |
2 | 18 | 31 | 0 | 0 | 48 | 34 | 17 | 48 | 0 | 0 | 11 | 0 | 0 | alcohol | white |
3 | 39 | 36 | 0 | 0 | 76 | 9 | 33 | 41 | 12 | 19 | 4 | 0 | 0 | heroin | black |
… | |||||||||||||||
100 | 81 | 22 | 0 | 0 | 37 | 17 | 19 | 30 | 3 | 0 | 10 | 0 | 0 | alcohol | other |
Testing section
Testing section
summary(data_1) x.norm <- rnorm(n=200, m=10, sd=20) hist(x.norm, main="N(10,20) Histogram") hist(x.norm, main="N(10,20) Histogram") mean(data_1$age) sd(data_1$age)
Simulate new data to match the properties/characteristics of observed data
i2 [0: 184] age m=34,sd=12 treat {0,1} homeless {0,1} pcs 14-75 mcs 7-62 cesd 0–60 indtot 4-45 pss_fr 0-14 drugrisk 0-21 sexrisk satreat (0=no,1=yes) female (0=no,1=yes) racegrp (black, white, other) # Demographics variables Sex <- ifelse(runif(NumSubj)<.5,0,1) Weight <- as.integer(rnorm(NumSubj, 80,10)) Age <- as.integer(rnorm(NumSubj, 62,10)) # Diagnosis: Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82)) # Genetics chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials # Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), sample.int(18, 82, replace=T) ) UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), sample.int(25, 82, replace=T) ) # Time: VisitTime – done automatically below in aggregator # Data (putting all components together) sim_PD_Data <- cbind( rep(Cases, each= NumTime), # Cases rep(L_caudate_ComputeArea, each= NumTime), # Imaging rep(Sex, each= NumTime), # Demographics rep(Weight, each= NumTime), rep(Age, each= NumTime), rep(Dx, each= NumTime), # Dx rep(chr12_rs34637584_GT, each= NumTime), # Genetics rep(chr17_rs11868035_GT, each= NumTime), rep(UPDRS_part_I, each= NumTime), # Clinical rep(UPDRS_part_II, each= NumTime), rep(UPDRS_part_III, each= NumTime), rep(c(0,6,12,18), NumSubj) # Time ) # Assign the column names colnames(sim_PD_Data) <- c( "Cases", "L_caudate_ComputeArea", "Sex", "Weight", "Age", "Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT", "UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III", "Time" ) # some QC summary(sim_PD_Data) dim(sim_PD_Data) head(sim_PD_Data)
- SOCR Home page: http://www.socr.umich.edu
Translate this page: