Difference between revisions of "SMHS DataSimulation"
(→Testing section) |
(→Simulate New Data to Match the Properties/Characteristics of Observed Data) |
||
(9 intermediate revisions by 2 users not shown) | |||
Line 2: | Line 2: | ||
===Importing observed data for exploratory analytics=== | ===Importing observed data for exploratory analytics=== | ||
+ | |||
Using the [[SOCR_Simulated_HELP_Data|SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset]] we can [https://umich.instructure.com/files/354289/download?download_frd=1 extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv)]. | Using the [[SOCR_Simulated_HELP_Data|SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset]] we can [https://umich.instructure.com/files/354289/download?download_frd=1 extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv)]. | ||
Line 9: | Line 10: | ||
attach(data_1) | attach(data_1) | ||
− | # to ensure all variables are accessible within R, e.g., using age instead of data_1$$age | + | # to ensure all variables are accessible within R, e.g., using age instead of data_1$\$$age |
# i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1 | # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1 | ||
# treat randomization group (0=usual care, 1=HELP clinic) | # treat randomization group (0=usual care, 1=HELP clinic) | ||
Line 21: | Line 22: | ||
===Fragment of the data=== | ===Fragment of the data=== | ||
+ | |||
<center> | <center> | ||
{| class="wikitable" style="text-align:center; " border="1" | {| class="wikitable" style="text-align:center; " border="1" | ||
Line 45: | Line 47: | ||
hist(x.norm, main="N(10,20) Histogram") | hist(x.norm, main="N(10,20) Histogram") | ||
hist(x.norm, main="N(10,20) Histogram") | hist(x.norm, main="N(10,20) Histogram") | ||
− | mean(data_1$$$age) | + | mean(data_1$\$$age) |
− | sd(data_1$$$age) | + | sd(data_1$\$$age) |
− | Simulate | + | ==Simulate New Data to Match the Properties/Characteristics of Observed Data== |
− | + | *i2 [0: 184] | |
− | + | *age m=34,sd=12 | |
− | + | *treat {0,1} | |
− | + | *homeless {0,1} | |
− | + | *pcs 14-75 | |
− | + | *mcs 7-62 | |
− | + | *cesd 0–60 | |
− | + | *indtot 4-45 | |
− | + | *pss_fr 0-14 | |
− | + | *drugrisk 0-21 | |
− | + | *sexrisk | |
− | + | *satreat (0=no,1=yes) | |
− | + | *female (0=no,1=yes) | |
− | + | *racegrp (black, white, other) | |
+ | # Define number of subjects | ||
+ | NumSubj <- 282 | ||
+ | NumTime <- 4 | ||
+ | |||
+ | # Define data elements | ||
+ | # Cases | ||
+ | Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369,370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692) | ||
+ | |||
+ | # Imaging Biomarkers | ||
+ | L_caudate_ComputeArea <- rpois(NumSubj, 600) | ||
+ | L_caudate_Volume <- rpois(NumSubj, 800) | ||
+ | R_caudate_ComputeArea <- rpois(NumSubj, 893) | ||
+ | R_caudate_Volume <- rpois(NumSubj, 1000) | ||
+ | L_putamen_ComputeArea <- rpois(NumSubj, 900) | ||
+ | L_putamen_Volume <- rpois(NumSubj, 1400) | ||
+ | R_putamen_ComputeArea <- rpois(NumSubj, 1300) | ||
+ | R_putamen_Volume <- rpois(NumSubj, 3000) | ||
+ | L_hippocampus_ComputeArea <- rpois(NumSubj, 1300) | ||
+ | L_hippocampus_Volume <- rpois(NumSubj, 3200) | ||
+ | R_hippocampus_ComputeArea <- rpois(NumSubj, 1500) | ||
+ | R_hippocampus_Volume <- rpois(NumSubj, 3800) | ||
+ | cerebellum_ComputeArea <- rpois(NumSubj, 16700) | ||
+ | cerebellum_Volume <- rpois(NumSubj, 14000) | ||
+ | L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) | ||
+ | L_lingual_gyrus_Volume <- rpois(NumSubj, 11000) | ||
+ | R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) | ||
+ | R_lingual_gyrus_Volume <- rpois(NumSubj, 12000) | ||
+ | L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600) | ||
+ | L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000) | ||
+ | R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300) | ||
+ | R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000) | ||
+ | |||
# Demographics variables | # Demographics variables | ||
− | |||
Sex <- ifelse(runif(NumSubj)<.5,0,1) | Sex <- ifelse(runif(NumSubj)<.5,0,1) | ||
− | |||
Weight <- as.integer(rnorm(NumSubj, 80,10)) | Weight <- as.integer(rnorm(NumSubj, 80,10)) | ||
− | |||
Age <- as.integer(rnorm(NumSubj, 62,10)) | Age <- as.integer(rnorm(NumSubj, 62,10)) | ||
− | + | ||
− | |||
− | |||
# Diagnosis: | # Diagnosis: | ||
− | |||
Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82)) | Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82)) | ||
− | + | ||
− | |||
− | |||
# Genetics | # Genetics | ||
− | |||
chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials | chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials | ||
− | |||
chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials | chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials | ||
− | + | ||
− | |||
− | |||
# Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) | # Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) | ||
− | |||
UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), | UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), | ||
− | |||
ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), | ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), | ||
− | |||
ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) | ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) | ||
− | |||
UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), | UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), | ||
− | |||
sample.int(18, 82, replace=T) ) | sample.int(18, 82, replace=T) ) | ||
− | |||
UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), | UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), | ||
− | + | sample.int(25, 82, replace=T) ) | |
− | + | ||
− | |||
− | |||
# Time: VisitTime – done automatically below in aggregator | # Time: VisitTime – done automatically below in aggregator | ||
− | + | ||
# Data (putting all components together) | # Data (putting all components together) | ||
− | |||
sim_PD_Data <- cbind( | sim_PD_Data <- cbind( | ||
− | + | rep(Cases, each= NumTime), # Cases | |
− | + | rep(L_caudate_ComputeArea, each= NumTime), # Imaging | |
− | + | rep(Sex, each= NumTime), # Demographics | |
− | + | rep(Weight, each= NumTime), | |
− | + | rep(Age, each= NumTime), | |
− | + | rep(Dx, each= NumTime), # Dx | |
− | + | rep(chr12_rs34637584_GT, each= NumTime), # Genetics | |
− | + | rep(chr17_rs11868035_GT, each= NumTime), | |
− | + | rep(UPDRS_part_I, each= NumTime), # Clinical | |
− | + | rep(UPDRS_part_II, each= NumTime), | |
− | + | rep(UPDRS_part_III, each= NumTime), | |
− | + | rep(c(0,6,12,18), NumSubj) # Time | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
) | ) | ||
− | + | ||
− | |||
# Assign the column names | # Assign the column names | ||
− | |||
colnames(sim_PD_Data) <- c( | colnames(sim_PD_Data) <- c( | ||
"Cases", | "Cases", | ||
Line 150: | Line 148: | ||
"Time" | "Time" | ||
) | ) | ||
− | + | ||
− | |||
− | |||
# some QC | # some QC | ||
− | |||
summary(sim_PD_Data) | summary(sim_PD_Data) | ||
− | |||
dim(sim_PD_Data) | dim(sim_PD_Data) | ||
− | |||
head(sim_PD_Data) | head(sim_PD_Data) | ||
Latest revision as of 15:50, 7 August 2016
Scientific Methods for Health Sciences - Data Simulation
Importing observed data for exploratory analytics
Using the SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset we can extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv).
# data_1 <- read.csv('00_Tiny_SOCR_HELP_Data_Simmulation.csv',as.is=T, header=T) # data_1 = read.csv(file.choose( )) # data_1 <- read.table('00_Tiny_SOCR_HELP_Data_Simmulation.csv', header=TRUE, sep=",", row.names="ID")
attach(data_1) # to ensure all variables are accessible within R, e.g., using age instead of data_1$\$$age # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1 # treat randomization group (0=usual care, 1=HELP clinic) # pcs SF-36 Physical Component Score (range 14-75) # mcs SF-36 Mental Component Score(range 7-62) # cesd Center for Epidemiologic Studies Depression scale (range 0–60) # indtot Inventory of Drug Use Con-sequences (InDUC) total score (range 4–45) # pss_fr perceived social supports (friends, range 0–14) see also dayslink # drugrisk Risk-Assessment Battery(RAB) drug risk score (range0–21) # satreat any BSAS substance abuse treatment at baseline (0=no,1=yes) ==='"`UNIQ--h-2--QINU`"'Fragment of the data=== <center> {| class="wikitable" style="text-align:center; " border="1" |- ! ID ||i2 ||age ||treat ||homeless ||pcs ||mcs ||cesd ||indtot ||pss_fr ||drugrisk ||sexrisk ||satreat ||female ||substance ||racegrp |- | 1 ||0 ||25 ||0 ||0 ||49 ||7 ||46 ||37 ||0 ||1 ||6 ||0 ||0 ||cocaine ||black |- | 2 ||18 ||31 ||0 ||0 ||48 ||34 ||17 ||48 ||0 ||0 ||11 ||0 ||0 ||alcohol ||white |- | 3 ||39 ||36 ||0 ||0 ||76 ||9 ||33 ||41 ||12 ||19 ||4 ||0 ||0 ||heroin ||black |- | … || || || || || || || || || || || || || || || |- | 100 ||81 ||22 ||0 ||0 ||37 ||17 ||19 ||30 ||3 ||0 ||10 ||0 ||0 ||alcohol ||other |} </center> ==='"`UNIQ--h-3--QINU`"'Testing section=== summary(data_1) x.norm <- rnorm(n=200, m=10, sd=20) hist(x.norm, main="N(10,20) Histogram") hist(x.norm, main="N(10,20) Histogram") mean(data_1$\$$age) sd(data_1$\$$age)
Simulate New Data to Match the Properties/Characteristics of Observed Data
- i2 [0: 184]
- age m=34,sd=12
- treat {0,1}
- homeless {0,1}
- pcs 14-75
- mcs 7-62
- cesd 0–60
- indtot 4-45
- pss_fr 0-14
- drugrisk 0-21
- sexrisk
- satreat (0=no,1=yes)
- female (0=no,1=yes)
- racegrp (black, white, other)
# Define number of subjects NumSubj <- 282 NumTime <- 4
# Define data elements # Cases Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369,370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692)
# Imaging Biomarkers L_caudate_ComputeArea <- rpois(NumSubj, 600) L_caudate_Volume <- rpois(NumSubj, 800) R_caudate_ComputeArea <- rpois(NumSubj, 893) R_caudate_Volume <- rpois(NumSubj, 1000) L_putamen_ComputeArea <- rpois(NumSubj, 900) L_putamen_Volume <- rpois(NumSubj, 1400) R_putamen_ComputeArea <- rpois(NumSubj, 1300) R_putamen_Volume <- rpois(NumSubj, 3000) L_hippocampus_ComputeArea <- rpois(NumSubj, 1300) L_hippocampus_Volume <- rpois(NumSubj, 3200) R_hippocampus_ComputeArea <- rpois(NumSubj, 1500) R_hippocampus_Volume <- rpois(NumSubj, 3800) cerebellum_ComputeArea <- rpois(NumSubj, 16700) cerebellum_Volume <- rpois(NumSubj, 14000) L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) L_lingual_gyrus_Volume <- rpois(NumSubj, 11000) R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) R_lingual_gyrus_Volume <- rpois(NumSubj, 12000) L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600) L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000) R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300) R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)
# Demographics variables Sex <- ifelse(runif(NumSubj)<.5,0,1) Weight <- as.integer(rnorm(NumSubj, 80,10)) Age <- as.integer(rnorm(NumSubj, 62,10))
# Diagnosis: Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))
# Genetics chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials
# Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), sample.int(18, 82, replace=T) ) UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), sample.int(25, 82, replace=T) )
# Time: VisitTime – done automatically below in aggregator
# Data (putting all components together) sim_PD_Data <- cbind( rep(Cases, each= NumTime), # Cases rep(L_caudate_ComputeArea, each= NumTime), # Imaging rep(Sex, each= NumTime), # Demographics rep(Weight, each= NumTime), rep(Age, each= NumTime), rep(Dx, each= NumTime), # Dx rep(chr12_rs34637584_GT, each= NumTime), # Genetics rep(chr17_rs11868035_GT, each= NumTime), rep(UPDRS_part_I, each= NumTime), # Clinical rep(UPDRS_part_II, each= NumTime), rep(UPDRS_part_III, each= NumTime), rep(c(0,6,12,18), NumSubj) # Time )
# Assign the column names colnames(sim_PD_Data) <- c( "Cases", "L_caudate_ComputeArea", "Sex", "Weight", "Age", "Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT", "UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III", "Time" )
# some QC summary(sim_PD_Data) dim(sim_PD_Data) head(sim_PD_Data)
.....
....
- SOCR Home page: http://www.socr.umich.edu
Translate this page: