Difference between revisions of "SMHS DataSimulation"
(→Testing section) |
(→Simulate New Data to Match the Properties/Characteristics of Observed Data) |
||
(17 intermediate revisions by 2 users not shown) | |||
Line 2: | Line 2: | ||
===Importing observed data for exploratory analytics=== | ===Importing observed data for exploratory analytics=== | ||
+ | |||
Using the [[SOCR_Simulated_HELP_Data|SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset]] we can [https://umich.instructure.com/files/354289/download?download_frd=1 extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv)]. | Using the [[SOCR_Simulated_HELP_Data|SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset]] we can [https://umich.instructure.com/files/354289/download?download_frd=1 extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv)]. | ||
− | # data_1 <- read.csv(' | + | # data_1 <- read.csv('00_Tiny_SOCR_HELP_Data_Simmulation.csv',as.is=T, header=T) |
# data_1 = read.csv(file.choose( )) | # data_1 = read.csv(file.choose( )) | ||
− | # data_1 <- read.table(' | + | # data_1 <- read.table('00_Tiny_SOCR_HELP_Data_Simmulation.csv', header=TRUE, sep=",", row.names="ID") |
− | + | ||
attach(data_1) | attach(data_1) | ||
− | # to ensure all variables are accessible within R, e.g., using | + | # to ensure all variables are accessible within R, e.g., using age instead of data_1$\$$age |
− | + | # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1 | |
− | # | ||
# treat randomization group (0=usual care, 1=HELP clinic) | # treat randomization group (0=usual care, 1=HELP clinic) | ||
# pcs SF-36 Physical Component Score (range 14-75) | # pcs SF-36 Physical Component Score (range 14-75) | ||
# mcs SF-36 Mental Component Score(range 7-62) | # mcs SF-36 Mental Component Score(range 7-62) | ||
# cesd Center for Epidemiologic Studies Depression scale (range 0–60) | # cesd Center for Epidemiologic Studies Depression scale (range 0–60) | ||
− | # | + | # indtot Inventory of Drug Use Con-sequences (InDUC) total score (range 4–45) |
# pss_fr perceived social supports (friends, range 0–14) see also dayslink | # pss_fr perceived social supports (friends, range 0–14) see also dayslink | ||
# drugrisk Risk-Assessment Battery(RAB) drug risk score (range0–21) | # drugrisk Risk-Assessment Battery(RAB) drug risk score (range0–21) | ||
# satreat any BSAS substance abuse treatment at baseline (0=no,1=yes) | # satreat any BSAS substance abuse treatment at baseline (0=no,1=yes) | ||
+ | ===Fragment of the data=== | ||
<center> | <center> | ||
Line 38: | Line 39: | ||
|} | |} | ||
</center> | </center> | ||
− | |||
− | |||
===Testing section=== | ===Testing section=== | ||
Line 48: | Line 47: | ||
hist(x.norm, main="N(10,20) Histogram") | hist(x.norm, main="N(10,20) Histogram") | ||
hist(x.norm, main="N(10,20) Histogram") | hist(x.norm, main="N(10,20) Histogram") | ||
− | mean(data_1$age) | + | mean(data_1$\$$age) |
− | sd(data_1$age) | + | sd(data_1$\$$age) |
+ | |||
+ | |||
+ | ==Simulate New Data to Match the Properties/Characteristics of Observed Data== | ||
+ | |||
+ | *i2 [0: 184] | ||
+ | *age m=34,sd=12 | ||
+ | *treat {0,1} | ||
+ | *homeless {0,1} | ||
+ | *pcs 14-75 | ||
+ | *mcs 7-62 | ||
+ | *cesd 0–60 | ||
+ | *indtot 4-45 | ||
+ | *pss_fr 0-14 | ||
+ | *drugrisk 0-21 | ||
+ | *sexrisk | ||
+ | *satreat (0=no,1=yes) | ||
+ | *female (0=no,1=yes) | ||
+ | *racegrp (black, white, other) | ||
+ | |||
+ | # Define number of subjects | ||
+ | NumSubj <- 282 | ||
+ | NumTime <- 4 | ||
+ | |||
+ | # Define data elements | ||
+ | # Cases | ||
+ | Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369,370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692) | ||
+ | |||
+ | # Imaging Biomarkers | ||
+ | L_caudate_ComputeArea <- rpois(NumSubj, 600) | ||
+ | L_caudate_Volume <- rpois(NumSubj, 800) | ||
+ | R_caudate_ComputeArea <- rpois(NumSubj, 893) | ||
+ | R_caudate_Volume <- rpois(NumSubj, 1000) | ||
+ | L_putamen_ComputeArea <- rpois(NumSubj, 900) | ||
+ | L_putamen_Volume <- rpois(NumSubj, 1400) | ||
+ | R_putamen_ComputeArea <- rpois(NumSubj, 1300) | ||
+ | R_putamen_Volume <- rpois(NumSubj, 3000) | ||
+ | L_hippocampus_ComputeArea <- rpois(NumSubj, 1300) | ||
+ | L_hippocampus_Volume <- rpois(NumSubj, 3200) | ||
+ | R_hippocampus_ComputeArea <- rpois(NumSubj, 1500) | ||
+ | R_hippocampus_Volume <- rpois(NumSubj, 3800) | ||
+ | cerebellum_ComputeArea <- rpois(NumSubj, 16700) | ||
+ | cerebellum_Volume <- rpois(NumSubj, 14000) | ||
+ | L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) | ||
+ | L_lingual_gyrus_Volume <- rpois(NumSubj, 11000) | ||
+ | R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) | ||
+ | R_lingual_gyrus_Volume <- rpois(NumSubj, 12000) | ||
+ | L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600) | ||
+ | L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000) | ||
+ | R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300) | ||
+ | R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000) | ||
+ | |||
+ | # Demographics variables | ||
+ | Sex <- ifelse(runif(NumSubj)<.5,0,1) | ||
+ | Weight <- as.integer(rnorm(NumSubj, 80,10)) | ||
+ | Age <- as.integer(rnorm(NumSubj, 62,10)) | ||
+ | |||
+ | # Diagnosis: | ||
+ | Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82)) | ||
+ | |||
+ | # Genetics | ||
+ | chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials | ||
+ | chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials | ||
+ | |||
+ | # Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) | ||
+ | UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), | ||
+ | ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), | ||
+ | ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) | ||
+ | UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), | ||
+ | sample.int(18, 82, replace=T) ) | ||
+ | UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), | ||
+ | sample.int(25, 82, replace=T) ) | ||
+ | |||
+ | # Time: VisitTime – done automatically below in aggregator | ||
+ | |||
+ | # Data (putting all components together) | ||
+ | sim_PD_Data <- cbind( | ||
+ | rep(Cases, each= NumTime), # Cases | ||
+ | rep(L_caudate_ComputeArea, each= NumTime), # Imaging | ||
+ | rep(Sex, each= NumTime), # Demographics | ||
+ | rep(Weight, each= NumTime), | ||
+ | rep(Age, each= NumTime), | ||
+ | rep(Dx, each= NumTime), # Dx | ||
+ | rep(chr12_rs34637584_GT, each= NumTime), # Genetics | ||
+ | rep(chr17_rs11868035_GT, each= NumTime), | ||
+ | rep(UPDRS_part_I, each= NumTime), # Clinical | ||
+ | rep(UPDRS_part_II, each= NumTime), | ||
+ | rep(UPDRS_part_III, each= NumTime), | ||
+ | rep(c(0,6,12,18), NumSubj) # Time | ||
+ | ) | ||
+ | |||
+ | # Assign the column names | ||
+ | colnames(sim_PD_Data) <- c( | ||
+ | "Cases", | ||
+ | "L_caudate_ComputeArea", | ||
+ | "Sex", "Weight", "Age", | ||
+ | "Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT", | ||
+ | "UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III", | ||
+ | "Time" | ||
+ | ) | ||
+ | |||
+ | # some QC | ||
+ | summary(sim_PD_Data) | ||
+ | dim(sim_PD_Data) | ||
+ | head(sim_PD_Data) | ||
+ | |||
+ | |||
+ | ..... | ||
+ | |||
+ | <center>[[Image:SMHS_DataSimulation_Fig1.png|500px]] </center> | ||
+ | |||
+ | .... | ||
Latest revision as of 15:50, 7 August 2016
Scientific Methods for Health Sciences - Data Simulation
Importing observed data for exploratory analytics
Using the SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset we can extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv).
# data_1 <- read.csv('00_Tiny_SOCR_HELP_Data_Simmulation.csv',as.is=T, header=T) # data_1 = read.csv(file.choose( )) # data_1 <- read.table('00_Tiny_SOCR_HELP_Data_Simmulation.csv', header=TRUE, sep=",", row.names="ID")
attach(data_1) # to ensure all variables are accessible within R, e.g., using age instead of data_1$\$$age # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1 # treat randomization group (0=usual care, 1=HELP clinic) # pcs SF-36 Physical Component Score (range 14-75) # mcs SF-36 Mental Component Score(range 7-62) # cesd Center for Epidemiologic Studies Depression scale (range 0–60) # indtot Inventory of Drug Use Con-sequences (InDUC) total score (range 4–45) # pss_fr perceived social supports (friends, range 0–14) see also dayslink # drugrisk Risk-Assessment Battery(RAB) drug risk score (range0–21) # satreat any BSAS substance abuse treatment at baseline (0=no,1=yes) ==='"`UNIQ--h-2--QINU`"'Fragment of the data=== <center> {| class="wikitable" style="text-align:center; " border="1" |- ! ID ||i2 ||age ||treat ||homeless ||pcs ||mcs ||cesd ||indtot ||pss_fr ||drugrisk ||sexrisk ||satreat ||female ||substance ||racegrp |- | 1 ||0 ||25 ||0 ||0 ||49 ||7 ||46 ||37 ||0 ||1 ||6 ||0 ||0 ||cocaine ||black |- | 2 ||18 ||31 ||0 ||0 ||48 ||34 ||17 ||48 ||0 ||0 ||11 ||0 ||0 ||alcohol ||white |- | 3 ||39 ||36 ||0 ||0 ||76 ||9 ||33 ||41 ||12 ||19 ||4 ||0 ||0 ||heroin ||black |- | … || || || || || || || || || || || || || || || |- | 100 ||81 ||22 ||0 ||0 ||37 ||17 ||19 ||30 ||3 ||0 ||10 ||0 ||0 ||alcohol ||other |} </center> ==='"`UNIQ--h-3--QINU`"'Testing section=== summary(data_1) x.norm <- rnorm(n=200, m=10, sd=20) hist(x.norm, main="N(10,20) Histogram") hist(x.norm, main="N(10,20) Histogram") mean(data_1$\$$age) sd(data_1$\$$age)
Simulate New Data to Match the Properties/Characteristics of Observed Data
- i2 [0: 184]
- age m=34,sd=12
- treat {0,1}
- homeless {0,1}
- pcs 14-75
- mcs 7-62
- cesd 0–60
- indtot 4-45
- pss_fr 0-14
- drugrisk 0-21
- sexrisk
- satreat (0=no,1=yes)
- female (0=no,1=yes)
- racegrp (black, white, other)
# Define number of subjects NumSubj <- 282 NumTime <- 4
# Define data elements # Cases Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369,370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692)
# Imaging Biomarkers L_caudate_ComputeArea <- rpois(NumSubj, 600) L_caudate_Volume <- rpois(NumSubj, 800) R_caudate_ComputeArea <- rpois(NumSubj, 893) R_caudate_Volume <- rpois(NumSubj, 1000) L_putamen_ComputeArea <- rpois(NumSubj, 900) L_putamen_Volume <- rpois(NumSubj, 1400) R_putamen_ComputeArea <- rpois(NumSubj, 1300) R_putamen_Volume <- rpois(NumSubj, 3000) L_hippocampus_ComputeArea <- rpois(NumSubj, 1300) L_hippocampus_Volume <- rpois(NumSubj, 3200) R_hippocampus_ComputeArea <- rpois(NumSubj, 1500) R_hippocampus_Volume <- rpois(NumSubj, 3800) cerebellum_ComputeArea <- rpois(NumSubj, 16700) cerebellum_Volume <- rpois(NumSubj, 14000) L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) L_lingual_gyrus_Volume <- rpois(NumSubj, 11000) R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300) R_lingual_gyrus_Volume <- rpois(NumSubj, 12000) L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600) L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000) R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300) R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)
# Demographics variables Sex <- ifelse(runif(NumSubj)<.5,0,1) Weight <- as.integer(rnorm(NumSubj, 80,10)) Age <- as.integer(rnorm(NumSubj, 62,10))
# Diagnosis: Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))
# Genetics chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)) # NumSubj Bernoulli trials chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1)) # NumSubj Bernoulli trials
# Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6) UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) ) UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T), sample.int(18, 82, replace=T) ) UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T), sample.int(25, 82, replace=T) )
# Time: VisitTime – done automatically below in aggregator
# Data (putting all components together) sim_PD_Data <- cbind( rep(Cases, each= NumTime), # Cases rep(L_caudate_ComputeArea, each= NumTime), # Imaging rep(Sex, each= NumTime), # Demographics rep(Weight, each= NumTime), rep(Age, each= NumTime), rep(Dx, each= NumTime), # Dx rep(chr12_rs34637584_GT, each= NumTime), # Genetics rep(chr17_rs11868035_GT, each= NumTime), rep(UPDRS_part_I, each= NumTime), # Clinical rep(UPDRS_part_II, each= NumTime), rep(UPDRS_part_III, each= NumTime), rep(c(0,6,12,18), NumSubj) # Time )
# Assign the column names colnames(sim_PD_Data) <- c( "Cases", "L_caudate_ComputeArea", "Sex", "Weight", "Age", "Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT", "UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III", "Time" )
# some QC summary(sim_PD_Data) dim(sim_PD_Data) head(sim_PD_Data)
.....
....
- SOCR Home page: http://www.socr.umich.edu
Translate this page: