Difference between revisions of "SMHS DataSimulation"

Latest revision as of 15:50, 7 August 2016

Scientific Methods for Health Sciences - Data Simulation

Importing observed data for exploratory analytics

Using the SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset we can extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv).

# data_1 <- read.csv('00_Tiny_SOCR_HELP_Data_Simmulation.csv',as.is=T, header=T)
# data_1 = read.csv(file.choose( ))
# data_1 <- read.table('00_Tiny_SOCR_HELP_Data_Simmulation.csv', header=TRUE,   sep=",", row.names="ID")

attach(data_1)  
# to ensure all variables are accessible within R, e.g., using age instead of data_1$\$$age
 # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1
 # treat randomization group (0=usual care, 1=HELP clinic)
 # pcs SF-36 Physical Component Score (range 14-75)
 # mcs SF-36 Mental Component Score(range 7-62)
 # cesd Center for Epidemiologic Studies Depression scale (range 0–60)
 # indtot Inventory of Drug Use Con-sequences (InDUC) total score (range 4–45)
 # pss_fr perceived social supports (friends, range 0–14) see also dayslink
 # drugrisk Risk-Assessment Battery(RAB) drug risk score (range0–21)
 # satreat any BSAS substance abuse treatment at baseline (0=no,1=yes)

==='"`UNIQ--h-2--QINU`"'Fragment of the data===

<center>
{| class="wikitable" style="text-align:center; " border="1"
|-
! ID ||i2 ||age ||treat ||homeless ||pcs ||mcs ||cesd ||indtot ||pss_fr ||drugrisk ||sexrisk ||satreat ||female ||substance ||racegrp
|-
| 1 ||0 ||25 ||0 ||0 ||49 ||7 ||46 ||37 ||0 ||1 ||6 ||0 ||0 ||cocaine ||black
|-
| 2 ||18 ||31 ||0 ||0 ||48 ||34 ||17 ||48 ||0 ||0 ||11 ||0 ||0 ||alcohol ||white
|-
| 3 ||39 ||36 ||0 ||0 ||76 ||9 ||33 ||41 ||12 ||19 ||4 ||0 ||0 ||heroin ||black
|-
| … || || || || || || || || || || || || || || ||
|-
| 100 ||81 ||22 ||0 ||0 ||37 ||17 ||19 ||30 ||3 ||0 ||10 ||0 ||0 ||alcohol ||other
|}
</center>

==='"`UNIQ--h-3--QINU`"'Testing section===

 summary(data_1)
 
 x.norm <- rnorm(n=200, m=10, sd=20)
 hist(x.norm, main="N(10,20) Histogram")
 hist(x.norm, main="N(10,20) Histogram")
 mean(data_1$\$$age)
sd(data_1$\$$age)

Simulate New Data to Match the Properties/Characteristics of Observed Data

i2 [0: 184]
age m=34,sd=12
treat {0,1}
homeless {0,1}
pcs 14-75
mcs 7-62
cesd 0–60
indtot 4-45
pss_fr 0-14
drugrisk 0-21
sexrisk
satreat (0=no,1=yes)
female (0=no,1=yes)
racegrp (black, white, other)

# Define number of subjects
NumSubj <- 282
NumTime <- 4

# Define data elements
# Cases
Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369,370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692)

# Imaging Biomarkers
L_caudate_ComputeArea <- rpois(NumSubj, 600)
L_caudate_Volume <- rpois(NumSubj, 800)
R_caudate_ComputeArea <- rpois(NumSubj, 893)
R_caudate_Volume <- rpois(NumSubj, 1000)
L_putamen_ComputeArea <- rpois(NumSubj, 900)
L_putamen_Volume <- rpois(NumSubj, 1400)
R_putamen_ComputeArea <- rpois(NumSubj, 1300)
R_putamen_Volume <- rpois(NumSubj, 3000)
L_hippocampus_ComputeArea <- rpois(NumSubj, 1300)
L_hippocampus_Volume <- rpois(NumSubj, 3200)
R_hippocampus_ComputeArea <- rpois(NumSubj, 1500)
R_hippocampus_Volume <- rpois(NumSubj, 3800)
cerebellum_ComputeArea <- rpois(NumSubj, 16700)
cerebellum_Volume <- rpois(NumSubj, 14000)
L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
L_lingual_gyrus_Volume <- rpois(NumSubj, 11000)
R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_lingual_gyrus_Volume <- rpois(NumSubj, 12000)
L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600)
L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000)
R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)

# Demographics variables
Sex <- ifelse(runif(NumSubj)<.5,0,1)
Weight <- as.integer(rnorm(NumSubj, 80,10))
Age <- as.integer(rnorm(NumSubj, 62,10))

# Diagnosis:
Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))

# Genetics
chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1))                              # NumSubj Bernoulli trials
chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1))                              # NumSubj Bernoulli trials

# Clinical          # rpois(NumSubj, 15) + rpois(NumSubj, 6)
UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1),
ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1),
ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) )
UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T),
sample.int(18, 82, replace=T) )
UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T),
   sample.int(25, 82, replace=T) )

# Time: VisitTime – done automatically below in aggregator

# Data (putting all components together)
sim_PD_Data <- cbind(
          rep(Cases, each= NumTime),                          # Cases
          rep(L_caudate_ComputeArea, each= NumTime), # Imaging
          rep(Sex, each= NumTime),                            # Demographics
          rep(Weight, each= NumTime),
          rep(Age, each= NumTime),
          rep(Dx, each= NumTime),                             # Dx
          rep(chr12_rs34637584_GT, each= NumTime),            # Genetics
          rep(chr17_rs11868035_GT, each= NumTime),
          rep(UPDRS_part_I, each= NumTime),                   # Clinical
          rep(UPDRS_part_II, each= NumTime),
          rep(UPDRS_part_III, each= NumTime),
          rep(c(0,6,12,18), NumSubj)                          # Time
)

# Assign the column names
colnames(sim_PD_Data) <- c(
"Cases",
"L_caudate_ComputeArea",
"Sex", "Weight", "Age",
"Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT",
"UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III",
"Time"
)

# some QC
summary(sim_PD_Data)
dim(sim_PD_Data)
head(sim_PD_Data)

.....

....

SOCR Home page: http://www.socr.umich.edu

Translate this page:

(default)	Deutsch	Español	Français	Italiano	Português	日本語	България	الامارات العربية المتحدة	Suomi	इस भाषा में	Norge
한국어	中文	繁体中文	Русский	Nederlands	Ελληνικά	Hrvatska	Česká republika	Danmark	Polska	România	Sverige

Difference between revisions of "SMHS DataSimulation"

Latest revision as of 15:50, 7 August 2016

Scientific Methods for Health Sciences - Data Simulation

Importing observed data for exploratory analytics

Simulate New Data to Match the Properties/Characteristics of Observed Data

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

Tools

@@ Line 2: / Line 2: @@
 ===Importing observed data for exploratory analytics===
 Using the [[SOCR_Simulated_HELP_Data|SOCR Health Evaluation and Linkage to Primary (HELP) Care Dataset]] we can [https://umich.instructure.com/files/354289/download?download_frd=1 extract some sample data (00_Tiny_SOCR_HELP_Data_Simmulation.csv)].
@@ Line 9: / Line 10: @@
   attach(data_1)
+ # to ensure all variables are accessible within R, e.g., using age instead of data_1$\$$age
   # i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0–184) see also i1
   # treat randomization group (0=usual care, 1=HELP clinic)
@@ Line 20: / Line 22: @@
 ===Fragment of the data===
 <center>
 {| class="wikitable" style="text-align:center; " border="1"
@@ Line 44: / Line 47: @@
   hist(x.norm, main="N(10,20) Histogram")
   hist(x.norm, main="N(10,20) Histogram")
-  mean(data_1$age)
+  mean(data_1$\$$age)
-  sd(data_1$age)
+  sd(data_1$\$$age)
-Simulate new data to match the properties/characteristics of observed data
+==Simulate New Data to Match the Properties/Characteristics of Observed Data==
- i2 [0: 184]
+*i2 [0: 184]
- age m=34,sd=12
+*age m=34,sd=12
- treat {0,1}
+*treat {0,1}
- homeless {0,1}
+*homeless {0,1}
- pcs 14-75
+*pcs 14-75
- mcs 7-62
+*mcs 7-62
- cesd 0–60
+*cesd 0–60
- indtot 4-45
+*indtot 4-45
- pss_fr 0-14
+*pss_fr 0-14
- drugrisk 0-21
+*drugrisk 0-21
- sexrisk
+*sexrisk
- satreat (0=no,1=yes)
+*satreat (0=no,1=yes)
- female (0=no,1=yes)
+*female (0=no,1=yes)
- racegrp (black, white, other)
+*racegrp (black, white, other)
+ # Define number of subjects
+ NumSubj <- 282
+ NumTime <- 4
+ # Define data elements
+ # Cases
+ Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58, 60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106, 107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149, 153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201, 208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251, 254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296, 298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336, 338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367, 368, 369,370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386, 387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447, 449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467, 470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488, 489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533, 537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582, 586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634, 635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656, 658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688, 689, 690, 692)
+ # Imaging Biomarkers
+ L_caudate_ComputeArea <- rpois(NumSubj, 600)
+ L_caudate_Volume <- rpois(NumSubj, 800)
+ R_caudate_ComputeArea <- rpois(NumSubj, 893)
+ R_caudate_Volume <- rpois(NumSubj, 1000)
+ L_putamen_ComputeArea <- rpois(NumSubj, 900)
+ L_putamen_Volume <- rpois(NumSubj, 1400)
+ R_putamen_ComputeArea <- rpois(NumSubj, 1300)
+ R_putamen_Volume <- rpois(NumSubj, 3000)
+ L_hippocampus_ComputeArea <- rpois(NumSubj, 1300)
+ L_hippocampus_Volume <- rpois(NumSubj, 3200)
+ R_hippocampus_ComputeArea <- rpois(NumSubj, 1500)
+ R_hippocampus_Volume <- rpois(NumSubj, 3800)
+ cerebellum_ComputeArea <- rpois(NumSubj, 16700)
+ cerebellum_Volume <- rpois(NumSubj, 14000)
+ L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
+ L_lingual_gyrus_Volume <- rpois(NumSubj, 11000)
+ R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
+ R_lingual_gyrus_Volume <- rpois(NumSubj, 12000)
+ L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600)
+ L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000)
+ R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300)
+ R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)
   # Demographics variables
   Sex <- ifelse(runif(NumSubj)<.5,0,1)
   Weight <- as.integer(rnorm(NumSubj, 80,10))
   Age <- as.integer(rnorm(NumSubj, 62,10))
   # Diagnosis:
   Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))
   # Genetics
   chr12_rs34637584_GT <- c(ifelse(runif(100)<.3,0,1), ifelse(runif(100)<.6,0,1), ifelse(runif(82)<.4,0,1))                              # NumSubj Bernoulli trials
   chr17_rs11868035_GT <- c(ifelse(runif(100)<.7,0,1), ifelse(runif(100)<.4,0,1), ifelse(runif(82)<.5,0,1))                              # NumSubj Bernoulli trials
   # Clinical          # rpois(NumSubj, 15) + rpois(NumSubj, 6)
   UPDRS_part_I <- c( ifelse(runif(100)<.7,0,1)+ifelse(runif(100)<.7,0,1),
   ifelse(runif(100)<.6,0,1)+ ifelse(runif(100)<.6,0,1),
   ifelse(runif(82)<.4,0,1)+ ifelse(runif(82)<.4,0,1) )
   UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100, replace=T),
   sample.int(18, 82, replace=T) )
   UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100, replace=T),
+    sample.int(25, 82, replace=T) )
-            sample.int(25, 82, replace=T) )
   # Time: VisitTime – done automatically below in aggregator
   # Data (putting all components together)
   sim_PD_Data <- cbind(
+           rep(Cases, each= NumTime),                          # Cases
-          rep(Cases, each= NumTime),                                      # Cases
+           rep(L_caudate_ComputeArea, each= NumTime), # Imaging
+           rep(Sex, each= NumTime),                            # Demographics
-                                          rep(L_caudate_ComputeArea, each= NumTime), # Imaging
+           rep(Weight, each= NumTime),
+           rep(Age, each= NumTime),
-                                          rep(Sex, each= NumTime),                                          # Demographics
+           rep(Dx, each= NumTime),                             # Dx
+           rep(chr12_rs34637584_GT, each= NumTime),            # Genetics
- rep(Weight, each= NumTime),
+           rep(chr17_rs11868035_GT, each= NumTime),
+           rep(UPDRS_part_I, each= NumTime),                   # Clinical
- rep(Age, each= NumTime),
+           rep(UPDRS_part_II, each= NumTime),
+           rep(UPDRS_part_III, each= NumTime),
-                                          rep(Dx, each= NumTime),                                                           # Dx
+           rep(c(0,6,12,18), NumSubj)                          # Time
-                                          rep(chr12_rs34637584_GT, each= NumTime),                      # Genetics
-                                          rep(chr17_rs11868035_GT, each= NumTime),
-                                          rep(UPDRS_part_I, each= NumTime),                       # Clinical
- rep(UPDRS_part_II, each= NumTime),
- rep(UPDRS_part_III, each= NumTime),
-  rep(c(0,6,12,18), NumSubj)                                         # Time
   )
   # Assign the column names
   colnames(sim_PD_Data) <- c(
   "Cases",
@@ Line 149: / Line 148: @@
   "Time"
   )
   # some QC
   summary(sim_PD_Data)
   dim(sim_PD_Data)
   head(sim_PD_Data)