# MATH 6643 Summer 2012 Applications of Mixed Models/Power by simulation: Normal versus t with 5 dfs-parallel

```# June 29, 2012
# This is a simulation to address the question of power and size (empirical rate of Type I error)
# when the underlying error distributions are not normal but are t with 5df
#
#
# Generate a sample
# N number of classes
# M students per class
# Exercise: extend to 3 levels
#install.packages('parallel')
library(p3ddev)
#
#  Step 1:  Write a function that generates a single data set based on
#  sample size, distribution assumptions and effect size(s)
#  This is the step that makes you really think the model through
#
gen <- function(N,M,ICC,weff=0,beff=0){
# generates a single data frame
# N is number of classes (clusters)
# M is number of students per class
# ICC is the intraclass correlation for the response variable (without treatment)
# weff is the effect size (relative to sigma) of a within-class treatment
# beff is the effect size (relative to sigma) of a between-class treatment
# make N and M even
N <- round((2*N+1)/2)
M <- round((2*M+1)/2)

sigma2 <- 1                    # within cluster variance
tau2 <- sigma2 * ICC/(1-ICC)   # between-cluster variance

id <- rep(1:N,each = M)        # cluster id
# t distn with 5 dfs:
random.int.t <- (sqrt(tau2)/1.225) * rt(N,df=5)[id] # random intercept 1.225 is appx SD of t,5
eps.t <- sqrt(sigma2)*rt(N*M,df=5)/1.225  # epsilon

random.int <- sqrt(tau2) * rnorm(N)[id] # random intercept
eps <- sqrt(sigma2)*rnorm(N*M)  # epsilon
# Exercise: introduce correlation but beware
# of order relative to treatment assignment
dd <- data.frame( id=id)
dd\$X <- 0:1   # note how easy this is because of recycling
dd\$W <- rep(0:1, each = N*M/2)
sig <- sqrt(sigma2)
dd\$Yn <- with(dd, X*weff*sig + W*beff*sig + random.int + eps  )
dd\$Yt <- with(dd, X*weff*sig + W*beff*sig + random.int.t + eps.t  )

dd
}

gen(4,3,.5,1,1)

#
# Step 2: Write a function that call the previous function,
# analyzes the data set and return observed p-values -- or
# other output.
#

gen.an <- function(N,M,ICC,weff=0,beff=0) {
# generate and analyze, return p-vvalue
ret <- c(X.t=NA,W.t=NA,X.n=NA,W.n=NA)
fit.n <- try( lme( Yn ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
if( !is(fit.n,"try-error")) {
ret[c(1,2)] <- as.matrix(wald(fit.n)1\$estimate)[2:3,'p-value']
}
fit.t <- try( lme( Yt ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
if( !is(fit.t,"try-error")) {
ret[c(3,4)] <- as.matrix(wald(fit.t)1\$estimate)[2:3,'p-value']
}
ret

}

gen.an(100,5,.2,1,1)
#
# Step 3: Write a function that does the above 'nsim' times
# and returns the proportion of p-values that are less that alpha (i.e. power)
# The reason for the '...' will be obvious later.
#
gen.an.power <- function(N,M,ICC,weff=0,beff=0,alpha=.05,nsim=1,...){
# proportion of rejections at level alpha
ret <- replicate(nsim, gen.an(N,M,ICC,weff,beff), simplify = FALSE)
# Proportion of p-values less that alpha
nmiss.n <- sum( sapply(ret, function(x)is.na(x)))
nmiss.t <- sum( sapply(ret, function(x)is.na(x)))
nrep.n <- nsim - nmiss.n
nrep.t <- nsim - nmiss.t
if (nrep.n==0) warning('no normal model converged')
if (nrep.t==0) warning('no t model converged')
ret <- do.call(cbind, ret)
ret <- apply(ret<alpha, 1, mean, na.rm = TRUE)
c(ret,nrep.n=nrep.n, nmiss.n=nmiss.n, nrep.t=nrep.t, nmiss.t=nmiss.t)
}

gen.an(4,3,.5,1,1)
gen.an.power(4,3,.5,1,1,nsim=10)

#
# Step 4: Write a function that takes its input from the rows of a
# data frame and appends power results as additional columns
#

pow <- function(dd) {
ret <- sapply(1:nrow(dd), function( i) {
do.call(gen.an.power, dd[i,])
#cat(".")
})
cbind(dd,t(ret))
}

zd <- expand.grid( N = seq(10,30,10), M = 3:6,  weff = .2, beff = c(0, .2, .6))
zd\$nsim <- 10
zd\$alpha <- .05
zd\$ICC <- .2

pow(zd)
#
# Step 5: Create a data frame to explore power with a modest number of
#         simulations
#
zd <- expand.grid( N = seq(10,100,10), M = 3:6,  beff = 0, weff = c(0, .2, .6))
zd\$nsim <- 100
zd\$alpha <- .05
zd\$ICC <- .2
dim(zd)

#
# Step 6: Explore with a modest number of simulations at first
#
#

# Hours to increase simulations by factor of 10
system.time(
dout <- pow(zd)
) *10 /3600
dout
#
# improve using parallel package: 'parallel'
#

ncores <- 4
nrow(zd)/ncores #datasets of size 30, nice and even
g <- rep(1:ncores, each = nrow(zd)/ncores)        # group list id
zdlist <- split(zd, g)

library(parallel)
cl <- makeCluster(ncores)     #cores used

system.time(
doutlapply <- lapply(zdlist, pow) #slightly faster
) *10 /3600

system.time(
doutpar <- parLapply(cl, zdlist, pow) #crash, can't see functions
) *10 /3600

# make function that is self contained (package dependencies included, or
powpar <- function(dd) {
require(nlme)
gen <- function(N,M,ICC,weff=0,beff=0){
# generates a single data frame
# N is number of classes (clusters)
# M is number of students per class
# ICC is the intraclass correlation for the response variable (without treatment)
# weff is the effect size (relative to sigma) of a within-class treatment
# beff is the effect size (relative to sigma) of a between-class treatment
# make N and M even
N <- round((2*N+1)/2)
M <- round((2*M+1)/2)

sigma2 <- 1                    # within cluster variance
tau2 <- sigma2 * ICC/(1-ICC)   # between-cluster variance

id <- rep(1:N,each = M)        # cluster id
# t distn with 5 dfs:
random.int.t <- (sqrt(tau2)/1.225) * rt(N,df=5)[id] # random intercept 1.225 is appx SD of t,5
eps.t <- sqrt(sigma2)*rt(N*M,df=5)/1.225  # epsilon

random.int <- sqrt(tau2) * rnorm(N)[id] # random intercept
eps <- sqrt(sigma2)*rnorm(N*M)  # epsilon
# Exercise: introduce correlation but beware
# of order relative to treatment assignment
dd <- data.frame( id=id)
dd\$X <- 0:1   # note how easy this is because of recycling
dd\$W <- rep(0:1, each = N*M/2)
sig <- sqrt(sigma2)
dd\$Yn <- with(dd, X*weff*sig + W*beff*sig + random.int + eps  )
dd\$Yt <- with(dd, X*weff*sig + W*beff*sig + random.int.t + eps.t  )

dd
}

#
# Step 2: Write a function that call the previous function,
# analyzes the data set and return observed p-values -- or
# other output.
#

gen.an <- function(N,M,ICC,weff=0,beff=0) {
# generate and analyze, return p-vvalue
ret <- c(X.t=NA,W.t=NA,X.n=NA,W.n=NA)
fit.n <- try( lme( Yn ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
if( !is(fit.n,"try-error")) {
ret[c(1,2)] <- as.matrix(wald(fit.n)1\$estimate)[2:3,'p-value']
}
fit.t <- try( lme( Yt ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
if( !is(fit.t,"try-error")) {
ret[c(3,4)] <- as.matrix(wald(fit.t)1\$estimate)[2:3,'p-value']
}
ret

}
#
# Step 3: Write a function that does the above 'nsim' times
# and returns the proportion of p-values that are less that alpha (i.e. power)
# The reason for the '...' will be obvious later.
#
gen.an.power <- function(N,M,ICC,weff=0,beff=0,alpha=.05,nsim=1,...){
# proportion of rejections at level alpha
ret <- replicate(nsim, gen.an(N,M,ICC,weff,beff), simplify = FALSE)
# Proportion of p-values less that alpha
nmiss.n <- sum( sapply(ret, function(x)is.na(x)))
nmiss.t <- sum( sapply(ret, function(x)is.na(x)))
nrep.n <- nsim - nmiss.n
nrep.t <- nsim - nmiss.t
if (nrep.n==0) warning('no normal model converged')
if (nrep.t==0) warning('no t model converged')
ret <- do.call(cbind, ret)
ret <- apply(ret<alpha, 1, mean, na.rm = TRUE)
c(ret,nrep.n=nrep.n, nmiss.n=nmiss.n, nrep.t=nrep.t, nmiss.t=nmiss.t)
}

#
# Step 4: Write a function that takes its input from the rows of a
# data frame and appends power results as additional columns
#

ret <- sapply(1:nrow(dd), function( i) {
do.call(gen.an.power, dd[i,])
#cat(".")
})
cbind(dd,t(ret))
}

# Run a quick test to prime the process for main run:
test <- list(zd[1,], zd[2,], zd[3,], zd[4,])
doutpar <- parLapply(cl, test, powpar)

# Main run
system.time(
doutpar <- parLapply(cl, zdlist, powpar) #parallel version
) *10 /3600
doutpar

#####