MATH 6643 Summer 2012 Applications of Mixed Models/Power by simulation: Normal versus t with 5 dfs-parallel

From Wiki1

Jump to: navigation, search
# June 29, 2012
# This is a simulation to address the question of power and size (empirical rate of Type I error)
# when the underlying error distributions are not normal but are t with 5df
#
#  
# Generate a sample
# N number of classes
# M students per class
# Exercise: extend to 3 levels
#install.packages('parallel')
library(spidadev)
library(p3ddev)
#
#  Step 1:  Write a function that generates a single data set based on
#  sample size, distribution assumptions and effect size(s)
#  This is the step that makes you really think the model through
#
gen <- function(N,M,ICC,weff=0,beff=0){
  # generates a single data frame
  # N is number of classes (clusters)
  # M is number of students per class
  # ICC is the intraclass correlation for the response variable (without treatment)
  # weff is the effect size (relative to sigma) of a within-class treatment
  # beff is the effect size (relative to sigma) of a between-class treatment
  # make N and M even
  N <- round((2*N+1)/2)
  M <- round((2*M+1)/2)
  
  sigma2 <- 1                    # within cluster variance
  tau2 <- sigma2 * ICC/(1-ICC)   # between-cluster variance
  
  id <- rep(1:N,each = M)        # cluster id
  # t distn with 5 dfs:
  random.int.t <- (sqrt(tau2)/1.225) * rt(N,df=5)[id] # random intercept 1.225 is appx SD of t,5
  eps.t <- sqrt(sigma2)*rt(N*M,df=5)/1.225  # epsilon
  
  random.int <- sqrt(tau2) * rnorm(N)[id] # random intercept
  eps <- sqrt(sigma2)*rnorm(N*M)  # epsilon
                   # Exercise: introduce correlation but beware 
                   # of order relative to treatment assignment
  dd <- data.frame( id=id)
  dd$X <- 0:1   # note how easy this is because of recycling
  dd$W <- rep(0:1, each = N*M/2)
  sig <- sqrt(sigma2)
  dd$Yn <- with(dd, X*weff*sig + W*beff*sig + random.int + eps  )
  dd$Yt <- with(dd, X*weff*sig + W*beff*sig + random.int.t + eps.t  )
  
  dd 
}

gen(4,3,.5,1,1)

#
# Step 2: Write a function that call the previous function, 
# analyzes the data set and return observed p-values -- or
# other output. 
# 

gen.an <- function(N,M,ICC,weff=0,beff=0) {
  # generate and analyze, return p-vvalue
  ret <- c(X.t=NA,W.t=NA,X.n=NA,W.n=NA)
  fit.n <- try( lme( Yn ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
  if( !is(fit.n,"try-error")) {
    ret[c(1,2)] <- as.matrix(wald(fit.n)1$estimate)[2:3,'p-value']
  }
  fit.t <- try( lme( Yt ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
  if( !is(fit.t,"try-error")) {
    ret[c(3,4)] <- as.matrix(wald(fit.t)1$estimate)[2:3,'p-value']
  }
  ret
  
}

gen.an(100,5,.2,1,1)
#
# Step 3: Write a function that does the above 'nsim' times 
# and returns the proportion of p-values that are less that alpha (i.e. power)
# The reason for the '...' will be obvious later.
#
gen.an.power <- function(N,M,ICC,weff=0,beff=0,alpha=.05,nsim=1,...){
  # proportion of rejections at level alpha
  ret <- replicate(nsim, gen.an(N,M,ICC,weff,beff), simplify = FALSE)
  # Proportion of p-values less that alpha
  nmiss.n <- sum( sapply(ret, function(x)is.na(x[1])))
  nmiss.t <- sum( sapply(ret, function(x)is.na(x[3])))
  nrep.n <- nsim - nmiss.n
  nrep.t <- nsim - nmiss.t
  if (nrep.n==0) warning('no normal model converged')
  if (nrep.t==0) warning('no t model converged')
  ret <- do.call(cbind, ret)
  ret <- apply(ret<alpha, 1, mean, na.rm = TRUE)
  c(ret,nrep.n=nrep.n, nmiss.n=nmiss.n, nrep.t=nrep.t, nmiss.t=nmiss.t)
}

gen.an(4,3,.5,1,1)
gen.an.power(4,3,.5,1,1,nsim=10)

#
# Step 4: Write a function that takes its input from the rows of a
# data frame and appends power results as additional columns 
#

pow <- function(dd) {
  ret <- sapply(1:nrow(dd), function( i) {
    do.call(gen.an.power, dd[i,])
    #cat(".")
  })
  cbind(dd,t(ret))
}

zd <- expand.grid( N = seq(10,30,10), M = 3:6,  weff = .2, beff = c(0, .2, .6))
zd$nsim <- 10
zd$alpha <- .05
zd$ICC <- .2

pow(zd)
#
# Step 5: Create a data frame to explore power with a modest number of
#         simulations
#
zd <- expand.grid( N = seq(10,100,10), M = 3:6,  beff = 0, weff = c(0, .2, .6))
zd$nsim <- 100
zd$alpha <- .05
zd$ICC <- .2
dim(zd)

# 
# Step 6: Explore with a modest number of simulations at first
#       
#

# Hours to increase simulations by factor of 10
system.time(
  dout <- pow(zd)
  ) *10 /3600
 dout
#
# improve using parallel package: 'parallel'
#

ncores <- 4 
nrow(zd)/ncores #datasets of size 30, nice and even
g <- rep(1:ncores, each = nrow(zd)/ncores)        # group list id
zdlist <- split(zd, g)
	 
library(parallel)	 
cl <- makeCluster(ncores)     #cores used

system.time(
  doutlapply <- lapply(zdlist, pow) #slightly faster
  ) *10 /3600 
	

system.time(
  doutpar <- parLapply(cl, zdlist, pow) #crash, can't see functions
) *10 /3600


# make function that is self contained (package dependencies included, or 
# use spidadev::wald() instead of wald(), for example)
powpar <- function(dd) { 
	require(nlme)
	require(spidadev)	
	gen <- function(N,M,ICC,weff=0,beff=0){
	  # generates a single data frame
	  # N is number of classes (clusters)
	  # M is number of students per class
	  # ICC is the intraclass correlation for the response variable (without treatment)
	  # weff is the effect size (relative to sigma) of a within-class treatment
	  # beff is the effect size (relative to sigma) of a between-class treatment
	  # make N and M even
	  N <- round((2*N+1)/2)
	  M <- round((2*M+1)/2)
	  
	  sigma2 <- 1                    # within cluster variance
	  tau2 <- sigma2 * ICC/(1-ICC)   # between-cluster variance
	  
	  id <- rep(1:N,each = M)        # cluster id
	  # t distn with 5 dfs:
	  random.int.t <- (sqrt(tau2)/1.225) * rt(N,df=5)[id] # random intercept 1.225 is appx SD of t,5
	  eps.t <- sqrt(sigma2)*rt(N*M,df=5)/1.225  # epsilon
	  
	  random.int <- sqrt(tau2) * rnorm(N)[id] # random intercept
	  eps <- sqrt(sigma2)*rnorm(N*M)  # epsilon
					   # Exercise: introduce correlation but beware 
					   # of order relative to treatment assignment
	  dd <- data.frame( id=id)
	  dd$X <- 0:1   # note how easy this is because of recycling
	  dd$W <- rep(0:1, each = N*M/2)
	  sig <- sqrt(sigma2)
	  dd$Yn <- with(dd, X*weff*sig + W*beff*sig + random.int + eps  )
	  dd$Yt <- with(dd, X*weff*sig + W*beff*sig + random.int.t + eps.t  )
	  
	  dd 
	}

	#
	# Step 2: Write a function that call the previous function, 
	# analyzes the data set and return observed p-values -- or
	# other output. 
	# 

	gen.an <- function(N,M,ICC,weff=0,beff=0) {
	  # generate and analyze, return p-vvalue
	  ret <- c(X.t=NA,W.t=NA,X.n=NA,W.n=NA)
	  fit.n <- try( lme( Yn ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
	  if( !is(fit.n,"try-error")) {
		ret[c(1,2)] <- as.matrix(wald(fit.n)1$estimate)[2:3,'p-value']
	  }
	  fit.t <- try( lme( Yt ~ X+W, gen(N,M,ICC,weff=weff,beff=beff), random = ~ 1 |id) )
	  if( !is(fit.t,"try-error")) {
		ret[c(3,4)] <- as.matrix(wald(fit.t)1$estimate)[2:3,'p-value']
	  }
	  ret
	  
	}
	#
	# Step 3: Write a function that does the above 'nsim' times 
	# and returns the proportion of p-values that are less that alpha (i.e. power)
	# The reason for the '...' will be obvious later.
	#
	gen.an.power <- function(N,M,ICC,weff=0,beff=0,alpha=.05,nsim=1,...){
	  # proportion of rejections at level alpha
	  ret <- replicate(nsim, gen.an(N,M,ICC,weff,beff), simplify = FALSE)
	  # Proportion of p-values less that alpha
	  nmiss.n <- sum( sapply(ret, function(x)is.na(x[1])))
	  nmiss.t <- sum( sapply(ret, function(x)is.na(x[3])))
	  nrep.n <- nsim - nmiss.n
	  nrep.t <- nsim - nmiss.t
	  if (nrep.n==0) warning('no normal model converged')
	  if (nrep.t==0) warning('no t model converged')
	  ret <- do.call(cbind, ret)
	  ret <- apply(ret<alpha, 1, mean, na.rm = TRUE)
	  c(ret,nrep.n=nrep.n, nmiss.n=nmiss.n, nrep.t=nrep.t, nmiss.t=nmiss.t)
	}

	#
	# Step 4: Write a function that takes its input from the rows of a
	# data frame and appends power results as additional columns 
	#

  ret <- sapply(1:nrow(dd), function( i) {
    do.call(gen.an.power, dd[i,])
    #cat(".")
  })
  cbind(dd,t(ret))
}


# Run a quick test to prime the process for main run:
test <- list(zd[1,], zd[2,], zd[3,], zd[4,])
doutpar <- parLapply(cl, test, powpar) 

# Main run
system.time(
  doutpar <- parLapply(cl, zdlist, powpar) #parallel version
) *10 /3600
doutpar

#####

head(dout)
summary(dout)
xyplot( I(X.n-X.t) ~ N | weff, dout, groups = M, type = 'b', lwd =2,auto.key=T)
xyplot( X ~ I(M*N) | weff, dout, groups = M, type = 'b', lwd =2,xlim=c(0,100))
Personal tools