SCS Reads 2011/Visualizing Multiple Regression R Script

From Wiki1

Jump to: navigation, search
##
##  Ellipses in multivariate regression
##  September 28, 2011
##  G. Monette
##  georges@yorku.ca
## Version on scs.math.yorku.ca wiki

# If you have not installed spida.beta and p3d.beta
# install.packages('rgl','car','Hmisc')
# download.file("http://www.math.yorku.ca/people/georges/Files/R/spida.beta.zip", 'spida.beta.zip')
# install.packages('spida.beta.zip', repos = NULL)
# 
# download.file("http://www.math.yorku.ca/people/georges/Files/R/p3d.beta.zip", 'p3d.beta.zip')
# install.packages('p3d.beta.zip', repos = NULL)

library(spida.beta)
library(p3d.beta)
data(coffee)

# head(coffee)
# rownames(coffee) <- coffee$Occupation

Init3d( family='serif',cex = 1.2)
Plot3d( Heart ~ Stress + Coffee, coffee,
      ylim = c(0,150),  xlim = c(0,180), zlim = c(0,180),
      col = 'blue', theta=-90, phi=0, fov= 0, xlab ='')
k <-function() rgl.bringtotop(stay=T) # makes rgl window stay on top
b <-function() rgl.bringtotop(stay=F)
k()
Fit3d( fitc <- lm ( Heart ~ Coffee, coffee ), lwd = 2, residuals = TRUE)
summary(fitc)
Ell3d()
Pop3d(4)

Id3d( labels=coffee$Occupation, pad = 2 ,col = 'blue')
# Axes3d()
k()
Text3d( z=-5,y=2,x=180, text = "Stress " ,adj = 1, col='black')
Fit3d( lm(Heart ~ 0, coffee), residuals =T, col = 'gray')
# view all pair wise
# Strong pairwise relationship between all pairs of variables

# What does Heart vs Coffee look like in 3d
Fit3d( fitc <- lm ( Heart ~ Coffee, coffee ), lwd =2, residuals = TRUE)

# What if we take stress into account
Fit3d( fitm <- lm( Heart ~ Coffee + Stress, coffee), col = 'yellow', col.grid = 'gray' )
# look down diagonal
summary(fitm)

# Note predictors are correlated

Lines3d( zx = (elld <- dell(coffee$Coffee, coffee$Stress, radius = 1:2)),y='min', color = 'red',lwd=2)
Lines3d( zx = ellptc( elld, c(0,1), radius=10*c(-1,1)), y = 'min', color = 'red', lwd = 2)
# Looking down the intersection of the blue and yellow planes: what do you see? AVP for Heart vs Stress adjusted for Coffee


par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-2,2),ylim= c(-1,3), type = 'n',
    xlab = list( ~beta[Coffee], cex =2), ylab=list(~beta[Stress], cex = 2))
abline( h = 0 )
abline( v = 0 )
points( rbind(coef(fitm)[c('Coffee','Stress')]), pch = 16 , col ='red')
lines( cell(fitm), col = 'black', lwd=2)
lines( cem <- cell(fitm, df=1 ), col = 'red', lwd=2)
summary(fitm)
wald(fitm)
confint(fitm)
points( x = coef(fitm)["Coffee"], y=0, pch = 16, col = 'red')
lines( x = rep( coef(fitm)["Coffee"],2), y = c(0,  coef(fitm)["Stress"]), col = 'red')
lines( x = confint(fitm)["Coffee",], y = c(0,0), lwd =3, col = 'red')
# show formulas for ellipses

points( x = 0, y=coef(fitm)["Stress"], pch = 16, col = 'red')
lines( x = c(0,0) , y = confint(fitm)["Stress",], lwd =3, col = 'red')

# but where does the confidence interval for the effect of coffee not controlling for streess come from
points( x =  coef(fitc)["Coffee"], y =0, pch = 16, col = 'blue')
lines( x = confint(fitc)["Coffee",], y = c(0,0), lwd =3, col = 'blue')

points( ellptc( cem , dir= c(1,0), radius = -1), col = 'black', pch = 16)
lines( elltan( cem, dir = c(1,0), radius = -1, len = 2))
lines( elltanc( cem, dir = c(1,0), radius = 0, len = 20))
lines( elltanc( cem, dir = c(1,0), radius = 1, len = 20))
lines( elltanc( cem, dir = c(1,0), radius = -1, len = 20))

# The 'shape' of the CE is deternined by Sigma(X)
# It is scaled by  d x se(y) / sqrt(n)
# where d is a quantile from an appropriate distribution
# Its center is at Beta.hat

##
##
##  Illustrating confidence regions
##
##  SKIP TO LINE 181  OR 234
##

Init3d( family = 'serif', cex = 1.2)

Plot3d( rbind(-3*c(10,1,1),3*c(10,1,1)) , axes = FALSE,   sphere.size = 0, point.col='white',
    xlim = 10*c(-1,1), ylim =2*c(-1,1), zlim =2*c(-1,1))
Lines3d( x = c(-20,20), y = 0, z = 0, color='black')
Lines3d( y = c(-2,2), x = 0, z = 0, color='black')
Lines3d( z = c(-2,2), y = 0, x = 0, color='black')
Text3d( x = c(20,0,0), y = c(0,2,0), z = c(0,0,2), text = c("b0","b.Stress","b.Coffee"), color='black')

# par3d('scale')
# par3d(scale=c(1/10,1,1))

vv <- vcov(fitm) [c(1,3,2),c(1,3,2)]
coef(fitm)
plot3d( ellipse3d( vv, centre= coef(fitm)[c(1,3,2)], t = sqrt( 3* qf(.95,3,18))), add=T, col = 'blue', alpha = 0.5)     # note the spelling of 'centre'
view3d(-90,0,0)


###
###  Measurement error
###

# Add error to stress

rgl.set(1)

sd( coffee$Stress)
zz <- coffee
zz$Stress <- zz$Stress + 10* rnorm( nrow(zz))
Fit3d( fit.me <- lm( Heart ~ Coffee + Stress, zz), col = 'pink')
lines( cell( fit.me), col = 'pink', lwd = 4)

zz$Stress <- zz$Stress + 10* rnorm( nrow(zz))
Fit3d( fit.me <- lm( Heart ~ Coffee + Stress, zz), col = 'pink')
lines( cell( fit.me), col = 'pink', lwd = 4)



###
###  Effect of outliers on regression
###

Init3d( family = 'serif', cex = 1.2 )
hw <- subset(hwoutliers, Outlier == 'none')
Plot3d( Health ~ Height + Weight, hw, col = 'blue', theta = -90, phi = 0, fov = 0)
#Axes3d()
#Pop3d(6)
k()
Fit3d( fitw <- lm( Health ~ Weight,  hw),  col = 'blue', residuals = T, lwd = 2)

# Not significant! No evidence of a problem

# Do classical diagnostics: what do they show?
# Residual vs X -- same as residuals versus yhat
# Residuals vs omitted variable ?
# Keep turning until AVP

Lines3d( xz = elld <- dell( hw$Height, hw$Weight, radius = c(1,2)), y = 'min', color = 'red')
Lines3d( xz = ellptc( elld, dir= c(1,0), radius = c(-10,10) ), y = 'min', color = 'red') # Regression of height on weight

##
## If regression line for the regression of height on weight is vertical then
##    horizontal displacement is proportional to residual of height on weight
##    vertical blue residual is proportion to residual of health on weight
##    SO we have the AVP for health on height adjusting for weight
##

Fit3d( fithw <- lm( Health ~ Height + Weight , hw), col = 'yellow', col.grid = 'gray')

# The AVP is a 2-dimensional view of the effect of a variable in a high-dimensional regression

summary( fithw )
par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-2,2),ylim= c(-1,3), type = 'n',
    xlab = list( ~beta[Weight], cex =2), ylab=list(~beta[Height], cex = 2))
abline( h = 0 )
abline( v = 0 )
points( rbind(coef(fithw)[c('Weight','Height')]), pch = 16 , col ='red')
lines( cell(fithw), col = 'black', lwd=2)
lines( cem <- cell(fithw, df=1 ), col = 'red', lwd=2)
summary(fithw)
wald(fithw)
confint(fithw)
points( x = coef(fithw)["Weight"], y=0, pch = 16, col = 'red')
lines( x = rep( coef(fithw)["Weight"],2), y = c(0,  coef(fithw)["Height"]), col = 'red')
lines( x = confint(fithw)["Weight",], y = c(0,0), lwd =3, col = 'red')

points( x =  coef(fitw)["Weight"], y =0, pch = 16, col = 'blue')
lines( x = confint(fitw)["Weight",], y = c(0,0), lwd =3, col = 'blue')


##
##  Effect of different types of outliers
##
#
#               Predictors         Fit
#    Type 1:    typical value      bad fit
#    Type 2:    atypical value     good fit
#    Type 3:    atypical value     bad fit
#
hwoutliers
Init3d( family = 'serif', cex = 1.2 )
Plot3d( Health ~ Height + Weight | Outlier , hwoutliers)
Id3d(labels = hwoutliers$Outlier,pad = 2)

hw0 <- subset( hwoutliers, Outlier == "none")
hw1 <- subset( hwoutliers, Outlier %in% c("none","Type 1"))
hw2 <- subset( hwoutliers, Outlier %in% c("none","Type 2"))
hw3 <- subset( hwoutliers, Outlier %in% c("none","Type 3"))

with ( hw0 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'blue', lwd = 2))
with ( hw1 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'green', lwd = 2))
with ( hw2 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'orange', lwd = 2))
with ( hw3 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'magenta', lwd = 3))

Fit3d( fit0 <- lm( Health ~ Weight + Height, hw0 ) , col = 'blue', alpha=.2)
Fit3d( fit1 <- lm( Health ~ Weight + Height, hw1 ) , col = 'green', alpha=.2)
Fit3d( fit2 <- lm( Health ~ Weight + Height, hw2 ) , col = 'orange', alpha=.2)
Fit3d( fit3 <- lm( Health ~ Weight + Height, hw3 ) , col = 'magenta', alpha=.4)

par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-2,2),ylim= c(-2,2), type = 'n',
    xlab = list( ~beta[Weight], cex =2), ylab=list(~beta[Height], cex = 2))
abline( h = 0 )
abline( v = 0 )

points( rbind(coef(fit0)[c('Weight','Height')]), pch = 16 , col ='blue')
lines( cell(fit0, df=1 ), col = 'blue', lwd=2)

points( rbind(coef(fit1)[c('Weight','Height')]), pch = 16 , col ='green')
lines( cell(fit1, df=1 ), col = 'green', lwd=2)

points( rbind(coef(fit2)[c('Weight','Height')]), pch = 16 , col ='orange')
lines( cell(fit2, df=1 ), col = 'orange', lwd=2)

points( rbind(coef(fit3)[c('Weight','Height')]), pch = 16 , col ='magenta')
lines( cell(fit3, df=1 ), col = 'magenta', lwd=2)

# Although only Type 3 outliers have a large influence on beta.hat
# Type 1 and Type 2 outliers have influence on inference
Personal tools