MATH 6627 2012-13 Practicum in Statistical Consulting/Visualizing Multiple Regression.R

From Wiki1

Jump to: navigation, search
##
##  Ellipses in multiple regression
##  MATH 6627
##  October 12, 2012
##  G. Monette
##  georges@yorku.ca
## master copy in
# http://scs.math.yorku.ca/index.php?title=MATH_6627_2012-13_Practicum_in_Statistical_Consulting/Visualizing_Multiple_Regression.R



# You only need to install packages if you haven't before:
# To install the spidadev and p3ddev packages 
# see http://scs.math.yorku.ca/index.php/SCS_2011:_Statistical_Analysis_and_Programming_with_R#Installing_packages_on_a_PC
# or http://scs.math.yorku.ca/index.php/SCS_2011:_Statistical_Analysis_and_Programming_with_R#Installing_packages_on_a_Mac


library(spidadev)
library(p3ddev)

data(coffee)

head(coffee)

Init3d( family='serif',cex = 1.2)
Plot3d( Heart ~ Stress + Coffee, coffee,
        ylim = c(0,150),  xlim = c(0,180), zlim = c(0,180),
        col = 'blue', theta=-90, phi=0, fov= 0, xlab ='')
k <-function() rgl.bringtotop(stay=T) # makes rgl window stay on top
b <-function() rgl.bringtotop(stay=F)
k()
Fit3d( fitc <- lm ( Heart ~ Coffee, coffee ), lwd = 3, residuals = TRUE)
Ell3d( col='aliceblue' )       # highly significant ... at a glance
summary(fitc)
Pop3d(4)
# What could be responsible?
Id3d( labels=coffee$Occupation, pad = 2 ,col = 'blue')

# Axes3d()
k()
# Another variable was measured:
Text3d( z=-1,y=2,x=180, text = "Stress " ,adj = 1, col='black')
# Look at data in 3d
Fit3d( lm(Heart ~ 0, coffee), residuals =T, col = 'gray90', col.grid = 'gray', col.res='gray')
Ell3d( col= 'aliceblue')

# view all pair wise
# Strong pairwise relationship between all pairs of variables

# What does Heart vs Coffee look like in 3d
view3d(-90,0,0)
Fit3d( fitc <- lm ( Heart ~ Coffee, coffee ), lwd =1, col='lightblue', col.grid = 'blue1', residuals = TRUE)



# What if we take bot coffee and stress into account
Fit3d( fitm <- lm( Heart ~ Coffee + Stress, coffee), col = 'yellow', col.grid = 'gray' )
# look down diagonal
# Note: goes through vertical girdle of ellipse
summary(fitm)


# Note predictors are correlated

Lines3d( zx = (elld <- dell(coffee$Coffee, coffee$Stress, radius = 1:2)),y='min', color = 'red',lwd=2)
Lines3d( zx = ellptc( elld, c(0,1), radius=10*c(-1,1)), y = 'min', color = 'red', lwd = 2)

# Looking down the intersection of the blue and yellow planes: what do you see? AVP for Heart vs Stress adjusted for Coffee

par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-2,2),ylim= c(-1,3), type = 'n',
          xlab = list( ~beta[Coffee], cex =2), ylab=list(~beta[Stress], cex = 2))
abline( h = 0 )
abline( v = 0 )
points( rbind(coef(fitm)[c('Coffee','Stress')]), pch = 16 , col ='red')
lines( cell(fitm), col = 'black', lwd=2)
lines( cem <- cell(fitm, df=1 ), col = 'red', lwd=2)
summary(fitm)
wald(fitm)
confint(fitm)
points( x = coef(fitm)["Coffee"], y=0, pch = 16, col = 'red')
lines( x = rep( coef(fitm)["Coffee"],2), y = c(0,  coef(fitm)["Stress"]), col = 'red')
lines( x = confint(fitm)["Coffee",], y = c(0,0), lwd =3, col = 'red')
# show formulas for ellipses

points( x = 0, y=coef(fitm)["Stress"], pch = 16, col = 'red')
lines( x = c(0,0) , y = confint(fitm)["Stress",], lwd =3, col = 'red')

# but where does the confidence interval for the effect of coffee not controlling for streess come from
points( x =  coef(fitc)["Coffee"], y =0, pch = 16, col = 'blue')
lines( x = confint(fitc)["Coffee",], y = c(0,0), lwd =3, col = 'blue')

points( ellptc( cem , dir= c(1,0), radius = -1), col = 'black', pch = 16)   # conjugate point
lines( elltan( cem, dir = c(1,0), radius = -1, len = 2))      # tangent parallel to a direction
lines( elltanc( cem, dir = c(1,0), radius = 0, len = 20))     # conjugate direction at center
lines( elltanc( cem, dir = c(1,0), radius = 1, len = 20))     #   ... at radius 1
lines( elltanc( cem, dir = c(1,0), radius = -1, len = 20))    #   .... at radius 2

#
#
#  OTHER ROUTES TO betahat.Coffee
#
#

#
#  THE ADDED VARIABLE PLOT:
#   - reduces multiple regression to simple regression
#   - good plot to assess influence of points on betahat.X
#

# Regress both Heart and Coffee  on  all other variables (i.e. Stress -- just one in this case)
# Perform a simple regression using resulting residuals


fit.avp <- lm( cbind( Heart, Coffee) ~ Stress, coffee)
fit.avp
resid(fit.avp)
lm( Heart ~ Coffee, as.data.frame(resid(fit.avp)))
lm( Heart ~ Coffee + Stress, coffee)      # same betahat.Coffeedifferences only due to df
par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-22,22),ylim= c(-22,22), type = 'n',
          ylab = list( as.expression(~H - hat(H)*"|"*Stress), cex =1.5),
          xlab=list(as.expression(~C - hat(C)*"|"*Stress), cex = 1.5),
          main = "Added Variable Plot")
abline( h = 0 )
abline( v = 0 )
points( resid(fit.avp)[,2:1], pch = 16, cex =1.5)
lines( dell( resid(fit.avp)[,2:1]), col = 'red', lwd = 3)


# Easier:
# SLICING THE DATA ELLIPSOID

vmat.avp <- var( resid(fit.avp) )
vmat.avp
# rgl.set( ? )
ell.avp <- with( coffee, ell( c(mean(Heart),mean(Coffee)), vmat.avp))

Init3d( family='serif',cex = 1.2)
Plot3d( Heart ~ Stress + Coffee, coffee,
        ylim = c(0,150),  xlim = c(0,180), zlim = c(0,180),
        col = 'blue', theta=-90, phi=0, fov= 0, xlab ='')

k()
Ell3d( col='aliceblue' )       # highly significant ... at a glance
# regression through shadow
Fit3d( fitc <- lm ( Heart ~ Coffee, coffee ), lwd =1, col='lightblue', col.grid = 'blue1', residuals = TRUE)
# regression through slice
with( coffee, Lines3d( x = mean(Stress), yz = ell.avp, col = 'red', lwd = 3))
Fit3d( fitm <- lm( Heart ~ Coffee + Stress, coffee), col = 'yellow', col.grid = 'gray' )

#
# The marginal regression of Heart on Coffee is 
# determined by the shadow of the 3D data ellipsoid
# onto the Heart x Coffee plane
# The multiple regression of Heart on Coffee is 
# determined by the **slice** of the 3D data ellipsoid
# by a frontal plane through the centre of the ellipsoid
#
# The 'shape' of the CE is deternined by Sigma(X)
# It is scaled by  d x se(y) / sqrt(n)
# where d is a quantile from an appropriate distribution
# Its center is at Beta.hat

##
##
##  Illustrating confidence regions
##
##  SKIP TO LINE 181  OR 234
##

Init3d( family = 'serif', cex = 1.2)

Plot3d( rbind(-3*c(10,1,1),3*c(10,1,1)) , axes = FALSE,   sphere.size = 0, point.col='white',
        xlim = 10*c(-1,1), ylim =2*c(-1,1), zlim =2*c(-1,1))
k()
Lines3d( x = c(-20,20), y = 0, z = 0, color='black')
Lines3d( y = c(-2,2), x = 0, z = 0, color='black')
Lines3d( z = c(-2,2), y = 0, x = 0, color='black')
Text3d( x = c(20,0,0), y = c(0,2,0), z = c(0,0,2), text = c("b0","b.Stress","b.Coffee"))

# par3d('scale')
# par3d(scale=c(1/10,1,1))

vv <- vcov(fitm) [c(1,3,2),c(1,3,2)]
coef(fitm)
plot3d( ellipse3d( vv, centre= coef(fitm)[c(1,3,2)], t = sqrt( 3* qf(.95,3,18))), add=T, col = 'blue', alpha = 0.5)     # note the spelling of 'centre'
view3d(-90,0,0)


###
###  Measurement error
###

# Add error to stress
Init3d( family = 'serif', cex = 1.2)

Plot3d( Heart ~ Stress + Coffee, coffee,
        ylim = c(0,150),  xlim = c(0,180), zlim = c(0,180),
        col = 'blue', theta=-90, phi=0, fov= 0, xlab ='')


k()
Fit3d( fitc <- lm ( Heart ~ Coffee, coffee ), lwd =1, col='lightblue', col.grid = 'blue1', residuals = TRUE)

Fit3d( fitm <- lm( Heart ~ Coffee + Stress, coffee), col = 'yellow', col.grid = 'gray' )


par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-2,2),ylim= c(-1,3), type = 'n',
          xlab = list( ~beta[Coffee], cex =2), ylab=list(~beta[Stress], cex = 2))
abline( h = 0 )
abline( v = 0 )
points( rbind(coef(fitm)[c('Coffee','Stress')]), pch = 16 , col ='red')
lines( cell(fitm), col = 'black', lwd=2)
lines( cem <- cell(fitm, df=1 ), col = 'red', lwd=2)
summary(fitm)
wald(fitm)
confint(fitm)
points( x = coef(fitm)["Coffee"], y=0, pch = 16, col = 'red')
lines( x = rep( coef(fitm)["Coffee"],2), y = c(0,  coef(fitm)["Stress"]), col = 'red')
lines( x = confint(fitm)["Coffee",], y = c(0,0), lwd =3, col = 'red')
# show formulas for ellipses
points( x =  coef(fitc)["Coffee"], y =0, pch = 16, col = 'blue')
lines( x = confint(fitc)["Coffee",], y = c(0,0), lwd =3, col = 'blue')

points( ellptc( cem , dir= c(1,0), radius = -1), col = 'black', pch = 16)   # conjugate point
lines( elltan( cem, dir = c(1,0), radius = -1, len = 2))      # tangent parallel to a direction
lines( elltanc( cem, dir = c(1,0), radius = 0, len = 20))     # conjugate direction at center
lines( elltanc( cem, dir = c(1,0), radius = 1, len = 20))     #   ... at radius 1
lines( elltanc( cem, dir = c(1,0), radius = -1, len = 20))    #   .... at radius 2

#
#  Add error to stress
#
sd( coffee$Stress)
zz <- coffee
zz$Stress <- zz$Stress + 10* rnorm( nrow(zz))
sd(zz$Stress)

# Fitted plane to points with added error in stress

Fit3d( fit.me <- lm( Heart ~ Coffee + Stress, zz), col = 'pink')

# Confidence ellipse:

lines( cell( fit.me), col = 'pink', lwd = 4)

# Keep adding more and more error by repeating the following: 
zz$Stress <- zz$Stress + 10* rnorm( nrow(zz))
Fit3d( fit.me <- lm( Heart ~ Coffee + Stress, zz), col = 'tomato')
lines( cell( fit.me), col = 'tomato', lwd = 4)

# what happened to the estimated effects of coffee and stress



###  Health on Weight and Height
###  Effect of outliers on regression
###


Init3d( family = 'serif', cex = 1.2 )
hw <- subset(hwoutliers, Outlier == 'none')

hw$Health <- c(1 + .5 *scale( hw$Health ))
hw$Weight <- c(1 + .5 *scale( hw$Weight ))
hw$Height <- c(1 + .5 *scale( hw$Height ))


Plot3d( Health ~ Height + Weight, hw, col = 'blue',
        xlim = c(0,2), zlim =c(0,2), ylim = c(0,2),
        theta = -90, phi = 0, fov = 0)
#Axes3d()
#Pop3d(6)
k()
Fit3d( fitw <- lm( Health ~ Weight,  hw),  col = 'blue', residuals = T, lwd = 2)
#Fit3d( fitw <- lm( Health ~ Weight,  hw),  col = 'blue', lwd = 2)
Ell3d(alpha = .3 )
#Ell3d()
# Not significant! No evidence of a problem

## Do classical diagnostics: what do they show?
## Residual vs X -- same as residuals versus yhat
## Residuals vs omitted variable ?
## Keep turning until AVP

## We get the strongest pattern in residuals precisely when we are looking at the AVP
## Looking down the regression of Height on Weight
##

Lines3d( xz = elld <- dell( hw$Height, hw$Weight, radius = c(1,2)), y = 0, color = 'red')
Lines3d( xz = ellptc( elld, dir= c(1,0), radius = c(-10,10) ), y = 0, color = 'red') # Regression of height on weight

##
## If regression line for the regression of height on weight is vertical then
##    horizontal displacement is proportional to residual of height on weight
##    vertical blue residual is proportion to residual of health on weight
##    SO we have the AVP for health on height adjusting for weight
##

fithw <- lm( Health ~  Weight + Height , hw)
summary(fithw)
Fit3d( fithw, col = 'yellow', col.grid = 'gray')

# See view of AVP is a 2-dimensional view of the effect of a variable in a high-dimensional regression
# up to shear transformation

summary(fitw)                  # weight not significant
summary( fithw )               # weight is significant
par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-3,1),ylim= c(-1,3), type = 'n',
          xlab = list( ~beta[Weight], cex =2), ylab=list(~beta[Height], cex = 2))
abline( h = 0 )
abline( v = 0 )
points( rbind(coef(fithw)[c('Weight','Height')]), pch = 16 , col ='red')
lines( cell(fithw), col = 'black', lwd=2)
lines( cem <- cell(fithw, df=1 ), col = 'red', lwd=2)
summary(fithw)
wald(fithw)
confint(fithw)
points( x = coef(fithw)["Weight"], y=0, pch = 16, col = 'red')
lines( x = rep( coef(fithw)["Weight"],2), y = c(0,  coef(fithw)["Height"]), col = 'red')
lines( x = confint(fithw)["Weight",], y = c(0,0), lwd =3, col = 'red')

points( x =  coef(fitw)["Weight"], y =0, pch = 16, col = 'blue')
lines( x = confint(fitw)["Weight",], y = c(0,0), lwd =3, col = 'blue')


####
####   AVP
####



fit.avp <- lm( cbind( Health, Weight) ~ Height, hw)
fit.avp
resid(fit.avp)
lm( Health ~ Weight, as.data.frame(resid(fit.avp)))
lm( Health ~ Weight + Height, hw)      # same effect of Weight except for df's
par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( resid(fit.avp), #xlim = c(-22,22),ylim= c(-22,22),
          type = 'n',
          ylab = list( as.expression(~H - hat(H)*"|"*Height), cex =1.5),
          xlab=list(as.expression(~W - hat(W)*"|"*Height), cex = 1.5),
          main = "Added Variable Plot")
abline( h = 0 )
abline( v = 0 )
points( resid(fit.avp)[,2:1], pch = 16, cex =1.5)
lines( dell( resid(fit.avp)[,2:1]), col = 'red', lwd = 3)

##
##  AVP in 3d
##


# Plot3d( Health ~ Height + Weight, hw, col = 'blue',theta = -90, phi = 0, fov = 0)
# Ell3d(alpha=.3)
# Fit3d( lm (Health ~ Weight, hw), col = 'blue')

vmat.avp <- var( resid(fit.avp) )
vmat.avp
# rgl.set( ? )
ell.avp <- with( hw, ell( c(mean(Health),mean(Weight)), vmat.avp))
with( hw, Lines3d( x = mean(Height), yz = ell.avp, col = 'red', lwd = 3))

with(hw,
     Lines3d ( x = mean(Height), yz = rbind(
       ellptc(ell.avp,c(1,0),radius = -5),
       center(ell.avp),
       ellptc(ell.avp,c(1,0),radius = 5)),
               col = 'red', lwd=2)
)


##
##  Effect of different types of outliers
##
#
#               Predictors         Fit
#    Type 1:    typical value      bad fit
#    Type 2:    atypical value     good fit
#    Type 3:    atypical value     bad fit
#
hwoutliers
Init3d( family = 'serif', cex = 1.2 )
Plot3d( Health ~ Height + Weight | Outlier , hwoutliers)
Id3d(labels = hwoutliers$Outlier,pad = 2)
k()
hw0 <- subset( hwoutliers, Outlier == "none")
Ell3d( hw1 <- subset( hwoutliers, Outlier %in% c("none","Type 1")), col = 'green', alpha = .3)
Ell3d( hw2 <- subset( hwoutliers, Outlier %in% c("none","Type 2")), col = 'orange')
Ell3d( hw3 <- subset( hwoutliers, Outlier %in% c("none","Type 3")), col = 'magenta')

with ( hw0 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'blue', lwd = 2))
with ( hw1 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'green', lwd = 2))
with ( hw2 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'orange', lwd = 2))
with ( hw3 , Lines3d( xz = dell( Height, Weight), y = 'min', color = 'magenta', lwd = 3))

Fit3d( fit0 <- lm( Health ~ Weight + Height, hw0 ) , col = 'blue', alpha=.2)
Fit3d( fit1 <- lm( Health ~ Weight + Height, hw1 ) , col = 'green', alpha=.2)
Fit3d( fit2 <- lm( Health ~ Weight + Height, hw2 ) , col = 'orange', alpha=.2)
Fit3d( fit3 <- lm( Health ~ Weight + Height, hw3 ) , col = 'magenta', alpha=.4)


###################   ADD
################### data ellipses + predictor ellipses


par( mar =  c(5, 5, 4, 2) + 0.1)
eqscplot( 0,0, xlim = c(-2,2),ylim= c(-2,2), type = 'n',
          xlab = list( ~beta[Weight], cex =2), ylab=list(~beta[Height], cex = 2))
abline( h = 0 )
abline( v = 0 )

points( rbind(coef(fit0)[c('Weight','Height')]), pch = 16 , col ='blue')
lines( cell(fit0, df=1 ), col = 'blue', lwd=2)

points( rbind(coef(fit1)[c('Weight','Height')]), pch = 16 , col ='green')
lines( cell(fit1, df=1 ), col = 'green', lwd=2)

points( rbind(coef(fit2)[c('Weight','Height')]), pch = 16 , col ='orange')
lines( cell(fit2, df=1 ), col = 'orange', lwd=2)

points( rbind(coef(fit3)[c('Weight','Height')]), pch = 16 , col ='magenta')
lines( cell(fit3, df=1 ), col = 'magenta', lwd=2)

# Although only Type 3 outliers have a large influence on beta.hat
# Type 1 and Type 2 outliers have influence on inference
#
#   Generally, what makes an outlier is Mahalanobis distance (relative to data ellipse)
#       not euclidean distance
#   Or for a particular beta, size of residual for that varialble
#
#
Personal tools