A Tour of Linear Models in R
From Wiki1
" SCS 2011: Statistical Analysis and Programming with R
- September-October 2011
- A Guided Tour of Linear Models in R
- much of the code is borrowed from John Fox:
- Introduction to the R Statistical Computing Environment
- ICPSR Summer Program, 2010-2011
Install packages (if needed)
- See [installing on a Mac]
"
install.packages(c('car','Hmisc','rgl')) download.file("http://www.math.yorku.ca/people/georges/Files/R/spida.beta.zip", "spida.beta.zip") download.file("http://www.math.yorku.ca/people/georges/Files/R/p3d.beta.zip", "p3d.beta.zip") install.packages("spida.beta.zip", repos = NULL) install.packages("p3d.beta.zip", repos = NULL)
"
Load packages
"
library(car) library(spidadev) library(p3ddev)
"
Fitting a linear regression model with numerical reponse
- R approach to data analysis:
- use many small tools, not one big package: more flexible and adaptive, requires more knowledge
- integrate graphical and numerical exploration of data
- numerical and categorical predictors
- interaction
- asking questions: linear hypotheses
"
"
Looking at your data
"
# The data: data(Prestige) # optional: bring data from 'car' package to workspace # this is a way of 'reading' a data set in a package # later we will talk about reading other data sets xqplot( Prestige ) # quick look (from spida.beta) ?Prestige # 'help' on Prestige names(Prestige) head(Prestige) # first 6 lines tail(Prestige, 2) # last 2 lines some(Prestige) # 10 randomly sampled observations # Selecting subsets and indexing Prestige[ c(1,4), ] # Selecting rows Prestige[ c(1,4), c('type','women')] # ... rows and columns
Prestige[ Prestige$women > 50, ] # selects on rows Prestige[ Prestige$type == 'prof', c('type','income') ] # selects on rows and columns Prestige[ order(Prestige$women), ] # ordering order(Prestige$women) # why it works: permutation vector # Tables: with( Prestige, table( type ) ) # using 'with' to refer to variable in a date frame # note that 'table' drops NAs with( Prestige, table( type , cut( women, 3)) ) # creating intervals from a numeric variable
tab(Prestige, ~ type + cut(women,3)) # 'tab' in spida.beta refers to variables with a formula # similarly to fitting functions: lm, etc. tab(Prestige, ~ type + cut(women,3), pct = 2) tab(Prestige, ~ type + cut(women,3), pct = 2, useNA = 'no') # more plots scatterplotMatrix( Prestige ) # fancy from 'car' pairs( Prestige ) # old splom( Prestige ) # newer Init3d() Plot3d( income ~ education + women | type, Prestige) Axes3d() Id3d() # why a linear (in x and y) model won't work fit.lin <- lm( income ~ education + women, Prestige) str(fit.lin) fit.lin Fit3d( fit.lin ) Pop3d(2) colors() Fit3d( fit.lin , col = 'hotpink', alpha = .6) pal(colors()) pal(grep("pink",colors(), value = T)) # global regular expression print (from old unix)
"
Regression on numerical variables and interpretation
"
# Regression with an additive numerical model (no interactions, all vars continuous ) # Note: you can transform a variable on the fly income.mod <- lm( log(income) ~ education + women, data = Prestige) # summary(income.mod) # Interpretation of coefficients: Proportion change in y per unit change in x keeping other x constant # multiply coef by 100 to get percentage change in y -- or use 100*log(y) as dependent variable Plot3d( log(income) ~ education + women | type, Prestige) # note: some curvature and negative outliers Fit3d( income.mod ) Axes3d() Id3d() confint(income.mod) wald(income.mod) wald(income.mod, -1) # removing first variable
"
Diagnostics
"
plot( income.mod ) # residuals vs fitted, normal quantiles for resid, # scale location, residual vs leverage avPlots( income.mod ) # added variable plots
"
Anova
"
anova( income.mod ) # sequential tests -- type I SS # Q: does each term 'add' to previous terms in list Anova( income.mod ) # type II SS: does each term add to others # excluding terms with that interact with the term Anova( income.mod, type = "III") # does each term have an effect when added last
"
Getting information from a regression
- :
"
coef( income.mod )
" : "
vcov( income.mod )
"
Factors in regression
"
Prestige$incomes <- sqrt( Prestige$income ) prestige.add <- lm( prestige ~ incomes + type, Prestige) summary(prestige.add)
"
Note on factors
- R's way of representing categorical data for analysis
- reading a data frame automatically turns a character variable into a factor
- example: 'type' in data frame Prestige
"
Prestige$type str( Prestige$type ) unclass( Prestige$type ) # raw internal representation # internally it's integers # but it prints as character as.character( Prestige$type ) unclass( as.character( Prestige$type )) # this is really a character var.
"
The multiple personalities of factors
- Sometimes they work like an integer variable (the 'codes' for the factor), sometimes like a character variable
- Generally, this works the way you would expect
A small example: "
f.ex <- factor( c('One','Two','Three','Four','One','Two')) f.ex # note that levels are in lexicographical (alpha) order by default
" The order of levels matters in regression because the first level is the reference level "
unclass(f.ex) # 'internally' f.ex is a vector of integers with an added 'attribute': 'levels' tab( f.ex) letters # a useful vector that comes with R letters[f.ex] # when indexing, f.ex acts as a number and uses its **codes** f.ex == "Three" # in logical operations, as a character f.ex[1:2] # when subsetting, it remembers its original **levels** f.ex[1:2, drop = T] # unless you ask it to forget # reordering a factor f.ex.ro <- factor( f.ex, levels = c('One','Two','Three','Four')) f.ex.ro letters[f.ex.ro] outer( f.ex, f.ex.ro, "==") # applies function '==' to all pairs z <- outer(f.ex, f.ex.ro, "==") # dimnames(z) <- list(f.ex, f.ex.ro) z
"
Basic plotting
- Using 'traditional' graphics in R
- Can now use a formula interface like 'lm'
- Illustrates convenience of using a factor as an indexing variable
"
plot( prestige ~ incomes, Prestige, col = type) # uses colors 1, 2 and 3 plot( prestige ~ incomes, Prestige, col = type, pch = 16) plot( prestige ~ incomes, Prestige, col = type, cex = 1.5, lwd = 2) plot( prestige ~ incomes, Prestige, col = c('red','blue','magenta')[type], cex = 1.5, lwd = 2) plot( prestige ~ incomes, Prestige, col = type, cex = 1.5, lwd = 2, axes = FALSE) axis(1, at = seq(20,160,by=20), labels = seq(20,160,by=20)^2) axis(2, at = seq(20,80,by=20)) box() abline( h = seq(20,80,20), lty = 2, col ='gray') abline( v = seq(20,160,20), lty = 2, col ='gray') # shows that == is applied to levels, not codes # an easy sampler to see what value of pch generates what symbol: dd <- data.frame( n = 0:256 ) dd$x <- dd$n %% 10 # remainder when dividing by 10 dd$y <- 10 * (dd$n %/% 10) # %/% is 'integer' divide plot( y ~ x, dd, pch = n)
"
Quick programs in R
- It's easy to turn a good idea into a function
"
out # make sure it is not already used out <- function( x, y, FUN ){ ret <- outer( x, y, FUN) dimnames( ret ) <- list( x, y) ret # value returned by a function is 'exiting' line } out out( f.ex, f.ex.ro, `==`) # uses levels, not codes out( f.ex, f.ex.ro, `<`) # < not meaningful for factors out( as.character(f.ex), as.character(f.ex.ro), `<`) # BUT it IS meaningful for characters!! # Useful for lots of stuff out( c(TRUE,FALSE,NA),c(TRUE,FALSE,NA), "|") # 3-valued logic in R out( c(TRUE,FALSE,NA),c(TRUE,FALSE,NA), "&") # 3-valued logic in R out( c(-Inf, -1, 0, 1, Inf, NA, NaN, 1i),c(-Inf, -1, 0, 1, Inf, NA, NaN,1i), "+" ) # extended arithmetic out( c(-Inf, -1, 0, 1, Inf, NA, NaN, 1i),c(-Inf, -1, 0, 1, Inf, NA, NaN,1i), "*" ) # extended arithmetic
"
Factors in regression
- A factor with k levels generates k-1 columns in the X matrix
"
model.matrix( prestige ~ incomes + type, Prestige) # creates the X matrix z <- model.matrix( prestige ~ incomes + type, Prestige) some(z) z$incomes # ERROR because a matrix is not a data frame z <- as.data.frame(z) # turns matrix into a data frame [Example of coercion] some( z ) z$incomes # a data frame contains variables
"
Merging data frames
"
# merging two data frames: z$id <- rownames(z) # create an id variable for indexing in z Prestige$id <- rownames(Prestige) # corresponding id for Prestige zm <- merge( z, Prestige[,c('id','type')], by = 'id') # merges on common var 'id' # note that the name of the variable here must be quoted!!! # Recall: Functions may required a variable to be referenced: # by name in quotes # by name without quotes # using a formula # sometimes more than one will work, often only one # Sorry!!! but good to know so it's easier to get out of dead ends some( zm ) # Note dummy (indicator) variable for typeprof and typewc
"
Prediction data frames
"
# prediction data frame # values for which we want to predict prestige pred <- expand.grid( type = levels(Prestige$type), incomes = seq(15,185,10)) some( pred ) # all combination, good to use 'levels' to make sure in correct order pred$y <- predict( prestige.add, newdata = pred) some( pred )
"
For loop
"
for ( nn in levels(pred$type)) { lines( y ~ incomes, pred, subset = type == nn, col = type) }
"
lapply (better)
- lapply( list, FUN) applies function FUN to each element of list or vector
"
lapply( levels(pred$type), function(x){ lines( y ~ incomes, pred, subset = type == x, col = type, lwd =2) })
"
Linking numbers with pictures and answers with questions
- Most statistical output answers questions you don't want to ask and doesn't answer the questions you should ask
- Linking the numbers with the graphs is a ideal test of understanding
- Interpreting coefficients for factor indicators: comparisons with the reference level -- the level that doesn't appear
"
summary( prestige.add )
"
Call: lm(formula = prestige ~ incomes + type, data = Prestige) Residuals: Min 1Q Median 3Q Max -17.5267 -5.0199 -0.5705 5.1203 24.5971 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 13.33319 3.44129 3.874 0.000198 *** incomes 0.30836 0.04494 6.862 7.17e-10 *** typeprof 23.68715 2.21896 10.675 < 2e-16 *** typewc 7.37610 2.00784 3.674 0.000397 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 7.794 on 94 degrees of freedom (4 observations deleted due to missingness) Multiple R-squared: 0.7985, Adjusted R-squared: 0.7921 F-statistic: 124.2 on 3 and 94 DF, p-value: < 2.2e-16
A colorful[sic] digression
"
# Easier in 3d # BUT: need at most two numerical predictors and one factor -- here only one numerical so add any other Plot3d( prestige ~ incomes + education | type, Prestige) # note categorical predictor after "|" # changing colors: colors() pals() # palettes of all colors pal(c('blue','skyblue','yellow4')) pal(grepv('blue',colors())) blues <- grepv( 'blue', colors()) length(blues) Blues <- list(blues[1:33], blues[-(1:33)]) Blues lapply( Blues, pal) # example lapply # choose some colors: Plot3d( prestige ~ incomes + education | type, Prestige, # note categorical predictor after "|" col = c('gray90','red','blue')) prestige.add <- lm( prestige ~ incomes + type, Prestige) summary(prestige.add) Fit3d(prestige.add) Id3d() # right-click to drag rectangles around points, end by drawing an empty rectangle # Exercise: explore other models
"
Interaction with numeric variables
- compare additive model vs model with interaction
"
data(Ginzberg) head(Ginzberg) Plot3d( depression ~ fatalism + simplicity, Ginzberg, xlim= c(0,3), zlim=c(0,3)) fit.int <- lm( depression ~ fatalism * simplicity, Ginzberg) summary(fit.int) # Additive model: Fit3d( lm(depression ~ fatalism + simplicity, Ginzberg)) # additive some( model.matrix(depression ~ fatalism + simplicity, Ginzberg) ) # X matrix # Interaction model:
Fit3d( fit.int <- lm(depression ~ fatalism * simplicity, Ginzberg), col = 'red') # interaction some( model.matrix(depression ~ fatalism * simplicity, Ginzberg) ) # X matrix summary(fit.int) # What do the coefficients mean? Axes3d()
"
Exploring a curved regression function
- Getting real answers to real questions
"
# Forming a linear hypothesis matrix: L <- rbind( "Eff. of fatalism | simp = 0" = c( 0, 1, 0, 0), #take derivative wrt fatalism, ":" means product "Eff. of fatalism | simp = 1" = c( 0, 1, 0, 1), "Eff. of fatalism | simp = 3" = c( 0, 1, 0, 3), "Eff. of simplicity | fatal = 0" = c( 0, 0, 1, 0), "Eff. of simplicity | fatal = 3" = c( 0, 0, 1, 3) ) L wald( fit.int, L ) # compare with : summary( fit.int )
"
- Note how, when there is an interaction term (SIG. OR NOT):
- main effects only estimate a conditional effect (sometimes called a specific or special effect)
- NOT a general effect of the variable
- when there is interaction, the conditional effect depends on values of other variables and should be explored and described
- Note that graphs often reveal important structure not at all visible through numerical output -- google(TM) Anscombe examples
'Additive' curvature
"
# Additive model with curvature Fit3d( lm(depression ~ fatalism + simplicity + I(simplicity^2), Ginzberg), col = 'green') # interaction some( model.matrix(depression ~ fatalism + simplicity + I(simplicity^2), Ginzberg) ) # X matrix # Notes: # distinguish among: # 1. IV (independent variables) -- NOT Statitistical Independent but 'functionally' independent # 2. regressor = column of X matrix # 3. term = expression that generates columns of X matrix
"
Interaction with factors and numeric variables
- additive model vs model with interaction
"
Prestige$type # a factor class(Prestige$type) str(Prestige$type) # structure sapply(Prestige, class) # sapply applies the 'class' function to each variable in Prestige Prestige.2 <- na.omit(Prestige) # filter out missing data nrow(Prestige) nrow(Prestige.2) Prestige$type Prestige.2$type levels(Prestige.2$type) Prestige.2$type <- with(Prestige.2, factor(type, levels=c("bc", "wc", "prof"))) # reorder levels Prestige.2$type # generating contrasts from factors getOption("contrasts") contrasts(Prestige.2$type) # what fills in the X matrix model.matrix(~ type, data=Prestige.2) contrasts(Prestige.2$type) <- contr.treatment(levels(Prestige.2$type), base=2) # changing baseline category contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- "contr.helmert" # Helmert contrasts contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- "contr.sum" # "deviation" contrasts contrasts(Prestige.2$type) contrasts(Prestige.2$type) <- NULL # back to default contrasts(Prestige.2$type)
Prestige.2$type.ord <- ordered(Prestige.2$type, levels=c("bc", "wc", "prof")) # ordered factor Prestige.2$type.ord Prestige.2$type.ord > 'wc' Prestige.2$type.ord[ Prestige.2$type.ord > 'wc' ] round(contrasts(Prestige.2$type.ord), 3) # orthogonal polynomial contrasts # Interaction prestige.add <- lm( prestige ~ incomes + type, data = Prestige) prestige.int <- lm( prestige ~ incomes * type, data = Prestige) summary(prestige.add) summary(prestige.int)
"
Interpreting main effects in presence of interactions
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -5.29638 5.78530 -0.915 0.362328 incomes 0.56720 0.07892 7.187 1.70e-10 *** typeprof 54.14877 8.09419 6.690 1.71e-09 *** typewc 28.36126 9.99804 2.837 0.005607 ** incomes:typeprof -0.37719 0.09624 -3.919 0.000171 *** incomes:typewc -0.29257 0.13924 -2.101 0.038361 *
- What do the coefficients say?
- Effect of unit increase in sqrt(income) = 0.567 ??
- prof - bc[reference level] = 54.14 ??
- wc - bc = 28.36 ??
- NO!
- Effect of unit increase in sqrt(income) = 0.567 WHEN type '=' 0, i.e. among 'bc'
- prof - bc[reference level] = 54.14 WHEN sqrt(income) = 0
- wc - bc = 28.36 ?? WHEN sqrt(income) = 0
Asking a question: How strong is evidence of interaction?
- Consider fitting the model
- with 'type' using 'bc' as the reference level
- with 'type' using 'wc' as the reference level
We should get the same answer! "
Prestige$type2 <- factor( Prestige$type, levels = c('wc','bc','prof')) prestige.int2 <- lm( prestige ~ incomes * type2, Prestige) summary( prestige.int2) summary( prestige.int)
"
- Why does one model look so much more significant than other?
- although they are fitting exactly the same thing except for a change of coding?
- Answer: The output for each model is only showing part of the information
- Standard regression output rarely answers relevant questions
- and what it does answer is often only partly relevant
- We need to ask: what happens if we drop interaction entirely?
- BUT there are two regressors for interaction
- Each alone might not be significant but together they might be highly significant
- Two ways to test:
- Likelihood Ratio Test: Fit the model twice: once with interaction and once without and compare the two models
- Wald Test: Use a linear hypothesis to test whether all relevant coefficients could be simultaneiously equal to 0
Likehood Ratio Test
prestige.add2 <- lm( prestige ~ incomes + type2, Prestige) summary( prestige.add2 ) anova(prestige.int2, prestige.add2) # what if we had used original coding for type? anova(prestige.int, prestige.add)
"
- we get exactly the same answer
- The LRT here is invariant wrt to coding of 'type
- It obeys a principle of invariance:
- What should not matter, does not matter
Wald test
- does not require refitting
- test whether all terms with interaction (last two terms) can be set to 0
"
summary(prestige.int) L <- rbind( c( 0,0,0,0,1,0), c(0,0,0,0,0,1)) L wald( prestige.int, L) wald( prestige.int2, L) # same L works
# Easy wald tests using regular expressions wald( prestige.int, ":") # selects every regressor whose name contains ':' # other ways: 'anova' uses terms with multiple regressors # Type I sequential: anova( prestige.int) # sequential gives right answer BECAUSE interaction is last anova( prestige.int2) # Type II (by default) Anova( prestige.int) # why are all the same ? but 'income' not the same as in Type I Anova( prestige.int2) Anova( prestige.int, type = "III") # meaningless? except for interaction because it's last Anova( prestige.int2, type = "III")
" RECAP:
- If a factor has more than two levels, overall tests must use all relevant terms simultaneously, not each one individually
- Beware of type III
Testing for 'type' in a model with interaction
Consider using either:
- Anova with type II
- OVerall: refitting without 'type' and using LRT
- Overall: using wald
"
# Anova II Anova( prestige.int, type = 'II') # refit: prestige.inc <- lm( prestige ~ incomes , Prestige) anova( prestige.int, prestige.inc) # ERROR: what happened?
"
Caution with LRT
- Need nested models: One X matrix is in a subspace of other X matrix
- Exactly the same cases
- Exactly the same y, e.g. can't compare y with log(y)
- If we drop a variable with NAs, then the new model might have more cases
"
# Refitting with the same cases: model.frame( prestige.int) prestige.inc <- lm( prestige ~ incomes , model.frame(prestige.int)) # all 'type' terms deleted anova( prestige.int, prestige.inc) # ERROR: what happened? # Wald (I find fastest, especially in consulting) wald( prestige.int, 'type')
"
Visualizing interaction
"
prestige.int <- lm( prestige ~ incomes*type, Prestige) summary( prestige.int ) # choosing colors levels(Prestige$type) cols <- c('blue','green','black')
plot( prestige ~ incomes, Prestige, col = cols[type], cex = 1.5, lwd = 2, xlim = c(0,170), ylim = c(-10,90)) # type used as index pred <- expand.grid( type = levels(Prestige$type), incomes = seq(-10,180)) pred$prestige <- predict( prestige.int, newdata = pred) some(pred) lapply( levels(pred$type), function(x){ lines( prestige ~ incomes, pred, subset = type == x, col = cols[type], lwd =2) # get added to plot })
summary(prestige.int)
"
Call: lm(formula = prestige ~ incomes * type, data = Prestige) Residuals: Min 1Q Median 3Q Max -12.5747 -5.3053 0.2172 3.9447 24.6692 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -5.29638 5.78530 -0.915 0.362328 incomes 0.56720 0.07892 7.187 1.70e-10 *** typeprof 54.14877 8.09419 6.690 1.71e-09 *** typewc 28.36126 9.99804 2.837 0.005607 ** incomes:typeprof -0.37719 0.09624 -3.919 0.000171 *** incomes:typewc -0.29257 0.13924 -2.101 0.038361 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 7.29 on 92 degrees of freedom (4 observations deleted due to missingness) Multiple R-squared: 0.8275, Adjusted R-squared: 0.8181 F-statistic: 88.28 on 5 and 92 DF, p-value: < 2.2e-16
Where do the numbers go on the graph? "
Estimating gaps
"
summary(prestige.int) abline( v = 0) L.gaps <- rbind( "wc - bc | incomes = 100" = c( 0, 0, 0, 1, 0, 100), "prof = bc | incomes = 100" = c( 0, 0, 1, 0, 100, 0), "prof - wc | incomes = 100" = c( 0, 0, 1,-1, 100,-100), "wc - bc | incomes = 100" = c( 0, 0, 0, 1, 0, 50), "prof = bc | incomes = 100" = c( 0, 0, 1, 0, 50, 0), "prof - wc | incomes = 100" = c( 0, 0, 1,-1, 50,- 50) ) wald( prestige.int, L.gaps ) abline( v = c(50,100)) # a puzzler: wald( prestige.int, 'type')
"
Visualizing a gap and its SE
"
# graph the wc - bc gap from 0 to 180 # 1. create a data frame with values that for which the gap varies and form the L matrix summary( prestige.int) gap.df <- expand.grid( incomes = seq(0,180)) L.gap <- with( gap.df, cbind( 0, 0, 0, 1, 0, incomes )) L.gap # 2. Use wald to generate estimated gaps, etc. ww <- wald( prestige.int, L.gap) ww ?wald # 3. Transform to a data frame and specify number of SEs for line above and below estimated gap ww.df <- as.data.frame( ww, se = 2) # number of standard errors ww.df # 4. Combine with data frame under 1 comb.df <- cbind( ww.df, gap.df) some(comb.df) # 5. Plot the result plot( coef ~ incomes, comb.df, type = 'l', lwd = 2, main = 'Estimated prestige gap: white collar - blue collar', xlab = list(expression(sqrt(income)), cex = 1.2), ylab = list(expression( hat(gap) %+-% 2 %*% SE ),cex=1.2)) lines( U2 ~ incomes, comb.df, lty = 2, lwd = 2, col = 'red') lines( L2 ~ incomes, comb.df, lty = 2, lwd = 2, col = 'red') abline ( h = 0, col = 'blue') ?plotmath # help on math symbols in plots