A Tour of Graphics in R

From Wiki1

Jump to: navigation, search

" SCS 2011: Statistical Analysis and Programming with R

September-October 2011
A Guided Tour of Graphics in R

Contents

Install packages (if needed)

"

 install.packages(c('car','Hmisc','rgl')) 
 download.file("http://blackwell.math.yorku.ca/R/spida.zip", "spida.zip")
 download.file("http://blackwell.math.yorku.ca/R/p3d.zip", "p3d.zip")
 install.packages("spida.zip", repos = NULL)
 install.packages("p3d.zip", repos = NULL)

"

Load packages

"

   library(car)
   library(spida)
   library(p3d)

"

Overview

R (previously S and S-Plus) have evolved through a number of stages. Each stage sees new layers. This is particularly true in graphics.

  1. Classic (traditional) graphics
    • Intuitive, easy to use, easy to build a a graph in separate steps
    • Recently improved to incorporate fairly consistent formula interface
  2. Lattice (=Trellis in S) graphics
    • Developed in 2nd decade to handle 'conditional' plotting
    • Excellent exploratory tools
    • Harder to control and to add to graph
    • Includes static 3d
  3. rgl (r graphics language built on 'Open GL')
    • 3d rotation and limited interaction
    • rgl is used with p3d interface to visualize models
  4. ggplot2 -- based on Grammar of Graphics by Leland Wilkinson

Strength of R graphics:

  • Very easy to use for exploration and diagnostics while working
  • Can turn into presentation/publication quality graphics

In this tour, we focus on the first two: traditional and lattice and see a bit of p3d using rgl. "


 library(lattice)     # to use lattice graphics (note that spida loads it automatically)
 library(p3d)
 library(spida)

"

Basic (classic) plotting

"

 data( Smoking )   # from p3d
 dl <- Smoking
 head(dl)
 tail(dl)
 
 rownames(dl) = dl$Country     # must be unique
 
 tab(dl, ~ is.na(LE)+is.na(CigCon))
 
 fit = lm( LE ~ CigCon , dl , na.action = na.omit)
 summary(fit)    
     
 ## Should always look at data
 plot( Smoking)
 plot( fit )  
 ## 'plot' is a 'generic' function in R
 ## what it does depends on the type of 'object' it is applied to
 plot
 print    # like print
 ## The 'type' of object is the 'class' of the object
 class( Smoking )
 class( fit )
 
 ## When you apply 'plot' to a 'data.frame' the 
 ## generic function 'plot' 
 ## 'dispatches'
 ## the 'method' 
 ## plot.data.frame
 plot.data.frame
 ## Applied to a `lm` object, the method 'plot.lm'
 plot.lm
 plot(fit)  #  uses plot.lm and produces regression diagnostics
      
 names(dl)
 plot(dl[c('LE','CigCon','HealthExpPC')])   # scatterplot matrix using plot.data.frame
 
 xqplot( dl )    # from spida
 
 # ordinary plot for a scatterplot
 
 plot( dl$CigCon, dl$LE )   # plot( x, y)   uses default: plot.default  
 
 plot( LE ~ CigCon, dl )    # plot( y ~ x , data.frame)  uses plot.formula
 
 attach( dl )
 plot( CigCon, LE, main = 'plot(x,y) with attach' )          # plot( x, y)
 plot( LE ~ CigCon  , sub = 'plot(y ~ x) with attach')        # plot( y ~ x )
 detach( dl ) 
 
 # Using a formula interface:
 #     formula evaluated in a data frame: (like 'lm')
 # 'mature' functions tend to use formula interfaces when feasible
 # new -- and early -- functions tend not to 
 
 plot( LE ~ CigCon, dl, pch = 16, col = 'green3') 
 
 abline( fit )                      # adding elements to a fit
 
 # note abline is fake generic
 
 abline
 abline( fit, lwd = 2, col = 'red')    
 abline( h=50 )
 identify( dl$CigCon, dl$LE, dl$Country)  # RStudio bug: labels only show at end. Likely to be corrected
 
 plot( LE ~ CigCon, dl , pch = Continent, 
         col = Continent)      # note: some functions with formula
                               # interfaces only evaluate the formula
                               # in the data frame
                               # An important skill in R is being good
                               # at guessing how/why things don't always
                               # work as they should
   # OOPS! something didn't work! Can we guess why?
   
   plot( LE ~ CigCon, dl , pch = as.numeric(Continent), 
         col = Continent)
   # why do we need to use 'as.numeric' for pch and NOT for col?
   # Because the argument for col is used as an index to a vector
   # of colors. When a factor is used as an index, the **codes**
   # are automatically used.  For 'pch', plot is using the number
   # directly, NOT as an index. How would you know all this?? 
   # You wouldn't! You can alway play it safe and coerce your factors
   # to numeric explicitly:
   
   plot( LE ~ CigCon, dl , 
         pch = as.numeric(Continent), 
         col = as.numeric(Continent))
   
   ?legend
   legend( locator(1), levels(dl$Continent), 
       pch =  1:length(levels(dl$Continent)), 
       col = 1:length(levels(dl$Continent)))   
    
 # plotting a fitted line for a general model
 plot( LE ~ CigCon, dl , 
         pch = as.numeric(Continent), 
         col = as.numeric(Continent), lwd =2)
 
 legend( locator(1), levels(dl$Continent), ncol = 2, lwd = 2,
         pch =  1:length(levels(dl$Continent)),
         lty = 1:length(levels(dl$Continent)),
         col = 1:length(levels(dl$Continent)))
         
         
 fit2 <- lm( LE ~ Continent*(CigCon + I(CigCon^2)) , dl)
 
 summary(fit2)
 
 # create a 'predictor data frame with predictor values for which you want a plot
 
 pred <- expand.grid(  CigCon =  seq(0,5000,100), Continent = levels(dl$Continent)) 
 pred$LE.quad = predict( fit2, newdata = pred)
 head(pred)
 lapply( levels(dl$Continent), function(x) {
       lines( LE.quad ~ CigCon, subset(pred, Continent == x), 
       col = Continent,
       lty = as.numeric(Continent),
       lwd = 2)
    }
 )   
  1. This example is where we cash in on the greater complexity of factors over character variables
  2. R remembers that the numeric code for 'Europe' is 4 even though it's the only level


  1. Plot3d automates multiple fits but doesn't have the flexibility of basic graphics
 Init3d( cex = 1, font = 2)
 Plot3d( LE ~ CigCon + HealthExpPC | Continent, dl)
 Fit3d( fit2)
 Id3d()
 
 Plot3d( LE ~ CigCon + HealthExpPC | Continent, subset(dl, Continent != "Australia"))
 Ell3d()       
 
 # Scatterplots in car
 library(car)
      scatterplot( LE ~ CigCon, dl)   
      scatterplot( LE ~ CigCon, dl, group = Continent)
      scatterplot( LE ~ CigCon, dl, group = dl$Continent)
         
      scatterplot( LE ~ CigCon | Continent, dl)   
      scatterplot( LE ~ CigCon | Continent, dl, cex = 1.5, lwd = 1.5)   
      ?scatterplot

"

Basic graphics for association

Categorical by categorical

"

      tt = table( dl$Continent )
      tt
      barplot( tt , legend = names(tt), col = rainbow(6))
      
      ttab <- tt
      names(ttab) <- abbreviate(names(tt))
      barplot( ttab )
      barplot( ttab , legend = names(tt), col = heat.colors(6))
      
   # Categorical association
   
      dl$LE.q <- cut( dl$LE, 4)
      tt <- table( "Life Expectancy"=dl$LE.q, Continent=dl$Continent)
      tt   
      assocplot(tt)
      colnames(tt)
      colnames(tt) <- abbreviate( colnames(tt))    
      assocplot(tt) 
   
      mosaicplot(tt, color = TRUE)    # shows 2nd var as proportion | first var
      mosaicplot( t(tt) , color = TRUE )
      mosaicplot( t(tt[rev(1:nrow(tt)),]) , color = TRUE , main = "National life expectancies")
   
   # Exercise: Improve by using better cutpoints and labels for Life Expectancy      

"

Categorical by continuous numerical

Working with factors

"

   # Conditional density plot:  how factor depends on continuous variable
   
   # Dealing with factors and characters is one of the few vexations in R
   # 'factors' are coded categorical variables created by default when
   # a variable in a data frame has non-numeric data
   #
   # Factors have two personalities: numeric code and a character value
   # Sometimes the numeric code is used to recognize an ordering
   #
   # Some functions use the factors numeric code, some the character
   # Most of the time all this works extremely well - as we shall see
   # Sometimes, it doesn't and it's hard to figure out why.
   #
   # Many statistical functions like 'factors'
   # Many text manipulation functions like 'characters'
   # You can switch from one to the other with 'as.factor' and 'as.character'
   # If levels of a factor are dropped by, e.g. subsetting, some routines will
   #    get confused,
   # Sometimes, you have to 'strip' a factor of dropped levels
   # Here's an example: we want to abbreviate Continent
   
   
   # Showing a categorical variable vs a continuous variables
   
   
     dl$Cont = factor( abbreviate( as.character( dl$Continent) ))   
         # this is the 'safe' way, explicitly converting the possible factor to character
         # and back to a factor
     
     
     # Using car's recode function. It strays from R syntax but is intuitive and useful
     
     dl$Cont2 = recode( dl$Continent, 
         " 'North America'='N.America';'South America'='S.America'")  
           # recode is smart about factors 
           # Note that the whole 'recode' string is in double quotes, 
           #    you must use single quotes inside the string. 
   
     with( dl, table(Cont, Cont2)) 
   
   # If a categorical variable depends on a continuous variable:
   #
   #           categorical ~ continuous
   #
   
     cdplot( Cont2 ~ LE, dl, main="pretty but not useful here") # This plot would be useful is Cont depended on LE
   
   # If continuous depends on category:
   #
   #           continuous ~ categorical
   #
     
     boxplot( LE ~ Cont2 , dl, main= "better since LE depends on Continent",
           sub="but boxplots can miss structure in distributions, e.g. bimodality") 
         # warning: I've been told that boxplot often engender violent opposition
         
     points ( LE ~ Cont2, dl)   # 'points' and 'lines' add to existing plot
    
     identify( dl$Cont2, dl$LE, dl$Country) # here we are happy that 'points' and 'identify' uses the factor's numeric code

" There are many other 'basic' plotting functions including many other that come with various packages.

Lattice graphics

  • Developed as 'Trellis' graphics by Bill Cleveland in early 80s
  • Extension of 'coplot' (conditional plot)
  • Allows more than two dimensions through
    • 1) panels
    • 2) groups
  • More challenging to add things than for 'basic' graphics
    e.g. 'identify' more awkward but recently made easier

"

 library(lattice)  # not loaded by default although library(spida) loads it  

"

  • Major lattice functions:
    • Univariate:
      • barchart
      • bwplot
      • densityplot
      • dotplot
      • histogram
      • qqmath # qq plot to compare data with hypothetical quantiles
      • stripplot
    • Bivariate:
      • xyplot scatterplot and more
      • qq qq plot to compare 2 distributions
    • Trivariate:
      • xyplot with groups OR '|'
      • levelplot
      • contourplot
      • cloud
      • wireframe
    • Hypervariate:
      • xyplot with groups AND '|'
      • splot
      • parallel
    • Other:
      • rfs residual and fitted value plot( also see oneway )
      • tmd Tukey mean difference plot
      • many others

"

   xyplot(LE ~ CigCon , dl, pch = 16, col = 'blue3')   # generally use formula interface
   
   
   # Using panels for each continent
   
   xyplot(LE ~ CigCon | Continent, dl, pch = 16, col = 'blue3')
        # note that each panel uses same scales so they are comparable
  
   # Creating a categorical variable from a continuous variable
        
   dl$HE = cut( dl$HealthExpPC, 4)    # uses equal width intervals NOT equal frequency
   table( dl$HE)
   dl$HEef <- cut(  dl$HealthExpPC, 
           c(-Inf, quantile(dl$HealthExpPC, c(25,50,75,100)/100, na.rm =T) ))     
   tab( dl$HEef)
   tab( dl, ~ HE + HEef)
   xyplot( HealthExpPC ~ HE, dl)    # plotting continuous ~ factor
   xyplot( HE ~ HealthExpPC, dl)    # plotting factor ~ continuous 
       # 'cut' creates equal intervals but we might prefer equal groups
   
  
   # Creating 4 equal groups
   quantile(dl$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T)
   dl$HE = cut( dl$HealthExpPC, 
       quantile(dl$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T), 
       include.lowest = TRUE)
   xyplot( HE ~ HealthExpPC, dl) 
   xyplot( HE ~ log(HealthExpPC), dl)      

   # Are there natural breaks :
   densityplot( ~ log(HealthExpPC), dl)
   densityplot( ~ log(HealthExpPC) | Continent, dl)
        # perhaps four 'natural' groups but we won't pursue here
   
   
   # Adding another dimension via 'groups'
   xyplot(LE ~ CigCon | Continent, dl, groups = HE)
        # not very informative
   
   # including a 'key':
   xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2))
   
   # switching panels and groups:
   xyplot(LE ~ CigCon | HE, dl, groups = Continent, auto.key = list(columns = 2))
   xyplot(LE ~ CigCon | HE, dl, groups = Continent, auto.key = list(columns = 2),
       pch = c(16,1,2,3))
       
   td( pch = c(15,1,2,3))    # in spida -- sets new defaults
   xyplot(LE ~ CigCon | HE, dl, groups = Continent, auto.key = list(columns = 2))
   
   
   # Doing more than plotting in each panel:
   
   xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2),
       panel = function(x,y,...) {
           panel.xyplot(x,y,...)
           panel.loess(x,y,...)
       })
       # but this fits a loess line in each panel, can we do it in each group
       
    xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2),
       panel = panel.superpose,
       panel.groups = function(x,y,...) {
           panel.xyplot(x,y,...)
           panel.loess(x,y,...)
       })
       # doesn't work: too little good data in some groups
       
    # Let's try lmline:
        
    xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2),
       panel = panel.superpose,
       panel.groups = function(x,y,...) {
           panel.xyplot(x,y,...)
           if( nrow(na.omit(cbind(x,y))) > 3) panel.lmline(x,y,...)
       })
   
   # Using identify with groups: problem getting panel.identify to use right labels
    
   xyplot( LE ~ CigCon | Continent, dl, groups = HE, 
      panel = function( x, y, subscripts, ...) {
         panel.superpose( x,y,subscripts,...)
         panel.identify( x,y, subscripts = subscripts, labels = dl$Country[subscripts], ...)
   })
         # see http://wiki.math.yorku.ca/index.php/R:_panel.identify
         
           
 ##
 ##
 ##  Other lattice functions
 ##  Use examples at bottom of help files
 ##     
   
   ?barchart
   
   data(barley)
   
   
   barchart(yield ~ variety | site, data = barley,
        groups = year, layout = c(1,6),
        ylab = "Barley Yield (bushels/acre)",
        scales = list(x = list(abbreviate = TRUE,
                      minlength = 5)),
        auto.key=T)
   
   barchart(yield ~ variety | site, data = barley,
        groups = year, layout = c(1,6), 
        stack = TRUE,     # stack groups 
        auto.key = list(points = FALSE, rectangles = TRUE, space = "right"),
        ylab = "Barley Yield (bushels/acre)",
        scales = list(x = list(rot = 45)))
        
   
   bwplot( LE ~ HE | Continent, dl )        
   bwplot( LE ~ HE | Continent, dl , scales = list( x = list(rot = 45)))    
   bwplot( LE ~ HE , dl , groups = Continent, scales = list( x = list(rot = 45)))
         # groups does not work!
         
     
 ###
 ###  Plotting fitted models
 ###    
   fit = lm( LE ~ (CigCon +I(CigCon^2) ) * log(HealthExpPC), dl) 
   summary(fit)
   
   summary(dl)
   
   pred = expand.grid( CigCon = seq(77,4300, 10), HealthExpPC = c(20,50,100,200,1000, 3000, 6000))
   pred$LE = predict( fit, newdata = pred)
   
   td( lwd = 2)
   xyplot( LE ~ CigCon , pred, groups = HealthExpPC,  type = 'l',
       auto.key = list( columns = 3, lines = T, points = F) )
   
     
     
 ##
 ##
 ##   Explore wireframe, dotplot and stripplot using help file and 'example'
   
   ?wireframe
   example(wireframe)   # to step through examples
                        
 ## Exercises: use Prestige:     
 ##    create categories for percentage of women
 ##    explore gender gap in income
 ##    What role does prestige play?
      


"

See also

"

Personal tools