# A Tour of Graphics in R

### From Wiki1

" SCS 2011: Statistical Analysis and Programming with R

- September-October 2011

- A Guided Tour of Graphics in R

## Contents |

## Install packages (if needed)

- See [installing on a Mac]

"

install.packages(c('car','Hmisc','rgl')) download.file("http://blackwell.math.yorku.ca/R/spida.zip", "spida.zip") download.file("http://blackwell.math.yorku.ca/R/p3d.zip", "p3d.zip") install.packages("spida.zip", repos = NULL) install.packages("p3d.zip", repos = NULL)

"

## Load packages

"

library(car) library(spida) library(p3d)

"

## Overview

R (previously S and S-Plus) have evolved through a number of stages. Each stage sees new layers. This is particularly true in graphics.

- Classic (traditional) graphics
- Intuitive, easy to use, easy to build a a graph in separate steps
- Recently improved to incorporate fairly consistent formula interface

- Lattice (=Trellis in S) graphics
- Developed in 2nd decade to handle 'conditional' plotting
- Excellent exploratory tools
- Harder to control and to add to graph
- Includes static 3d

- rgl (r graphics language built on 'Open GL')
- 3d rotation and limited interaction
- rgl is used with p3d interface to visualize models

- ggplot2 -- based on Grammar of Graphics by Leland Wilkinson

Strength of R graphics:

- Very easy to use for exploration and diagnostics while working
- Can turn into presentation/publication quality graphics

In this tour, we focus on the first two: traditional and lattice and see a bit of p3d using rgl. "

library(lattice) # to use lattice graphics (note that spida loads it automatically) library(p3d) library(spida)

"

## Basic (classic) plotting

"

data( Smoking ) # from p3d dl <- Smoking head(dl) tail(dl) rownames(dl) = dl$Country # must be unique tab(dl, ~ is.na(LE)+is.na(CigCon)) fit = lm( LE ~ CigCon , dl , na.action = na.omit) summary(fit) ## Should always look at data

plot( Smoking) plot( fit )

## 'plot' is a 'generic' function in R ## what it does depends on the type of 'object' it is applied to

plot print # like print

## The 'type' of object is the 'class' of the object class( Smoking ) class( fit ) ## When you apply 'plot' to a 'data.frame' the ## generic function 'plot' ## 'dispatches' ## the 'method' ## plot.data.frame plot.data.frame ## Applied to a `lm` object, the method 'plot.lm' plot.lm

plot(fit) # uses plot.lm and produces regression diagnostics names(dl) plot(dl[c('LE','CigCon','HealthExpPC')]) # scatterplot matrix using plot.data.frame xqplot( dl ) # from spida # ordinary plot for a scatterplot plot( dl$CigCon, dl$LE ) # plot( x, y) uses default: plot.default plot( LE ~ CigCon, dl ) # plot( y ~ x , data.frame) uses plot.formula attach( dl ) plot( CigCon, LE, main = 'plot(x,y) with attach' ) # plot( x, y) plot( LE ~ CigCon , sub = 'plot(y ~ x) with attach') # plot( y ~ x ) detach( dl ) # Using a formula interface: # formula evaluated in a data frame: (like 'lm') # 'mature' functions tend to use formula interfaces when feasible # new -- and early -- functions tend not to plot( LE ~ CigCon, dl, pch = 16, col = 'green3') abline( fit ) # adding elements to a fit # note abline is fake generic abline abline( fit, lwd = 2, col = 'red') abline( h=50 ) identify( dl$CigCon, dl$LE, dl$Country) # RStudio bug: labels only show at end. Likely to be corrected

plot( LE ~ CigCon, dl , pch = Continent, col = Continent) # note: some functions with formula # interfaces only evaluate the formula # in the data frame # An important skill in R is being good # at guessing how/why things don't always # work as they should

# OOPS! something didn't work! Can we guess why? plot( LE ~ CigCon, dl , pch = as.numeric(Continent), col = Continent) # why do we need to use 'as.numeric' for pch and NOT for col? # Because the argument for col is used as an index to a vector # of colors. When a factor is used as an index, the **codes** # are automatically used. For 'pch', plot is using the number # directly, NOT as an index. How would you know all this?? # You wouldn't! You can alway play it safe and coerce your factors # to numeric explicitly: plot( LE ~ CigCon, dl , pch = as.numeric(Continent), col = as.numeric(Continent)) ?legend legend( locator(1), levels(dl$Continent), pch = 1:length(levels(dl$Continent)), col = 1:length(levels(dl$Continent))) # plotting a fitted line for a general model

plot( LE ~ CigCon, dl , pch = as.numeric(Continent), col = as.numeric(Continent), lwd =2) legend( locator(1), levels(dl$Continent), ncol = 2, lwd = 2, pch = 1:length(levels(dl$Continent)), lty = 1:length(levels(dl$Continent)), col = 1:length(levels(dl$Continent))) fit2 <- lm( LE ~ Continent*(CigCon + I(CigCon^2)) , dl) summary(fit2) # create a 'predictor data frame with predictor values for which you want a plot pred <- expand.grid( CigCon = seq(0,5000,100), Continent = levels(dl$Continent)) pred$LE.quad = predict( fit2, newdata = pred) head(pred) lapply( levels(dl$Continent), function(x) { lines( LE.quad ~ CigCon, subset(pred, Continent == x), col = Continent, lty = as.numeric(Continent), lwd = 2) } )

- This example is where we cash in on the greater complexity of factors over character variables
- R remembers that the numeric code for 'Europe' is 4 even though it's the only level

- Plot3d automates multiple fits but doesn't have the flexibility of basic graphics

Init3d( cex = 1, font = 2) Plot3d( LE ~ CigCon + HealthExpPC | Continent, dl) Fit3d( fit2) Id3d() Plot3d( LE ~ CigCon + HealthExpPC | Continent, subset(dl, Continent != "Australia")) Ell3d() # Scatterplots in car

library(car) scatterplot( LE ~ CigCon, dl) scatterplot( LE ~ CigCon, dl, group = Continent) scatterplot( LE ~ CigCon, dl, group = dl$Continent) scatterplot( LE ~ CigCon | Continent, dl) scatterplot( LE ~ CigCon | Continent, dl, cex = 1.5, lwd = 1.5) ?scatterplot

"

## Basic graphics for association

### Categorical by categorical

"

tt = table( dl$Continent ) tt

barplot( tt , legend = names(tt), col = rainbow(6)) ttab <- tt names(ttab) <- abbreviate(names(tt)) barplot( ttab ) barplot( ttab , legend = names(tt), col = heat.colors(6)) # Categorical association dl$LE.q <- cut( dl$LE, 4) tt <- table( "Life Expectancy"=dl$LE.q, Continent=dl$Continent) tt assocplot(tt) colnames(tt) colnames(tt) <- abbreviate( colnames(tt)) assocplot(tt) mosaicplot(tt, color = TRUE) # shows 2nd var as proportion | first var mosaicplot( t(tt) , color = TRUE ) mosaicplot( t(tt[rev(1:nrow(tt)),]) , color = TRUE , main = "National life expectancies") # Exercise: Improve by using better cutpoints and labels for Life Expectancy

"

### Categorical by continuous numerical

#### Working with factors

"

# Conditional density plot: how factor depends on continuous variable # Dealing with factors and characters is one of the few vexations in R # 'factors' are coded categorical variables created by default when # a variable in a data frame has non-numeric data # # Factors have two personalities: numeric code and a character value # Sometimes the numeric code is used to recognize an ordering # # Some functions use the factors numeric code, some the character # Most of the time all this works extremely well - as we shall see # Sometimes, it doesn't and it's hard to figure out why. # # Many statistical functions like 'factors' # Many text manipulation functions like 'characters' # You can switch from one to the other with 'as.factor' and 'as.character' # If levels of a factor are dropped by, e.g. subsetting, some routines will # get confused, # Sometimes, you have to 'strip' a factor of dropped levels # Here's an example: we want to abbreviate Continent # Showing a categorical variable vs a continuous variables dl$Cont = factor( abbreviate( as.character( dl$Continent) )) # this is the 'safe' way, explicitly converting the possible factor to character # and back to a factor # Using car's recode function. It strays from R syntax but is intuitive and useful dl$Cont2 = recode( dl$Continent, " 'North America'='N.America';'South America'='S.America'")

# recode is smart about factors # Note that the whole 'recode' string is in double quotes, # you must use single quotes inside the string. with( dl, table(Cont, Cont2)) # If a categorical variable depends on a continuous variable: # # categorical ~ continuous # cdplot( Cont2 ~ LE, dl, main="pretty but not useful here") # This plot would be useful is Cont depended on LE # If continuous depends on category: # # continuous ~ categorical # boxplot( LE ~ Cont2 , dl, main= "better since LE depends on Continent", sub="but boxplots can miss structure in distributions, e.g. bimodality") # warning: I've been told that boxplot often engender violent opposition points ( LE ~ Cont2, dl) # 'points' and 'lines' add to existing plot identify( dl$Cont2, dl$LE, dl$Country) # here we are happy that 'points' and 'identify' uses the factor's numeric code

" There are many other 'basic' plotting functions including many other that come with various packages.

## Lattice graphics

- Developed as 'Trellis' graphics by Bill Cleveland in early 80s
- Extension of 'coplot' (conditional plot)
- Allows more than two dimensions through
- 1) panels
- 2) groups

- More challenging to add things than for 'basic' graphics
- e.g. 'identify' more awkward but recently made easier

"

library(lattice) # not loaded by default although library(spida) loads it

"

- Major lattice functions:
- Univariate:
- barchart
- bwplot
- densityplot
- dotplot
- histogram
- qqmath # qq plot to compare data with hypothetical quantiles
- stripplot

- Bivariate:
- xyplot scatterplot and more
- qq qq plot to compare 2 distributions

- Trivariate:
- xyplot with groups OR '|'
- levelplot
- contourplot
- cloud
- wireframe

- Hypervariate:
- xyplot with groups AND '|'
- splot
- parallel

- Other:
- rfs residual and fitted value plot( also see oneway )
- tmd Tukey mean difference plot
- many others

- Univariate:

"

xyplot(LE ~ CigCon , dl, pch = 16, col = 'blue3') # generally use formula interface # Using panels for each continent xyplot(LE ~ CigCon | Continent, dl, pch = 16, col = 'blue3') # note that each panel uses same scales so they are comparable # Creating a categorical variable from a continuous variable dl$HE = cut( dl$HealthExpPC, 4) # uses equal width intervals NOT equal frequency table( dl$HE) dl$HEef <- cut( dl$HealthExpPC, c(-Inf, quantile(dl$HealthExpPC, c(25,50,75,100)/100, na.rm =T) )) tab( dl$HEef) tab( dl, ~ HE + HEef) xyplot( HealthExpPC ~ HE, dl) # plotting continuous ~ factor xyplot( HE ~ HealthExpPC, dl) # plotting factor ~ continuous # 'cut' creates equal intervals but we might prefer equal groups # Creating 4 equal groups quantile(dl$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T)

dl$HE = cut( dl$HealthExpPC, quantile(dl$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T), include.lowest = TRUE) xyplot( HE ~ HealthExpPC, dl) xyplot( HE ~ log(HealthExpPC), dl) # Are there natural breaks : densityplot( ~ log(HealthExpPC), dl) densityplot( ~ log(HealthExpPC) | Continent, dl) # perhaps four 'natural' groups but we won't pursue here # Adding another dimension via 'groups'

xyplot(LE ~ CigCon | Continent, dl, groups = HE) # not very informative # including a 'key': xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2)) # switching panels and groups:

xyplot(LE ~ CigCon | HE, dl, groups = Continent, auto.key = list(columns = 2))

xyplot(LE ~ CigCon | HE, dl, groups = Continent, auto.key = list(columns = 2), pch = c(16,1,2,3)) td( pch = c(15,1,2,3)) # in spida -- sets new defaults xyplot(LE ~ CigCon | HE, dl, groups = Continent, auto.key = list(columns = 2)) # Doing more than plotting in each panel: xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2), panel = function(x,y,...) { panel.xyplot(x,y,...) panel.loess(x,y,...) }) # but this fits a loess line in each panel, can we do it in each group xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2), panel = panel.superpose, panel.groups = function(x,y,...) { panel.xyplot(x,y,...) panel.loess(x,y,...) }) # doesn't work: too little good data in some groups # Let's try lmline: xyplot(LE ~ CigCon | Continent, dl, groups = HE, auto.key = list(columns = 2), panel = panel.superpose, panel.groups = function(x,y,...) { panel.xyplot(x,y,...) if( nrow(na.omit(cbind(x,y))) > 3) panel.lmline(x,y,...) }) # Using identify with groups: problem getting panel.identify to use right labels xyplot( LE ~ CigCon | Continent, dl, groups = HE, panel = function( x, y, subscripts, ...) { panel.superpose( x,y,subscripts,...) panel.identify( x,y, subscripts = subscripts, labels = dl$Country[subscripts], ...) }) # see http://wiki.math.yorku.ca/index.php/R:_panel.identify ## ## ## Other lattice functions ## Use examples at bottom of help files ## ?barchart data(barley) barchart(yield ~ variety | site, data = barley, groups = year, layout = c(1,6), ylab = "Barley Yield (bushels/acre)", scales = list(x = list(abbreviate = TRUE, minlength = 5)), auto.key=T) barchart(yield ~ variety | site, data = barley, groups = year, layout = c(1,6), stack = TRUE, # stack groups auto.key = list(points = FALSE, rectangles = TRUE, space = "right"), ylab = "Barley Yield (bushels/acre)", scales = list(x = list(rot = 45))) bwplot( LE ~ HE | Continent, dl ) bwplot( LE ~ HE | Continent, dl , scales = list( x = list(rot = 45))) bwplot( LE ~ HE , dl , groups = Continent, scales = list( x = list(rot = 45))) # groups does not work! ### ### Plotting fitted models ###

fit = lm( LE ~ (CigCon +I(CigCon^2) ) * log(HealthExpPC), dl) summary(fit) summary(dl) pred = expand.grid( CigCon = seq(77,4300, 10), HealthExpPC = c(20,50,100,200,1000, 3000, 6000)) pred$LE = predict( fit, newdata = pred) td( lwd = 2) xyplot( LE ~ CigCon , pred, groups = HealthExpPC, type = 'l', auto.key = list( columns = 3, lines = T, points = F) ) ## ## ## Explore wireframe, dotplot and stripplot using help file and 'example' ?wireframe example(wireframe) # to step through examples ## Exercises: use Prestige: ## create categories for percentage of women ## explore gender gap in income ## What role does prestige play?

"

## See also

- Let Graphics Tell the Story
- Producing simple graphs with R
- R Graphics by Paul Murrell
- Quick R - Basic Graphs
- Graphics with R by Development Core Group
- Graph Gallery
- ggplot2

"