# MATH 6627 2010-11 Practicum in Statistical Consulting/R/Graphics in R.R

Jump to: navigation, search
```#
# Please feel free to improve this file!
#

##
##  Basic (classic) plotting
##

dl = read.csv("http://www.math.yorku.ca/~georges/Data/CigLE.csv")

head(dl)
tail(dl)

rownames(dl) = dl\$Country
dl\$X = NULL

table( LE = !is.na( dl\$LE) , CigCon = !is.na(dl\$CigCon))

fit = lm( LE ~ CigCon , dl , na.action = na.omit)
summary(fit)

## Should always look at data
## 'plot' is a 'generic' function in R

plot(fit)  # diagnostics

plot(dl)   # scatterplot matrix

source("http://www.math.yorku.ca/~georges/R/fun.R")
xqplot( dl )

# ordinary plot for a scatterplot

plot( dl\$CigCon, dl\$LE )   # plot( x, y)

plot( LE ~ CigCon, dl )    # plot( y ~ x , data.frame)

attach( dl )
plot( CigCon, LE)          # plot( x, y)
plot( LE ~ CigCon)        # plot( y ~ x )
detach( dl )

plot( LE ~ CigCon, dl, pch = 16, col = 'green3')

abline( fit )
abline( fit, lwd = 2, col = 'red')

identify( dl\$CigCon, dl\$LE, dl\$Country)

plot( LE ~ CigCon, dl , pch = 16, col = as.numeric(dl\$Continent))
legend( locator(1), levels(dl\$Continent), pch = 16, col = 1:length(levels(dl\$Continent)))

# don't despair: this will be much easier later with lattice graphics
# 'car' version by John Fox

library(car)
scatterplot( LE ~ CigCon, dl)
scatterplot( LE ~ CigCon, dl, group = dl\$Continent)
scatterplot( LE ~ CigCon | Continent, dl)
scatterplot( LE ~ CigCon | Continent, dl, cex = 1.5, lwd = 1.5)

##
## Other 'basic' graphics functions
##

tt = table( dl\$Continent )
tt
barplot( tt , legend = names(tt), col = rainbow(6))
names(tt) = abbreviate(names(tt))
barplot( tt )
barplot( tt , legend = names(tt), col = heat.colors(6))

# Categorical association

dl\$LE.q = cut( dl\$LE, 4)
tt = table( "Life Expectancy"=dl\$LE.q, Continent=dl\$Continent)
tt
assocplot(tt)
colnames(tt)
colnames(tt) = abbreviate( colnames(tt))
assocplot(tt)

mosaicplot(tt, color = TRUE)
mosaicplot( t(tt) , color = TRUE )

# Conditional density plot:  how factor depends on continuous variable

# Dealing with factors and characters is one of the few vexations in R
# 'factors' are coded categorical variables created by default when
# a variable in a data frame has non-numeric data
#
# Factors have two personalities: numeric code and a character value
# Sometimes the numeric code is used to recognize an ordering
#
# Some functions use the factors' numeric code, some the character
# Most of the time all this works extremely well - as we shall see
# Sometimes, it doesn't and it's hard to figure out why.
#
# Many statistical functions like 'factors'
# Many text manipulation functions like 'characters'
# You can switch from one to the other with 'as.factor' and 'as.character'
# If levels of a factor are dropped by, e.g. subsetting, some routines will
#    get confused,
# Sometimes, you have to 'strip' a factor of dropped levels
# Here's an example: we want to abbreviate Continent

# Showing a categorical variable vs a continuous variables

dl\$Cont = factor( abbreviate( as.character( dl\$Continent) ))
# this is the 'safe' way, explicitly converting the possible factor to character
# and back to a factor

# Using car's recode function. It strays from R syntax but is intuitive and useful

dl\$Cont2 = recode( dl\$Continent,
" 'North America'='N.America';'South America'='S.America'")
# recode is smart about factors
# Note that the whole 'recode' string is in double quotes,
#    you must use single quotes inside the string.

with( dl, table(Cont, Cont2))

# If a categorical variable depends on a continuous variable:
#
#           categorical ~ continuous
#

cdplot( Cont2 ~ LE, dl, main="pretty but not useful here") # This plot would be useful is Cont depended on LE

# If continuous depends on category:
#
#           continuous ~ categorical
#

boxplot( LE ~ Cont2 , dl, main= "better since LE depends on Continent",
sub="but boxplots can miss structure in distributions, e.g. bimodality")
points ( LE ~ Cont2, dl)

identify( dl\$Cont2, dl\$LE, dl\$Country) # here we are happy that 'points' and 'identify' uses the factor's numeric code

# Let's use Continent as a moderator

fit.i = lm( LE ~ CigCon * Continent, dl)
as.numeric(dl\$Continent)

"
There are many other 'basic' plotting functions
including many other that come with various packages.

"

# Nicer scatterplot matrix:

library(car)    # by John Fox

scatterplot.matrix(dl)

#   d1 = read.csv("http://www.math.yorku.ca/~georges/Data/CigaretteConsumption.csv")
#   d2 = read.csv("http://www.math.yorku.ca/~georges/Data/CigaretteConsumptionSample.csv")
#     d3 = read.csv("http://www.math.yorku.ca/~georges/Data/CigaretteConsumptionSmall.csv")

#  dim(d1)
#  dim(d2)
#  dim(d3)
#  head( dl )

# Things you can plot
source("/R/fun.R")

xqplot(dl)

head(dl)
rownames(dl) = dl\$Country
head(dl)

plot( dl\$CigCon, dl\$LE)        # x, y
plot( LE ~ CigCon, dl)         # y ~ x, data.frame
plot( LE ~ CigCon, dl, pch = 16, col = 'blue3')
plot( LE ~ CigCon, dl, pch = 16, col = 'blue3', ylab = "Life Expectancy in 1996",
xlab = "Cigarette Consumption in 1996")

plot( LE ~ CigCon, dl, pch = 16, col = as.numeric(dl\$Cont2) , ylab = "Life Expectancy in 1996",
xlab = "Cigarette Consumption in 1996")     # here we make sure we are using the code
legend( locator(1), levels(dl\$Cont2), col = 1:length(levels(dl\$Cont2)), pch = 16)
# Could show colours for Continents
# fit a regression and plot the line

fit = lm( LE ~ CigCon, dl)
abline(fit)
abline(fit, col = 'red', lwd = 2, lty = 2)   # see ?par for information on options

# plotting a fitted line for a more general model

fit2 = lm( LE ~ CigCon + I(CigCon^2) , dl)

# create a 'predictor data frame with predictor values for which you want a plot

pred = expand.grid(  CigCon =  seq(0,4000,10))   # we use expand.grid in anticipation of bigger things
pred\$LE.quad = predict( fit2, newdata = pred)
head(pred)

lines( LE.quad ~ CigCon, pred , lwd = 2, lty = 3)

library(car)
scatterplot.matrix( dl)

##
##
##

dc = read.csv("http://www.math.yorku.ca/~georges/Data/HealthExpPerCap.csv")

dc\$Country %less% dl\$Country

dl\$Country %less% dc\$Country

?match
dl\$Country = tran(
c(
"Bahamas, The",
#"Cook Islands",
"Gambia, The",
"Macedonia, Republic of",
"Micronesia, Federated States of",
# "Nauru",
# "Niue",
"Sao Tom and Prncipe",
"Serbia and Montenegro"), #"Tuvalu"
c(
"Bahamas", # "Brunei"                         "Congo, Republic of the"         "East Timor"
"The Gambia",
"Macedonia" ,
"Federated States of Micronesia",
"Sao Tome and Principe",
"Serbia"),
dl\$Country)

dm = merge( dl, dc, by = 'Country', all = T)
dim(dm)
dim(dl)
dim(dc)

head(dm)
rownames(dm) = dm\$Country

library( p3d )
Plot3d( LE ~ CigCon + HealthExpPC | Continent, dm)

I3d()
Axes3d()

fit = lm( LE ~ CigCon + HealthExpPC +I(CigCon^2) + I(HealthExpPC^2) + I(CigCon*HealthExpPC), dm)
Fit3d( fit )

fit2 = lm( LE ~ Continent*(CigCon + HealthExpPC +I(CigCon^2) + I(HealthExpPC^2) + I(CigCon*HealthExpPC)), dm)
Fit3d( fit2 )  # who says that art has to have a meaning

###
###  Lattice graphics
###  Developed as 'Trellis' graphics by Bill Cleveland in early 80s
###  Extension of 'coplot'
###  Allows more than two dimensions
###  Through
###    1) panels
###    2) groups
###  More challenging to add things than for 'basic' graphics
###  e.g. "identify" more awkward but recently made easier

library(lattice)

"

Major lattice functions:
Univariate:
barchart
bwplot
densityplot
dotplot
histogram
qqmath   qq plot to compare data with hypothetical quantiles
stripplot

Bivariate:
qq  qq plot to compare 2 distributions
xyplot  scatterplot and more

Trivariate:
levelplot
contourplot
cloud
wireframe

Hypervariate:
splot
parallel

Other:
rfs   residual and fitted value plot( also see oneway )
tmd   Tukey mean difference plot

"

xyplot(LE ~ CigCon , dm, pch = 16, col = 'blue3')

# Using panels for each continent

xyplot(LE ~ CigCon | Continent, dm, pch = 16, col = 'blue3')
# note that each panel uses same scales so they are comparable

# Creating a categorical variable from a continuous variable

dm\$HE = cut( dm\$HealthExpPC, 4)
table( dm\$HE)
xyplot( HealthExpPC ~ HE, dm)    # plotting continuous ~ factor
xyplot( HE ~ HealthExpPC, dm)    # plotting factor ~ continuous
# 'cut' creates equal intervals but we might prefer equal groups

# Creating 4 equal groups
quantile(dm\$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T)

dm\$HE = cut( dm\$HealthExpPC,
quantile(dm\$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T),
include.lowest = TRUE)
xyplot( HE ~ HealthExpPC, dm)
xyplot( HE ~ log(HealthExpPC), dm)

# Are there natural breaks :
densityplot( ~ log(HealthExpPC), dm)
densityplot( ~ log(HealthExpPC) | Continent, dm)
# perhaps four 'natural' groups but we won't pursue here

# Adding another dimension via 'groups'

xyplot(LE ~ CigCon | Continent, dm, groups = HE)
# not very informative

# including a 'key':
xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2))

# switching panels and groups:

xyplot(LE ~ CigCon | HE, dm, groups = Continent, auto.key = list(columns = 2))

xyplot(LE ~ CigCon | HE, dm, groups = Continent, auto.key = list(columns = 2),
pch = c(16,1,2,3))

td( pch = c(15,1,2,3))    # in fun.R -- sets new defaults
xyplot(LE ~ CigCon | HE, dm, groups = Continent, auto.key = list(columns = 2))

# Doing more than plotting in each panel:

xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2),
panel = function(x,y,...) {
panel.xyplot(x,y,...)
panel.loess(x,y,...)
})
# but this fits a loess line in each panel, can we do it in each group

xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2),
panel = panel.superpose,
panel.groups = function(x,y,...) {
panel.xyplot(x,y,...)
panel.loess(x,y,...)
})
# doesn't work: too little good data in some groups

# Let's try lmline:

xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2),
panel = panel.superpose,
panel.groups = function(x,y,...) {
panel.xyplot(x,y,...)
if( nrow(na.omit(cbind(x,y))) > 3) panel.lmline(x,y,...)
})

# Using identify with groups: problem getting panel.identify to use right labels

xyplot( LE ~ CigCon | Continent, dm, groups = HE,
panel = function( x, y, subscripts, ...) {
panel.superpose( x,y,subscripts,...)
panel.identify( x,y, subscripts = subscripts, labels = dm\$Country[subscripts], ...)
})
# see http://wiki.math.yorku.ca/index.php/R:_panel.identify

##
##
##  Other lattice functions
##  Use examples at bottom of help files
##

?barchart

data(barley)

barchart(yield ~ variety | site, data = barley,
groups = year, layout = c(1,6),
ylab = "Barley Yield (bushels/acre)",
scales = list(x = list(abbreviate = TRUE,
minlength = 5)),
auto.key=T)

barchart(yield ~ variety | site, data = barley,
groups = year, layout = c(1,6),
stack = TRUE,     # stack groups
auto.key = list(points = FALSE, rectangles = TRUE, space = "right"),
ylab = "Barley Yield (bushels/acre)",
scales = list(x = list(rot = 45)))

bwplot( LE ~ HE | Continent, dm )
bwplot( LE ~ HE | Continent, dm , scales = list( x = list(rot = 45)))
bwplot( LE ~ HE , dm , groups = Continent, scales = list( x = list(rot = 45)))
# groups does not work!

###
###  Plotting fitted models
###

fit = lm( LE ~ (CigCon +I(CigCon^2) ) * log(HealthExpPC), dm)
summary(fit)

summary(dm)

pred = expand.grid( CigCon = seq(77,4300, 10), HealthExpPC = c(20,50,100,200,1000, 3000, 6000))
pred\$LE = predict( fit, newdata = pred)

td( lwd = 2)
xyplot( LE ~ CigCon , pred, groups = HealthExpPC,  type = 'l',
auto.key = list( columns = 3, lines = T, points = F) )

##
##
##   Explore dotplot and stripplot using help file
##   Explore Prestige data set

library(car)
data(Prestige)

xqplot( Prestige )
?Prestige

# might create categories for percentage of women
# explore gender gap in income
# What role does prestige play?

##
##```