MATH 6627 2010-11 Practicum in Statistical Consulting/R/Graphics in R.R

From Wiki1

Jump to: navigation, search
#
# Please feel free to improve this file!
#


    ##
    ##  Basic (classic) plotting
    ##
    
    
       dl = read.csv("http://www.math.yorku.ca/~georges/Data/CigLE.csv")
        
       head(dl)
       tail(dl)
        
       rownames(dl) = dl$Country
       dl$X = NULL
        
       table( LE = !is.na( dl$LE) , CigCon = !is.na(dl$CigCon))
        
       fit = lm( LE ~ CigCon , dl , na.action = na.omit)
       summary(fit)    
        
        
       ## Should always look at data
       ## 'plot' is a 'generic' function in R
        
       plot(fit)  # diagnostics
        
       plot(dl)   # scatterplot matrix
    
       source("http://www.math.yorku.ca/~georges/R/fun.R")
       xqplot( dl )    
       
       # ordinary plot for a scatterplot
    
       plot( dl$CigCon, dl$LE )   # plot( x, y)  
       
       plot( LE ~ CigCon, dl )    # plot( y ~ x , data.frame)
    
       attach( dl )
       plot( CigCon, LE)          # plot( x, y)
       plot( LE ~ CigCon)        # plot( y ~ x )
       detach( dl ) 
    
       plot( LE ~ CigCon, dl, pch = 16, col = 'green3') 
       
       abline( fit ) 
       abline( fit, lwd = 2, col = 'red')    
       
       identify( dl$CigCon, dl$LE, dl$Country)
       
       plot( LE ~ CigCon, dl , pch = 16, col = as.numeric(dl$Continent))
       legend( locator(1), levels(dl$Continent), pch = 16, col = 1:length(levels(dl$Continent)))
      
       
       # don't despair: this will be much easier later with lattice graphics
       # 'car' version by John Fox
       
       library(car)
       scatterplot( LE ~ CigCon, dl)   
       scatterplot( LE ~ CigCon, dl, group = dl$Continent)   
       scatterplot( LE ~ CigCon | Continent, dl)   
       scatterplot( LE ~ CigCon | Continent, dl, cex = 1.5, lwd = 1.5)   
       
    ##
    ## Other 'basic' graphics functions
    ##   
    
       tt = table( dl$Continent )
       tt
       barplot( tt , legend = names(tt), col = rainbow(6))
       names(tt) = abbreviate(names(tt))
       barplot( tt )
       barplot( tt , legend = names(tt), col = heat.colors(6))
       
    # Categorical association
    
       dl$LE.q = cut( dl$LE, 4)
       tt = table( "Life Expectancy"=dl$LE.q, Continent=dl$Continent)
       tt   
       assocplot(tt)
       colnames(tt)
       colnames(tt) = abbreviate( colnames(tt))    
       assocplot(tt) 
    
       mosaicplot(tt, color = TRUE)
       mosaicplot( t(tt) , color = TRUE )
    
    # Conditional density plot:  how factor depends on continuous variable
    
    # Dealing with factors and characters is one of the few vexations in R
    # 'factors' are coded categorical variables created by default when
    # a variable in a data frame has non-numeric data
    #
    # Factors have two personalities: numeric code and a character value
    # Sometimes the numeric code is used to recognize an ordering
    #
    # Some functions use the factors' numeric code, some the character
    # Most of the time all this works extremely well - as we shall see
    # Sometimes, it doesn't and it's hard to figure out why.
    #
    # Many statistical functions like 'factors'
    # Many text manipulation functions like 'characters'
    # You can switch from one to the other with 'as.factor' and 'as.character'
    # If levels of a factor are dropped by, e.g. subsetting, some routines will
    #    get confused,
    # Sometimes, you have to 'strip' a factor of dropped levels
    # Here's an example: we want to abbreviate Continent
    
    
    # Showing a categorical variable vs a continuous variables
    
    
      dl$Cont = factor( abbreviate( as.character( dl$Continent) ))   
          # this is the 'safe' way, explicitly converting the possible factor to character
          # and back to a factor
      
      
      # Using car's recode function. It strays from R syntax but is intuitive and useful
      
      dl$Cont2 = recode( dl$Continent, 
          " 'North America'='N.America';'South America'='S.America'")  
            # recode is smart about factors 
            # Note that the whole 'recode' string is in double quotes, 
            #    you must use single quotes inside the string. 
    
      with( dl, table(Cont, Cont2)) 
    
    # If a categorical variable depends on a continuous variable:
    #
    #           categorical ~ continuous
    #
    
      cdplot( Cont2 ~ LE, dl, main="pretty but not useful here") # This plot would be useful is Cont depended on LE
    
    # If continuous depends on category:
    #
    #           continuous ~ categorical
    #
      
      boxplot( LE ~ Cont2 , dl, main= "better since LE depends on Continent",
            sub="but boxplots can miss structure in distributions, e.g. bimodality") 
      points ( LE ~ Cont2, dl)
     
      identify( dl$Cont2, dl$LE, dl$Country) # here we are happy that 'points' and 'identify' uses the factor's numeric code
     
    # Let's use Continent as a moderator
    
    
      fit.i = lm( LE ~ CigCon * Continent, dl)
      as.numeric(dl$Continent)
    
    
    "
        There are many other 'basic' plotting functions
        including many other that come with various packages.
    
    "
           
    
    # Nicer scatterplot matrix:
    
    library(car)    # by John Fox


    scatterplot.matrix(dl)
    
        
    #   d1 = read.csv("http://www.math.yorku.ca/~georges/Data/CigaretteConsumption.csv")
    #   d2 = read.csv("http://www.math.yorku.ca/~georges/Data/CigaretteConsumptionSample.csv")
    #     d3 = read.csv("http://www.math.yorku.ca/~georges/Data/CigaretteConsumptionSmall.csv")
          
    #  dim(d1)
    #  dim(d2)      
    #  dim(d3)
    #  head( dl )
    
    # Things you can plot
    source("/R/fun.R")
    
    xqplot(dl)      
    
    head(dl)
    rownames(dl) = dl$Country
    head(dl)
      
      
      
      
      
      
      plot( dl$CigCon, dl$LE)        # x, y
      plot( LE ~ CigCon, dl)         # y ~ x, data.frame
      plot( LE ~ CigCon, dl, pch = 16, col = 'blue3')
      plot( LE ~ CigCon, dl, pch = 16, col = 'blue3', ylab = "Life Expectancy in 1996", 
            xlab = "Cigarette Consumption in 1996")
    
      plot( LE ~ CigCon, dl, pch = 16, col = as.numeric(dl$Cont2) , ylab = "Life Expectancy in 1996", 
            xlab = "Cigarette Consumption in 1996")     # here we make sure we are using the code
      legend( locator(1), levels(dl$Cont2), col = 1:length(levels(dl$Cont2)), pch = 16)
      # Could show colours for Continents
      # fit a regression and plot the line
      
      fit = lm( LE ~ CigCon, dl)
      abline(fit) 
      abline(fit, col = 'red', lwd = 2, lty = 2)   # see ?par for information on options
    
      # plotting a fitted line for a more general model
      
      fit2 = lm( LE ~ CigCon + I(CigCon^2) , dl)
    
      # create a 'predictor data frame with predictor values for which you want a plot
    
      pred = expand.grid(  CigCon =  seq(0,4000,10))   # we use expand.grid in anticipation of bigger things
      pred$LE.quad = predict( fit2, newdata = pred)
      head(pred)
      
      lines( LE.quad ~ CigCon, pred , lwd = 2, lty = 3) 
      
      
      library(car)
      scatterplot.matrix( dl)
     
    ##
    ##
    ##
      
    
    
      dc = read.csv("http://www.math.yorku.ca/~georges/Data/HealthExpPerCap.csv")
    
      dc$Country %less% dl$Country
      
      dl$Country %less% dc$Country
      
      ?match
      dl$Country = tran(
         c( 
          "Bahamas, The", 
          #"Cook Islands",
          "Gambia, The", 
          "Macedonia, Republic of",         
          "Micronesia, Federated States of", 
          # "Nauru",
          # "Niue", 
          "Sao Tom and Prncipe",            
          "Serbia and Montenegro"), #"Tuvalu" 
         c( 
          "Bahamas", # "Brunei"                         "Congo, Republic of the"         "East Timor"                    
          "The Gambia",                     
          "Macedonia" ,                     
          "Federated States of Micronesia", 
          "Sao Tome and Principe",         
          "Serbia"),
         dl$Country)            
      
    
        dm = merge( dl, dc, by = 'Country', all = T)  
        dim(dm)
        dim(dl)
        dim(dc)  
    
        head(dm)
        rownames(dm) = dm$Country
        
        library( p3d )
        Plot3d( LE ~ CigCon + HealthExpPC | Continent, dm) 
    
        I3d()
        Axes3d()
        
        fit = lm( LE ~ CigCon + HealthExpPC +I(CigCon^2) + I(HealthExpPC^2) + I(CigCon*HealthExpPC), dm)
        Fit3d( fit )
        
        fit2 = lm( LE ~ Continent*(CigCon + HealthExpPC +I(CigCon^2) + I(HealthExpPC^2) + I(CigCon*HealthExpPC)), dm)
        Fit3d( fit2 )  # who says that art has to have a meaning
    
    
    ###
    ###  Lattice graphics
    ###  Developed as 'Trellis' graphics by Bill Cleveland in early 80s
    ###  Extension of 'coplot'
    ###  Allows more than two dimensions
    ###  Through 
    ###    1) panels
    ###    2) groups
    ###  More challenging to add things than for 'basic' graphics
    ###  e.g. "identify" more awkward but recently made easier
    
    
    library(lattice)    
    
    "
    
    Major lattice functions:
      Univariate:
         barchart
         bwplot
         densityplot
         dotplot
         histogram
         qqmath   qq plot to compare data with hypothetical quantiles
         stripplot
      
      Bivariate:
        qq  qq plot to compare 2 distributions
        xyplot  scatterplot and more
        
      Trivariate:
        levelplot
        contourplot
        cloud
        wireframe
    
      Hypervariate:
        splot
        parallel
    
      Other:
        rfs   residual and fitted value plot( also see oneway )
        tmd   Tukey mean difference plot
        
    "    
 
    xyplot(LE ~ CigCon , dm, pch = 16, col = 'blue3')
    
    
    # Using panels for each continent
    
    xyplot(LE ~ CigCon | Continent, dm, pch = 16, col = 'blue3')
         # note that each panel uses same scales so they are comparable
   
    # Creating a categorical variable from a continuous variable
         
    dm$HE = cut( dm$HealthExpPC, 4)   
    table( dm$HE)
    xyplot( HealthExpPC ~ HE, dm)    # plotting continuous ~ factor
    xyplot( HE ~ HealthExpPC, dm)    # plotting factor ~ continuous 
        # 'cut' creates equal intervals but we might prefer equal groups
    
   
    # Creating 4 equal groups
    quantile(dm$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T)

    dm$HE = cut( dm$HealthExpPC, 
        quantile(dm$HealthExpPC, c(0,25,50,75,100)/100, na.rm=T), 
        include.lowest = TRUE)
    xyplot( HE ~ HealthExpPC, dm) 
    xyplot( HE ~ log(HealthExpPC), dm)      
 
    # Are there natural breaks :
    densityplot( ~ log(HealthExpPC), dm)
    densityplot( ~ log(HealthExpPC) | Continent, dm)
         # perhaps four 'natural' groups but we won't pursue here
    
    
    # Adding another dimension via 'groups'

    xyplot(LE ~ CigCon | Continent, dm, groups = HE)
         # not very informative
    
    # including a 'key':
    xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2))
    
    # switching panels and groups:

    xyplot(LE ~ CigCon | HE, dm, groups = Continent, auto.key = list(columns = 2))

    xyplot(LE ~ CigCon | HE, dm, groups = Continent, auto.key = list(columns = 2),
        pch = c(16,1,2,3))
        
    td( pch = c(15,1,2,3))    # in fun.R -- sets new defaults
    xyplot(LE ~ CigCon | HE, dm, groups = Continent, auto.key = list(columns = 2))
    
    
    # Doing more than plotting in each panel:
    
    xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2),
        panel = function(x,y,...) {
            panel.xyplot(x,y,...)
            panel.loess(x,y,...)
        })
        # but this fits a loess line in each panel, can we do it in each group
        
     xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2),
        panel = panel.superpose,
        panel.groups = function(x,y,...) {
            panel.xyplot(x,y,...)
            panel.loess(x,y,...)
        })
        # doesn't work: too little good data in some groups
        
     # Let's try lmline:
         
     xyplot(LE ~ CigCon | Continent, dm, groups = HE, auto.key = list(columns = 2),
        panel = panel.superpose,
        panel.groups = function(x,y,...) {
            panel.xyplot(x,y,...)
            if( nrow(na.omit(cbind(x,y))) > 3) panel.lmline(x,y,...)
        })
    
    # Using identify with groups: problem getting panel.identify to use right labels
     
    xyplot( LE ~ CigCon | Continent, dm, groups = HE, 
       panel = function( x, y, subscripts, ...) {
          panel.superpose( x,y,subscripts,...)
          panel.identify( x,y, subscripts = subscripts, labels = dm$Country[subscripts], ...)
    })
          # see http://wiki.math.yorku.ca/index.php/R:_panel.identify
          
          
##
##
##  Other lattice functions
##  Use examples at bottom of help files
##     
    
    ?barchart
    
    data(barley)
    
    
    barchart(yield ~ variety | site, data = barley,
         groups = year, layout = c(1,6),
         ylab = "Barley Yield (bushels/acre)",
         scales = list(x = list(abbreviate = TRUE,
                       minlength = 5)),
         auto.key=T)
    
    barchart(yield ~ variety | site, data = barley,
         groups = year, layout = c(1,6), 
         stack = TRUE,     # stack groups 
         auto.key = list(points = FALSE, rectangles = TRUE, space = "right"),
         ylab = "Barley Yield (bushels/acre)",
         scales = list(x = list(rot = 45)))
         
    
    bwplot( LE ~ HE | Continent, dm )        
    bwplot( LE ~ HE | Continent, dm , scales = list( x = list(rot = 45)))    
    bwplot( LE ~ HE , dm , groups = Continent, scales = list( x = list(rot = 45)))
          # groups does not work!
          
    
###
###  Plotting fitted models
###    

    fit = lm( LE ~ (CigCon +I(CigCon^2) ) * log(HealthExpPC), dm) 
    summary(fit)
    
    summary(dm)
    
    pred = expand.grid( CigCon = seq(77,4300, 10), HealthExpPC = c(20,50,100,200,1000, 3000, 6000))
    pred$LE = predict( fit, newdata = pred)
    
    td( lwd = 2)
    xyplot( LE ~ CigCon , pred, groups = HealthExpPC,  type = 'l',
        auto.key = list( columns = 3, lines = T, points = F) )
    
    
    
##
##
##   Explore dotplot and stripplot using help file
##   Explore Prestige data set

     library(car)
     data(Prestige)  

     xqplot( Prestige )
     ?Prestige
     
     # might create categories for percentage of women
     # explore gender gap in income
     # What role does prestige play?
     
##
##
Personal tools