#### Notes on R Syntax ####
# Number signs indicate 'comments', anything following # will not be executed
# Syntax is case sensitive! Incorrect capitalization will cause errors.
# No end-line characters (such as ;) required
#### Useful utility commands ####
getwd() # Returns the current working directory where R is 'looking' and will automatically go to to read/write files unless a destination is specified
# setwd("insert path ie. c:/...") # Set the working directory
list.files() # Lists files in working directory
# create some variables to populate the Workspace
x <- 2
u <- "a"
ls() # Lists all variables currently in the workspace
rm(list=ls()) # Deletes all variables in the workspace (ie. active variables)
#### Basic Data Structures, Operations and Functions ####
## Basic Mathmatical Operations ##
2+1
(2+1)*5 ## Naturally, this follows BEDMAS
3*5 - (9+1)/7 - 2
## Numeric Scalars ##
a <- 8 # Assign variable 'a' the value 8
a # Running the name of a variable with no operations will print out its value in the console
# IMPORTANT: a = 8 works most of the time, but not always!
# It is not recommended to use = anywhere other than in function calls.
(a <- 8) # Assign and print
a <- 8.001
a
# Basic Mathmatical Operations with Numeric Values ##
4*a - 1 # This will produce a solution, but not save the result
b <- 4*a - 1 # This will save the result to variable 'b'
b
3*(a-1) + b
## Numeric Vectors ##
c(1,2,3,4) # 'c' stands for 'concatenate'.
1:3
-1:2 # Note precedence
seq(1,4) # The 'sequence' function returns a vector
seq(2, 8, by=2)
seq(0, 1, length=11)
x <- c(5.11111,12.45533,13.66666) # Note we have completely overwritten the 'x' we created before.
x
y <- c(1,2,3)
y
z <- seq(0, 1, length=11)
z
xx <- c(3,5,3,7,NA,1,4,6) # NA (Not Available) is used as a placeholder for missing values
xx
# We can access particular entires by subscripting with square brackets.
# The first entry in the vector is #1 *unlike C/C++ where the first entry of the array is 0)
x[1]
x[2]
x[3]
x[c(2,3)] # We can indicate using a vector which entries of 'x' we want
x[2:3] # These two lines of code both return the 2nd and 3rd entries of 'x'
z[c(1,3,5,7)] # Returns the 1st, 3rd, 5th and 7th entries of 'z'
#!@$* Exercise! *$@!#
#!@$* Can you use the seq() function to get the 1st, 3rd, 5th and 7th entries of 'z'?
# Basic Mathmatical Operations with Numeric Vectors ##
x + 2 # Adds '2' to each entry of the vector 'x'
x - 2 # Subtracts '2' from each entry of the vector 'x'
x * 2 # Multiplies each entry of the vector 'x' by 2
x / 2 # Divides each entry of the vector 'x' by 2
a
x + a # Adds 'a' to each entry of the vector 'x'
x + y # Addition of vectors is 'element-wise'
x - y # Subtraction of vectors is 'element-wise'
x * y # *** Element-wise multiplication *** of vectors
x %*% y # *** Matrix multiplication *** of vectors
x / y # Element-wise division of vectors
x + c(10,100)
# 'x' has length 3
# c(10,100) has length 2
# R throws a warning, but will still perform the operation, repeating elements of c(10,100) as necessary
# to add a value to each element of 'x', like this:
x + c(10, 100, 10)
# This is referred to as R's "Recycling Rule".
## Functions ##
#
# function_name( argument 1, argument 2, argument 3, ...)
#
# There are many functions that are part of the 'base' program
# and are loaded automatically, others can be added as needed
# which we will learn more about later.
# The R-Reference Card can be a handy resource for basic functions
# http://cran.r-project.org/doc/contrib/Short-refcard.pdf
class(x) # 'class' is a function that returns the type of object 'x'
typeof(x) # determines the (R internal) type or storage mode of any object
length(x) # 'length' returns the number of elements in 'x'
mean(x) # 'mean' returns the mean of the elements of 'x'
sd(x) # 'sd' returns the standard deviation of 'x'
min(x) # 'min' returns the smallest elements of 'x'
max(x) # 'mean' returns the largest elements of 'x'
round(x) # 'round' returns the rounded elements of 'x'
round(x,2) # 'round' returns the elements of 'x' rounded to 2 digits
round(x,0) # note: that the default is to round to 0 decimal places
# Getting help on functions
? round # Calls the help file for the function round().
round(x,digits=2) # 'digits' is the second argument, so R doesn't need you to explicitly
# indicate that you are setting the value of 'digits'. But you can
# if you want, and it sometimes keeps code tidy, especially with functions
# that have many arguments. More on function arguments later...
# NOTE: A single '=' is used to pass parameters to functions
rnd <- 2 # Create another variable called 'rnd'
round(x,rnd) # Pass two variables to the function 'round'
rnd <- rnd - 1
round(x,rnd)
round(x,digits=rnd)
(x_rnded <- round(x,rnd)) # Save the results of the 'rounding' function as variable 'x_rnded'
xx
mean(xx) # Returns NA as the mean, because one entry is NA
mean(xx, na.rm=TRUE) # Argument 'na.rm' tells it to compute the mean by removing missing values
## More on Getting Help ##
? length # Works well only if you know the exact name of the function you need!
?? length # Searches all the R help system for the word "length"
# Google searches work surprisingly well
# The first result for "r length of vector" is same page as ? length
# http://www.rseek.org/
# Is a Google driven search engine for R-related pages which is sometimes helpful
#!@$* Exercise! *$@!#
#!@$* Can you find the function name for a t-test in R?
#!@$* Is this vector of observations: c(0.06, 0.05, -0.02, 0.01, -0.04, 0.03, 0.08, -0.03)
#!@$* statistically significantly different from zero?
#!@$* Exercise! *$@!#
#!@$* create a vector ex1 = 10, 8, 3, 21, 4, 5, 1, 7, 5, 8, 28, 15, 6
#!@$* Calculate the mean (9.307692), median (7), standard deviation (7.706973) and variance (59.39744)
#!@$* Calculate the 15% trimmed mean (8.363636) (hint: read ?mean and look through the available arguments)
#!@$* Convert ex1 to ranks and save to a new variable ex2. Compute the mean (7) and standard deviation (3.883727) of the ranks.
## Packages / Libraries ##
# Advanced R users can create their own packages of functions and can even
# distribute them publically through CRAN (Comprehensive R Archive Network) Repository
#
# This is a strength of R, a massive user base of people all contributing ot the project
# New statistical techniques get written up as R functions and made available
# much more quickly than they can be incorporated into commercial products (SAS, SPSS, etc.)
#
# There are also multiple versions of functions that may produce more/less output,
# include different options or be more/less computationally efficient.
#
# As an example, lets look at options for achieving summary statistics for a vector.
? summary
summary(xx)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 1.000 3.000 4.000 4.143 5.500 7.000 1.000
# Lets assume we aren't satisfied with this summary.
# Try Googling "R summary statistics"
# The first hit: http://www.statmethods.net/stats/descriptives.html
# Suggests describe() in package 'Hmisc', stat.desc() from 'pastecs'
# and describe() in package 'psych'
# To try these alternatives out, we need to first install the packages either
# using drop-down menu or command 'install.packages' -- recall R is case-sensitive
install.packages("Hmisc")
install.packages("pastecs")
install.packages("psych")
# IMPORTANT: Once you have installed a package on your computer once,
# you shouldn't ever have to use 'install.packages' again.
# It is saved in your package library automatically by R.
# R does not automatically load all the packages in your library
# so before we can use a package it has to be loaded using the function 'library()'
# to make the package functions available
library("Hmisc")
describe(xx)
# xx
# n missing unique Mean
# 7 1 6 4.143
#
# 1 3 4 5 6 7
# Frequency 1 2 1 1 1 1
# % 14 29 14 14 14 14
library("pastecs")
stat.desc(xx)
# nbr.val nbr.null nbr.na min max
# 7.0000000 0.0000000 1.0000000 1.0000000 7.0000000
# range sum median mean SE.mean
# 6.0000000 29.0000000 4.0000000 4.1428571 0.7693093
# CI.mean.0.95 var std.dev coef.var
# 1.8824319 4.1428571 2.0354010 0.4913037
library("psych")
# The following message appears:
# The following object(s) are masked from ‘package:Hmisc’:
# describe
#
# Both 'psych' and 'Hmisc' contain functions named 'describe'
# by loading 'psych' second, the default version of 'describe'
# will be that from 'psych'.
describe(xx)
# var n mean sd median trimmed mad min max range skew kurtosis se
# 1 1 7 4.14 2.04 4 4.14 1.48 1 7 6 -0.06 -1.5 0.77
# If we wanted to use the Hmisc version, or wanted to ensure
# we don't get confused, you can specify the package R should look in
# for the function
Hmisc::describe(xx)
## Numeric Matrix ##
# Can create matrices directly using function 'matrix'
# matrix(data, nrow, ncol, byrow==FALSE)
# 'data' is 'filled in' by column
? matrix
matrix(c(1,11,21,2,12,22,3,13,23,4,14,24),3,4)
# makes a 3row x 4 column matrix and
# fills the matrix in with 1, 11, 21, 2, 12, 22, 3, 13, 23, 4, 14, 24
# starting in top left corner, going down first column
matrix(c(1,2,3,4,11,12,13,14,21,22,23,24),3,4,byrow = TRUE)
# Adding "byrow = TRUE" statement causes R to fill in
# starting at top left corner and going right to left across rows
matrix(1,4,5) # creates a matrix of all 1's
matrix(c(1,2,3),4,5) # creates a matrix of all 1's
# Building a matrix by glueing together vectors
p <- rbind(c(1,2),c(3,4),c(5,6)) #'rbind' == 'row bind', stacks the rows on top of each other
p
m <- cbind(c(1,2),c(3,4),c(5,6)) #'cbind' == 'column bind', column vectors are glued together
m
rbind(c(1,2,3,4),
c(11,12,13,14),
c(21,22,23,24))
#!@$* Exercise! *$@!#
#!@$* Try making the same matrix using the column-bind function cbind()
# rbind and cbind can also be used to glue a (conforming) matrix and a vector together
# or two (conforming) matrices
cbind(p,p)
colSums?
p
rowSums(p)
cbind(p, rowSums(p)) # This line of code will 'glue' a vector of row sums for matrix 'p' to the right of 'p'
cbind(rowSums(p),p) # Row sums glued on the left side of matrix 'p'
#!@$* Exercise! *$@!#
#!@$* Compute column means for matrix 'p', and create a new matrix 'newmat'
#!@$* which is 'p' stacked on top of 'p's column means
# Access elements in a matrix requires a row and column number (in that order)
m
m[1,2]
m[4,2] # subscript out of bounds, 4 is larger than the actual number of rows.
m[2,] # if the column number isn't specified, it returns the whole row
m[,2] # if the row number isn't specified, it returns the whole column
# Basic Mathmatical Operations with Numeric Matrices #
# Math operations involving matrices need to have 'conformable arguments'
# meaning they need to have appropriate dimensions to perform matrix mathematics
m + p # error because m (2x3) and p (3x2)
dim(m) # m (2x3)
dim(p) # p (3x2)
m + m # Matrix addition is element-wise
m - m # Matrix subtraction is element-wise
m * m # *** Element-wise multiplication *** of matrices
x %*% p # *** Matrix multiplication ***
# Putting labels on rows/columns
rownames(m) <- c("Row1","Row2")
colnames(m) <- c("Col1","Col2","Col3")
m
## Characters/Strings ##
y <- "abc"
y
class(y)
y[1]
y[2] # NA because really 'y' is a vector (or array) of length 1, and there is only one entry which is 'abc'
## Characters Vector ##
y <- c("abc", "de", "123")
y
y[1]
y[2]
y[3] # all entires of a vector must be of the same type.
# Though '123' could be numeric, here they are a character string.
class(y[3])
substring(y,1,2) # Returns the first two characters in each entry of 'y'
# Combine strings and
paste("This is a number I like:", a, ". It's a good one, huh?")
paste(y, "is the ", 1:3, "entry in character vector y")
## Lists ##
# Like an R vector, an R list is a container for values
# but its contents can be items of different data types
# Many R functions return 'lists', so we need to be able to deal with them
class(a)
class(y)
imalist <- list(a=a, yvar=y, mat=matrix(1:6, 2, 3))
imalist
str(imalist)
imalist[[1]] # Elements of a list can be accessed by their position in *double* square brackets
imalist$a # or by their 'name'
imalist[[2]]
imalist$yvar
imalist[[3]] # Returns the matrix that is the third item in the list 'imalist'
imalist$mat
# We can further access the elements of that matrix
imalist[[3]][1,2]
imalist$mat[,2]
# As mentioned, many functions return a list
t.test(imalist$mat[1,], imalist$mat[2,]) #!@$* Exercise! Do you understand what the t-test is comparing? *$@!#
ttestresult <- t.test(imalist$mat[1,], imalist$mat[2,]) # Here we 'catch' the result of the t-test
str(ttestresult)
ttestresult$statistic
ttestresult$p.value
ttestresult$conf.int # This is a vector representing the lower & upper CI
ttestresult$conf.int[1] # This is the lower CI on the mean difference
ttestresult$conf.int[2] # This is the upper CI on the mean difference
## Data Frames ##
# We will generally want to have our data set up as a data frame
# Like a matrix, but containing different 'types' of variables
# Data frames can be created directly, but are more commonly read from a file
d<-data.frame(list(kid=c("Jack","Jill", "Bob"),age=c(12,10,9)))
d
str(d) # Compactly displaces the internal structure of an object
# Components of the data frame can be accessed by name or by numeric position in square brackets like a matrix
d$kid
d[,1] # Returns 1st column of 'd', which is 'kids'
d[2,] # Returns 2nd row of 'd', which is second subject named Jill age 10.
d[3,2] # Returns the value in the 3nd row and 2nd column of 'd'. Subscripting like a matrix.
mean(age) # Error, because 'age' cannot be seen directly
mean(d$age)
# The questionable practice of 'attach'
attach(d) # Makes the sub-variables visible so they can be called by name
mean(age)
detach(d) # Hides them again
# Name conflicts of this type are a common problem with attach()
# and care should be taken to avoid them.
# Many R geeks say one should *never* use attach()!
## Importing Data ##
# Squid data comes from A Beginner's Guide to R by Gentleman, Hornik & Parmigiani. Springer, 2009.
# (This text is available electronically through York's library)
#
# Options exist for importing data sets from other software packages
install.packages("gdata") # May not work! Seems to work on Mac...
library(gdata)
squiddat <- read.xls("data/squid.xls")
head(squiddat) # Head displays the first 6 lines of the data set
head(squiddat, n=12) # change the number of lines to print, n=12
str(squiddat) # Compactly displays the internal structure of an R object
# This package also has a read.xls function ---> install.packages("xlsReadWrite")
install.packages("foreign")
library(foreign)
squiddat <- read.spss("data/squid.sav") # seems to return a list
squiddat <- data.frame(squiddat) # convert the list to a data frame
head(squiddat)
str(squiddat)
# Though these methods work, they tend to make me uncomfortable
# I worry things might get mixed up when versions of software change
# between versions of the packages
# I prefer to save files as comma separated files (.csv) directly in Excel or SPSS
squiddat <- read.csv("data/squid.csv")
head(squiddat)
str(squiddat)
## Logical Vectors and Operations ##
logi <- TRUE
logi
logi <- FALSE
logi
# Logical vector
logivect <- c(FALSE,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,TRUE)
logivect
# Logical vector can be used to select data from a vector, matrix, data frame, etc.
xx[logivect]
# We can use logical statements to create a logical vector
xx
xx<4 # Less than
xx>=4 # Greater than, or equal to
xx==5 # Two equal signs means 'equal to', not an assignment
xx==5 | xx==4 # Pipe | means 'or'
xx==5 & !is.na(xx) # Ampersand means 'and', ! means 'not', which inverts True <--> False
# We can use these types of logical vectors to pick out data we want
xx[xx>=4]
xx[xx>=4 & !is.na(xx)]
# We can use the same technique to pick out data from the squid data
# collected at location 3
squiddat[squiddat$Location==3,] # [, ]
squiddat[squiddat$Sex=="female",] # [, ]
#!@$* Exercise! *$@!#
#!@$* Use logical subscripts to pick out data for male squid at location 2
#!@$* collected in the first year. Save this to a new variable called squ_m_loc2_yr1.