MATH 6627 2012-13 Practicum in Statistical Consulting/Very Basic Regression in R

From Wiki1

Jump to: navigation, search
###
###  Very Basic Regression in R
###  MATH 6627
###  September 5, 2012
### 

library(spidadev)  # spida package

# Artificial data
# Index of Heart Damage, Coffee Consumption and Stress in standardized units

# reading a table of data pasted from a file
cat("

### Paste some data in here

", file = "coffee.dat")           # creates a temporary file
cf <- read.table('coffee.dat',header = T)
cf          # show the data set 
head(cf)    #  first 6 lines
some(cf)    # random 10 lines

xqplot(cf)  # uniform quantile plots 
xqplot(cf, ptype = 'normal')  # normal quantile plots 

# Example: Creating a categorical variable with 'cut'

cf$Stress.class <- cut( cf$Stress, c(-Inf,75,150,Inf),
                        labels = c("Low",'Medium','High'))
cf
xqplot(cf)  # quick look at data set --- not for formal reports

# 3 ways to produce roughly the same thing
pairs(cf)
scatterplotMatrix(cf)
splom(~cf)

# Note:
# we will see that all 2-dimensional views do not necessarily
# capture the essential structure of a 3+ dimensional data set

# Regressions: Simple and Multiple

fit1 <- lm( Heart ~ Coffee, data = cf)
fit1          
str(fit1)
summary(fit1)  # usual output
plot(fit1)     # basic diagnostics


fit2 <- lm( Heart ~ Stress, data = cf)
summary(fit2)


# Regression on a factor (categorical variable) with more than 2 levels

fit2c <- lm( Heart ~ Stress.class, data = cf)
summary(fit2c)

# Testing the effect of "Stress.class"

wald( fit2c, "Stress")

# Pairwise comparisons

L <- rbind( "Med - Low" = c(0,1,0),
            "High - Low" = c(0,0,1),
            "High - Med" = c(0,-1,1))
L
wald(fit2c, L)

# Multiple regression

fit3 <- lm( Heart ~ Stress + Coffee, data = cf)
summary(fit3)

plot(fit3)  # some diagnostics

avPlots(fit3) # Added Variable Plots
shapiro.test( resid(fit3))  # formal test of normality of residuals

#
# Note that the simple regression of Heart on Coffee 
# shows that there is a statistically significant
# positive association of heart damage with coffee consumption.
#
# However, when controlling for 'Stress' the association
# is negative, although not statistically significant.
#
# Does this provide evidence that Coffee consumption may be harmful?
# Why would the sign of the coefficient change when we include Stress in the model?
#
Personal tools