# SCS 2011: Statistical Analysis and Programming with R/4 Linear Model Example.R

#
#  SCS 2011: Statistical Analysis and Programming with R
#  September-October 2011
#
#  Linear Models in R
#  -- much of the code is borrowed from Fox,
#     Introduction to the R Statistical Computing Environment
#     ICPSR Summer Program, 2010-2011

# multiple regression (Prestige data)

library(car)
library(spida.beta)

names(Prestige)
some(Prestige)  # 10 randomly sampling observations

prestige.mod <- lm(prestige ~ education + log2(income) + women,
data=Prestige)
summary(prestige.mod)

confint(prestige.mod)
wald(prestige.mod)
# dummy regression

Prestige\$type # a factor
class(Prestige\$type)
str(Prestige\$type) # structure

sapply(Prestige, class)      # sapply applies the 'class' function to each variable in Prestige

Prestige.2 <- na.omit(Prestige) # filter out missing data
nrow(Prestige)
nrow(Prestige.2)
levels(Prestige.2\$type)
Prestige.2\$type <- with(Prestige.2, factor(type, levels=c("bc", "wc", "prof"))) # reorder levels
Prestige.2\$type

# generating contrasts from factors

getOption("contrasts")
contrasts(Prestige.2\$type)
model.matrix(~ type, data=Prestige.2)

contrasts(Prestige.2\$type) <- contr.treatment(levels(Prestige.2\$type), base=2)  # changing baseline category
contrasts(Prestige.2\$type)

contrasts(Prestige.2\$type) <- "contr.helmert"  # Helmert contrasts
contrasts(Prestige.2\$type)

contrasts(Prestige.2\$type) <- "contr.sum"  # "deviation" contrasts
contrasts(Prestige.2\$type)

contrasts(Prestige.2\$type) <- NULL  # back to default

Prestige.2\$type.ord <- ordered(Prestige.2\$type, levels=c("bc", "wc", "prof")) # ordered factor
Prestige.2\$type.ord
round(contrasts(Prestige.2\$type.ord), 3)   # orthogonal polynomial contrasts

prestige.mod.1 <- lm(prestige ~ log2(income) + education + type, data=Prestige.2)
summary(prestige.mod.1)

anova(prestige.mod.1)   # sequential ("type-I") tests

prestige.mod.0 <- lm(prestige ~ income + education, data=Prestige.2) # note: NA's filtered!
summary(prestige.mod.0)

prestige.mod.0 <- update(prestige.mod.1, . ~ . - type) # equivalent [in a formula '-' means remove]
anova(prestige.mod.0, prestige.mod.1)   # incremental F-test

Anova(prestige.mod.1)   # "type-II" tests

prestige.mod.3 <- update(prestige.mod.1,
. ~ . + log2(income):type + education:type)     # adding interactions
summary(prestige.mod.3)
Anova(prestige.mod.3)

lm(prestige ~ log2(income*type) + education*type, data=Prestige.2) # equivalent specifications
lm(prestige ~ (log2(income) + education)*type, data=Prestige.2)

# effect displays

library(effects)

# Anova Models

some(Moore)

Moore\$fcategory <- factor(Moore\$fcategory, levels=c("low", "medium", "high"))
Moore\$partner.status <- relevel(Moore\$partner.status, ref="low")

xtabs(~ fcategory + partner.status, data=Moore)

with(Moore, tapply(conformity,
list(Authoritarianism=fcategory, "Partner's Status"=partner.status),
mean))
with(Moore, tapply(conformity,
list(Authoritarianism=fcategory, "Partner's Status"=partner.status),
sd))

# graph of means:

with(Moore, {
interaction.plot(fcategory, partner.status, conformity, type="b",
pch=c(1, 16), cex=2, ylim=range(conformity))
points(jitter(as.numeric(fcategory), factor=0.5), conformity,
pch=ifelse(partner.status == "low", "L", "H"))
identify(fcategory, conformity)
})

# ANOVA tables

contr <- options(contrasts=c("contr.sum", "contr.poly"))   # contr.sum = deviation contrasts
moore.mod <- lm(conformity ~ fcategory*partner.status, data=Moore)
summary(moore.mod)

Anova(moore.mod)    # type II sums of squares
Anova(moore.mod, type="III")    # type III sums of squares

options(contr) # restore defaults

# more on lm

args(lm)

some(Davis)
lm(weight ~ repwt, data=Davis, subset=sex == "F")  # observation selection (women only)
lm(weight ~ repwt, data=Davis, subset=1:100)

lm(prestige ~ income + education, data=Duncan, subset=-c(6, 16))

lm(conformity ~ partner.status*fcategory,  # specifying contrasts
contrasts=list(partner.status=contr.sum, fcategory=contr.poly),
data=Moore)

lm(100*conformity/40 ~ partner.status*fcategory, data=Moore)  # data argument; note computation of y

lm(prestige~I(income + education), data=Duncan)  # "protecting" expresssion on RHS of the model

# Generalized linear models

# binary logit model

some(Mroz)

mroz.mod <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg + inc,
data=Mroz, family=binomial)
summary(mroz.mod)

round(exp(cbind(Estimate=coef(mroz.mod), confint(mroz.mod))), 2) # odds ratios

mroz.mod.2 <- update(mroz.mod, . ~ . - k5 - k618)
anova(mroz.mod.2, mroz.mod, test="Chisq") # likelihood-ratio test

Anova(mroz.mod)  # analysis-of-deviance table

# Poisson regression

some(Ornstein)
nrow(Ornstein)
(tab <- xtabs(~interlocks, data=Ornstein))

x <- as.numeric(names(tab)) # the names are the distinct values of interlocks
plot(x, tab, type="h", xlab="Number of Interlocks", ylab="Frequency")
points(x, tab, pch=16)

mod.ornstein <- glm(interlocks ~ log2(assets) + nation + sector,
family=poisson, data=Ornstein)
summary(mod.ornstein)
Anova(mod.ornstein)

# quasi-Poisson model, allowing for overdispersion

mod.ornstein.q <- update(mod.ornstein, family=quasipoisson)
summary(mod.ornstein.q)

# repeated-measures ANOVA and MANOVA

some(OBrienKaiser)
?OBrienKaiser
contrasts(OBrienKaiser\$treatment)
contrasts(OBrienKaiser\$gender)

# defining the within-subjects design

phase <- factor(rep(c("pretest", "posttest", "followup"), c(5, 5, 5)),
levels=c("pretest", "posttest", "followup"))
hour <- ordered(rep(1:5, 3))
idata <- data.frame(phase, hour)
idata

# fitting the multivariate linear model

mod.ok <- lm(cbind(pre.1, pre.2, pre.3, pre.4, pre.5,
post.1, post.2, post.3, post.4, post.5,
fup.1, fup.2, fup.3, fup.4, fup.5) ~  treatment*gender,
data=OBrienKaiser)
mod.ok

# multivariate and univariate tests

(av.ok <- Anova(mod.ok, idata=idata, idesign=~phase*hour))

summary(av.ok)

# graphing the means

# reshape the data from "wide" to "long"

OBrien.long <- reshape(OBrienKaiser,
varying=c("pre.1", "pre.2", "pre.3", "pre.4", "pre.5",
"post.1", "post.2", "post.3", "post.4", "post.5",
"fup.1", "fup.2", "fup.3",  "fup.4", "fup.5"),
v.names="score",
timevar="phase.hour", direction="long")
OBrien.long\$phase <- ordered(c("pre", "post", "fup")[1 + ((OBrien.long\$phase.hour - 1) %/% 5)],
levels=c("pre", "post", "fup"))
OBrien.long\$hour <- ordered(1 + ((OBrien.long\$phase.hour - 1) %% 5))
dim(OBrien.long)
head(OBrien.long, 25) # first 25 rows

# compute means

Means <- as.data.frame(ftable(with(OBrien.long,
tapply(score, list(treatment=treatment, gender=gender, phase=phase, hour=hour), mean))))
names(Means)[5] <- "score"
dim(Means)