Introduction to Programming in R

From Wiki1

Jump to: navigation, search

" R Programming:
Largely derived from John Fox:
Introduction to the R Statistical Computing Environment
ICPSR Summer Program (2010)

Preliminary programming example

  • Identifying influential outliers in regression using:
    • 1) leverage (hatvalues) -- unusual values of X compared with other observations
    • 2) rstudent -- poor fit to model fitted with other observations
    • 3) Influence: Cook's distance: combination of 1 and 2 above

"

 library(car) # for data
 library(spida.beta)
 
   
 # Example of diagnostics for linear model
 # Do it step by step then turn it into a function
 # so it can be applied to new problems
 
 # Example doing it 'by hand' -- then we turn it into a function
 xqplot(Duncan)
 
 
 mod.duncan <- lm(prestige ~ income + education, data=Duncan)
 cook <- sqrt(cooks.distance(mod.duncan))      # using as radius will make area proportion to cook's distance
 plot(hatvalues(mod.duncan), rstudent(mod.duncan), 
     cex=(10/max(cook))*cook)
 abline(h=c(-2, 0, 2), lty=2)    # frequent cutoff for rstudent
 abline(v=c(2, 3)*               # cutoffs for leverage  
     length(coef(mod.duncan))/length(rstudent(mod.duncan)), lty=1)
 identify(hatvalues(mod.duncan), rstudent(mod.duncan), 
     rownames(Duncan))
 
 # putting it together in a function:
 inflPlot <- function(model, scale=10, col=c(1, 2),
     identify=TRUE, labels=names(rstud), ... ) {
     # Plot hatvalues, Studentized residuals, and Cook's distances
     #  for a linear or generalized linear model
     # Arguments:
     #  model: an lm or glm model object
     #  scale: a scaling factor for the circles representing 
     #    Cook's D
     #  col: colors for non-noteworthy and noteworthy points
     #  identify points: label noteworthy points (TRUE or FALSE)
     #  labels: for identified points
     hatval <- hatvalues(model)
     rstud <- rstudent(model)
     cook <- sqrt(cooks.distance(model))
     scale <- scale/max(cook, na.rm = TRUE)
     p <- length(coef(model))
     n <- length(rstud)
     cutoff <- sqrt(4/(n - p))
     plot(hatval, rstud, xlab = "Hat-Values", 
         ylab = "Studentized Residuals",
         cex=scale*cook, col=ifelse(cook > cutoff, col[2], col[1]), 
         ...)
     abline(v = c(2, 3)*p/n, lty = "dashed")
     bonf <- qt(.025/n, df = n - p - 1, lower.tail=FALSE)
     abline(h=c(-bonf, -2, 0, 2, bonf), lty="dashed")
     if (identify){
         noteworthy <- cook > cutoff | abs(rstud) > bonf | 
             hatval > 2*p/n
         pos <- ifelse(hatval - mean(range(hatval, na.rm=TRUE)) <= 0, 
             4, 2)
         text(hatval[noteworthy], rstud[noteworthy], 
             labels[noteworthy], pos = pos[noteworthy])
         return(which(noteworthy))
     }
     else return(invisible(NULL))
 }
 
 inflPlot(mod.duncan)
 
 inflPlot(mod.duncan, ylim=c(-3, 4), las=1, col=gray(c(0.5, 0)))
 
 # Basic matrix operations
 
 (A <- matrix(c(1, 2, -4, 3, 5, 0), nrow=2, ncol=3))
 (B <- matrix(1:6, 2, 3))
 (C <- matrix(c(2, -2, 0, 1, -1, 1, 4 ,4, -4), 3, 3, byrow=TRUE))
 
 A + B  # addition
 A - B  # subtraction
 2*A    # product of a scalar and a matrix
 -A     # negation
 
 A %*% C  # matrix product
 
 A %*% B  # not conformable for matrix multiplication
 
     # vectors are treated as row or column vectors as needed
     
 (a <- rep(1, 3))
 (b <- c(1, 5, 3))
 C %*% a
 
 a %*% C
 
 a %*% b  # inner product
 
 outer(a, b)  # outer product
 
 t(B)  # matrix transpose
 
 solve(C)  # matrix inverse
 solve(C) %*% C  # check
 
 library(MASS)  # for fractions()
 fractions(solve(C))
 
 solve(C, b)  # solve matrix equation Cx = b
 
 solve(C) %*% b  # equivalent [almost]
 
     # illustration: naive computation of LS regression
     
 X <- cbind(1, as.matrix(Prestige[,1:3]))  # attach the constant
 y <- Prestige[ , 4]
 head(X)  # first 6 rows
 head(y)
 
 head(Prestige[ , 4, drop=FALSE]) # as a column
 
 solve(t(X) %*% X) %*% t(X) %*% y  # LS coefficients
 
 lm(prestige ~ education + income + women, data=Prestige) # check?
 # Note that here they were close but 'lm' estimates regression in
 # a way that is numerically FAR MORE STABLE than using the 
 # 'formula'.  
 # Mathematical formulas would give exact answers -- with infinite precision
 # But real calculations use finite precision and good algorithms can often
 # do a MUCH better job than a standard formula
   # eigenvalues and eigenvectors
   
 R <- with(Prestige, cor(cbind(education, income, women)))
 R  # correlation matrix
 
 eigen(R)  # principal-components analysis
 
 det(R)  # determinant
 
 svd(R)    # singular value decomposition: R = UDV' [often used for stable computation]
 R.qr <- qr(R)
 qr.R(R.qr)    # upper triangular
 
 diag(R)  # extract diagonal  
 
 diag(R) <- NA  # set diagonal  [example of a **replacement** function used
                   # on left side of assignment "<-"]
 R
 
 
 
 diag(1:3)  # make diagonal matrix
 
 diag(3)  # order-3 identity matrix
         # Note: diag does very different things depending on its argument
 
 # Control structures
 
     # conditionals
     
         # if ... else
         
 abs1 <- function(x) if (x < 0) -x else x
 abs1(-5)
 abs1(5)
 
 abs1(-3:3)  # wrong! the first element, -3, controls the result
         # if works only one element
         # To work on each element of a vector we need the
         # vectorized version of 'if':
         # ifelse (vectorized)
         
 abs2 <- function(x) ifelse(x < 0, -x, x)
 abs2(-3:3)
 
         # cascading conditionals
         
 sign1 <- function(x){
     if (x < 0) -1
         else if (x > 0) 1
             else 0
 }
 
 sign1(-5)   # can only work on one value at a time
 
 sign2 <- function(x){
     ifelse (x < 0, -1,
         ifelse(x > 0, 1, 0))
 }
 
 sign2(c(-5, 0, 10))
 
         # switch  -- another way to get conditional execution
         
 convert2meters <- function(x, 
     units=c("inches", "feet", "yards", "miles")) {
     
     units <- match.arg(units)
     switch(units,
         inches = x * 0.0254,
         feet = x * 0.3048,
         yards = x * 0.9144,
         miles = x * 1609.344)
 }
 
 convert2meters(10, "inches")
 convert2meters(10, "in")   # abbreviation
 convert2meters(3, "feet")
 convert2meters(100, "yards")
 convert2meters(5, "miles")
 convert2meters(7, "fathoms") # error!
 
     # iteration
     
         # for loops
         
 prod(1:5)    # ok for small numbers
 gamma(5 + 1) # better
 factorial(5) # best
 factorial
 
 # prototype functions
 
 fact1 <- function(x){
     if (x <= 1) return(1)
     f <- 1  # initialize
     for (i in 1:x) f <- f * i  # accumulate product
     f  # return result
 }
 
 fact1(5)
 
 fact1(-5)  # wrong!
 
 # more mature functions include error checking
 
 fact2 <- function(x) {  # with input checks
     if ((!is.numeric(x)) || (x != floor(x))
          || (x < 0) || (length(x) > 1))
         stop("x must be a non-negative integer")
     f <- 1  # initialize
     for (i in 1:x) f <- f * i  # accumulate product
     f  # return result
 }
 
 fact2(5)
 fact2(-5)  # error!
         
         # while loops
         
 fact3 <- function(x){
     if ((!is.numeric(x)) || (x != floor(x))
          || (x < 0) || (length(x) > 1))
         stop("x must be a non-negative integer")
     i <- f <- 1  # initialize
     while (i <= x) {
         f <- f * i  # accumulate product
         i <- i + 1  # increment counter
         }
     f  # return result
 }
 
 fact3(5)
 
         # repeat loops
         
 fact4 <- function(x) {
     if ((!is.numeric(x)) || (x != floor(x))
          || (x < 0) || (length(x) > 1))
         stop("x must be a non-negative integer")
     i <- f <- 1  # initialize
     repeat {
         f <- f * i  # accumulate product
         i <- i + 1  # increment counter
         if (i > x) break  # termination test
     }
     f  # return result
 }
 
 fact4(5)
 
     # recursion
     
 fact5 <- function(x){
     if (x <= 1) 1  # termination condition
     else x * fact5(x - 1)  # recursive call
 }
 
 fact5(5)
 
 fact6 <- fact5
 remove(fact5)
 fact6(5)  # error!
 
 fact7 <- function(x) {
     if (x <= 1) 1
     else x * Recall(x - 1)  # recursive call
 }
 
 fact7(5)
 fact8 <- fact7
 remove(fact7)
 fact8(5)  # still works with fact7 removed
 
 
 # Avoiding loops
 
     # the "apply" family
         
         # apply (arrays)
     
 head(DavisThin, 10)  # first 10 rows
 dim(DavisThin)
 ?DavisThin
 
 DavisThin$thin.drive <- apply(DavisThin, 1, sum) # row sums
 head(DavisThin$thin.drive, 10)
 
 apply(DavisThin, 2, mean) # column means
 
 colMeans(DavisThin) # more efficient (rowMeans, colSums, rowSums too)
 
 DavisThin$thin.drive <- NULL  # remove thin.drive
 DavisThin[1, 2] <- DavisThin[2, 4] <- DavisThin[10, 3] <- NA  # some missing data
 head(DavisThin, 10)
 
 head(apply(DavisThin, 1, sum), 10)
 
 head(apply(DavisThin, 1, function(x) 7*mean(x, na.rm=TRUE)), 10)
 
 DavisThin[1, 2:5] <- NA  # create some more missing data
 head(DavisThin, 10)
 
 make.scale <- function(items) {
     if (sum(is.na(items)) >= 4) NA
     else 7*mean(items, na.rm=TRUE)
 }
 
 head(apply(DavisThin, 1, make.scale), 10)
 
 
         # lapply, sapply (lists)
 
 thin.list <- as.list(DavisThin)
 str(thin.list)  # structure of the result
 
 lapply(thin.list, mean, na.rm=TRUE) # returns list (na.rm passed to mean)
 
 lapply(thin.list, function(x) mean(x, na.rm=TRUE)) # equivalent
 
 sapply(thin.list, mean, na.rm=TRUE) # simplifies result
 
         # mapply (multiple arguments
         
 integrate(dnorm, lower=-1.96, upper=1.96) # integrate normal density
 
 (low <- c(-Inf, -3:3))
 (high <- c(-3:3, Inf))
 (P <- mapply(function(lo, hi) integrate(dnorm, lo, hi)$value, 
     lo=low, hi=high))
 sum(P)
 
 pnorm(high) - pnorm(low) # check
 
         # Vectorize
     
 Integrate <- Vectorize(
     function(fn, lower, upper) integrate(fn, lower, upper)$value,
     vectorize.args=c("lower", "upper")
     )
 
 Integrate(dnorm, lower=low, upper=high)
 
         # tapply (tables of statistics)
         
 some(Moore)  # randomly sample 10 observations
 
 with(Moore, tapply(conformity,
    list(Status=partner.status, Authoritarianism=fcategory), mean))
    
 Moore$fcategory <- factor(Moore$fcategory, 
     levels=c("low", "medium", "high"))  # reorder levels
 Moore$partner.status <- factor(Moore$partner.status,
     levels=c("low", "high"))
     
 with(Moore, tapply(conformity,
    list(Status=partner.status, Authoritarianism=fcategory), mean))
    
     # to loop or not to loop?
     
 time1 <- function(n){  # inefficient!
     a <- NULL
     for(i in 1:n) a <- c(a, i^2)
     a
 }
 
 system.time(time1(30000))
 
 time2 <- function(n){  # better
     a <- numeric(n) # initialize to final length
     for(i in 1:n) a[i] <- i^2
     a
 }
 
 system.time(time2(30000))
 
 time3 <- function(n){  # best
     a <- (1:n)^2 # vectorize
     a
 }
 
 system.time(time3(30000))
 
 
 time4 <- function(n){  # (slightly) inefficient!
     a <- numeric(n)
     for(i in 1:n) a[i] <- 2 * pi * sin(i)
     a
 }
 
 system.time(time4(100000))
 
 time5 <- function(n){  # better
     a <- numeric(n)
     for(i in 1:n)
         a[i] <- sin(i)
     2 * pi * a  # move outside loop
 }
 system.time(time5(100000))
 
 time6 <- function(n){  # best
     2 * pi * sin(1:n) # fully vectorized
 }
 system.time(time6(100000))
 
         # don't vectorize for its own sake
         
 matrices <- vector(mode="list", length=10000)  # allocate 
 for (i in seq_along(matrices))
     matricesi <- matrix(rnorm(10000), 100, 100)
 
             # simple
 system.time({ 
     S1 <- matrix(0, 100, 100)  # initialize
     for (M in matrices) S1 <- S1 + M  # accumulate
 })
 
             # clever
 system.time(S2 <- apply(array(unlist(matrices), 
     dim = c(100, 100, 10000)), 1:2, sum))
     
             # a smaller problem
             
 system.time({ 
     S1 <- matrix(0, 100, 100)  # initialize
     for (M in matrices[1:1000]) S1 <- S1 + M  # accumulate
 })
 
             # clever
 system.time(S2 <- apply(array(unlist(matrices[1:1000]), 
     dim = c(100, 100, 1000)), 1:2, sum))
     
 all(S1 == S2)  # are answers EXACTLY equal (bad idea!)?
 
 max(abs(S1 - S2))  # how different?
 
 all.equal(S1, S2)  # equal within tolerance
 ?all.equal
 
     
 # Non-trivial programming examples
 
     # estimation of logistic regression by Newton-Raphson
     
 lreg1 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE){
     # X is the model matrix
     # y is the response vector of 0s and 1s
     # max.iter is the maximum number of iterations
     # tol is a convergence criterion
     # verbose: show iteration history?
     X <- cbind(1, X)  # add constant
     b <- b.last <- rep(0, ncol(X))  # initialize coefficients
     it <- 1  # initialize iteration counter
     while (it <= max.iter){
         if (verbose) cat("\niteration = ", it, ": ", b)
         p <- as.vector(1/(1 + exp(-X %*% b)))
         V <- diag(p * (1 - p))
         var.b <- solve(t(X) %*% V %*% X)
         b <- b + var.b %*% t(X) %*% (y - p)  # update coefficients
         if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) 
             break
         b.last <- b  # update previous coefficients
         it <- it + 1  # increment counter
     }
     if (verbose) cat("\n")  # newline
     if (it > max.iter) warning("maximum iterations exceeded")
     list(coefficients=as.vector(b), var=var.b, iterations=it)
 }
 
 head(Mroz)  # first 6 observations
 
 Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0))
 Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0))
 Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0))
 
 mod.mroz.1 <- with(Mroz, 
     lreg1(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
 
 mod.mroz.1$coefficients
 
 sqrt(diag(mod.mroz.1$var)) # std. errors
 
 mod.mroz.glm <- glm(lfp ~ ., family=binomial, data=Mroz) # check
 coef(mod.mroz.glm) - mod.mroz.1$coefficients
 sqrt(diag(vcov(mod.mroz.glm))) - sqrt(diag(mod.mroz.1$var))
 
 
     # estimation of logistic regression by optimization
     
 lreg2 <- function(X, y, method="BFGS"){
     X <- cbind(1, X)
     negLogL <- function(b, X, y){
         p <- as.vector(1/(1 + exp(-X %*% b)))
         - sum(y*log(p) + (1 - y)*log(1 - p))
     }
     grad <- function(b, X, y){
         p <- as.vector(1/(1 + exp(-X %*% b)))
         - apply(((y - p)*X), 2, sum)
     }
     result <- optim(rep(0, ncol(X)), negLogL, gr=grad,
         hessian=TRUE, method=method, X=X, y=y)
     list(coefficients=result$par, var=solve(result$hessian),
         deviance=2*result$value, converged=result$convergence == 0)
 }
 
 
 mod.mroz.2 <- with(Mroz, 
     lreg2(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
 mod.mroz.2$coefficients
 
 sqrt(diag(mod.mroz.2$var))
 
 mod.mroz.2$converged
 
 coef(mod.mroz.glm) - mod.mroz.2$coefficients # check
 sqrt(diag(vcov(mod.mroz.glm))) - sqrt(diag(mod.mroz.2$var))
 
     # translating numbers into words
     
 makeDigits <- function(x) strsplit(as.character(x), "")1
 
 makeDigits(123456)
 makeDigits(-123456) # whoops!
 makeDigits(1000000000) # whoops!
 
 options("scipen")
 ?options
 options(scipen=100)
 makeDigits(1000000000)
 
 makeNumber <- function(x) as.numeric(paste(x, collapse=""))
 makeNumber(c("1", "2", "3", "4", "5"))
 
 ones <- c("zero", "one", "two", "three", "four", "five", "six", 
     "seven", "eight", "nine")
 teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", 
     "fifteen", "sixteen", " seventeen", "eighteen", "nineteen")
 names(ones) <- names(teens) <- 0:9
 tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy", 
     "eighty", "ninety")
 names(tens) <- 2:9
 suffixes <- c("thousand,", "million,", "billion,", "trillion,")
 
 ones["5"]
 teens["3"]
 tens["7"]
 
 trim <- function(text){
     gsub("(^\ *)|((\ *|-|,\ zero|-zero)$)", "", text)
 }
 
 trim("twenty-one")
 trim("twenty-zero")
 
 number2words <- function(x){
     negative <- x < 0
     x <- abs(x)
     digits <- makeDigits(x)
     nDigits <- length(digits)
     result <- if (nDigits == 1) as.vector(ones[digits])
         else if (nDigits == 2)
             if (x <= 19) as.vector(teens[digits[2]])
                 else trim(paste(tens[digits[1]], "-", ones[digits[2]], 
                     sep=""))
         else if (nDigits == 3) {
             tail <- makeNumber(digits[2:3])
                 if (tail == 0) paste(ones[digits[1]], "hundred")
                     else trim(paste(ones[digits[1]], "hundred", 
                         number2words(tail)))
         }
         else {
             nSuffix <- ((nDigits + 2) %/% 3) - 1
             if (nSuffix > length(suffixes) || nDigits > 15)
                 stop(paste(x, "is too large!"))
             pick <- 1:(nDigits - 3*nSuffix)
             trim(paste(number2words(makeNumber(digits[pick])),
                 suffixes[nSuffix], 
                 number2words(makeNumber(digits[-pick]))))
         }
     if (negative) paste("minus", result) else result
 }
 
 number2words(123456789)
 
 number2words(-123456789)
 
 number2words(-123456000)
 
 debug(number2words)
 number2words(123456789)
 
 remove(makeDigits, makeNumber, number2words, trim, ones, teens, 
     tens, suffixes)
 options(scipen=0) # back to default
 
 numbers2words <- function(x, billion=c("US", "UK"),
     and=if (billion == "US") "" else "and") {
     billion <- match.arg(billion)
     trim <- function(text){
         gsub("(^\ *)|((\ *|-|,\ zero|-zero)$)", "", text)
     }
     makeNumber <- function(x) as.numeric(paste(x, collapse=""))
     makeDigits <- function(x) strsplit(as.character(x), "")1
     helper <- function(x) {
         negative <- x < 0
         x <- abs(x)
         digits <- makeDigits(x)
         nDigits <- length(digits)
         result <- if (nDigits == 1) as.vector(ones[digits])
         else if (nDigits == 2)
             if (x <= 19) as.vector(teens[digits[2]])
                 else trim(paste(tens[digits[1]], "-", ones[digits[2]], sep=""))
         else if (nDigits == 3) {
             tail <- makeNumber(digits[2:3])
             if (tail == 0) paste(ones[digits[1]], "hundred")
                 else trim(paste(ones[digits[1]], trim(paste("hundred", and)),
                     helper(tail)))
         }
         else {
             nSuffix <- ((nDigits + 2) %/% 3) - 1
             if (nSuffix > length(suffixes) || nDigits > 15)
                 stop(paste(x, "is too large!"))
             pick <- 1:(nDigits - 3*nSuffix)
             trim(paste(helper(makeNumber(digits[pick])),
                 suffixes[nSuffix], helper(makeNumber(digits[-pick]))))
         }
         if (billion == "UK"){
             words <- strsplit(result, " ")1
             if (length(grep("million,", words)) > 1)
                 result <- sub(" million, ", ", ", result)
         }
         if (negative) paste("minus", result) else result
     }
     opts <- options(scipen=100)
     on.exit(options(opts))
     ones <- c("zero", "one", "two", "three", "four", "five", "six", "seven",
         "eight", "nine")
     teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
         "sixteen", " seventeen", "eighteen", "nineteen")
     names(ones) <- names(teens) <- 0:9
     tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy",
              "eighty", "ninety")
     names(tens) <- 2:9
     suffixes <- if (billion == "US")
                     c("thousand,", "million,", "billion,", "trillion,")
                 else
                     c("thousand,", "million,", "thousand million,", "billion,")
     x <- round(x)
     if (length(x) > 1) sapply(x, helper) else helper(x)
 }
 
 numbers2words(c(1234567890123, -0123, 1000))
 
 numbers2words(c(1234567890123, -0123, 1000), billion="UK")
 
 # Simulation example: 
 # efficiency of LS and robust regression when errors are normal and non-normal
 
 X <- as.matrix(Prestige[,c("education", "income", "women")])
 mod <- lm(prestige ~ education + income + women, data=Prestige)
 beta <- coef(mod)  # taken as "true" parameters
 sigma <- summary(mod)$sigma  # taken as true scale of errors
 Ey <- fitted(mod)  # taken as expectation of response
 
     # initialize:
 robust.t <- robust.normal <- LS.t <- LS.normal <- matrix(0, 1000, 4)
 
 (seed <- sample(1e6, 1))
 set.seed(seed) # for reproducibility
 
 system.time(
     for (i in 1:1000){
         normal.errors <- sigma*rnorm(102) # sample errors
         t.errors <- sigma*rt(102, 2)
         y.normal <-Ey + normal.errors  # construct observed response
         y.t <- Ey + t.errors
         LS.normal[i,] <- lsfit(X, y.normal)$coef - beta  # LS sampling error
         LS.t[i,] <- lsfit(X, y.t)$coef - beta
         robust.normal[i,] <- coef(rlm(y.normal ~ X, method="MM", maxit=100)) - beta
         robust.t[i,] <- coef(rlm(y.t ~ X, method="MM", maxit=100)) - beta
     }
 )
 
 summary(lm(prestige ~ education + income + women, data=Prestige))
 
     # estimated relative bias (should be close to 0):
 colMeans(LS.normal)/beta
 colMeans(robust.normal)/beta
 
 colMeans(LS.t)/beta
 colMeans(robust.t)/beta
 
     # estimated RMSE:
 sqrt(colMeans(LS.normal^2))/abs(beta)
 sqrt(colMeans(robust.normal^2))/abs(beta)
 
 sqrt(colMeans(LS.t^2))/abs(beta)
 sqrt(colMeans(robust.t^2))/abs(beta)
 
 # Debugging
 
 library(car) # for data
 Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0))
 Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0))
 Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0))
 
     # bugged!
 lreg3 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) {
     X <- cbind(1, X)
     b <- b.last <- rep(0, ncol(X))
     it <- 1
     while (it <= max.iter){
         if (verbose) cat("\niteration = ", it, ": ", b)
         p <- 1/(1 + exp(-X %*% b))
         V <- diag(p * (1 - p))
         var.b <- solve(t(X) %*% V %*% X)
         b <- b + var.b %*% t(X) %*% (y - p)
         if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) 
             break
         b.last <- b
         it <- it + 1
     }
     if (it > max.iter) warning("maximum iterations exceeded")
     if (verbose) cat("\n")
     list(coefficients=as.vector(b), var=var.b, iterations=it)
 }
 
 mod.mroz.3 <- with(Mroz, 
     lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
     
     # call stack to error: traceback
 
 traceback()
 
     # interrupt execution, examine environment of function: browser
 
 ?browser
 
         # still bugged!
 lreg3 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) {
     X <- cbind(1, X)
     b <- b.last <- rep(0, ncol(X))
     it <- 1
     while (it <= max.iter){
         if (verbose) cat("\niteration = ", it, ": ", b)
         p <- 1/(1 + exp(-X %*% b))
         V <- diag(p * (1 - p))
       browser()
         var.b <- solve(t(X) %*% V %*% X)
         b <- b + var.b %*% t(X) %*% (y - p)
         if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) 
             break
         b.last <- b
         it <- it + 1
       }
     if (it > max.iter) warning("maximum iterations exceeded")
     if (verbose) cat("\n")
     list(coefficients=as.vector(b), var=var.b, iterations=it)
 }
 
 mod.mroz.3 <- with(Mroz,
     lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
 
 # imagined sequence of commands in browser mode:    
 #    t(X) %*% V %*% X
 #    str(X)
 #    head(X)
 #    str(V)
 #    V
 #    str(p)
 #    head(p)
 #    p <- as.vector(p)
 #    str(diag(p * (1 - p)))
 #    V <- diag(p * (1 - p))
 #    V[1:10, 1:10]
 #    
 #    Q
 
     # step-through debugger: debug
     
 ?debug
     
         # bugged! (original bugged function)
 lreg3 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) {
     X <- cbind(1, X)
     b <- b.last <- rep(0, ncol(X))
     it <- 1
     while (it <= max.iter){
         if (verbose) cat("\niteration = ", it, ": ", b)
         p <- 1/(1 + exp(-X %*% b))
         V <- diag(p * (1 - p))
         var.b <- solve(t(X) %*% V %*% X)
         b <- b + var.b %*% t(X) %*% (y - p)
         if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) 
             break
         b.last <- b
         it <- it + 1
     }
     if (it > max.iter) warning("maximum iterations exceeded")
     if (verbose) cat("\n")
     list(coefficients=as.vector(b), var=var.b, iterations=it)
 }
 
 debug(lreg3) # set debug flag
 
 mod.mroz.3 <- with(Mroz,
     lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
     
     # postmortem debugger: debugger
     
 undebug(lreg3)
 
 ?debugger
 
 options(error=dump.frames)
 
 mod.mroz.3 <- with(Mroz,
     lreg3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
     
 debugger()
 
 # Measuring time and memory allocation
 
 options(error=NULL) # restore default
 
 lreg1 # by Newton-Raphson
 lreg2 # by optimization
 
     # a larger (but not large!) problem
 
 set.seed(12345)  # for reproducibility
 X <- matrix(rnorm(5000*10), 5000, 10)
 y <- rbinom(5000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11))))
 
     # measuring time: system.time
     
 system.time(mod.1 <- lreg1(X, y))
 mod.1$coef
 
 system.time(mod.2 <- lreg2(X, y))
 mod.2$coef
     
 system.time(mod.glm <- glm(y ~ X, family=binomial))
 coef(mod.glm)
 
     # profiling time and memory use: Rprof
     
 (tmp <- tempfile()) # create temporary file
 Rprof(tmp, memory.profiling=TRUE) # turn on profiling
 mod.1 <- lreg1(X, y)
 Rprof() # turn off profiling
 summaryRprof(tmp, memory="both") # summarize results
 unlink(tmp) # delete temporary file
 
 tmp <- tempfile() # create temporary file
 Rprof(tmp, memory.profiling=TRUE) # turn on profiling
 mod.2 <- lreg2(X, y)
 Rprof() # turn off profiling
 summaryRprof(tmp, memory="both") # summarize results
 unlink(tmp) # delete temporary file
 
         # new version of Newton-Raphson function
         
 lreg4 <- function(X, y, max.iter=10, tol=1E-6) {
     X <- cbind(1, X)
     b <- b.last <- rep(0, ncol(X))
     it <- 1
     while (it <= max.iter){
         p <- as.vector(1/(1 + exp(-X %*% b)))
         var.b <- solve(crossprod(X, p * (1 - p) * X))
         b <- b + var.b %*% crossprod(X, y - p)
         if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) 
             break
         b.last <- b
         it <- it + 1
     }
     if (it > max.iter) warning("maximum iterations exceeded")
     list(coefficients=as.vector(b), var=var.b, iterations=it)
 }
 
         # explanation:
         
 (XX <- matrix(1, 5, 3)) # don't want to overwrite X
 (V <- diag(1:5))
 V %*% XX
 
 1:5 * XX # same thing!
 
 system.time(mod.4 <- lreg4(X, y))
 mod.4$coef  # check
 
 tmp <- tempfile() # create temporary file
 Rprof(tmp, memory.profiling=TRUE, interval=0.002) # profiling with smaller sampling interval
 mod.4 <- lreg4(X, y)
 Rprof() # turn off profiling
 summaryRprof(tmp, memory="both") # summarize results
 unlink(tmp) # delete temporary file
 
         # a much larger problem
         
 set.seed(12345)  # for reproducibility
 X <- matrix(rnorm(100000*10), 100000, 10)
 y <- rbinom(100000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11))))
 
 system.time(mod.1 <- lreg1(X, y))
 
 system.time(mod.4 <- lreg4(X, y))
 
 system.time(mod.2 <- lreg2(X, y))
     
 system.time(mod.glm <- glm(y ~ X, family=binomial))
 
 max(abs(mod.4$coef - coef(mod.glm)))
 
 # The S3 object system
 
     # S3 classes
 
 library(car) # for data
 
 mod.prestige <- lm(prestige ~ income + education + women,
     data=Prestige)
 attributes(mod.prestige)
 class(mod.prestige)
 
 
 v <- 1:10
 v
 attributes(v)
 class(v)
 
 class(v) <- "character"
 attributes(v)
 v
 class(v)
 
     # S3 generic functions and methods
     
 print # the print generic
 
 print.lm # print method for "lm" objects
 
 mod.prestige
 print(mod.prestige) # equivalent
 print.lm(mod.prestige)  # equivalent, but bad form
 
 methods("print") # print methods
 methods(class="lm") # methods for objects of class "lm"
 
     # S3 "inheritance"
 
 mod.mroz <- glm(lfp ~ ., family=binomial, data=Mroz)
 class(mod.mroz)
 
     # Example: a logistic-regression function
         
         # generic function
         
 lreg <- function(X, ...){
     UseMethod("lreg")
     }
     
         # default method
 
 lreg.default <- function(X, y, predictors=colnames(X), max.iter=10,
         tol=1E-6, constant=TRUE, ...) {
     if (!is.numeric(X) || !is.matrix(X))
         stop("X must be a numeric matrix")
     if (!is.numeric(y) || !all(y == 0 | y == 1))
         stop("y must contain only 0s and 1s")
     if (nrow(X) != length(y))
         stop("X and y contain different numbers of observations")
     if (constant) {
         X <- cbind(1, X)
         colnames(X)[1] <- "Constant"
     }
     b <- b.last <- rep(0, ncol(X))
     it <- 1
     while (it <= max.iter){
         p <- as.vector(1/(1 + exp(-X %*% b)))
         var.b <- solve(crossprod(X, p * (1 - p) * X))
         b <- b + var.b %*% crossprod(X, y - p)
         if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) 
             break
         b.last <- b
         it <- it + 1
     }
     if (it > max.iter) warning("maximum iterations exceeded")
     dev <- -2*sum(y*log(p) + (1 - y)*log(1 - p))
     result <- list(coefficients=as.vector(b), var=var.b,
         deviance=dev, converged= it <= max.iter,
         predictors=predictors, iterations = it)
     class(result) <- "lreg"
     result
 }
 
 
 mod.mroz.3 <- with(Mroz, 
     lreg(cbind(k5, k618, age, wc, hc, lwg, inc), lfp))
 class(mod.mroz.3)
 mod.mroz.3 # whoops!
 
         # print and summary methods for class "lreg"
 
 print.lreg <- function(x, ...) {
     coef <- x$coefficients
     names(coef) <- x$predictors
     print(coef)
     if (!x$converged) cat("\n *** lreg did not converge ***\n")
     invisible(x)
 }
 summary.lreg <- function(object, ...) {
     b <- object$coefficients
     se <- sqrt(diag(object$var))
     z <- b/se
     table <- cbind(b, se, z, 2*(1-pnorm(abs(z))))
     colnames(table) <- 
         c("Estimate", "Std.Err", "z value", "Pr(>|z|)")
     rownames(table) <- object$predictors
     result <- list(coef=table, deviance=object$deviance,
         converged=object$converged)
     class(result) <- "summary.lreg"
     result
 }
 print.summary.lreg <- function(x, ...) {
     printCoefmat(x$coef, signif.stars=FALSE)
     cat("\nDeviance =", x$deviance,"\n")
     if (!x$converged) 
         cat("\n Note: *** lreg did not converge ***\n")
     invisible(x)
 }
 
 mod.mroz.3
 
 summary(mod.mroz.3)
 
     # Writing a statistical modeling function
     
         # a formula method for lreg
         
 lreg.formula <- function(formula, data, subset, na.action, 
     model = TRUE, contrasts = NULL, ...) {
     call <- match.call()  # returns the function call
     mf <- match.call(expand.dots = FALSE)  # the function call w/o ...
     args <- match(c("formula", "data", "subset", "na.action"),
         names(mf), 0)  # which arguments are present?
     mf <- mf[c(1, args)]
     mf$drop.unused.levels <- TRUE
     mf1 <- as.name("model.frame")
     mf <- eval.parent(mf)  # create a model frame
     terms <- attr(mf, "terms")  # terms object for the model
     y <- model.response(mf)  # response variable
     values <- sort(unique(y))
     if (length(values) != 2) stop("the response variable is not binary")
     y <- as.numeric(y == values[2])
     X <- model.matrix(terms, mf, contrasts)  # model matrix
     mod <- lreg.default(X, y, predictors=colnames(X), 
         constant=FALSE, ...)
     mod$na.action <- attr(mf, "na.action")
     mod$contrasts <- attr(X, "contrasts")
     mod$xlevels <- .getXlevels(terms, mf)
     mod$call <- call
     mod$terms <- terms
     if (model)  mod$model <- mf
     mod
 }
 
 remove(Mroz) # delete modified Mroz data set
 mod.lreg <- lreg(lfp ~ ., data=Mroz) # Mroz will be found in car
 mod.lreg
Personal tools