# install.packages("glmnet")
# install.packages("coefplot")
library(coefplot) # simple extraction of coef for lasso
library(glmnet)
library(splines)
library(Hmisc)
library(randomForest)
library(MASS)
library(ISLR2)
library(ggplot2)
library(dplyr)

theme_set(theme_bw())

makeseq <- function(x, l = 20) {
  seq(min(x), max(x), length.out = l)
}

Generalizing the GLMs

Again, we are looking at the bias/variance trade-off

Beyond Linearity

## Data from ISLR
data(Wage)

ggplot(Wage, aes(age, wage)) + geom_point()

Wage <- mutate(Wage, wage125 = as.numeric(wage > 125))

ggplot(Wage, aes(age, wage125)) + 
  geom_jitter(height = 0.05, size = 2, alpha = 0.5) + 
  ylim(0, 1)

table(Wage$wage125)
## 
##    0    1 
## 2132  868

Polynomial Regression

Polynomials of a single variable can be fit directly in R. I() will allow you to modify existing vars in place while poly() will let you create the basis for a polynomail directly

fit <- glm(wage125 ~ age, data = Wage, family = "binomial")
fit2 <- glm(wage125 ~ age + I(age^2), data = Wage, family = "binomial")
fit3 <- glm(wage125 ~ poly(age, 3), data = Wage, family = "binomial")

Wage$pred1 <- predict(fit, Wage, type = "response")
Wage$pred2 <- predict(fit2, Wage, type = "response") 
Wage$pred3 <- predict(fit3, Wage, type = "response") 

ggplot(Wage, aes(age, wage125)) + 
  geom_jitter(height = 0.05, size = 2, alpha = 0.5) + 
  ylim(0, 1) + 
  geom_line(aes(age, y = pred1), size = 2, color = 'red') +
  geom_line(aes(age, y = pred2), size = 2, color = 'blue') +
  geom_line(aes(age, y = pred3), size = 2, color = 'orange')

One difference when using poly is the value of the coefficients – they’re good for fitting but poor for interpretation (it creates orthogonal polynomials rather than “raw” polynomials)

fit2 <- glm(wage125 ~ age + I(age^2), data = Wage, family = "binomial")
fit3 <- glm(wage125 ~ poly(age, 2), data = Wage, family = "binomial")

coef(fit2)
## (Intercept)         age    I(age^2) 
##  -8.1237803   0.3107589  -0.0031354
coef(fit3)
##   (Intercept) poly(age, 2)1 poly(age, 2)2 
##       -1.0012       24.9304      -28.2936

Step Functions

A useful function to know is cut2() from the Hmisc package. It will take a numeric vector and separate it into arbitrary number of equally sized groups

x <- rnorm(20)
cut2(x, g = 2)
##  [1] [-1.862,-0.152) [-1.862,-0.152) [-1.862,-0.152) [-0.152, 1.017]
##  [5] [-1.862,-0.152) [-1.862,-0.152) [-1.862,-0.152) [-0.152, 1.017]
##  [9] [-0.152, 1.017] [-1.862,-0.152) [-0.152, 1.017] [-0.152, 1.017]
## [13] [-0.152, 1.017] [-1.862,-0.152) [-1.862,-0.152) [-0.152, 1.017]
## [17] [-0.152, 1.017] [-0.152, 1.017] [-1.862,-0.152) [-0.152, 1.017]
## Levels: [-1.862,-0.152) [-0.152, 1.017]
cut2(x, g = 4)
##  [1] [-1.862,-1.082) [-1.862,-1.082) [-1.082,-0.152) [ 0.562, 1.017]
##  [5] [-1.082,-0.152) [-1.862,-1.082) [-1.082,-0.152) [ 0.562, 1.017]
##  [9] [-0.152, 0.562) [-1.862,-1.082) [-0.152, 0.562) [ 0.562, 1.017]
## [13] [-0.152, 0.562) [-1.862,-1.082) [-1.082,-0.152) [-0.152, 0.562)
## [17] [ 0.562, 1.017] [ 0.562, 1.017] [-1.082,-0.152) [-0.152, 0.562)
## Levels: [-1.862,-1.082) [-1.082,-0.152) [-0.152, 0.562) [ 0.562, 1.017]
fit_step1 <- glm(wage125 ~ cut2(age, g = 3), data = Wage, family = "binomial")
fit_step2 <- glm(wage125 ~ cut2(age, g = 6), data = Wage, family = "binomial")

Wage$predstep1 <- predict(fit_step1, Wage, type = "response")
Wage$predstep2 <- predict(fit_step2, Wage, type = "response")

ggplot(Wage, aes(age, wage125)) + 
  geom_jitter(height = 0.05, size = 2, alpha = 0.5) + 
  ylim(0, 1) + 
  geom_line(aes(age, y = predstep1), size = 2, color = 'blue') + 
  geom_line(aes(age, y = predstep2), size = 2, color = 'orange')

Basis Functions

Both step functions and polynomials are examples of basis functions, where for a single covariate \(x\) we let

\[ g(\mu) = \beta_0 + \sum_{k=1}^K \beta_k b_{k}(x) \] where, for each of the examples we have seen, \(b_k(x)\) may take the form \(b_k(x) = x^k\) for polynomial regression or \(b_k(x) = I(c_k \leq x \leq c_{k+1})\). However, any arbitrary collection of functions may be used in place.

splines
splines

Splines and GAMs

Basis splines and natural splines. Notice (conveniently) that the knots argument for each are totally different.

Along with this, we see the introduction of the gam() function, contained within the gam package, and also containing the function s(), a spline wrapper function that assists in smoothing our data

library(splines)
library(gam)
fit_b <- glm(wage125 ~ bs(age, knots = 3), data = Wage, family = "binomial")
fit_n <- glm(wage125 ~ ns(age, knots = quantile(age, probs = c(0.25, 0.5, 0.75))), 
             data = Wage, family = "binomial")
fit_s <- gam(wage125 ~ s(age, df = 3), data = Wage, family = "binomial")


Wage$predb <- predict(fit_b, Wage, type = "response")
Wage$predn <- predict(fit_n, Wage, type = "response")
Wage$preds <- predict(fit_s, Wage, type = "response")

ggplot(Wage, aes(age, wage125)) + 
  geom_jitter(height = 0.05, size = 2, alpha = 0.5) + 
  ylim(0, 1) + 
  geom_line(aes(age, y = predb), size = 2, color = 'blue') + 
  geom_line(aes(age, y = predn), size = 2, color = 'orange') +
  geom_line(aes(age, y = preds), size = 2, color = 'green')

anova(fit_b, fit_n, fit_s)
## Analysis of Deviance Table
## 
## Model 1: wage125 ~ bs(age, knots = 3)
## Model 2: wage125 ~ ns(age, knots = quantile(age, probs = c(0.25, 0.5, 
##     0.75)))
## Model 3: wage125 ~ s(age, df = 3)
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1      2996       3411                        
## 2      2995       3404  1     6.37   0.0116 * 
## 3      2996       3413 -1    -8.54   0.0035 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The function s() works a little differently than the others in that it has a smoothing hyperparmater that helps dictate how much smoothness we wish to see. This again is a balance between variance and bias

fit_s1 <- gam(wage125 ~ s(age, spar = 1), 
             data = Wage, family = "binomial")
fit_s75 <- gam(wage125 ~ s(age, spar = 0.75), 
             data = Wage, family = "binomial")
fit_s5 <- gam(wage125 ~ s(age, spar = 0.5), 
             data = Wage, family = "binomial")
fit_s01 <- gam(wage125 ~ s(age, spar = 0.1), 
             data = Wage, family = "binomial")

Wage$preds1 <- predict(fit_s1, Wage, type = "response")
Wage$preds75 <- predict(fit_s75, Wage, type = "response")
Wage$preds5 <- predict(fit_s5, Wage, type = "response")
Wage$preds01 <- predict(fit_s01, Wage, type = "response")

ggplot(Wage, aes(age, wage125)) + 
  geom_jitter(height = 0.05, size = 2, alpha = 0.5) + 
  ylim(0, 1) + 
  geom_line(aes(age, y = preds1), size = 2, color = 'blue') + 
  geom_line(aes(age, y = preds75), size = 2, color = 'purple') +
  geom_line(aes(age, y = preds5), size = 2, color = 'orange') +
  geom_line(aes(age, y = preds01), size = 2, color = 'green')

The generalized additive model, then, can be any linear combination of terms,

\[ g(\mu) = \beta_0 + \sum_{j=1}^p f_{j}(x_j) \] where \(f_j(x_j)\), for example, could be

\[ f_j(x_j) = \sum \beta_{jk} b_{jk}(x_j) \]

fit_gam <- gam(wage125 ~ s(age) + s(year) + education, 
               family = "binomial", data = Wage)
summary(fit_gam)
## 
## Call: gam(formula = wage125 ~ s(age) + s(year) + education, family = "binomial", 
##     data = Wage)
## Deviance Residuals:
##    Min     1Q Median     3Q    Max 
## -1.695 -0.694 -0.452  0.847  3.133 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 3609.3 on 2999 degrees of freedom
## Residual Deviance: 2854.8 on 2987 degrees of freedom
## AIC: 2880.8 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##             Df Sum Sq Mean Sq F value               Pr(>F)    
## s(age)       1     26    26.3   25.14           0.00000056 ***
## s(year)      1      5     4.5    4.34                0.037 *  
## education    4    442   110.5  105.77 < 0.0000000000000002 ***
## Residuals 2987   3120     1.0                                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar Chisq              P(Chi)    
## (Intercept)                                           
## s(age)            3       91.6 <0.0000000000000002 ***
## s(year)           3        1.2                0.76    
## education                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Penalized Regression

students <- read.table("https://users.stat.ufl.edu/~aa/cat/data/Students.dat", 
                       header = TRUE)

x <- students[, -c(1, 16)]
y <- students$abor

fit.lass <- glmnet(x, y,
                   alpha = 1, # lasso
                   family = "binomial")
fit.ridge <- glmnet(x, y, 
                   alpha = 0, # ridge
                   family = "binomial")
plot(fit.lass)

plot(fit.lass, xlim = c(0, 10), ylim = c(-5, 5), main = "zoomed in lasso")

plot(fit.ridge)

x <- as.matrix(x)

cv <- cv.glmnet(x, y, alpha = 1, family = "binomial")

coef(glmnet(x, y,
            alpha = 1, 
            family = "binomial", lambda = cv$lambda.1se))
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## (Intercept)  2.1840552
## gender       .        
## age          .        
## hsgpa        .        
## cogpa        .        
## dhome        .        
## dres         .        
## tv           .        
## sport        .        
## news         0.0069066
## aids         .        
## veg          .        
## affil        .        
## ideol       -0.2924576
## relig       -0.2120237
## affirm       .        
## life         0.1921022