# install.packages("glmnet")
# install.packages("coefplot")
library(coefplot) # simple extraction of coef for lasso
library(glmnet)
library(splines)
library(Hmisc)
library(randomForest)
library(MASS)
library(ISLR2)
library(ggplot2)
library(dplyr)
theme_set(theme_bw())
makeseq <- function(x, l = 20) {
seq(min(x), max(x), length.out = l)
}
Again, we are looking at the bias/variance trade-off
## Data from ISLR
data(Wage)
ggplot(Wage, aes(age, wage)) + geom_point()
Wage <- mutate(Wage, wage125 = as.numeric(wage > 125))
ggplot(Wage, aes(age, wage125)) +
geom_jitter(height = 0.05, size = 2, alpha = 0.5) +
ylim(0, 1)
table(Wage$wage125)
##
## 0 1
## 2132 868
Polynomials of a single variable can be fit directly in R.
I()
will allow you to modify existing vars in place while
poly()
will let you create the basis for a polynomail
directly
fit <- glm(wage125 ~ age, data = Wage, family = "binomial")
fit2 <- glm(wage125 ~ age + I(age^2), data = Wage, family = "binomial")
fit3 <- glm(wage125 ~ poly(age, 3), data = Wage, family = "binomial")
Wage$pred1 <- predict(fit, Wage, type = "response")
Wage$pred2 <- predict(fit2, Wage, type = "response")
Wage$pred3 <- predict(fit3, Wage, type = "response")
ggplot(Wage, aes(age, wage125)) +
geom_jitter(height = 0.05, size = 2, alpha = 0.5) +
ylim(0, 1) +
geom_line(aes(age, y = pred1), size = 2, color = 'red') +
geom_line(aes(age, y = pred2), size = 2, color = 'blue') +
geom_line(aes(age, y = pred3), size = 2, color = 'orange')
One difference when using poly is the value of the coefficients – they’re good for fitting but poor for interpretation (it creates orthogonal polynomials rather than “raw” polynomials)
fit2 <- glm(wage125 ~ age + I(age^2), data = Wage, family = "binomial")
fit3 <- glm(wage125 ~ poly(age, 2), data = Wage, family = "binomial")
coef(fit2)
## (Intercept) age I(age^2)
## -8.1237803 0.3107589 -0.0031354
coef(fit3)
## (Intercept) poly(age, 2)1 poly(age, 2)2
## -1.0012 24.9304 -28.2936
A useful function to know is cut2()
from the Hmisc
package. It will take a numeric vector and separate it into arbitrary
number of equally sized groups
x <- rnorm(20)
cut2(x, g = 2)
## [1] [-1.862,-0.152) [-1.862,-0.152) [-1.862,-0.152) [-0.152, 1.017]
## [5] [-1.862,-0.152) [-1.862,-0.152) [-1.862,-0.152) [-0.152, 1.017]
## [9] [-0.152, 1.017] [-1.862,-0.152) [-0.152, 1.017] [-0.152, 1.017]
## [13] [-0.152, 1.017] [-1.862,-0.152) [-1.862,-0.152) [-0.152, 1.017]
## [17] [-0.152, 1.017] [-0.152, 1.017] [-1.862,-0.152) [-0.152, 1.017]
## Levels: [-1.862,-0.152) [-0.152, 1.017]
cut2(x, g = 4)
## [1] [-1.862,-1.082) [-1.862,-1.082) [-1.082,-0.152) [ 0.562, 1.017]
## [5] [-1.082,-0.152) [-1.862,-1.082) [-1.082,-0.152) [ 0.562, 1.017]
## [9] [-0.152, 0.562) [-1.862,-1.082) [-0.152, 0.562) [ 0.562, 1.017]
## [13] [-0.152, 0.562) [-1.862,-1.082) [-1.082,-0.152) [-0.152, 0.562)
## [17] [ 0.562, 1.017] [ 0.562, 1.017] [-1.082,-0.152) [-0.152, 0.562)
## Levels: [-1.862,-1.082) [-1.082,-0.152) [-0.152, 0.562) [ 0.562, 1.017]
fit_step1 <- glm(wage125 ~ cut2(age, g = 3), data = Wage, family = "binomial")
fit_step2 <- glm(wage125 ~ cut2(age, g = 6), data = Wage, family = "binomial")
Wage$predstep1 <- predict(fit_step1, Wage, type = "response")
Wage$predstep2 <- predict(fit_step2, Wage, type = "response")
ggplot(Wage, aes(age, wage125)) +
geom_jitter(height = 0.05, size = 2, alpha = 0.5) +
ylim(0, 1) +
geom_line(aes(age, y = predstep1), size = 2, color = 'blue') +
geom_line(aes(age, y = predstep2), size = 2, color = 'orange')
Both step functions and polynomials are examples of basis functions, where for a single covariate \(x\) we let
\[ g(\mu) = \beta_0 + \sum_{k=1}^K \beta_k b_{k}(x) \] where, for each of the examples we have seen, \(b_k(x)\) may take the form \(b_k(x) = x^k\) for polynomial regression or \(b_k(x) = I(c_k \leq x \leq c_{k+1})\). However, any arbitrary collection of functions may be used in place.
Basis splines and natural splines. Notice (conveniently) that the
knots
argument for each are totally different.
Along with this, we see the introduction of the gam()
function, contained within the gam
package, and also
containing the function s()
, a spline wrapper function that
assists in smoothing our data
library(splines)
library(gam)
fit_b <- glm(wage125 ~ bs(age, knots = 3), data = Wage, family = "binomial")
fit_n <- glm(wage125 ~ ns(age, knots = quantile(age, probs = c(0.25, 0.5, 0.75))),
data = Wage, family = "binomial")
fit_s <- gam(wage125 ~ s(age, df = 3), data = Wage, family = "binomial")
Wage$predb <- predict(fit_b, Wage, type = "response")
Wage$predn <- predict(fit_n, Wage, type = "response")
Wage$preds <- predict(fit_s, Wage, type = "response")
ggplot(Wage, aes(age, wage125)) +
geom_jitter(height = 0.05, size = 2, alpha = 0.5) +
ylim(0, 1) +
geom_line(aes(age, y = predb), size = 2, color = 'blue') +
geom_line(aes(age, y = predn), size = 2, color = 'orange') +
geom_line(aes(age, y = preds), size = 2, color = 'green')
anova(fit_b, fit_n, fit_s)
## Analysis of Deviance Table
##
## Model 1: wage125 ~ bs(age, knots = 3)
## Model 2: wage125 ~ ns(age, knots = quantile(age, probs = c(0.25, 0.5,
## 0.75)))
## Model 3: wage125 ~ s(age, df = 3)
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2996 3411
## 2 2995 3404 1 6.37 0.0116 *
## 3 2996 3413 -1 -8.54 0.0035 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The function s()
works a little differently than the
others in that it has a smoothing hyperparmater that helps dictate how
much smoothness we wish to see. This again is a balance between variance
and bias
fit_s1 <- gam(wage125 ~ s(age, spar = 1),
data = Wage, family = "binomial")
fit_s75 <- gam(wage125 ~ s(age, spar = 0.75),
data = Wage, family = "binomial")
fit_s5 <- gam(wage125 ~ s(age, spar = 0.5),
data = Wage, family = "binomial")
fit_s01 <- gam(wage125 ~ s(age, spar = 0.1),
data = Wage, family = "binomial")
Wage$preds1 <- predict(fit_s1, Wage, type = "response")
Wage$preds75 <- predict(fit_s75, Wage, type = "response")
Wage$preds5 <- predict(fit_s5, Wage, type = "response")
Wage$preds01 <- predict(fit_s01, Wage, type = "response")
ggplot(Wage, aes(age, wage125)) +
geom_jitter(height = 0.05, size = 2, alpha = 0.5) +
ylim(0, 1) +
geom_line(aes(age, y = preds1), size = 2, color = 'blue') +
geom_line(aes(age, y = preds75), size = 2, color = 'purple') +
geom_line(aes(age, y = preds5), size = 2, color = 'orange') +
geom_line(aes(age, y = preds01), size = 2, color = 'green')
The generalized additive model, then, can be any linear combination of terms,
\[ g(\mu) = \beta_0 + \sum_{j=1}^p f_{j}(x_j) \] where \(f_j(x_j)\), for example, could be
\[ f_j(x_j) = \sum \beta_{jk} b_{jk}(x_j) \]
fit_gam <- gam(wage125 ~ s(age) + s(year) + education,
family = "binomial", data = Wage)
summary(fit_gam)
##
## Call: gam(formula = wage125 ~ s(age) + s(year) + education, family = "binomial",
## data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.695 -0.694 -0.452 0.847 3.133
##
## (Dispersion Parameter for binomial family taken to be 1)
##
## Null Deviance: 3609.3 on 2999 degrees of freedom
## Residual Deviance: 2854.8 on 2987 degrees of freedom
## AIC: 2880.8
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(age) 1 26 26.3 25.14 0.00000056 ***
## s(year) 1 5 4.5 4.34 0.037 *
## education 4 442 110.5 105.77 < 0.0000000000000002 ***
## Residuals 2987 3120 1.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar Chisq P(Chi)
## (Intercept)
## s(age) 3 91.6 <0.0000000000000002 ***
## s(year) 3 1.2 0.76
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
students <- read.table("https://users.stat.ufl.edu/~aa/cat/data/Students.dat",
header = TRUE)
x <- students[, -c(1, 16)]
y <- students$abor
fit.lass <- glmnet(x, y,
alpha = 1, # lasso
family = "binomial")
fit.ridge <- glmnet(x, y,
alpha = 0, # ridge
family = "binomial")
plot(fit.lass)
plot(fit.lass, xlim = c(0, 10), ylim = c(-5, 5), main = "zoomed in lasso")
plot(fit.ridge)
x <- as.matrix(x)
cv <- cv.glmnet(x, y, alpha = 1, family = "binomial")
coef(glmnet(x, y,
alpha = 1,
family = "binomial", lambda = cv$lambda.1se))
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.1840552
## gender .
## age .
## hsgpa .
## cogpa .
## dhome .
## dres .
## tv .
## sport .
## news 0.0069066
## aids .
## veg .
## affil .
## ideol -0.2924576
## relig -0.2120237
## affirm .
## life 0.1921022