Logisitic Regression with Categorical Predictors

# install.packages("ISLR2")

library(MASS)
library(ISLR2)
library(ggplot2)
library(dplyr)

theme_set(theme_bw())

makeseq <- function(x, l = 20) {
  seq(min(x), max(x), length.out = l)
}

Complete Separation

df <- data.frame(x = 10*c(1:4,6:9), 
                 y = rep(c(0,1), each = 4))

ggplot(df, aes(x, y)) + geom_point(size = 4)

(fit <- glm(y ~ x, family = binomial, data = df))

## 
## Call:  glm(formula = y ~ x, family = binomial, data = df)
## 
## Coefficients:
## (Intercept)            x  
##     -118.16         2.36  
## 
## Degrees of Freedom: 7 Total (i.e. Null);  6 Residual
## Null Deviance:       11.1 
## Residual Deviance: 0.000000000218    AIC: 4

## Predicted probs
df2 <- data.frame(x = seq(10, 90, length.out = 100))
df2$pred <- predict(fit, df2, type = "response")

ggplot(df2, aes(x, pred)) + geom_line(linewidth = 1.5, color = "gray50") + 
  geom_point(data = df, aes(y = y), size = 4)

Note that the midpoint occurs at

\[ x = -\frac{\alpha}{\beta} = -118.16 / 2.36 = 50 \] If the slope at any point can be approximated by

\[ \beta \times \pi(x) (1-\pi(x)) \] We get an optimal solution of \(0.25\beta \approx \infty\)

Fitting with lda

fit <- lda(y ~ x, df)

df2 <- data.frame(x = seq(10, 90, length.out = 20))

df2$pred <- predict(fit, df2)$class

ggplot(df2, aes(x, y = 0, color = pred)) + geom_point(size = 3)

What is LDA?

set.seed(123)
N <- 50
df1 <- data.frame(x = rnorm(N, 2, sd = 1), y = 1)
df2 <- data.frame(x = rnorm(N, -2, sd = 1), y = 0)
df <- rbind(df1, df2)

ggplot(df, aes(x, fill = factor(y))) + 
  geom_histogram(bins = 15,  color = "black", alpha = 0.6, position = "identity") + 
  ggtitle("Distribution of X, given Y")

(fit.lda <- lda(y ~ x, data = df))

## Call:
## lda(y ~ x, data = df)
## 
## Prior probabilities of groups:
##   0   1 
## 0.5 0.5 
## 
## Group means:
##         x
## 0 -1.8536
## 1  2.0344
## 
## Coefficients of linear discriminants:
##     LD1
## x 1.092

dd <- data.frame(x = makeseq(df$x, l = 50))
dd$pred <- predict(fit.lda, dd)$class

## Breakpoint
bp <- max(dd[dd$pred == 0, ]$x)

ggplot(df, aes(x, fill = factor(y))) + 
  geom_histogram(bins = 15,  color = "black", alpha = 0.6, position = "identity") + 
  ggtitle("Distribution of X, given Y") + 
  geom_vline(xintercept = bp, color = 'red', linewidth = 1.5)

Predicting binary outcome with two continuous vars

college <- read.csv("https://collinn.github.io/data/college2019.csv")

college$Private <- factor(college$Type)

glm(Private ~ Cost + ACT_median, data = college, family = binomial) %>% summary()

## 
## Call:
## glm(formula = Private ~ Cost + ACT_median, family = binomial, 
##     data = college)
## 
## Coefficients:
##               Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  3.3309310  1.2425185    2.68               0.0073 ** 
## Cost        -0.0004859  0.0000388  -12.52 < 0.0000000000000002 ***
## ACT_median   0.4935508  0.0616114    8.01   0.0000000000000011 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1481.63  on 1094  degrees of freedom
## Residual deviance:  247.72  on 1092  degrees of freedom
## AIC: 253.7
## 
## Number of Fisher Scoring iterations: 8

fit.lda <- lda(Private ~ Cost + ACT_median, data = college)

df <- expand.grid(ACT_median = makeseq(college$ACT_median), 
                  Cost = makeseq(college$Cost))


df$pred <- predict(fit.lda, df)$class

p1 <- ggplot(college, aes(ACT_median, Cost, fill = Private)) + 
  geom_point(size = 3, shape = 21, color = "black")

p2 <- ggplot(df, aes(ACT_median, Cost, fill = pred)) + 
  geom_point(size = 3, shape = 21, color = "black")

gridExtra::grid.arrange(p1, p2, nrow = 1)

Other

data(Default)

Default <- Default[sample(1:nrow(Default), 200), ]

fit.lda <- lda(default ~ balance + income, Default)
inc <- seq(min(Default$income), max(Default$income), length.out = 20)
bal <- seq(min(Default$balance), max(Default$balance), length.out = 20)
df <- expand.grid(income = inc, balance = bal)

pred <- predict(fit.lda, df)
df$pred <- pred$class

p1 <- ggplot(Default, aes(balance, income, color = default)) + geom_point()

p2 <- ggplot(df, aes(balance, income, color = pred)) + 
  geom_point()

gridExtra::grid.arrange(p1, p2, nrow = 1)

crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat", 
                   header = TRUE)

ggplot(crab, aes(weight, width, color = factor(y))) + geom_point()

fit.lda <- lda(y ~ weight + width, data = crab)


inc <- seq(min(crab$weight), max(crab$weight), length.out = 20)
bal <- seq(min(crab$width), max(crab$width), length.out = 20)
df <- expand.grid(weight = inc, width = bal)

pred <- predict(fit.lda, df)
df$pred <- pred$class

ggplot(df, aes(weight, width, color = pred)) + 
  geom_point(size = 0.75) + 
  geom_point(data = crab, aes(weight, width, fill = factor(y)), 
             shape = 21, color = 'black', size = 2)

We can also compare prediction with GLM

fit.lda <- lda(y ~ weight + width, data = crab)
fit.glm <- glm(y ~ weight + width, data = crab, family = binomial)

crab$glm <- predict(fit.glm, crab, type = "response") > 0.5
crab$lda <- predict(fit.lda, crab)$class

xtabs(~ glm + y, crab) # prediction at pi = 0.5

##        y
## glm      0  1
##   FALSE 28 16
##   TRUE  34 95

xtabs(~ lda + y, crab)

##    y
## lda  0  1
##   0 27 15
##   1 35 96

Multiple Response Categories

data(iris)

ggplot(iris, aes(Sepal.Length, Sepal.Width, fill = Species)) + 
  geom_point(shape = 21, color = "black", size = 2.5) + 
  scale_fill_brewer(palette = "RdYlBu")

fit.lda <- lda(Species ~ Sepal.Length + Sepal.Width, iris)


## Create predict data
sl <- with(iris, seq(min(Sepal.Length), max(Sepal.Length), length.out = 20))
sw <- with(iris, seq(min(Sepal.Width), max(Sepal.Width), length.out = 20))

df <- expand.grid(Sepal.Length = sl, 
                 Sepal.Width = sw)

df$pred <- predict(fit.lda, df)$class

p1 <- ggplot(iris, aes(Sepal.Length, Sepal.Width, fill = Species)) + 
  geom_point(shape = 21, color = "black", size = 2.5) + 
  scale_fill_brewer(palette = "RdYlBu")

p2 <- ggplot(df, aes(Sepal.Length, Sepal.Width, fill = pred)) +
  geom_point(size = 3, color = "black", shape = 21) + 
  scale_fill_brewer(palette = "RdYlBu")

gridExtra::grid.arrange(p1, p2, nrow = 1)