# install.packages("ISLR2")
library(MASS)
library(ISLR2)
library(ggplot2)
library(dplyr)
theme_set(theme_bw())
makeseq <- function(x, l = 20) {
seq(min(x), max(x), length.out = l)
}
df <- data.frame(x = 10*c(1:4,6:9),
y = rep(c(0,1), each = 4))
ggplot(df, aes(x, y)) + geom_point(size = 4)
(fit <- glm(y ~ x, family = binomial, data = df))
##
## Call: glm(formula = y ~ x, family = binomial, data = df)
##
## Coefficients:
## (Intercept) x
## -118.16 2.36
##
## Degrees of Freedom: 7 Total (i.e. Null); 6 Residual
## Null Deviance: 11.1
## Residual Deviance: 0.000000000218 AIC: 4
## Predicted probs
df2 <- data.frame(x = seq(10, 90, length.out = 100))
df2$pred <- predict(fit, df2, type = "response")
ggplot(df2, aes(x, pred)) + geom_line(linewidth = 1.5, color = "gray50") +
geom_point(data = df, aes(y = y), size = 4)
Note that the midpoint occurs at
\[ x = -\frac{\alpha}{\beta} = -118.16 / 2.36 = 50 \] If the slope at any point can be approximated by
\[ \beta \times \pi(x) (1-\pi(x)) \] We get an optimal solution of \(0.25\beta \approx \infty\)
fit <- lda(y ~ x, df)
df2 <- data.frame(x = seq(10, 90, length.out = 20))
df2$pred <- predict(fit, df2)$class
ggplot(df2, aes(x, y = 0, color = pred)) + geom_point(size = 3)
set.seed(123)
N <- 50
df1 <- data.frame(x = rnorm(N, 2, sd = 1), y = 1)
df2 <- data.frame(x = rnorm(N, -2, sd = 1), y = 0)
df <- rbind(df1, df2)
ggplot(df, aes(x, fill = factor(y))) +
geom_histogram(bins = 15, color = "black", alpha = 0.6, position = "identity") +
ggtitle("Distribution of X, given Y")
(fit.lda <- lda(y ~ x, data = df))
## Call:
## lda(y ~ x, data = df)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## x
## 0 -1.8536
## 1 2.0344
##
## Coefficients of linear discriminants:
## LD1
## x 1.092
dd <- data.frame(x = makeseq(df$x, l = 50))
dd$pred <- predict(fit.lda, dd)$class
## Breakpoint
bp <- max(dd[dd$pred == 0, ]$x)
ggplot(df, aes(x, fill = factor(y))) +
geom_histogram(bins = 15, color = "black", alpha = 0.6, position = "identity") +
ggtitle("Distribution of X, given Y") +
geom_vline(xintercept = bp, color = 'red', linewidth = 1.5)
college <- read.csv("https://collinn.github.io/data/college2019.csv")
college$Private <- factor(college$Type)
glm(Private ~ Cost + ACT_median, data = college, family = binomial) %>% summary()
##
## Call:
## glm(formula = Private ~ Cost + ACT_median, family = binomial,
## data = college)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.3309310 1.2425185 2.68 0.0073 **
## Cost -0.0004859 0.0000388 -12.52 < 0.0000000000000002 ***
## ACT_median 0.4935508 0.0616114 8.01 0.0000000000000011 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1481.63 on 1094 degrees of freedom
## Residual deviance: 247.72 on 1092 degrees of freedom
## AIC: 253.7
##
## Number of Fisher Scoring iterations: 8
fit.lda <- lda(Private ~ Cost + ACT_median, data = college)
df <- expand.grid(ACT_median = makeseq(college$ACT_median),
Cost = makeseq(college$Cost))
df$pred <- predict(fit.lda, df)$class
p1 <- ggplot(college, aes(ACT_median, Cost, fill = Private)) +
geom_point(size = 3, shape = 21, color = "black")
p2 <- ggplot(df, aes(ACT_median, Cost, fill = pred)) +
geom_point(size = 3, shape = 21, color = "black")
gridExtra::grid.arrange(p1, p2, nrow = 1)
data(Default)
Default <- Default[sample(1:nrow(Default), 200), ]
fit.lda <- lda(default ~ balance + income, Default)
inc <- seq(min(Default$income), max(Default$income), length.out = 20)
bal <- seq(min(Default$balance), max(Default$balance), length.out = 20)
df <- expand.grid(income = inc, balance = bal)
pred <- predict(fit.lda, df)
df$pred <- pred$class
p1 <- ggplot(Default, aes(balance, income, color = default)) + geom_point()
p2 <- ggplot(df, aes(balance, income, color = pred)) +
geom_point()
gridExtra::grid.arrange(p1, p2, nrow = 1)
crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat",
header = TRUE)
ggplot(crab, aes(weight, width, color = factor(y))) + geom_point()
fit.lda <- lda(y ~ weight + width, data = crab)
inc <- seq(min(crab$weight), max(crab$weight), length.out = 20)
bal <- seq(min(crab$width), max(crab$width), length.out = 20)
df <- expand.grid(weight = inc, width = bal)
pred <- predict(fit.lda, df)
df$pred <- pred$class
ggplot(df, aes(weight, width, color = pred)) +
geom_point(size = 0.75) +
geom_point(data = crab, aes(weight, width, fill = factor(y)),
shape = 21, color = 'black', size = 2)
We can also compare prediction with GLM
fit.lda <- lda(y ~ weight + width, data = crab)
fit.glm <- glm(y ~ weight + width, data = crab, family = binomial)
crab$glm <- predict(fit.glm, crab, type = "response") > 0.5
crab$lda <- predict(fit.lda, crab)$class
xtabs(~ glm + y, crab) # prediction at pi = 0.5
## y
## glm 0 1
## FALSE 28 16
## TRUE 34 95
xtabs(~ lda + y, crab)
## y
## lda 0 1
## 0 27 15
## 1 35 96
data(iris)
ggplot(iris, aes(Sepal.Length, Sepal.Width, fill = Species)) +
geom_point(shape = 21, color = "black", size = 2.5) +
scale_fill_brewer(palette = "RdYlBu")
fit.lda <- lda(Species ~ Sepal.Length + Sepal.Width, iris)
## Create predict data
sl <- with(iris, seq(min(Sepal.Length), max(Sepal.Length), length.out = 20))
sw <- with(iris, seq(min(Sepal.Width), max(Sepal.Width), length.out = 20))
df <- expand.grid(Sepal.Length = sl,
Sepal.Width = sw)
df$pred <- predict(fit.lda, df)$class
p1 <- ggplot(iris, aes(Sepal.Length, Sepal.Width, fill = Species)) +
geom_point(shape = 21, color = "black", size = 2.5) +
scale_fill_brewer(palette = "RdYlBu")
p2 <- ggplot(df, aes(Sepal.Length, Sepal.Width, fill = pred)) +
geom_point(size = 3, color = "black", shape = 21) +
scale_fill_brewer(palette = "RdYlBu")
gridExtra::grid.arrange(p1, p2, nrow = 1)