Logistic Regression 1

Introduction

We’ll be using the R package gam today with install.packages("gam"). Included here are also a collection of datasets

library(ggplot2)
library(dplyr)
library(MASS)
library(gam)


theme_set(theme_bw())

## Smoking
smoke <- expand.grid(Smoking = c("yes", "no"), 
                  Cancer = c("yes", "no"))
smoke$Cases <- c(688, 21, 650, 59)


## Crab
crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat", 
                     header = TRUE)

Evaluating Appropriateness of Fit

Generalized additive models or GAMs can demonstrate arbitrary nonlinear relationships

library(gam)

## Fit GAM
gf <- gam(y ~ s(width), family = binomial, data = crab)

df <- data.frame(width = seq(min(crab$width), max(crab$width), 
                             length.out = 200))
df$y <- predict(gf, df, type = "response")

ggplot(df, aes(width, y)) + 
  geom_line() + 
  geom_jitter(data = crab, aes(width, y), height = 0.01)

Retrospective Logistic

with(smoke[rep(1:nrow(smoke), times = smoke$Cases), ], table(Smoking, Cancer)) %>% addmargins(1)

##        Cancer
## Smoking yes  no
##     yes 688 650
##     no   21  59
##     Sum 709 709

Compare this with the model

fit <-  glm(Cancer ~ Smoking, family = binomial, data = smoke, weights = Cases)
summary(fit)

## 
## Call:
## glm(formula = Cancer ~ Smoking, family = binomial, data = smoke, 
##     weights = Cases)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.0568     0.0547   -1.04      0.3    
## Smokingno     1.0898     0.2599    4.19 0.000028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1965.8  on 3  degrees of freedom
## Residual deviance: 1945.9  on 2  degrees of freedom
## AIC: 1950
## 
## Number of Fisher Scoring iterations: 4

Consider the expression

\[ \hat{\pi}(x) = \frac{e^{\hat{\alpha} + \hat{\beta}x}}{1 + e^{\hat{\alpha} + \hat{\beta}x}} \] Here, things are all backwards because of how they were coded – in particular, the outcome is the probability of not getting cancer (hence \(1 - \pi(x)\)) and the reference variable indicates smokers

## When smoking is yes
syes <- coef(fit)[1]
sno <- sum(coef(fit))

## Prob of cancer for yes smoker
1 - exp(syes) / (1 + exp(syes))

## (Intercept) 
##      0.5142

## Prob of cancer for non smoker
1 - exp(sno) / (1 + exp(sno))

## [1] 0.2625

Note that each of these correspond to \(P(Y|X = x)\)

Confidence Intervals

Model Fit

fit <- glm(y ~ width, family = binomial, data = crab)
summary(fit)

## 
## Call:
## glm(formula = y ~ width, family = binomial, data = crab)
## 
## Coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  -12.351      2.629   -4.70 0.0000026 ***
## width          0.497      0.102    4.89 0.0000010 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 194.45  on 171  degrees of freedom
## AIC: 198.5
## 
## Number of Fisher Scoring iterations: 4

anova(fit)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##       Df Deviance Resid. Df Resid. Dev    Pr(>Chi)    
## NULL                    172        226                
## width  1     31.3       171        194 0.000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For coefficients

## Construct from model output
bb <- summary(fit) %>% coef()
bb <- bb[2, 1:2] # est and std error for beta
bb <- bb[1] + qnorm(c(0.025, 0.975)) * bb[2]
bb

## [1] 0.29783 0.69663

## Profile Method
(pb <- confint(fit))

##                 2.5 %   97.5 %
## (Intercept) -17.81001 -7.45725
## width         0.30838  0.70902

Use these to construct CI for odds

exp(bb)

## [1] 1.3469 2.0070

exp(pb[2, ])

##  2.5 % 97.5 % 
## 1.3612 2.0320

For response

Note that if we wanted to find a CI for \(\hat{\alpha} + \hat{\beta}x\) we would need the covariance matrix estimated by the glm function using vcov

vcov(fit)

##             (Intercept)    width
## (Intercept)     6.91016 -0.26685
## width          -0.26685  0.01035

However, there are more direct methods for doing this as seen below

df <- data.frame(width = seq(min(crab$width), max(crab$width), 
                             length.out = 200)) # choice of 200 arbitrary

r <- predict(fit, df, type = "response", se.fit = TRUE)
df$ub <- r$fit + qnorm(0.975)* r$se.fit
df$lb <- r$fit - qnorm(0.975)* r$se.fit
df$fit <- r$fit

head(df)

##    width      ub       lb     fit
## 1 21.000 0.24295 0.015240 0.12910
## 2 21.063 0.24780 0.017493 0.13265
## 3 21.126 0.25271 0.019853 0.13628
## 4 21.188 0.25768 0.022324 0.14000
## 5 21.251 0.26270 0.024909 0.14380
## 6 21.314 0.26778 0.027611 0.14769

Getting this ggplot ready involves “melting” the data.frame (putting it in long format)

library(tidyr)
df <- pivot_longer(df, cols = c("ub", "lb", "fit"), 
                   names_to = "ci", values_to = "y")
head(df)

## # A tibble: 6 × 3
##   width ci         y
##   <dbl> <chr>  <dbl>
## 1  21   ub    0.243 
## 2  21   lb    0.0152
## 3  21   fit   0.129 
## 4  21.1 ub    0.248 
## 5  21.1 lb    0.0175
## 6  21.1 fit   0.133

ggplot(df, aes(width, y, color = ci)) + 
  geom_line() + 
  geom_jitter(data = crab, aes(y = y, color = NULL), height = 0.01) + # add points
  scale_color_manual(values = c("blue", "red", "red"))

Why Models?

heart <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Heart.dat", 
                   header = TRUE)
heart

##              snoring yes   no
## 1              never  24 1355
## 2         occasional  35  603
## 3 nearly_every_night  21  192
## 4        every_night  30  224

heart <- within(heart, {
                snorescore <- c(0,1,3,4)
                n <-  yes + no 
                prob <- yes / n})

## Using model
fit <- glm(yes/n ~ snorescore, family = binomial, weights = n, data = heart) 

mp <- predict(fit, heart, type = "response", se.fit = TRUE)
cbind(prob = mp[[1]], se = mp[[2]])

##       prob        se
## 1 0.023832 0.0034806
## 2 0.037363 0.0040400
## 3 0.089336 0.0096449
## 4 0.134918 0.0185497

## Using table
heart <- within(heart, 
                se <- sqrt(prob * (1-prob) / n))
heart[, c("prob", "se", "n")]

##       prob        se    n
## 1 0.017404 0.0035215 1379
## 2 0.054859 0.0090149  638
## 3 0.098592 0.0204264  213
## 4 0.118110 0.0202504  254

Exercises

This is a modified problem 4.7 from the text:

Hastie and Tibshirani described a study to determine risk factors for kyphosis, a severe forward flexion of the spine following surgery. Use the Kyphosis data at the text including 40 observations with \(y=1\) indicating that kyphosis is present

## Kyphosis
kyph <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Kyphosis.dat", 
                   header = TRUE)

Part A Fit a logistic regression model for the data and assess the effect of age

Part B Plot the data against a generalized spline using a GAM model. Does a standard logistic regression seem appropriate here?

Part C Fit the model \(\text{logit}[P(Y = 1)] = \alpha + \beta_1 x + \beta_2 x^2\). Test the significance of the squared term. Plot the predicted probability curve and interpret the effects of age on kyphosis. (Hint: you may need to investigate how to include a squared coefficient in your model)