Marijuana Data

Table 4.2

## Read in grouped marijuana data
mm <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Marijuana.dat", 
                 header = TRUE)
mm

##    race gender yes  no
## 1 white female 420 620
## 2 white   male 483 579
## 3 other female  25  55
## 4 other   male  32  62

fit <- glm(yes / (yes + no) ~ gender + race, weights = yes + no, 
           family = binomial, data = mm)
summary(fit)

## 
## Call:
## glm(formula = yes/(yes + no) ~ gender + race, family = binomial, 
##     data = mm, weights = yes + no)
## 
## Coefficients:
##             Estimate Std. Error z value   Pr(>|z|)    
## (Intercept)  -0.8303     0.1685   -4.93 0.00000084 ***
## gendermale    0.2026     0.0852    2.38     0.0174 *  
## racewhite     0.4437     0.1677    2.65     0.0081 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12.752784  on 3  degrees of freedom
## Residual deviance:  0.057982  on 1  degrees of freedom
## AIC: 30.41
## 
## Number of Fisher Scoring iterations: 3

UCB Admissions Data

Note the use of pivot_wider()

library(tidyr)
ucb <- as.data.frame(UCBAdmissions)
ucb <- pivot_wider(ucb, names_from ="Admit", 
                   values_from = "Freq")

print(ucb)

## # A tibble: 12 × 4
##    Gender Dept  Admitted Rejected
##    <fct>  <fct>    <dbl>    <dbl>
##  1 Male   A          512      313
##  2 Female A           89       19
##  3 Male   B          353      207
##  4 Female B           17        8
##  5 Male   C          120      205
##  6 Female C          202      391
##  7 Male   D          138      279
##  8 Female D          131      244
##  9 Male   E           53      138
## 10 Female E           94      299
## 11 Male   F           22      351
## 12 Female F           24      317

The first model is the homogeneous association model, namely that odds of admission for each department are the same given gender

(fit1 <- glm(Admitted / (Admitted + Rejected) ~ Gender + Dept, family = binomial, 
    weights = Admitted + Rejected, data = ucb)) %>% summary()

## 
## Call:
## glm(formula = Admitted/(Admitted + Rejected) ~ Gender + Dept, 
##     family = binomial, data = ucb, weights = Admitted + Rejected)
## 
## Coefficients:
##              Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    0.5821     0.0690    8.44 <0.0000000000000002 ***
## GenderFemale   0.0999     0.0808    1.24                0.22    
## DeptB         -0.0434     0.1098   -0.40                0.69    
## DeptC         -1.2626     0.1066  -11.84 <0.0000000000000002 ***
## DeptD         -1.2946     0.1058  -12.23 <0.0000000000000002 ***
## DeptE         -1.7393     0.1261  -13.79 <0.0000000000000002 ***
## DeptF         -3.3065     0.1700  -19.45 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 877.056  on 11  degrees of freedom
## Residual deviance:  20.204  on  5  degrees of freedom
## AIC: 103.1
## 
## Number of Fisher Scoring iterations: 4

Comparison of UCB Models

Two marginal models on department and gender

(fit2 <- glm(Admitted / (Admitted + Rejected) ~ Dept, family = binomial, 
    weights = Admitted + Rejected, data = ucb)) %>% summary()

## 
## Call:
## glm(formula = Admitted/(Admitted + Rejected) ~ Dept, family = binomial, 
##     data = ucb, weights = Admitted + Rejected)
## 
## Coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)   0.5935     0.0684    8.68 <0.0000000000000002 ***
## DeptB        -0.0506     0.1097   -0.46                0.64    
## DeptC        -1.2091     0.0973  -12.43 <0.0000000000000002 ***
## DeptD        -1.2583     0.1015  -12.40 <0.0000000000000002 ***
## DeptE        -1.6830     0.1173  -14.34 <0.0000000000000002 ***
## DeptF        -3.2691     0.1671  -19.57 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 877.056  on 11  degrees of freedom
## Residual deviance:  21.736  on  6  degrees of freedom
## AIC: 102.7
## 
## Number of Fisher Scoring iterations: 4

(fit3 <- glm(Admitted / (Admitted + Rejected) ~ Gender, family = binomial, 
    weights = Admitted + Rejected, data = ucb)) %>% summary()

## 
## Call:
## glm(formula = Admitted/(Admitted + Rejected) ~ Gender, family = binomial, 
##     data = ucb, weights = Admitted + Rejected)
## 
## Coefficients:
##              Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)   -0.2201     0.0388   -5.68          0.000000014 ***
## GenderFemale  -0.6104     0.0639   -9.55 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 877.06  on 11  degrees of freedom
## Residual deviance: 783.61  on 10  degrees of freedom
## AIC: 856.5
## 
## Number of Fisher Scoring iterations: 4

anova(fit1, fit2) # indicates conditional independence of gender

## Analysis of Deviance Table
## 
## Model 1: Admitted/(Admitted + Rejected) ~ Gender + Dept
## Model 2: Admitted/(Admitted + Rejected) ~ Dept
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1         5       20.2                     
## 2         6       21.7 -1    -1.53     0.22

anova(fit1, fit3)

## Analysis of Deviance Table
## 
## Model 1: Admitted/(Admitted + Rejected) ~ Gender + Dept
## Model 2: Admitted/(Admitted + Rejected) ~ Gender
##   Resid. Df Resid. Dev Df Deviance            Pr(>Chi)    
## 1         5         20                                    
## 2        10        784 -5     -763 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Coffee data

coffee <- read.csv("https://collinn.github.io/data/coffee.csv")
coffee <- pivot_wider(coffee, names_from = "cancer", 
                      values_from = "Freq")

coffee

## # A tibble: 4 × 4
##   coffee smoking   yes    no
##   <chr>  <chr>   <int> <int>
## 1 yes    yes        16     4
## 2 no     yes         4     1
## 3 yes    no         12   108
## 4 no     no         15   135

Part 1 Begin by fitting a logistic regression model to cancer with only coffee as a covariate. Why does it make sense to consider this a marginal independence model?

Looking at the Wald statistic for the coefficient, does it seem that marginal independence holds?

Part 2 Based on your model for Part 1, what is the odds ratio comparing the odds of cancer for smokers to non-smokers? Using confint() or the Wald statistic, is there strong evidence that marginal independence does not hold?

Part 3 Now construct a logistic regression model with indicators for both coffee and smoking. Looking at the deviance, is there evidence for homogenous association between our covariates?

Part 4 Consider again the Wald statistic for the coffee covariate as you did in Part B. What can you say about the conditional independence of coffee and cancer, given smoking status?

Logisitic Regression with Categorical Predictors

Marijuana Data

UCB Admissions Data

Comparison of UCB Models

Coffee data