Marijuana Data

Table 4.2

library(tidyr)
library(dplyr)
library(ggplot2)
theme_set(theme_bw(base_size = 14))
## Read in grouped marijuana data
mm <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Marijuana.dat", 
                 header = TRUE)
mm <- group_by(mm, gender) %>% 
  summarize(yes = sum(yes), no = sum(no))


mm2 <- pivot_longer(mm, cols = c("yes", "no"), 
                   names_to = "weed", values_to = "freq")
mm2 <- group_by(mm2, gender, weed) %>% summarize(freq = sum(freq))
mm3 <- mm2[rep(1:nrow(mm2), times = mm2$freq), ]

(tt <- with(mm3, table(gender, weed))[, 2:1])

##         weed
## gender   yes  no
##   female 445 675
##   male   515 641

ttm <- addmargins(tt)

You could also represent this in long format:

mm2

## # A tibble: 4 × 3
## # Groups:   gender [2]
##   gender weed   freq
##   <chr>  <chr> <int>
## 1 female no      675
## 2 female yes     445
## 3 male   no      641
## 4 male   yes     515

fit_bin <- glm(yes / (yes + no) ~ gender, weights = yes + no, 
           family = binomial, data = mm)
summary(fit_bin)

## 
## Call:
## glm(formula = yes/(yes + no) ~ gender, family = binomial, data = mm, 
##     weights = yes + no)
## 
## Coefficients:
##             Estimate Std. Error z value        Pr(>|z|)    
## (Intercept)  -0.4166     0.0611   -6.82 0.0000000000089 ***
## gendermale    0.1978     0.0850    2.33            0.02 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5.4178  on 1  degrees of freedom
## Residual deviance: 0.0000  on 0  degrees of freedom
## AIC: 18.92
## 
## Number of Fisher Scoring iterations: 2

fit_pois <- glm(freq ~ weed + gender, family = poisson, 
               data = mm2)
summary(fit_pois)

## 
## Call:
## glm(formula = freq ~ weed + gender, family = poisson, data = mm2)
## 
## Coefficients:
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)   6.4733     0.0348  185.83 < 0.0000000000000002 ***
## weedyes      -0.3154     0.0424   -7.43     0.00000000000011 ***
## gendermale    0.0316     0.0419    0.75                 0.45    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 61.9002  on 3  degrees of freedom
## Residual deviance:  5.4178  on 1  degrees of freedom
## AIC: 44.09
## 
## Number of Fisher Scoring iterations: 3

fit_pois2 <- glm(freq ~ weed + gender + weed*gender, family = poisson,
               data = mm2, offset = log(rep(2276, 4)))
summary(fit_pois2)

## 
## Call:
## glm(formula = freq ~ weed + gender + weed * gender, family = poisson, 
##     data = mm2, offset = log(rep(2276, 4)))
## 
## Coefficients:
##                    Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         -1.2155     0.0385  -31.58 < 0.0000000000000002 ***
## weedyes             -0.4166     0.0611   -6.82      0.0000000000089 ***
## gendermale          -0.0517     0.0552   -0.94                 0.35    
## weedyes:gendermale   0.1978     0.0850    2.33                 0.02 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 61.900179118757790775  on 3  degrees of freedom
## Residual deviance:  0.000000000000049738  on 0  degrees of freedom
## AIC: 40.67
## 
## Number of Fisher Scoring iterations: 2

fit_pois_off <- glm(freq ~ weed + gender + weed*gender, family = poisson,
               data = mm2, offset = log(rep(2276, 4)))
summary(fit_pois_off)

## 
## Call:
## glm(formula = freq ~ weed + gender + weed * gender, family = poisson, 
##     data = mm2, offset = log(rep(2276, 4)))
## 
## Coefficients:
##                    Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         -1.2155     0.0385  -31.58 < 0.0000000000000002 ***
## weedyes             -0.4166     0.0611   -6.82      0.0000000000089 ***
## gendermale          -0.0517     0.0552   -0.94                 0.35    
## weedyes:gendermale   0.1978     0.0850    2.33                 0.02 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 61.900179118757790775  on 3  degrees of freedom
## Residual deviance:  0.000000000000049738  on 0  degrees of freedom
## AIC: 40.67
## 
## Number of Fisher Scoring iterations: 2

Predictions

pred_bin <- predict(fit_bin, mm, type = "response")
pred_pois <- predict(fit_pois, mm2, type = "response")


matrix(predict(fit_pois_off, mm2, type = "response") / 2276, 
       nrow = 2, byrow = TRUE)

##         [,1]    [,2]
## [1,] 0.29657 0.19552
## [2,] 0.28163 0.22627

prop.table(tt)

##         weed
## gender       yes      no
##   female 0.19552 0.29657
##   male   0.22627 0.28163

Other Covariates

crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat", 
                 header = TRUE)
crab$col_fact <- factor(crab$color)

Categorical predictors at each level

fit <- glm(y ~ width + col_fact, family = binomial, data = crab)
summary(fit)

## 
## Call:
## glm(formula = y ~ width + col_fact, family = binomial, data = crab)
## 
## Coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept) -11.3852     2.8735   -3.96 0.0000743 ***
## width         0.4680     0.1055    4.43 0.0000093 ***
## col_fact2     0.0724     0.7399    0.10      0.92    
## col_fact3    -0.2238     0.7771   -0.29      0.77    
## col_fact4    -1.3299     0.8525   -1.56      0.12    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 187.46  on 168  degrees of freedom
## AIC: 197.5
## 
## Number of Fisher Scoring iterations: 4

df <- expand.grid(col_fact = factor(1:4), 
                  width = seq(min(crab$width)-2, max(crab$width)+2, 
                              length.out = 100))

df$prob <- predict(fit, df, type = "response")

ggplot(df, aes(width, prob, color = col_fact)) + 
  geom_line(size = 1.2) + scale_color_brewer(palette = "BuGn") + 
  ggtitle("Four factors")

Comparing models with deviance

fit_no_color <- glm(y ~ width, family = binomial, data = crab)

anova(fit_no_color, fit)

## Analysis of Deviance Table
## 
## Model 1: y ~ width
## Model 2: y ~ width + col_fact
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       171        194                       
## 2       168        188  3        7    0.072 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Alternatively

library(car)
Anova(fit)

## Analysis of Deviance Table (Type II tests)
## 
## Response: y
##          LR Chisq Df Pr(>Chisq)    
## width        24.6  1  0.0000007 ***
## col_fact      7.0  3      0.072 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Treating as quantiative variable

fit_col <- glm(y ~ width + color, family = binomial, data = crab)
summary(fit_col)

## 
## Call:
## glm(formula = y ~ width + color, family = binomial, data = crab)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -10.071      2.807   -3.59  0.00033 ***
## width          0.458      0.104    4.41 0.000011 ***
## color         -0.509      0.224   -2.28  0.02286 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 189.12  on 170  degrees of freedom
## AIC: 195.1
## 
## Number of Fisher Scoring iterations: 4

Compare the assumption of (1,2,3,4) for colors with the predicted level for each color in the categorical model

coef(fit)[-c(1:2)]

## col_fact2 col_fact3 col_fact4 
##  0.072417 -0.223798 -1.329919

Constructing binary group

crab$dark <- ifelse(crab$color == 4, 1, 0)

fit_binary <- glm(y ~ width + dark, family = binomial, data = crab)
summary(fit_binary)

## 
## Call:
## glm(formula = y ~ width + dark, family = binomial, data = crab)
## 
## Coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  -11.679      2.693   -4.34 0.0000144 ***
## width          0.478      0.104    4.59 0.0000044 ***
## dark          -1.301      0.526   -2.47     0.013 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 187.96  on 170  degrees of freedom
## AIC: 194
## 
## Number of Fisher Scoring iterations: 4

Why can’t we compare this directly to continuous color model?

anova(fit_binary, fit, test = "LRT")

## Analysis of Deviance Table
## 
## Model 1: y ~ width + dark
## Model 2: y ~ width + col_fact
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       170        188                     
## 2       168        188  2    0.501     0.78

With interactions

fit_interaction <- glm(y ~ width*dark, family = binomial, data = crab)
summary(fit_interaction)

## 
## Call:
## glm(formula = y ~ width * dark, family = binomial, data = crab)
## 
## Coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  -12.812      2.958   -4.33 0.0000148 ***
## width          0.522      0.115    4.56 0.0000052 ***
## dark           6.958      7.318    0.95      0.34    
## width:dark    -0.322      0.286   -1.13      0.26    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 186.79  on 169  degrees of freedom
## AIC: 194.8
## 
## Number of Fisher Scoring iterations: 4

df <- expand.grid(dark = c(0,1), 
                  width = seq(min(crab$width)-2, max(crab$width)+2, 
                              length.out = 100))
df$prob <- predict(fit_interaction, df, type = "response")

ggplot(df, aes(width, prob, color = factor(dark))) + 
  geom_jitter(data = crab, aes(x = width, y = y),
              height = 0.02, size = 2) + 
  geom_line(size = 1.2) + scale_color_brewer(palette = "Dark2") + 
  ggtitle("Interaction")

But perhaps the interaction term is not worth the complexity

Anova(fit_interaction)

## Analysis of Deviance Table (Type II tests)
## 
## Response: y
##            LR Chisq Df Pr(>Chisq)    
## width         26.84  1 0.00000022 ***
## dark           6.49  1      0.011 *  
## width:dark     1.17  1      0.279    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Logisitic Regression with Categorical Predictors