Table 4.2
library(tidyr)
library(dplyr)
library(ggplot2)
theme_set(theme_bw(base_size = 14))
## Read in grouped marijuana data
mm <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Marijuana.dat",
header = TRUE)
mm <- group_by(mm, gender) %>%
summarize(yes = sum(yes), no = sum(no))
mm2 <- pivot_longer(mm, cols = c("yes", "no"),
names_to = "weed", values_to = "freq")
mm2 <- group_by(mm2, gender, weed) %>% summarize(freq = sum(freq))
mm3 <- mm2[rep(1:nrow(mm2), times = mm2$freq), ]
(tt <- with(mm3, table(gender, weed))[, 2:1])
## weed
## gender yes no
## female 445 675
## male 515 641
ttm <- addmargins(tt)
You could also represent this in long format:
mm2
## # A tibble: 4 × 3
## # Groups: gender [2]
## gender weed freq
## <chr> <chr> <int>
## 1 female no 675
## 2 female yes 445
## 3 male no 641
## 4 male yes 515
fit_bin <- glm(yes / (yes + no) ~ gender, weights = yes + no,
family = binomial, data = mm)
summary(fit_bin)
##
## Call:
## glm(formula = yes/(yes + no) ~ gender, family = binomial, data = mm,
## weights = yes + no)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4166 0.0611 -6.82 0.0000000000089 ***
## gendermale 0.1978 0.0850 2.33 0.02 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5.4178 on 1 degrees of freedom
## Residual deviance: 0.0000 on 0 degrees of freedom
## AIC: 18.92
##
## Number of Fisher Scoring iterations: 2
fit_pois <- glm(freq ~ weed + gender, family = poisson,
data = mm2)
summary(fit_pois)
##
## Call:
## glm(formula = freq ~ weed + gender, family = poisson, data = mm2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.4733 0.0348 185.83 < 0.0000000000000002 ***
## weedyes -0.3154 0.0424 -7.43 0.00000000000011 ***
## gendermale 0.0316 0.0419 0.75 0.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 61.9002 on 3 degrees of freedom
## Residual deviance: 5.4178 on 1 degrees of freedom
## AIC: 44.09
##
## Number of Fisher Scoring iterations: 3
fit_pois2 <- glm(freq ~ weed + gender + weed*gender, family = poisson,
data = mm2, offset = log(rep(2276, 4)))
summary(fit_pois2)
##
## Call:
## glm(formula = freq ~ weed + gender + weed * gender, family = poisson,
## data = mm2, offset = log(rep(2276, 4)))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2155 0.0385 -31.58 < 0.0000000000000002 ***
## weedyes -0.4166 0.0611 -6.82 0.0000000000089 ***
## gendermale -0.0517 0.0552 -0.94 0.35
## weedyes:gendermale 0.1978 0.0850 2.33 0.02 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 61.900179118757790775 on 3 degrees of freedom
## Residual deviance: 0.000000000000049738 on 0 degrees of freedom
## AIC: 40.67
##
## Number of Fisher Scoring iterations: 2
fit_pois_off <- glm(freq ~ weed + gender + weed*gender, family = poisson,
data = mm2, offset = log(rep(2276, 4)))
summary(fit_pois_off)
##
## Call:
## glm(formula = freq ~ weed + gender + weed * gender, family = poisson,
## data = mm2, offset = log(rep(2276, 4)))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2155 0.0385 -31.58 < 0.0000000000000002 ***
## weedyes -0.4166 0.0611 -6.82 0.0000000000089 ***
## gendermale -0.0517 0.0552 -0.94 0.35
## weedyes:gendermale 0.1978 0.0850 2.33 0.02 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 61.900179118757790775 on 3 degrees of freedom
## Residual deviance: 0.000000000000049738 on 0 degrees of freedom
## AIC: 40.67
##
## Number of Fisher Scoring iterations: 2
pred_bin <- predict(fit_bin, mm, type = "response")
pred_pois <- predict(fit_pois, mm2, type = "response")
matrix(predict(fit_pois_off, mm2, type = "response") / 2276,
nrow = 2, byrow = TRUE)
## [,1] [,2]
## [1,] 0.29657 0.19552
## [2,] 0.28163 0.22627
prop.table(tt)
## weed
## gender yes no
## female 0.19552 0.29657
## male 0.22627 0.28163
crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat",
header = TRUE)
crab$col_fact <- factor(crab$color)
fit <- glm(y ~ width + col_fact, family = binomial, data = crab)
summary(fit)
##
## Call:
## glm(formula = y ~ width + col_fact, family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.3852 2.8735 -3.96 0.0000743 ***
## width 0.4680 0.1055 4.43 0.0000093 ***
## col_fact2 0.0724 0.7399 0.10 0.92
## col_fact3 -0.2238 0.7771 -0.29 0.77
## col_fact4 -1.3299 0.8525 -1.56 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 187.46 on 168 degrees of freedom
## AIC: 197.5
##
## Number of Fisher Scoring iterations: 4
df <- expand.grid(col_fact = factor(1:4),
width = seq(min(crab$width)-2, max(crab$width)+2,
length.out = 100))
df$prob <- predict(fit, df, type = "response")
ggplot(df, aes(width, prob, color = col_fact)) +
geom_line(size = 1.2) + scale_color_brewer(palette = "BuGn") +
ggtitle("Four factors")
fit_no_color <- glm(y ~ width, family = binomial, data = crab)
anova(fit_no_color, fit)
## Analysis of Deviance Table
##
## Model 1: y ~ width
## Model 2: y ~ width + col_fact
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 171 194
## 2 168 188 3 7 0.072 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Alternatively
library(car)
Anova(fit)
## Analysis of Deviance Table (Type II tests)
##
## Response: y
## LR Chisq Df Pr(>Chisq)
## width 24.6 1 0.0000007 ***
## col_fact 7.0 3 0.072 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_col <- glm(y ~ width + color, family = binomial, data = crab)
summary(fit_col)
##
## Call:
## glm(formula = y ~ width + color, family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.071 2.807 -3.59 0.00033 ***
## width 0.458 0.104 4.41 0.000011 ***
## color -0.509 0.224 -2.28 0.02286 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 189.12 on 170 degrees of freedom
## AIC: 195.1
##
## Number of Fisher Scoring iterations: 4
Compare the assumption of (1,2,3,4) for colors with the predicted level for each color in the categorical model
coef(fit)[-c(1:2)]
## col_fact2 col_fact3 col_fact4
## 0.072417 -0.223798 -1.329919
crab$dark <- ifelse(crab$color == 4, 1, 0)
fit_binary <- glm(y ~ width + dark, family = binomial, data = crab)
summary(fit_binary)
##
## Call:
## glm(formula = y ~ width + dark, family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.679 2.693 -4.34 0.0000144 ***
## width 0.478 0.104 4.59 0.0000044 ***
## dark -1.301 0.526 -2.47 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 187.96 on 170 degrees of freedom
## AIC: 194
##
## Number of Fisher Scoring iterations: 4
Why can’t we compare this directly to continuous color model?
anova(fit_binary, fit, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: y ~ width + dark
## Model 2: y ~ width + col_fact
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 170 188
## 2 168 188 2 0.501 0.78
fit_interaction <- glm(y ~ width*dark, family = binomial, data = crab)
summary(fit_interaction)
##
## Call:
## glm(formula = y ~ width * dark, family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.812 2.958 -4.33 0.0000148 ***
## width 0.522 0.115 4.56 0.0000052 ***
## dark 6.958 7.318 0.95 0.34
## width:dark -0.322 0.286 -1.13 0.26
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 186.79 on 169 degrees of freedom
## AIC: 194.8
##
## Number of Fisher Scoring iterations: 4
df <- expand.grid(dark = c(0,1),
width = seq(min(crab$width)-2, max(crab$width)+2,
length.out = 100))
df$prob <- predict(fit_interaction, df, type = "response")
ggplot(df, aes(width, prob, color = factor(dark))) +
geom_jitter(data = crab, aes(x = width, y = y),
height = 0.02, size = 2) +
geom_line(size = 1.2) + scale_color_brewer(palette = "Dark2") +
ggtitle("Interaction")
But perhaps the interaction term is not worth the complexity
Anova(fit_interaction)
## Analysis of Deviance Table (Type II tests)
##
## Response: y
## LR Chisq Df Pr(>Chisq)
## width 26.84 1 0.00000022 ***
## dark 6.49 1 0.011 *
## width:dark 1.17 1 0.279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1