library(ggplot2)
library(dplyr)
x <- matrix(c(1e6, 0, 1e6, 1), nrow = 2) / 1e6
x
## [,1] [,2]
## [1,] 1 1.000000
## [2,] 0 0.000001
solve(x) # invertible
## [,1] [,2]
## [1,] 1 -1000000
## [2,] 0 1000000
t(x) %*% x
## [,1] [,2]
## [1,] 1 1
## [2,] 1 1
solve(t(x) %*% x)
## [,1] [,2]
## [1,] 999911107321 -999911107320
## [2,] -999911107320 999911107320
crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat",
header = TRUE)
fit <- glm(y ~ weight + width + factor(color) + factor(spine),
family = binomial, data = crab)
summary(fit)
##
## Call:
## glm(formula = y ~ weight + width + factor(color) + factor(spine),
## family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.065 3.929 -2.05 0.040 *
## weight 0.826 0.704 1.17 0.241
## width 0.263 0.195 1.35 0.178
## factor(color)2 -0.103 0.783 -0.13 0.895
## factor(color)3 -0.489 0.853 -0.57 0.567
## factor(color)4 -1.609 0.935 -1.72 0.086 .
## factor(spine)2 -0.096 0.703 -0.14 0.891
## factor(spine)3 0.400 0.503 0.80 0.426
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 185.20 on 165 degrees of freedom
## AIC: 201.2
##
## Number of Fisher Scoring iterations: 4
Comparing this with the residual model, we see clear evidence of a fit with the difference between the null and residual deviance being 225.76 - 185.20 = 40.56 on \(df = 7\)
pchisq(40.56, df = 7, lower.tail = FALSE)
## [1] 0.00000098329
anova(fit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 172 226
## weight 1 30.02 171 196 0.000000043 ***
## width 1 2.85 170 193 0.092 .
## factor(color) 3 6.68 167 186 0.083 .
## factor(spine) 2 1.01 165 185 0.604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Without weight
fit <- glm(y ~ width + factor(color) + factor(spine),
family = binomial, data = crab)
summary(fit)
##
## Call:
## glm(formula = y ~ width + factor(color) + factor(spine), family = binomial,
## data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.0995 2.9771 -3.73 0.00019 ***
## width 0.4562 0.1078 4.23 0.000023 ***
## factor(color)2 -0.1434 0.7784 -0.18 0.85383
## factor(color)3 -0.5241 0.8469 -0.62 0.53603
## factor(color)4 -1.6683 0.9328 -1.79 0.07371 .
## factor(spine)2 -0.0578 0.7031 -0.08 0.93445
## factor(spine)3 0.3770 0.5019 0.75 0.45254
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 186.61 on 166 degrees of freedom
## AIC: 200.6
##
## Number of Fisher Scoring iterations: 4
anova(fit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 172 226
## width 1 31.31 171 194 0.000000022 ***
## factor(color) 3 7.00 168 188 0.072 .
## factor(spine) 2 0.85 166 187 0.655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Randomly generate numbers
x <- runif(500, 5, 10)
## Take vector to the 1:6 power
X <- lapply(1:6, function(n) x^n) %>% Reduce(f = cbind)
## Find correlation matrix
cor(X)
## init
## init 1.00000 0.99640 0.98638 0.97129 0.95263 0.93172
## 0.99640 1.00000 0.99674 0.98783 0.97463 0.95845
## 0.98638 0.99674 1.00000 0.99714 0.98940 0.97805
## 0.97129 0.98783 0.99714 1.00000 0.99753 0.99092
## 0.95263 0.97463 0.98940 0.99753 1.00000 0.99790
## 0.93172 0.95845 0.97805 0.99092 0.99790 1.00000
fit1 <- glm(y ~ width, family = binomial, data = crab)
summary(fit1)
##
## Call:
## glm(formula = y ~ width, family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.351 2.629 -4.70 0.0000026 ***
## width 0.497 0.102 4.89 0.0000010 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 194.45 on 171 degrees of freedom
## AIC: 198.5
##
## Number of Fisher Scoring iterations: 4
fit2 <- glm(y ~ width + factor(color),
family = binomial, data = crab)
summary(fit2)
##
## Call:
## glm(formula = y ~ width + factor(color), family = binomial, data = crab)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.3852 2.8735 -3.96 0.0000743 ***
## width 0.4680 0.1055 4.43 0.0000093 ***
## factor(color)2 0.0724 0.7399 0.10 0.92
## factor(color)3 -0.2238 0.7771 -0.29 0.77
## factor(color)4 -1.3299 0.8525 -1.56 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 225.76 on 172 degrees of freedom
## Residual deviance: 187.46 on 168 degrees of freedom
## AIC: 197.5
##
## Number of Fisher Scoring iterations: 4
Try
## Difference in AIC (decrease is better)
AIC(fit2) - AIC(fit1)
## [1] -0.99563
## -2(L1 - L2) is difference in residual deviance + 3 covariates
(187.46-194.45) + 2*3
## [1] -0.99
This ends up being very similar to our log-linear models whereby we can check goodness of fit with a standard Pearson \(\chi^2\), with the \(G^2\) statistic, or
mj <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Marijuana.dat",
header = TRUE)
## Add total column
mj$total <- mj$yes + mj$no
fit <- glm(yes / total ~ race + gender, family = binomial,
weights = total, data = mj)
summary(fit)
##
## Call:
## glm(formula = yes/total ~ race + gender, family = binomial, data = mj,
## weights = total)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8303 0.1685 -4.93 0.00000084 ***
## racewhite 0.4437 0.1677 2.65 0.0081 **
## gendermale 0.2026 0.0852 2.38 0.0174 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12.752784 on 3 degrees of freedom
## Residual deviance: 0.057982 on 1 degrees of freedom
## AIC: 30.41
##
## Number of Fisher Scoring iterations: 3
## Get predicted probabilities for each group
mj$prob <- predict(fit, data = mj, type = "response")
## Get expected values
mj$exp_y <- with(mj, total * prob)
mj$exp_n <- with(mj, total * (1-prob))
## See these side-by-side
with(mj, cbind(yes, exp_y))
## yes exp_y
## [1,] 420 420.714
## [2,] 483 482.286
## [3,] 25 24.286
## [4,] 32 32.714
with(mj, cbind(no, exp_n))
## no exp_n
## [1,] 620 619.286
## [2,] 579 579.714
## [3,] 55 55.714
## [4,] 62 61.286
Here is our \(G^2\) and Pearson statistics. Note that we need to compare fitted and observed for both success and failure
## Pearson
((mj$yes - mj$exp_y)^2 / mj$exp_y) %>% sum() +
((mj$no - mj$exp_n)^2 / mj$exp_n) %>% sum()
## [1] 0.058062
## G^2
2 * (sum(mj$yes * log(mj$yes / mj$exp_y)) + sum(mj$no * log(mj$no / mj$exp_n)))
## [1] 0.057982
## Deviance from model (not the same?)
fit$deviance
## [1] 0.057982