Multicollinearity

library(ggplot2)
library(dplyr)

In genearl

x <- matrix(c(1e6, 0, 1e6, 1), nrow = 2) / 1e6
x
##      [,1]     [,2]
## [1,]    1 1.000000
## [2,]    0 0.000001
solve(x) # invertible
##      [,1]     [,2]
## [1,]    1 -1000000
## [2,]    0  1000000
t(x) %*% x
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1
solve(t(x) %*% x)
##               [,1]          [,2]
## [1,]  999911107321 -999911107320
## [2,] -999911107320  999911107320

In practice

crab <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat", 
                   header = TRUE)

fit <- glm(y ~ weight + width + factor(color) + factor(spine), 
           family = binomial, data = crab)
summary(fit)
## 
## Call:
## glm(formula = y ~ weight + width + factor(color) + factor(spine), 
##     family = binomial, data = crab)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)      -8.065      3.929   -2.05    0.040 *
## weight            0.826      0.704    1.17    0.241  
## width             0.263      0.195    1.35    0.178  
## factor(color)2   -0.103      0.783   -0.13    0.895  
## factor(color)3   -0.489      0.853   -0.57    0.567  
## factor(color)4   -1.609      0.935   -1.72    0.086 .
## factor(spine)2   -0.096      0.703   -0.14    0.891  
## factor(spine)3    0.400      0.503    0.80    0.426  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 185.20  on 165  degrees of freedom
## AIC: 201.2
## 
## Number of Fisher Scoring iterations: 4

Comparing this with the residual model, we see clear evidence of a fit with the difference between the null and residual deviance being 225.76 - 185.20 = 40.56 on \(df = 7\)

pchisq(40.56, df = 7, lower.tail = FALSE)
## [1] 0.00000098329
anova(fit)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##               Df Deviance Resid. Df Resid. Dev    Pr(>Chi)    
## NULL                            172        226                
## weight         1    30.02       171        196 0.000000043 ***
## width          1     2.85       170        193       0.092 .  
## factor(color)  3     6.68       167        186       0.083 .  
## factor(spine)  2     1.01       165        185       0.604    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Without weight

fit <- glm(y ~ width + factor(color) + factor(spine), 
           family = binomial, data = crab)
summary(fit)
## 
## Call:
## glm(formula = y ~ width + factor(color) + factor(spine), family = binomial, 
##     data = crab)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -11.0995     2.9771   -3.73  0.00019 ***
## width            0.4562     0.1078    4.23 0.000023 ***
## factor(color)2  -0.1434     0.7784   -0.18  0.85383    
## factor(color)3  -0.5241     0.8469   -0.62  0.53603    
## factor(color)4  -1.6683     0.9328   -1.79  0.07371 .  
## factor(spine)2  -0.0578     0.7031   -0.08  0.93445    
## factor(spine)3   0.3770     0.5019    0.75  0.45254    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 186.61  on 166  degrees of freedom
## AIC: 200.6
## 
## Number of Fisher Scoring iterations: 4
anova(fit)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##               Df Deviance Resid. Df Resid. Dev    Pr(>Chi)    
## NULL                            172        226                
## width          1    31.31       171        194 0.000000022 ***
## factor(color)  3     7.00       168        188       0.072 .  
## factor(spine)  2     0.85       166        187       0.655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Powers of covariate

## Randomly generate numbers
x <- runif(500, 5, 10)

## Take vector to the 1:6 power
X <- lapply(1:6, function(n) x^n) %>% Reduce(f = cbind)

## Find correlation matrix
cor(X)
##         init                                        
## init 1.00000 0.99640 0.98638 0.97129 0.95263 0.93172
##      0.99640 1.00000 0.99674 0.98783 0.97463 0.95845
##      0.98638 0.99674 1.00000 0.99714 0.98940 0.97805
##      0.97129 0.98783 0.99714 1.00000 0.99753 0.99092
##      0.95263 0.97463 0.98940 0.99753 1.00000 0.99790
##      0.93172 0.95845 0.97805 0.99092 0.99790 1.00000

AIC

fit1 <- glm(y ~ width, family = binomial, data = crab)
summary(fit1)
## 
## Call:
## glm(formula = y ~ width, family = binomial, data = crab)
## 
## Coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  -12.351      2.629   -4.70 0.0000026 ***
## width          0.497      0.102    4.89 0.0000010 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 194.45  on 171  degrees of freedom
## AIC: 198.5
## 
## Number of Fisher Scoring iterations: 4
fit2 <- glm(y ~ width + factor(color), 
           family = binomial, data = crab)
summary(fit2)
## 
## Call:
## glm(formula = y ~ width + factor(color), family = binomial, data = crab)
## 
## Coefficients:
##                Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)    -11.3852     2.8735   -3.96 0.0000743 ***
## width            0.4680     0.1055    4.43 0.0000093 ***
## factor(color)2   0.0724     0.7399    0.10      0.92    
## factor(color)3  -0.2238     0.7771   -0.29      0.77    
## factor(color)4  -1.3299     0.8525   -1.56      0.12    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 225.76  on 172  degrees of freedom
## Residual deviance: 187.46  on 168  degrees of freedom
## AIC: 197.5
## 
## Number of Fisher Scoring iterations: 4

Try

## Difference in AIC (decrease is better)
AIC(fit2) - AIC(fit1)
## [1] -0.99563
## -2(L1 - L2) is difference in residual deviance + 3 covariates
(187.46-194.45) + 2*3
## [1] -0.99

Model Diagnostic in Categorical Covariate Case

This ends up being very similar to our log-linear models whereby we can check goodness of fit with a standard Pearson \(\chi^2\), with the \(G^2\) statistic, or

mj <- read.table("http://www.stat.ufl.edu/~aa/cat/data/Marijuana.dat", 
                   header = TRUE)

## Add total column
mj$total <- mj$yes + mj$no

fit <- glm(yes / total ~ race + gender, family = binomial, 
           weights = total, data = mj)

summary(fit)
## 
## Call:
## glm(formula = yes/total ~ race + gender, family = binomial, data = mj, 
##     weights = total)
## 
## Coefficients:
##             Estimate Std. Error z value   Pr(>|z|)    
## (Intercept)  -0.8303     0.1685   -4.93 0.00000084 ***
## racewhite     0.4437     0.1677    2.65     0.0081 ** 
## gendermale    0.2026     0.0852    2.38     0.0174 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12.752784  on 3  degrees of freedom
## Residual deviance:  0.057982  on 1  degrees of freedom
## AIC: 30.41
## 
## Number of Fisher Scoring iterations: 3
## Get predicted probabilities for each group
mj$prob <- predict(fit, data = mj, type = "response")

## Get expected values
mj$exp_y <- with(mj, total * prob)
mj$exp_n <- with(mj, total * (1-prob))

## See these side-by-side
with(mj, cbind(yes, exp_y))
##      yes   exp_y
## [1,] 420 420.714
## [2,] 483 482.286
## [3,]  25  24.286
## [4,]  32  32.714
with(mj, cbind(no, exp_n))
##       no   exp_n
## [1,] 620 619.286
## [2,] 579 579.714
## [3,]  55  55.714
## [4,]  62  61.286

Here is our \(G^2\) and Pearson statistics. Note that we need to compare fitted and observed for both success and failure

## Pearson
((mj$yes - mj$exp_y)^2 / mj$exp_y) %>% sum() +
((mj$no - mj$exp_n)^2 / mj$exp_n) %>% sum()
## [1] 0.058062
## G^2
2 * (sum(mj$yes * log(mj$yes / mj$exp_y)) + sum(mj$no * log(mj$no / mj$exp_n)))
## [1] 0.057982
## Deviance from model (not the same?)
fit$deviance
## [1] 0.057982