Logistic regression from scratch (Newton Raphson and Fisher Scoring)

Logistic regression from scratch (Newton Raphson and Fisher Scoring)

Feb 15, 2022 · 3 min read

In an earlier post, I had shown this using iteratively reweighted least squares (IRLS). This is just an alternative method using Newton Raphson and the Fisher scoring algorithm. For further details, you can look here as well.

library(MLMusingR)
data(suspend)

m1 <- glm(sus ~ male + gpa * frpl + fight + frmp.c * pminor.c, 
          data = suspend, 
          family = binomial)

### extracting raw components

dat <- model.frame(m1)
fml <- formula(m1)
X <- model.matrix(fml, dat)
y <- model.response(dat, 'numeric')
k <- ncol(X)
#pp <- function(x) 1 / (1 + exp(-x)) #convert logit to pred prob

beta_0 <- rep(0, k) #initialize with zeroes
eta <- X %*% beta_0 #predicted logit
mu <- plogis(eta) #same as pp but built-in; convert logit to pred prob
vr <- (mu * (1 - mu)) #variance

wts <- diag(as.vector(vr)) #create weight matrix
d2 <- t(X) %*% wts %*% X #the Fisher information matrix
u1 <- t(X) %*% (y - mu) #the score function
iter <- 50 #max number of iterations
tol <- 1e-8 #tolerance

### now iterate

for (i in 1:iter){
  #beta_1 <- beta_0 + solve(d2) %*% u1 #update beta
  beta_1 <- beta_0 + chol2inv(chol(d2)) %*% u1 #update beta avoiding inversion
  #print(beta_1) #show estimates
  
  if(any(abs((beta_1 - beta_0) / beta_0) < tol)) break #stop loop if no change

  eta <- X %*% beta_1 #predicted logit
  mu <- plogis(eta) #this is just pp(eta)
  vr <- mu * (1 - mu) #variance
  
  wts <- diag(as.vector(vr)) #putting in a weight matrix
  d2 <- t(X) %*% wts %*% X #information matrix
  u1 <- t(X) %*% (y - mu) #score function
  beta_0 <- beta_1
  cat(i, "::") #show iteration number
}
1 ::2 ::3 ::4 ::5 ::6 ::7 ::
### compare results

se <- sqrt(diag(solve(d2))) #get standard error, inverse of information matrix
z <- beta_1 / se
p.val <- 2 * pt(-abs(z), df = Inf)
df <- data.frame(beta = beta_1, se = se, z = z, p = format(p.val, 3))
df
                     beta       se      z        p
(Intercept)     -1.592202 0.269404 -5.910 3.42e-09
male             0.324897 0.099384  3.269 1.08e-03
gpa             -0.795479 0.084849 -9.375 6.90e-21
frpl            -0.562734 0.318874 -1.765 7.76e-02
fight            2.078100 0.098472 21.103 7.40e-99
frmp.c           0.003004 0.003189  0.942 3.46e-01
pminor.c        -0.002236 0.002302 -0.971 3.31e-01
gpa:frpl         0.387256 0.109169  3.547 3.89e-04
frmp.c:pminor.c  0.000124 0.000107  1.167 2.43e-01
#### to compare

m2 <- update(m1, epsilon = 1e-16) #change tolerance so results match
summary(m2)$coef
                 Estimate Std. Error z value Pr(>|z|)
(Intercept)     -1.592202   0.269404  -5.910 3.42e-09
male             0.324897   0.099384   3.269 1.08e-03
gpa             -0.795479   0.084849  -9.375 6.90e-21
frpl            -0.562734   0.318874  -1.765 7.76e-02
fight            2.078100   0.098472  21.103 7.40e-99
frmp.c           0.003004   0.003189   0.942 3.46e-01
pminor.c        -0.002236   0.002302  -0.971 3.31e-01
gpa:frpl         0.387256   0.109169   3.547 3.89e-04
frmp.c:pminor.c  0.000124   0.000107   1.167 2.43e-01
## equal deviances

-2 * sum((y * log(mu)) + (1 - y) * log(1 - mu))
[1] 3331
deviance(m2)
[1] 3331

An alternative using maximum likelihood estimation (MLE)

m3 <- glm(sus ~ male + gpa + frpl + gpa * frpl, 
          data = suspend, 
          family = binomial)
X <- model.matrix(m3)

## this is the function to maximize
mle <- function(b){
  pp <- plogis(X %*% b)
  sum(dbinom(y, 1, pp, log = TRUE))
}

k <- ncol(X) #number of variables
sta <- rep(0, k) #initial values

out <- optim(par = sta, mle, #starting values and function to optimize
            control = list(fnscale = -1, #-1 to maximize
            reltol = 1e-15, #tolerance
            maxit = 1000),
            hessian = TRUE) #so it doesn't keep iterating
out$par #coefficients, difference due to tolerance?
[1] -0.371  0.673 -0.988 -0.543  0.406
coef(m3) #using built in function
(Intercept)        male         gpa        frpl    gpa:frpl 
     -0.371       0.673      -0.988      -0.543       0.406 
out$value #the log likelihood
[1] -1908
logLik(m3)
'log Lik.' -1908 (df=5)
sqrt(diag(solve(-out$hessian))) #standard errors
[1] 0.2386 0.0931 0.0777 0.2941 0.1010
sqrt(diag(vcov(m3)))
(Intercept)        male         gpa        frpl    gpa:frpl 
     0.2386      0.0931      0.0777      0.2941      0.1010 

– END