# Logistic regression from scratch (Newton Raphson and Fisher Scoring)

In an earlier post, I had shown this using iteratively reweighted least squares (IRLS). This is just an alternative method using Newton Raphson and the Fisher scoring algorithm. For further details, you can look here as well.

library(MLMusingR)
data(suspend)

m1 <- glm(sus ~ male + gpa * frpl + fight + frmp.c * pminor.c,
data = suspend,
family = binomial)

### extracting raw components

dat <- model.frame(m1)
fml <- formula(m1)
X <- model.matrix(fml, dat)
y <- model.response(dat, 'numeric')
k <- ncol(X)
#pp <- function(x) 1 / (1 + exp(-x)) #convert logit to pred prob

beta_0 <- rep(0, k) #initialize with zeroes
eta <- X %*% beta_0 #predicted logit
mu <- plogis(eta) #same as pp but built-in; convert logit to pred prob
vr <- (mu * (1 - mu)) #variance

wts <- diag(as.vector(vr)) #create weight matrix
d2 <- t(X) %*% wts %*% X #the Fisher information matrix
u1 <- t(X) %*% (y - mu) #the score function
iter <- 50 #max number of iterations
tol <- 1e-8 #tolerance

### now iterate

for (i in 1:iter){
#beta_1 <- beta_0 + solve(d2) %*% u1 #update beta
beta_1 <- beta_0 + chol2inv(chol(d2)) %*% u1 #update beta avoiding inversion
#print(beta_1) #show estimates

if(any(abs((beta_1 - beta_0) / beta_0) < tol)) break #stop loop if no change

eta <- X %*% beta_1 #predicted logit
mu <- plogis(eta) #this is just pp(eta)
vr <- mu * (1 - mu) #variance

wts <- diag(as.vector(vr)) #putting in a weight matrix
d2 <- t(X) %*% wts %*% X #information matrix
u1 <- t(X) %*% (y - mu) #score function
beta_0 <- beta_1
cat(i, "::") #show iteration number
}
1 ::2 ::3 ::4 ::5 ::6 ::7 ::
### compare results

se <- sqrt(diag(solve(d2))) #get standard error, inverse of information matrix
z <- beta_1 / se
p.val <- 2 * pt(-abs(z), df = Inf)
df <- data.frame(beta = beta_1, se = se, z = z, p = format(p.val, 3))
df
                         beta          se          z            p
(Intercept)     -1.5922023202 0.269404299 -5.9100851 3.419311e-09
male             0.3248969875 0.099383812  3.2691138 1.078849e-03
gpa             -0.7954794245 0.084849355 -9.3751971 6.904665e-21
frpl            -0.5627344684 0.318873753 -1.7647563 7.760473e-02
fight            2.0780999956 0.098472087 21.1034422 7.395171e-99
frmp.c           0.0030040140 0.003189348  0.9418898 3.462491e-01
pminor.c        -0.0022362792 0.002302005 -0.9714485 3.313250e-01
gpa:frpl         0.3872562271 0.109168665  3.5473204 3.891710e-04
frmp.c:pminor.c  0.0001243666 0.000106535  1.1673780 2.430578e-01
#### to compare

m2 <- update(m1, epsilon = 1e-16) #change tolerance so results match
summary(m2)$coef  Estimate Std. Error z value Pr(>|z|) (Intercept) -1.5922023202 0.269404299 -5.9100851 3.419311e-09 male 0.3248969875 0.099383812 3.2691138 1.078849e-03 gpa -0.7954794245 0.084849355 -9.3751971 6.904665e-21 frpl -0.5627344684 0.318873753 -1.7647563 7.760473e-02 fight 2.0780999956 0.098472087 21.1034422 7.395171e-99 frmp.c 0.0030040140 0.003189348 0.9418898 3.462491e-01 pminor.c -0.0022362792 0.002302005 -0.9714485 3.313250e-01 gpa:frpl 0.3872562271 0.109168665 3.5473204 3.891710e-04 frmp.c:pminor.c 0.0001243666 0.000106535 1.1673780 2.430578e-01 ## equal deviances -2 * sum((y * log(mu)) + (1 - y) * log(1 - mu)) [1] 3331.017 deviance(m2) [1] 3331.017 ## An alternative using maximum likelihood estimation (MLE) m3 <- glm(sus ~ male + gpa + frpl + gpa * frpl, data = suspend, family = binomial) X <- model.matrix(m3) ## this is the function to maximize mle <- function(b){ pp <- plogis(X %*% b) sum(dbinom(y, 1, pp, log = TRUE)) } k <- ncol(X) #number of variables sta <- rep(0, k) #initial values out <- optim(par = sta, mle, #starting values and function to optimize control = list(fnscale = -1, #-1 to maximize reltol = 1e-15, #tolerance maxit = 1000), hessian = TRUE) #so it doesn't keep iterating out$par #coefficients, difference due to tolerance?
[1] -0.3712208  0.6727835 -0.9884962 -0.5433412  0.4056071
coef(m3) #using built in function
(Intercept)        male         gpa        frpl    gpa:frpl
-0.3712205   0.6727835  -0.9884965  -0.5433411   0.4056072 
out$value #the log likelihood [1] -1908.202 logLik(m3) 'log Lik.' -1908.202 (df=5) sqrt(diag(solve(-out$hessian))) #standard errors
[1] 0.23858787 0.09314298 0.07768979 0.29413387 0.10102125
sqrt(diag(vcov(m3)))
(Intercept)        male         gpa        frpl    gpa:frpl
0.23858792  0.09314278  0.07768976  0.29413406  0.10102131 

– END

Previous