library(tidyverse)
library(knitr)
library(tidymodels)
AE 06: Newton Raphson
Go to the course GitHub organization and locate your ae-06 repo to get started.
You do not need to submit this AE.
Introduction
COVID-19 infection prevention practices at food establishments
Researchers at Wollo University in Ethiopia conducted a study in July and August 2020 to understand factors associated with good COVID-19 infection prevention practices at food establishments. Their study is published in Andualem et al. (2022).
They were particularly interested in the understanding implementation of prevention practices at food establishments, given the workers’ increased risk due to daily contact with customers.
We will use the data from Andualem et al. (2022) to explore the association between age, sex, years of service, and whether someone works at a food establishment with access to personal protective equipment (PPE) as of August 2020. We will use access to PPE as a proxy for wearing PPE.
The study participants were selected using a simple random sampling at the selected establishments.
.
<- read_csv("data/covid-prevention-study.csv") |>
covid_df rename(age = "Age of food handlers",
years = "Years of service",
ppe_access = "Availability of PPEs") |>
mutate(sex = factor(if_else(Sex == 2, "Female", "Male")),
ppe_access = as_factor(ppe_access))
Functions for Newton-Raphson
## Calculate the gradient of logL (score function)
<- function(beta, X, y){
calc_first_deriv <- rep(0, length(beta))
first_deriv for(i in 1:length(y)){
<- first_deriv + as.numeric(y[i] - exp(X[i,] %*% beta)/(1 + exp(X[i,] %*% beta))) %*% X[i,]
first_deriv
}return(colSums(first_deriv)) #return values as a vector
}
## Calculate the Hessian matrix
<- function(beta, X, y){
calc_second_deriv <- matrix(0, nrow = length(beta), ncol = length(beta))
second_deriv for(i in 1:length(y)){
<- second_deriv + as.numeric(1/(1 + exp(X[i,] %*% beta)))*
second_deriv as.numeric((exp(X[i,] %*% beta)/(1 + exp(X[i,] %*% beta)))) *
%*% t(X[i,])
(X[i,])
}return(second_deriv)
}
Starting values
# design matrix
<- model.matrix(~ age + sex + years, data = covid_df)
X
# vector of response
<- I(covid_df$ppe_access == 1)
y
# vector of coefficients
<- c(0, 0, 0, 0)
beta
# keep track of iterations
<- 1
iter
# keep track of difference in estimates
<- 1
delta
# keep track of estimates at each iteration
<- matrix(0, nrow = 500, ncol = 4) temp
Estimate
while(delta > 0.000001 & iter < 50){
<- beta
old <- old - solve(-1 * calc_second_deriv(beta = beta, X = X, y = y)) %*%
beta calc_first_deriv(beta = beta, X = X, y = y)
<- beta
temp[iter,] <- sqrt(sum((beta - old)^2))
delta <- iter + 1
iter }
Show results
iter
[1] 7
beta
[,1]
(Intercept) -2.12709823
age 0.05586608
sexMale 0.34070061
years 0.26396205
Note that
#calculate the hessian matrix
<- calc_second_deriv(beta = beta, X = X, y = y)
second_deriv
# take the inverse
<- solve(second_deriv)
inv_second_deriv
# get estimates for SE
<- sqrt(diag(inv_second_deriv))
se_beta
se_beta
[1] 0.45832580 0.01740581 0.22360561 0.06583117
Coefficient estimates from glm
<- glm(ppe_access ~ age + sex + years,
ppe_model data = covid_df, family = binomial)
tidy(ppe_model, conf.int = TRUE) |>
kable(digits = 3)
term | estimate | std.error | statistic | p.value | conf.low | conf.high |
---|---|---|---|---|---|---|
(Intercept) | -2.127 | 0.458 | -4.641 | 0.000 | -3.058 | -1.257 |
age | 0.056 | 0.017 | 3.210 | 0.001 | 0.023 | 0.091 |
sexMale | 0.341 | 0.224 | 1.524 | 0.128 | -0.098 | 0.780 |
years | 0.264 | 0.066 | 4.010 | 0.000 | 0.143 | 0.401 |