<- read.csv("https://stats.idre.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv")
d <- lm(api00 ~ enroll + meals + full, data = d)
fit <- as.matrix(cbind(1,d[,c("enroll","meals","full")]))
X <- d$api00 y
OLS with matrices
The linear regression model takes the form: \[y=\beta_0+\beta_1x_1+\beta_2x_2...+\beta_kx_{k}+\varepsilon\]
where \(x_1,x_2...x_k\) are the predictor variables and \(\beta_1,\beta_2...\beta_k\) are the coefficients.
In matrix form: \[y=X\beta+\varepsilon\]
where \(y\) is a \(n \times 1\) vector of dependent variable, \(X\) is \(n \times (p+1)\) with the first columns as 1s that multiply the intercept \(\beta_0\), \(\beta\) is a $(p+1) $ vector of regression coefficients, including the intercept, is a \(n \times 1\) vector of residuals (or errors).
This model returns a prediction which is essentially a weighted average of the independent variables, plus an intercept which represents the expected value of the dependent variable when all predictors are set to zero.
In R
, let’s model api00
as a function of enroll
, meals
and full
:
The vector of coefficients \(\hat{\beta}\) can be obtained using matrix operations: \[(X^TX)^{-1}X^Ty\]
<- solve(t(X) %*% X) %*% t(X) %*% y betas
The variance of \(\hat{\beta}\) is \(Var(\hat{\beta})=\hat{\sigma^2}(X^TX)^{-1}\)
# Get the sigma hat squared (residual_variance)
<- y - (X %*% betas)
residuals <- ncol(X)-1
k <- nrow(X) - k - 1
degrees_of_freedom <- sum(residuals^2) / degrees_of_freedom
residual_variance # Multiply it as in the equation
<- residual_variance * solve(t(X) %*% X)
betas_cov <- sqrt(diag(betas_cov))
betas_se cbind(c(betas),unname(c(betas_se)))
[,1] [,2]
[1,] 801.8298337 26.42660282
[2,] -0.0514556 0.01383741
[3,] -3.6597345 0.10879550
[4,] 1.0810944 0.23945269
coef(summary(fit))[,1:2]
Estimate Std. Error
(Intercept) 801.8298337 26.42660282
enroll -0.0514556 0.01383741
meals -3.6597345 0.10879550
full 1.0810944 0.23945269
Maximum likelihood estimation
With \(y=X\beta+\varepsilon\):
\[L(\beta, \sigma^2 | Y, X) = \prod_{i=1}^{n} {\frac {1}{\sigma {\sqrt {2\pi }}}}e^{-{\frac {1}{2}}\left({\frac {y_i-X_i\beta }{\sigma }}\right)^{2}}\]
With a little bit of math we can get the log likelihood:
\[lnL(\beta, \sigma^2 | Y, X) = -\frac{n}{2}\ln(2\pi)-\frac{n}{2}ln(\sigma^2)-\frac{1}{2\sigma^2}\sum_{i=1}^{n}(y_i-X_i\beta)\]
This is something we can optimize in R:
<- rnorm(4)
betas_params_initial <- rlnorm(1)
sigma_params_initial <- c(betas_params_initial,sigma_params_initial)
params <- function(pars,y,x1,x2,x3){
to_optim <- pars[1]
b0 <- pars[2]
b1 <- pars[3]
b2 <- pars[4]
b3 <- pars[5]
sigma_u <- exp(sigma_u)
sigma <- b0+b1*x1+b2*x2+b3*x3
mu_vector <-length(y)
n-sum(sapply(1:n, function(x) {dnorm(y[x],mu_vector[x],sigma,log=TRUE)}))
# alternatively without using dnorm
#log_density <- function(x, mean, sd) {-0.5 * log(2 * pi) - log(sd) - 0.5 * ((x - mean)/sd)^2}
#-sum(sapply(1:n, function(x) {log_density(y[x], mu_vector[x], sigma)}))
}optim(params,to_optim,y=d$api00,x1=d$enroll,x2=d$meals,x3=d$full,method="L-BFGS-B")$par
[1] 801.83873933 -0.05146513 -3.65975800 1.08106420 4.06796330
coef(summary(fit))[,1]
(Intercept) enroll meals full
801.8298337 -0.0514556 -3.6597345 1.0810944
MLE of multiple regression minimizes the sum of squares residuals, so we could optimize that:
<- rnorm(4)
betas_params_initial <- betas_params_initial
paramsDirectSum <- function(pars,y,x1,x2,x3){
to_optimDirectSum <- pars[1]
b0 <- pars[2]
b1 <- pars[3]
b2 <- pars[4]
b3 <- b0+b1*x1+b2*x2+b3*x3
mu_vector <-length(y)
nsum((y-mu_vector)^2)
}optim(paramsDirectSum,to_optimDirectSum,y=d$api00,x1=d$enroll,x2=d$meals,x3=d$full,method="L-BFGS-B")$par
[1] 801.82908193 -0.05146091 -3.65973002 1.08113458
Optimization does not always work:
<- optim(params,to_optim,y=d$api00,x1=d$enroll,x2=d$meals,x3=d$full,method="Nelder-Mead")$par) (nelder_mead_params
[1] 0.6391398 1.0909668 -8.1725264 5.4775162 5.7315367
<- optim(params,to_optim,y=d$api00,x1=d$enroll,x2=d$meals,x3=d$full,method="BFGS")$par) (BFGS_params
[1] 840.83648191 -0.06076046 -3.76474390 0.74301723 4.07261206
coef(summary(fit))[,1]
(Intercept) enroll meals full
801.8298337 -0.0514556 -3.6597345 1.0810944
I’m not sure why Nelder-Mead is so bad. Maybe it found a local minimum.
To obtain the standard errors, we need the Hessian matrix of the second derivatives of the log-likelihood function evaluated at the maximum likelihood estimates. Here, I need to brush up. I will write something in the future about gradients, jacobians, hessians and Newton’s method for MLE.
The inverse of this matrix is an estimate of the covariance matrix of the parameter estimates.
<- optim(params,to_optim,y=d$api00,x1=d$enroll,x2=d$meals,x3=d$full,method="L-BFGS-B",hessian = TRUE)
fit_optim <- solve(fit_optim$hessian)
cov_beta_hat <- sqrt(diag(cov_beta_hat))) (se_beta_hat
[1] 26.29256437 0.01376801 0.10824831 0.23823963 0.03535563
coef(summary(fit))[,2]
(Intercept) enroll meals full
26.42660282 0.01383741 0.10879550 0.23945269
We can also do the optimization by hand using Netwon-Raphson. It involves updating parameters by subtracting the multiplication of the inverse of the Hessian with the gradient vector.
<- params[1:4]
betas
for (i in 1:5){
= X %*% betas
yhat <- t(X) %*% (y-yhat)
gradients <- - t(X) %*% X
hessian <- betas - solve(hessian) %*% gradients
betas
}
<- y - X %*% betas
residuals <- sum(residuals^2) / (length(y) - length(betas))
sigma2_hat <- sigma2_hat * solve(-hessian)
cov_beta_hat <- sqrt(diag(cov_beta_hat))
se_beta_hat cbind(c(betas),unname(c(betas_se)))
[,1] [,2]
[1,] 801.8298337 26.42660282
[2,] -0.0514556 0.01383741
[3,] -3.6597345 0.10879550
[4,] 1.0810944 0.23945269
coef(summary(fit))[,1:2]
Estimate Std. Error
(Intercept) 801.8298337 26.42660282
enroll -0.0514556 0.01383741
meals -3.6597345 0.10879550
full 1.0810944 0.23945269
Finally, this is used in Machine Learning more than in social sciences but it’s helpful to know ; here’s how to get the point estimates of the coefficients using gradient descent.
For gradient descent, we need to scale fist.
<- as.matrix(cbind(1, d[, c("enroll", "meals", "full")]))
X -1] <- scale(X[,-1])
X[,<- scale(d$api00)
y <- length(y)
m
# Settings
<- 0.001 # Learning rate
alpha <- 10000
num_iterations <- matrix(rnorm(4,0,0.1), ncol(X), 1) # Initialize beta values
betas
# Gradient Descent in a for loop
for(i in 1:num_iterations){
# Compute the prediction
<- X %*% betas
prediction
# Compute the error
<- prediction - matrix(y, ncol=1)
error
# Update betas
for(j in 1:ncol(X)){
<- t(X[, j]) %*% error
gradient <- betas[j,] - alpha * (1/m) * gradient
betas[j,]
}
}
round(print(betas),3)
[,1]
[1,] -2.928004e-06
[2,] -8.122462e-02
[3,] -8.179810e-01
[4,] 1.169966e-01
[,1]
[1,] 0.000
[2,] -0.081
[3,] -0.818
[4,] 0.117
round(coef(lm(y~X[,-1])),3)
(Intercept) X[, -1]enroll X[, -1]meals X[, -1]full
0.000 -0.082 -0.821 0.114
It worked but I had to play with the alpha number of iterations parameters.
Additional statistics lm()
summary(fit)
Call:
lm(formula = api00 ~ enroll + meals + full, data = d)
Residuals:
Min 1Q Median 3Q Max
-181.721 -40.802 1.129 39.983 158.774
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 801.82983 26.42660 30.342 < 2e-16 ***
enroll -0.05146 0.01384 -3.719 0.000229 ***
meals -3.65973 0.10880 -33.639 < 2e-16 ***
full 1.08109 0.23945 4.515 8.37e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 58.73 on 396 degrees of freedom
Multiple R-squared: 0.8308, Adjusted R-squared: 0.8295
F-statistic: 648.2 on 3 and 396 DF, p-value: < 2.2e-16
<- 1 - (sum((fit$model$api00 - predict(fit))^2) /
(R2 sum((fit$model$api00-mean(fit$model$api00))^2)))
[1] 0.8308122
<- 1 - (1 - R2) * ((nrow(fit$model) - 1) /
(R2adj nrow(fit$model) - length(fit$coefficients[-1]) - 1))) (
[1] 0.8295304
<- quantile(fit$residuals,c(0,0.25,0.5,0.75,1))) (Residuals
0% 25% 50% 75% 100%
-181.720968 -40.801636 1.128517 39.983048 158.774215
<- coef(summary(fit))[,1] /
(tstats coef(summary(fit))[,2])
(Intercept) enroll meals full
30.341767 -3.718585 -33.638657 4.514856
<- nrow(fit$model) - length(fit$coefficients)) (d_free
[1] 396
<- 2 * (1 - pt(abs(tstats), df=d_free))) (pvalues
(Intercept) enroll meals full
0.000000e+00 2.292395e-04 0.000000e+00 8.369020e-06
<- sqrt(sum(fit$residuals^2) /
(Res_se nrow(fit$model)-(1+length(fit$coefficients[-1]))))) (
[1] 58.73169
=sum(fit$residuals^2)
SSE=sum((fit$model$api00-mean(fit$model$api00))^2)
SSyy<-length(fit$coefficients)-1
k<- ((SSyy-SSE)/k) / (SSE/(400-(3+1)))
Fstat
<- 1-pf(Fstat,3,396)) (Fstat_pval
[1] 0
Additional statistics glm() for the Gaussian family
<- glm(api00 ~ enroll + meals + full, data = d, family='gaussian')
fit_glm summary(fit_glm)
Call:
glm(formula = api00 ~ enroll + meals + full, family = "gaussian",
data = d)
Deviance Residuals:
Min 1Q Median 3Q Max
-181.721 -40.802 1.129 39.983 158.774
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 801.82983 26.42660 30.342 < 2e-16 ***
enroll -0.05146 0.01384 -3.719 0.000229 ***
meals -3.65973 0.10880 -33.639 < 2e-16 ***
full 1.08109 0.23945 4.515 8.37e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 3449.412)
Null deviance: 8073672 on 399 degrees of freedom
Residual deviance: 1365967 on 396 degrees of freedom
AIC: 4399.5
Number of Fisher Scoring iterations: 2
quantile(fit_glm$residuals,c(0,0.25,0.5,0.75,1))
0% 25% 50% 75% 100%
-181.720968 -40.801636 1.128517 39.983048 158.774215
<- sum(fit$residuals^2) /
(Dispersion nrow(fit$model)-(1+length(fit$coefficients[-1])))) (
[1] 3449.412
It’s possible to get the loglik from brute force:
<- sum(
(loglikMLE sapply(1:400, function(i){
log(dnorm(fit_glm$model$api00[i],
coef(fit_glm)[[1]] + sum(fit_glm$model[i,2:4] * coef(fit_glm)[2:4]),
sqrt(Dispersion)))
}) ))
[1] -2194.767
logLik(fit_glm)
'log Lik.' -2194.757 (df=5)
It’s possible to get the loglik using a formula derived from summation:
<- sum((d$api00 - mean(d$api00))^2)) (nulldeviance
[1] 8073672
<- sum(residuals(fit_glm)^2)) (residualdeviance
[1] 1365967
<-
(loglikDirect -400/2) * log(2*pi) - (400/2) * log(Dispersion) -
(1/(2*Dispersion)) * residualdeviance) (
[1] -2194.767
4 coefficient parameters + dispersion of Gaussian = 5
AIC = 2*5 - 2*logLik(fit_glm)) (
'log Lik.' 4399.514 (df=5)
BIC(fit_glm)
[1] 4419.472
unname(5*log(400) - 2*logLik(fit_glm))
'log Lik.' 4419.472 (df=5)
I will discuss Fisher Scoring in another entry.
Confidence and prediction intervals
as.matrix(cbind(1,fit$model[,-1])) %*% coef(fit))[1:10] (
[1] 626.0813 526.7168 500.0250 545.0005 543.4118 855.6927 876.0496 820.4431
[9] 857.6799 780.8557
predict(fit)[1:10]
1 2 3 4 5 6 7 8
626.0813 526.7168 500.0250 545.0005 543.4118 855.6927 876.0496 820.4431
9 10
857.6799 780.8557
When using predict(fit, se.fit = TRUE)
, we can derive either a confidence interval or a prediction interval based on the standard error of the prediction. They are different. The confidence interval reflects the uncertainty surrounding the estimated average value at the population level, and it would narrow indefinitely with infinite data. The prediction interval has inherent randomness and represents predictions for individual data points.
First we need to calculate the standard error of the prediction.
<- lm(api00 ~ enroll + meals + full, data = d)
fit <- as.matrix(cbind(1,fit$model[,-1]))
Xp <- coef(fit)
b <- Xp %*% b
yh <- vcov(fit)
V <- rowSums((Xp %*% V) * Xp)
var.fit sqrt(var.fit)[1:10]
1 2 3 4 5 6 7 8
5.137250 4.268921 5.436181 4.717847 4.544710 5.561061 5.994418 16.628745
9 10
7.047945 4.265484
predict(fit,se.fit=TRUE)$se.fit[1:10]
[1] 5.137250 4.268921 5.436181 4.717847 4.544710 5.561061 5.994418
[8] 16.628745 7.047945 4.265484
Once we have the standard error of the prediction, we can get the confidence and prediction intervals as follow:
Confidence interval is self-explanatory:
<- predict(fit,se.fit=TRUE)
predit_fit rbind(predit_fit$fit[1:10] - predit_fit$se.fit[1:10]*qt(0.975,396),
$fit[1:10] + predit_fit$se.fit[1:10]*qt(0.975,396)) predit_fit
1 2 3 4 5 6 7 8
[1,] 615.9816 518.3242 489.3377 535.7253 534.4770 844.7598 864.2647 787.7515
[2,] 636.1810 535.1094 510.7124 554.2757 552.3465 866.6256 887.8344 853.1348
9 10
[1,] 843.8238 772.4698
[2,] 871.5360 789.2415
head(predict(fit,interval="confidence"),10)
fit lwr upr
1 626.0813 615.9816 636.1810
2 526.7168 518.3242 535.1094
3 500.0250 489.3377 510.7124
4 545.0005 535.7253 554.2757
5 543.4118 534.4770 552.3465
6 855.6927 844.7598 866.6256
7 876.0496 864.2647 887.8344
8 820.4431 787.7515 853.1348
9 857.6799 843.8238 871.5360
10 780.8557 772.4698 789.2415
This is similar but adding randomness based on the residual error:
rbind(predit_fit$fit[1:10] - qt(0.975,396)*sqrt(predit_fit$se.fit[1:10]^2+sum(fit$residuals ^ 2) / fit$df.residual),
$fit[1:10] + qt(0.975,396)*sqrt(predit_fit$se.fit[1:10]^2+sum(fit$residuals ^ 2) / fit$df.residual)) predit_fit
1 2 3 4 5 6 7 8
[1,] 510.1755 410.9473 384.0666 429.1637 427.6017 739.7113 759.9848 700.4394
[2,] 741.9870 642.4863 615.9835 660.8373 659.2218 971.6740 992.1143 940.4468
9 10
[1,] 741.3866 665.0867
[2,] 973.9732 896.6247
head(predict(fit,interval="prediction"),10)
Warning in predict.lm(fit, interval = "prediction"): predictions on current data refer to _future_ responses
fit lwr upr
1 626.0813 510.1755 741.9870
2 526.7168 410.9473 642.4863
3 500.0250 384.0666 615.9835
4 545.0005 429.1637 660.8373
5 543.4118 427.6017 659.2218
6 855.6927 739.7113 971.6740
7 876.0496 759.9848 992.1143
8 820.4431 700.4394 940.4468
9 857.6799 741.3866 973.9732
10 780.8557 665.0867 896.6247