8 Algorithms
8.1 Maximum likelihood estimation
8.1.1 Likelihood estimation
(without random effects)
With the normal distribution of errors, likelihood can be expressed explicitly as the product of the densities of each of the independent normal observations.
the negative log-likelihood then,
for matrix format
where derived from .
- minimize by taking the derivative
taking the derivatives of the negative log-likelihood function.
setting the derivatives equal to zero and solving for the parameters
but for REML
8.1.2 R demonstration
- toy data
set.seed(123)
<- 20000
n <- rnorm(n, 2, sqrt(2))
x <- rnorm(n, 0, 0.8)
s <- 1.5+x*3+s
y <- data.frame(y,x) mydata
- using linear regression
<- lm(y~., data=mydata)
lmfit # logLik(lmfit)
coefficients(lmfit)
## (Intercept) x
## 1.487701 3.003695
summary(lmfit)$sigma**2) (
## [1] 0.6397411
- using
-log max likelihood estimate
formula
notice, using vector and matrix notation
## Using the mathematical expression:
<- function(param){
minusloglik <- param[-1] #Regression Coefficients
beta <- param[1] #Variance
sigma <- as.vector(mydata$y) #DV
y <- cbind(1, mydata$x) #IV
x <- x%*%beta #multiply matrices
mu 0.5*(n*log(2*pi) + n*log(sigma) + sum((y-mu)^2)/sigma)
}
<- optim( c (1, 1, 1 ), minusloglik)
MLoptimize ## The results:
$par MLoptimize
## [1] 0.6397201 1.4876186 3.0038675
- using max likelihood estimate directly (normal distribution)
# max
library(maxLik)
<- function(param) {
ols.lf <- param[-1] #Regression Coefficients
beta <- param[1] #Variance
sigma <- as.vector(mydata$y) #DV
y <- cbind(1, mydata$x) #IV
x <- x%*%beta #multiply matrices
mu sum(dnorm(y, mu, sqrt(sigma), log = TRUE)) #normal distribution(vector of observations, mean, sd)
}
<- maxLik(logLik = ols.lf, start = c(sigma = 1, beta1 = 1, beta2 = 1 ))
mle_ols summary(mle_ols)
## --------------------------------------------
## Maximum Likelihood estimation
## Newton-Raphson maximisation, 11 iterations
## Return code 8: successive function values within relative tolerance limit (reltol)
## Log-Likelihood: -23910.85
## 3 free parameters
## Estimates:
## Estimate Std. error t value Pr(> t)
## sigma 0.639677 0.006396 100.0 <2e-16 ***
## beta1 1.487701 0.009768 152.3 <2e-16 ***
## beta2 3.003695 0.003999 751.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
- another example
<- function(param) {
ols.lf <- param[-1] #Regression Coefficients
beta <- param[1] #Variance
sigma <- as.vector(mtcars$mpg) #DV
y <- cbind(1, mtcars$cyl, mtcars$disp) #IV
x <- x%*%beta #multiply matrices
mu sum(dnorm(y, mu, sqrt(sigma), log = TRUE)) #normal distribution(vector of observations, mean, sd)
}
<- maxLik(logLik = ols.lf, start = c(sigma = 1, beta1 = 1, beta2 = 1, beta3=1))
mle_ols summary(mle_ols)
## --------------------------------------------
## Maximum Likelihood estimation
## Newton-Raphson maximisation, 28 iterations
## Return code 2: successive function values within tolerance limit (tol)
## Log-Likelihood: -79.57282
## 4 free parameters
## Estimates:
## Estimate Std. error t value Pr(> t)
## sigma 8.460632 2.037359 4.153 0.0000329 ***
## beta1 34.661013 2.395871 14.467 < 2e-16 ***
## beta2 -1.587281 0.673675 -2.356 0.0185 *
## beta3 -0.020584 0.009757 -2.110 0.0349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------
#Checking against linear regression
<- (lm(mpg~cyl+disp, data=mtcars))
lmfit2 lmfit2
##
## Call:
## lm(formula = mpg ~ cyl + disp, data = mtcars)
##
## Coefficients:
## (Intercept) cyl disp
## 34.66099 -1.58728 -0.02058
summary(lmfit2)$sigma**2) (
## [1] 9.335872
8.1.3 Estimate confidence intervals using the likelihood
Most classical confidence intervals for parameters are estimated using the likelihood approach, the Wald interval (or the asymptotic normality property).
where the standard error is from the second derivative of the log-likelihood function. This is the Hessian matrix/(observed) Information matrix if there is more than one single model parameter.
- take second derivative of the log-likelihood function
e.g. for one independent variable (with parameters: and in the model).
Here, giving the four entries of the Hessian matrix
therefore
the inverse of the Fisher information is just each diagonal element, so
For
Thus, the Wald confidence interval for the mean would be
For variance
Thus, the Wald confidence interval for the variance would be
However, the following approach confidence interval will generally have much better small sample properties than the Wald interval.
8.1.4 The profile likelihood
It can be profiled by maximizing the likelihood function with respect to all the other parameters.
In the following equation, expressed by
Similarly, the profile likelihood for the variance can be expressed
This becomes particularly simple, as the -estimate does not depend on the .
8.1.5 Maximum likelihood estimate practice
question: eatimate mean and variance
<-c(1.38, 3.96, -0.16, 8.12, 6.30, 2.61, -1.35, 0.03, 3.94, 1.11)
sample<-length(sample)
n<-mean(sample)
muhat<-sum((sample-muhat)^2)/n
sigsqhat muhat
## [1] 2.594
sigsqhat
## [1] 8.133884
<-function(theta){
loglike<--n/2*log(2*pi)-n/2*log(theta[2])-sum((sample-theta[1])^2)/(2*theta[2])
areturn(-a)
}optim(c(2,2),loglike,method="BFGS")$par
## [1] 2.593942 8.130340
8.2 Gradient descent
- linear regression
library("ggplot2")
# fit a linear model
<- lm( hwy ~ cty ,data=mpg)
res summary(res)
##
## Call:
## lm(formula = hwy ~ cty, data = mpg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3408 -1.2790 0.0214 1.0338 4.0461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.89204 0.46895 1.902 0.0584 .
## cty 1.33746 0.02697 49.585 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.752 on 232 degrees of freedom
## Multiple R-squared: 0.9138, Adjusted R-squared: 0.9134
## F-statistic: 2459 on 1 and 232 DF, p-value: < 2.2e-16
- compute regression coefficients
# squared error cost function
<- function(X, y, theta) {
cost sum( (X %*% theta - y)^2 ) / (2*length(y))
}
# learning rate and iteration limit
<- 0.005
alpha <- 20000
num_iters
# keep history
<- double(num_iters)
cost_history <- list(num_iters)
theta_history
# initialize coefficients
<- matrix(c(0,0), nrow=2)
theta
=mpg$cty
x=mpg$hwy
y
# add a column of 1's for the intercept coefficient
<- cbind(1, matrix(x))
X
# gradient descent
for (i in 1:num_iters) {
<- (X %*% theta - y)
error <- t(X) %*% error / length(y) #derivation
delta
<- theta - alpha * delta
theta
<- cost(X, y, theta)
cost_history[i] <- theta
theta_history[[i]]
}
print(theta)
## [,1]
## [1,] 0.8899161
## [2,] 1.3375742
tail(cost_history)
## [1] 1.522137 1.522137 1.522137 1.522137 1.522137 1.522137
- plot the cost function
plot(cost_history, type='line', col='red', lwd=2, main='Cost function', ylab='cost', xlab='Iterations')
- compare two ways (linear regresion vs. gradient descent)
=mpg$cty
x=mpg$cty*theta[2] + theta[1]
yplot(x,y, main='Linear regression by gradient descent')
# line(x,y ,col=3)
abline(lm(mpg$hwy ~ mpg$cty),col="blue",lwd = 4)
abline(res, col='red')