Consider a standard linear model.
There are four assumptions associated with a linear regression model:
In addition, for Bayesian studies, we specify
Now, consider a linear model with k predictors and N observations.
$$ y=X\beta+e $$$$ e\sim N(0,\sigma_{e}^{2}I_{N}) $$The likelihood function yields
$$ \begin{array}{ccc} L(\beta,\sigma_{e}^{2}|X,Y) & = & \prod_{i-1}^{N} (2\pi\sigma_{e}^{2})^{-\frac{1}{2}}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y_{i}-X_{i}^{T}\beta)^{2}\}\\ & = & (2\pi\sigma_{e}^{2})^{-\frac{N}{2}}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\beta)^{T}(y-X\beta)\} \end{array} $$The maximum likelihood solution suggests:
$$ \hat{\beta}=(X^{T}X)^{-1}(X^{T}y) $$$$ \hat{\sigma}_{e}^{2}=\frac{1}{N}e^{T}e $$The estimated standard errors of the regression parameters are
$$ ACOV(\hat{\beta})=\hat{\sigma}_{e}^{2}(X^{T}X)^{-1} $$And the theoretical standar error for the error variance is
$$ SE(\hat{\sigma}_{e}^{2})=(\frac{2\hat{\sigma}_{e}^{2}}{N})^{\frac{1}{2}} $$Note: This standard error is almost never reported, in part because the associated normal theory tests include negative values for variance.
Assuming improper uniform prior for \beta and $p(\sigma_{e}^{2})=\frac{1}{\sigma_{e}^{2}}$ for $\sigma_{e}^{2}$, we obtain the following posterior distribution:
$$ p(\beta,\sigma_{e}^{2}|X,Y)\propto(\sigma_{e}^{2})^{-\frac{N}{2}+1}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\beta)^{T}(y-X\beta)\} $$library(MASS)
library(magrittr)
library(ggplot2)
library(gridExtra)
## Metropolis-Hastings Algorithm for Standard Linear Regression
set.seed(2019)
dgp <- function() {
####### Data Generating Process
N <- 1000; ## nb. of observations
k <- 5; ## nb. of features (including constant)
x1.m <- 0; x1.sd <- 1
x2.m <- 0; x2.sd <- 1
x3.m <- 0; x3.sd <- 1
x4.m <- 0; x4.sd <- 1
x1 <- rnorm(n = 1000, mean = x1.m, sd = x1.sd) ## feature 1
x2 <- rnorm(n = 1000, mean = x2.m, sd = x2.sd) ## feature 2
x3 <- rnorm(n = 1000, mean = x3.m, sd = x3.sd) ## feature 3
x4 <- rnorm(n = 1000, mean = x4.m, sd = x4.sd) ## feature 4
btrue <- c(1,2,3,4,5); ## True values of regression coefficients
X <- matrix(cbind(rep(1, N),x1,x2,x3,x4), ncol = k); ## Explanatory variables as a matrix
Y <- X %*% matrix(btrue, k, 1) + mvrnorm(1,mu = rep(0,N), Sigma = diag(1, N))
result <- list()
result$X <- X
result$Y <- Y
result$N <- N
result$k <- k
return(result)
}
## Compute log of unnormalized posterior probability
lpost <- function(x_arg, y_arg, b_arg, s2_arg, N_arg) {
log0 <- function(x){result <- ifelse(x>0, log(x), 0); return(result)}
result <- - (0.5 * N_arg + 1) * log0(s2_arg) -
(0.5/s2_arg) * (t(y_arg - x_arg %*% b_arg) %*% (y_arg - x_arg %*% b_arg))
return(result)
}
myData <- dgp();
linearMH <- function(T_burnin = 3000, T_retained = 7000, data = myData) {
## read data
X <- data$X
Y <- data$Y
N <- data$N
k <- data$k
## Specify number of iterations.
T0 <- T_burnin;
T1 <- T_retained;
T <- T0 + T1;
## Store result
b <- matrix(0, T, k); s2 <- matrix(0.01, T);
b.accrate <- matrix(0, T, k);
s2.accrate <- matrix(0, T);
## specify jumping/proposal probability standard deviation
c <- 0.5 ## scale
b.sd <- sqrt(diag(vcov(lm(Y~X-1)))) * c;
s2.sd <- sqrt(var(residuals(lm(Y~X-1))*(N-1)/(N-k))) * c;
## Note:
## 1. var(.) has default denominator (N-1),
## so (N-1)/(N-k) is included to adjust the sample variance.
## 2. scale is included to adjust acceptance rate.
for (t in 2:T) {
b[t, ] <- b[t-1, ] # temperarily set b.
## filtering b[t, ]
for (j in 1:k) {
# generate candidate, assuming accepted
b[t, j] <- b[t-1,j] + rnorm(n=1, mean = 0, sd = b.sd[j]);
# indicator of acceptance
ind.b <- 1
b.AcceptanceRatio <- lpost(X, Y, matrix(b[t,], ncol = 1), s2[t-1, 1], N) -
lpost(X, Y, matrix(b[t-1,], ncol = 1), s2[t-1, 1], N)
if (min(b.AcceptanceRatio,0) < log(runif(n = 1,min = 0,max = 1))) {
ind.b <- 0
b[t, j] <- b[t-1, j]
}
b.accrate[t, j] <- (b.accrate[t-1,j] * (t-1) + ind.b)/t
}
## filtering s2[t, 1]
s2[t, 1] <- s2[t-1, 1] + rnorm(n = 1, mean = 0, sd = s2.sd);
ind.s2 <- 1;
## obtain the acceptance ratio
s2.AcceptanceRatio <- lpost(X, Y, matrix(b[t,], ncol = 1), s2[t, 1], N) -
lpost(X, Y, matrix(b[t,], ncol = 1), s2[t-1, 1], N)
## watch out! the condition s2[t,1] > 0 is necessary, otherwise the lpost result will go north!
if (s2[t,1] <= 0 || min(s2.AcceptanceRatio,0) < log(runif(n = 1,min = 0,max = 1))) {
ind.s2 <- 0;
s2[t,1] <- s2[t-1,1];
}
s2.accrate[t] <- (s2.accrate[t] * (t-1) + ind.s2)/t
}
result <- list()
result$b <- b
result$s2 <- s2
return(result)
}
myEst <- linearMH()
## Calculate High Posterior Density Region
unimodalHPD <- function(data, q_arg = 0.95) {
## Step 1: obtain all feasible left cuts of HPD region,
## using the knowledge that quantile is the inverse of empirical cumulative function
q_left <- quantile(x = data, probs = 1 - q_arg, type = 3);
# q_right <- quantile(x = data, probs = q_arg, type = 3);
## Step 2: sort data, define ecdf function, define density funcction
d <- sort(data)
f0 <- ecdf(d)
f1 <- approxfun(density(d))
## Step 3: loop through feasible values of left cuts,
## find all possible rights cuts with the following properties:
## 1. Area between left cut and its corresponding right cut is 0.95.
## 2. The density of left cut and right cut should be approximately equal.
cut_pairs <- list() ## store left and right cut which satisfy the above conditions
dens <- list() ## store the density for selected pairs
for (i in 1:length(d)){
left_cut <- d[i]
if (left_cut < q_left) {
right_cut <- quantile(x = d, probs = f0(left_cut) + q_arg, type = 3);
if (abs(f1(right_cut) - f1(left_cut))<0.01) {
cut_pairs[[i]] <- list(left_cut, right_cut)
if (!is.null(cut_pairs[[i]])) {
dens[[i]] <- (f1(left_cut) + f1(right_cut))/2
}
}
}
}
return(list(cut_pairs, dens))
}
## Get the HPD parameters for beta4
myHPD<- unimodalHPD(data = myEst$b[3001:10000, 4])
cat("The cutoffs for beta4 are:", unlist(Filter(Negate(is.null), myHPD[[1]])))
cat("\n")
cat("The cutoff density is:", unlist(Filter(Negate(is.null), myHPD[[2]])))
Following are graphics of sampling posterior distribution for $\beta_3$ and $\beta_4$. Examples for other parameters can be similarly derived.
myHPD<- unimodalHPD(data = myEst$b[3000:10000, 4])
graphresult <- function(data = myEst$b[, 4], obj = 'beta4',
hpd_horizontal = unlist(Filter(Negate(is.null), myHPD[[2]])),
hpd_vertical = unlist(Filter(Negate(is.null), myHPD[[1]]))) {
p1 <- ggplot(data = NULL, mapping = aes(x = data[3001:10000])) +
geom_histogram(aes(y=..density..), color="darkblue", fill="lightblue") +
geom_density(alpha=.2, fill="#FF6666") +
geom_hline(yintercept = hpd_horizontal, linetype="dashed") +
geom_vline(xintercept = hpd_vertical) +
xlab(label = obj) + ggtitle(label = 'Histogram, Density and HPD region')
p2 <- ggplot(data = NULL, mapping = aes(x = c(1:10000), y = data)) + geom_line() +
xlab(label = 'number of runs') + ylab(label = obj) +
labs(title = 'Evidence of Convergence')
grid.arrange(p1, p2, nrow = 2)
}
graphresult()
## Get the HPD parameters for beta4
myHPD <- unimodalHPD(data = myEst$b[3001:10000, 3])
cat("The cutoffs for beta3 are:", unlist(Filter(Negate(is.null), myHPD[[1]])))
cat("\n")
cat("The realted cutoff densities are:", unlist(Filter(Negate(is.null), myHPD[[2]])))
graphresult(data = myEst$b[, 3], obj = 'beta3')