The Last time we used MH algorithm to solve a standard linear regression problem. This time, we are going to approach the same problem using Gibbs Sampling.
The idea is that since we are able to derive the conditional posterior distributions for $\beta$ and $\sigma^2$ analytically, Gibbs sampling can be employed.
There are at least two ways to derive a Gibbs Sampler, the first is called The Full conditionals method, where we focus on the full conditionals for (1) the regression parameters and (2) the error variance parameter.
For the error variance parameter $\sigma^2$, we have
$$ p(\sigma_{e}^{2}|X,Y)\propto(\sigma_{e}^{2})^{-\frac{N}{2}+1}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\beta)^{T}(y-X\beta)\} $$This is an inverse gamma distribution where the shape parameter is $\alpha = \frac{N}{2}$ and the scale parameter is $\beta = \frac{1}{2}(y-X\beta)^{T}(y-X\beta)$
As for the regression parameters, we have
$$ \begin{array}{ccc} p(\beta|\sigma_{e}^{2},X,Y) & \propto & \text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\beta)^{T}(y-X\beta)\}\\ & \propto & \text{exp}\{-\frac{1}{2\sigma_{e}^{2}}[\beta^{T}X^{T}X\beta-2\beta^{T}X^{T}Y]\}\\ & \propto & \text{exp}\{-\frac{1}{2\sigma_{e}^{2}}\beta^{T}[X^{T}X\beta-2X^{T}Y]\}\\ & \propto & \text{exp}\{-\beta^{T}\times(2\sigma_{e}^{2}(X^{T}X)^{-1})^{-1}\times[(X^{T}X)^{-1}X^{T}X\beta-2(X^{T}X)^{-1}X^{T}Y]\}\\ & \propto & \text{exp}\{-\beta^{T}[(2\sigma_{e}^{2}(X^{T}X)^{-1})^{-1}]\beta-2\beta^{T}[(2\sigma_{e}^{2}(X^{T}X)^{-1})^{-1}](X^{T}X)^{-1}X^{T}Y]\}\\ & \propto & \text{exp}\{-(\beta-(X^{T}X)^{-1}X^{T}Y)^{T}[(2\sigma_{e}^{2}(X^{T}X)^{-1})^{-1}](\beta-(X^{T}X)^{-1}X^{T}Y)\} \end{array} $$which is a normal distribution with mean $(X^{T}X)^{-1}X^{T}Y$ and variance $\sigma_{e}^{2}(X^{T}X)^{-1}$
The Gibbs Sampling proceeds in 3 steps:
Now, the code.
library(tidyverse) # metapackage with lots of helpful functions
library(magrittr)
library(ggplot2)
library(MASS)
library(gridExtra)
set.seed(2019)
dgp <- function() {
####### Data Generating Process
N <- 1000; ## nb. of observations
k <- 5; ## nb. of features (including constant)
x1.m <- 0; x1.sd <- 1
x2.m <- 0; x2.sd <- 1
x3.m <- 0; x3.sd <- 1
x4.m <- 0; x4.sd <- 1
x1 <- rnorm(n = 1000, mean = x1.m, sd = x1.sd) ## feature 1
x2 <- rnorm(n = 1000, mean = x2.m, sd = x2.sd) ## feature 2
x3 <- rnorm(n = 1000, mean = x3.m, sd = x3.sd) ## feature 3
x4 <- rnorm(n = 1000, mean = x4.m, sd = x4.sd) ## feature 4
btrue <- c(1,2,3,4,5); ## True values of regression coefficients
X <- matrix(cbind(rep(1, N),x1,x2,x3,x4), ncol = k); ## Explanatory variables as a matrix
Y <- X %*% matrix(btrue, k, 1) + mvrnorm(1,mu = rep(0,N), Sigma = diag(1, N))
result <- list(); result$X <- X; result$Y <- Y; result$N <- N; result$k <- k; result$btrue <- btrue;
return(result)
}
myData <- dgp();
linear.GibbsS_I <- function(d = myData, T_arg = 5000) {
X <- d$X; Y <- d$Y; N <- d$N; k <- d$k; T <- T_arg;
## Matrice to store results.
b <- matrix(0,T,k); s2 <- matrix(data = 3, nrow = T);
xx.inverse <- solve(t(X) %*% X);
## starting value of beta parameters
b.init <- coef(lm(Y ~ X-1));
#Gibbs sampling begins
for(t in 2:T){
#draw beta from multivariate normal
b[t,] <- mvrnorm(n = 1, mu = b.init, Sigma = s2[t-1,1] * xx.inverse );
# or equivalently, we can obtain b by cholesky decomposition.
# b[t,] <- b.init + t(rnorm(9,mean=0,sd=1)) %*% chol(s2[t-1]*xx.inverse);
#draw sigma squared from inverse gamma
s2[t] <- 1/rgamma(n = 1, shape = N/2,
rate = 0.5 * t(Y- X %*% (b[t,])) %*% (Y - X %*% (b[t,])));
}
result <- list(); result$b <- b; result$s2 <- s2; result$T <- T;
return(result)
}
temp <- linear.GibbsS_I()
## examine result for parameter b1.
graph.m <- function(d, col) {
lab <- paste('b', col, sep = '');
p.b1.hist <- ggplot(data = NULL, mapping = aes(x = d$b[,col])) +
geom_histogram(alpha = 0.3, bins = 80) +
geom_vline(xintercept = quantile(x = d$b[,col], probs = 0.5)) +
geom_vline(xintercept = quantile(x = d$b[,col], probs = c(0.025,0.975)), linetype = 'dashed', color = 'red' ) +
geom_vline(xintercept = myData$btrue[col], color = 'blue') +
xlab(lab) + ylab('Count') + xlim(quantile(x = d$b[,col], probs = c(0.001,0.999))) +
annotate("text", x = myData$btrue[col], y = 50, label = "True value")+
annotate("text", x = quantile(x = d$b[,col], probs = 0.5), y = 30, label = "Median") +
annotate("text", x = quantile(x = d$b[,col], probs = c(0.025,0.975)), y = 30, label = c('1st quartile','3rd quartile'));
p.b1.convg <- ggplot(data = NULL, mapping = aes(x = 1:d$T, y = d$b[,col])) + geom_line() +
xlab('Number of Iterations') + ylab(lab);
grid.arrange(p.b1.hist, p.b1.convg, ncol = 1)
}
graph.m(temp, 1);
The second method we are going to discuss is called Composition method.
In this method, we integrate out $\beta$ to obtain the posterior distribution of $\sigma_{e}^{2}$.
$$ p(\sigma_{e}^{2}|X,Y)=\int p(\beta,\sigma_{e}^{2}|X,Y)d\beta $$More specifically, we know that
$$ \begin{array}{ccc} p(\beta,\sigma_{e}^{2}|X,Y) & = & \frac{(\sigma_{e}^{2})^{-\frac{N}{2}-1}}{(2\pi)^{N/2}}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\hat{\beta}+X\hat{\beta}-X\beta)^{T}(y-X\hat{\beta}+X\hat{\beta}-X\beta)\}\\ & = & \frac{(\sigma_{e}^{2})^{-\frac{N}{2}-1}}{(2\pi)^{N/2}}\{\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(\beta-\hat{\beta})^{T}X^{T}X(\beta-\hat{\beta})\}\}\\ & & \times\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\hat{\beta})^{T}(y-X\hat{\beta})\} \end{array} $$So that
$$ \begin{array}{ccc} p(\sigma_{e}^{2}|X,Y) & = & \int p(\beta,\sigma_{e}^{2}|X,Y)d\beta\\ & = & \frac{(\sigma_{e}^{2})^{-\frac{N}{2}-1}}{(2\pi)^{N/2}}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\hat{\beta}+X\hat{\beta}-X\beta)^{T}(y-X\hat{\beta}+X\hat{\beta}-X\beta)\}\\ & = & \frac{(\sigma_{e}^{2})^{-\frac{N}{2}-1}}{(2\pi)^{N/2}}\int\{\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(\beta-\hat{\beta})^{T}X^{T}X(\beta-\hat{\beta})\}\}d\beta\\ & & \times\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\hat{\beta})^{T}(y-X\hat{\beta})\}\\ & \propto & (\frac{1}{\sigma_{e}^{2}})^{\frac{N-k}{2}+1}\times\text{exp}\{-\frac{(N-k)}{2\sigma_{e}^{2}}\times\frac{(y-X\hat{\beta})^{T}(y-X\hat{\beta})}{(N-k)}\} \end{array} $$which is an inverse gamma with shape parameter $\frac{N-k}{2}$ and scale parameter $\frac{(y-X\hat{\beta})^{T}(y-X\hat{\beta})}{2}$. Or equivalently, $\sigma_{e}^{2}$ is inverse chi-square distributed with parameters $N-k$ and scale parameter $\frac{(y-X\hat{\beta})^{T}(y-X\hat{\beta})}{(N-k)}$
The implementation does not differ much:
linear.GibbsS_II <- function(d = myData, T_arg = 5000) {
X <- d$X; Y <- d$Y; N <- d$N; k <- d$k; T <- T_arg;
## Matrice to store results.
b <- matrix(0,T,k); s2 <- matrix(data = 3, nrow = T);
xx.inverse <- solve(t(X) %*% X);
## starting value of beta parameters
b.init <- coef(lm(Y ~ X-1));
sse <- t(resid(lm(Y~X-1))) %*% resid(lm(Y~X-1));
#draw sigma from inverse gamma
s2[,1] <- 1/rgamma(n = T, shape = (N-k)/2, rate = 0.5*sse);
#Gibbs sampling begins, with t starting from 1.
for(t in 1:T){
#simulate beta from its multivariate normal conditional
b[t,] <- mvrnorm(n = 1, mu = b.init, Sigma = s2[t,1] * xx.inverse );
}
result <- list(); result$b <- b; result$s2 <- s2; result$T <- T;
return(result)
}
temp <- linear.GibbsS_II()
## examine result for parameter b1.
graph.m(temp, 1);