Bayesian Linear Regression I by Sheng BI¶

Consider a standard linear model.

There are four assumptions associated with a linear regression model:

  1. Linearity: The relationship between $X$ and the mean of $Y$ is linear.
  2. Homoscedasticity: The variance of residual is the same for any value of $X$.
  3. Independence: Observations are independent of each other.
  4. Normality: For any fixed value of $X$, $Y$ is normally distributed.

In addition, for Bayesian studies, we specify

  1. Indpependent improper priors for $\beta$ and reference prior $p(\sigma^2) = \frac{1}{\sigma^2}$ for $\sigma^2$.

Now, consider a linear model with k predictors and N observations.

$$ y=X\beta+e $$$$ e\sim N(0,\sigma_{e}^{2}I_{N}) $$

The likelihood function yields

$$ \begin{array}{ccc} L(\beta,\sigma_{e}^{2}|X,Y) & = & \prod_{i-1}^{N} (2\pi\sigma_{e}^{2})^{-\frac{1}{2}}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y_{i}-X_{i}^{T}\beta)^{2}\}\\ & = & (2\pi\sigma_{e}^{2})^{-\frac{N}{2}}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\beta)^{T}(y-X\beta)\} \end{array} $$

The maximum likelihood solution suggests:

$$ \hat{\beta}=(X^{T}X)^{-1}(X^{T}y) $$$$ \hat{\sigma}_{e}^{2}=\frac{1}{N}e^{T}e $$

The estimated standard errors of the regression parameters are

$$ ACOV(\hat{\beta})=\hat{\sigma}_{e}^{2}(X^{T}X)^{-1} $$

And the theoretical standar error for the error variance is

$$ SE(\hat{\sigma}_{e}^{2})=(\frac{2\hat{\sigma}_{e}^{2}}{N})^{\frac{1}{2}} $$

Note: This standard error is almost never reported, in part because the associated normal theory tests include negative values for variance.

Assuming improper uniform prior for \beta and $p(\sigma_{e}^{2})=\frac{1}{\sigma_{e}^{2}}$ for $\sigma_{e}^{2}$, we obtain the following posterior distribution:

$$ p(\beta,\sigma_{e}^{2}|X,Y)\propto(\sigma_{e}^{2})^{-\frac{N}{2}+1}\text{exp}\{-\frac{1}{2\sigma_{e}^{2}}(y-X\beta)^{T}(y-X\beta)\} $$
In [24]:
library(MASS)
library(magrittr)
library(ggplot2)
library(gridExtra)

## Metropolis-Hastings Algorithm for Standard Linear Regression

set.seed(2019)

dgp <- function() {
    
    ####### Data Generating Process
    N <- 1000; ## nb. of observations
    k <- 5;    ## nb. of features (including constant) 
    x1.m <- 0; x1.sd <- 1
    x2.m <- 0; x2.sd <- 1
    x3.m <- 0; x3.sd <- 1
    x4.m <- 0; x4.sd <- 1
    x1 <- rnorm(n = 1000, mean = x1.m, sd = x1.sd) ## feature 1
    x2 <- rnorm(n = 1000, mean = x2.m, sd = x2.sd) ## feature 2
    x3 <- rnorm(n = 1000, mean = x3.m, sd = x3.sd) ## feature 3
    x4 <- rnorm(n = 1000, mean = x4.m, sd = x4.sd) ## feature 4
    
    btrue <- c(1,2,3,4,5); ## True values of regression coefficients
        
    X <- matrix(cbind(rep(1, N),x1,x2,x3,x4), ncol = k); ## Explanatory variables as a matrix
    
    Y <- X %*% matrix(btrue, k, 1) + mvrnorm(1,mu = rep(0,N), Sigma = diag(1, N))
    
    result <- list()
    result$X <- X
    result$Y <- Y
    result$N <- N
    result$k <- k
    return(result)
}

## Compute log of unnormalized posterior probability
lpost <- function(x_arg, y_arg, b_arg, s2_arg, N_arg) {
    
    log0 <- function(x){result <- ifelse(x>0, log(x), 0); return(result)}

    result <- - (0.5 * N_arg + 1) * log0(s2_arg) - 
    (0.5/s2_arg) * (t(y_arg - x_arg %*% b_arg) %*% (y_arg - x_arg %*% b_arg))

    return(result)
}

myData <- dgp();
In [25]:
linearMH <- function(T_burnin = 3000, T_retained = 7000, data = myData) {
    
    ## read data
    X <- data$X
    Y <- data$Y
    N <- data$N
    k <- data$k
    
    ## Specify number of iterations.
    T0 <- T_burnin;
    T1 <- T_retained;
    T  <- T0 + T1;
    
    ## Store result
    b <- matrix(0, T, k); s2 <- matrix(0.01, T);
    b.accrate <- matrix(0, T, k); 
    s2.accrate <- matrix(0, T);
    
    ## specify jumping/proposal probability standard deviation
    c <-  0.5 ## scale 
    b.sd <- sqrt(diag(vcov(lm(Y~X-1)))) * c;
    s2.sd <- sqrt(var(residuals(lm(Y~X-1))*(N-1)/(N-k))) * c;
    ## Note: 
    ## 1. var(.) has default denominator (N-1),
    ##    so (N-1)/(N-k) is included to adjust the sample variance.
    ## 2. scale is included to adjust acceptance rate.
    
    
    for (t in 2:T) {
        
        b[t, ] <- b[t-1, ] # temperarily set b.
        
        ## filtering b[t, ]
        for (j in 1:k) {
            # generate candidate, assuming accepted
            b[t, j] <- b[t-1,j] + rnorm(n=1, mean = 0, sd = b.sd[j]);
            
            # indicator of acceptance
            ind.b <- 1
                        
            b.AcceptanceRatio <- lpost(X, Y, matrix(b[t,], ncol = 1), s2[t-1, 1], N) - 
            lpost(X, Y, matrix(b[t-1,], ncol = 1), s2[t-1, 1], N)
            
            if (min(b.AcceptanceRatio,0) < log(runif(n = 1,min = 0,max = 1))) {
                ind.b <- 0
                b[t, j] <- b[t-1, j]
            }
            
            b.accrate[t, j] <- (b.accrate[t-1,j] * (t-1) + ind.b)/t
            
        }
        
        ## filtering s2[t, 1]
        s2[t, 1] <- s2[t-1, 1] + rnorm(n = 1, mean = 0, sd = s2.sd);
        ind.s2 <- 1;
        
        ## obtain the acceptance ratio
        s2.AcceptanceRatio <- lpost(X, Y, matrix(b[t,], ncol = 1), s2[t, 1], N) - 
            lpost(X, Y, matrix(b[t,], ncol = 1), s2[t-1, 1], N)
        
        ## watch out! the condition s2[t,1] > 0 is necessary, otherwise the lpost result will go north!
        if (s2[t,1] <= 0 || min(s2.AcceptanceRatio,0) < log(runif(n = 1,min = 0,max = 1))) {
            ind.s2 <- 0;
            s2[t,1] <- s2[t-1,1];
        }
        s2.accrate[t] <- (s2.accrate[t] * (t-1) + ind.s2)/t
        
    }
    
    result <- list()
    result$b  <- b
    result$s2 <- s2
    return(result)
    
}

myEst <- linearMH()
In [51]:
## Calculate High Posterior Density Region

unimodalHPD <- function(data, q_arg = 0.95) {
    
    ## Step 1: obtain all feasible left cuts of HPD region, 
    ##         using the knowledge that quantile is the inverse of empirical cumulative function
    q_left <- quantile(x = data, probs = 1 - q_arg, type = 3);
    # q_right <- quantile(x = data, probs = q_arg, type = 3);

    ## Step 2: sort data, define ecdf function, define density funcction
    d  <- sort(data)
    f0 <- ecdf(d)
    f1 <- approxfun(density(d))

    ## Step 3: loop through feasible values of left cuts, 
    ##         find all possible rights cuts with the following properties:
    ##         1. Area between left cut and its corresponding right cut is 0.95.
    ##         2. The density of left cut and right cut should be approximately equal. 
    
    cut_pairs <- list() ## store left and right cut which satisfy the above conditions
    dens <- list()  ## store the density for selected pairs
    
    for (i in 1:length(d)){
        left_cut <- d[i]
        if (left_cut < q_left) {
            right_cut <- quantile(x = d, probs = f0(left_cut) + q_arg, type = 3);
            if (abs(f1(right_cut) - f1(left_cut))<0.01) {
                cut_pairs[[i]] <- list(left_cut, right_cut)
                if (!is.null(cut_pairs[[i]])) {
                    dens[[i]] <- (f1(left_cut) + f1(right_cut))/2
                }
            }
                
        }
    }
    return(list(cut_pairs, dens))
}

## Get the HPD parameters for beta4
myHPD<- unimodalHPD(data = myEst$b[3001:10000, 4])

cat("The cutoffs for beta4 are:", unlist(Filter(Negate(is.null), myHPD[[1]])))
cat("\n")
cat("The cutoff density is:", unlist(Filter(Negate(is.null), myHPD[[2]])))
The cutoffs for beta4 are: 3.957304 4.086007
The cutoff density is: 1.73025

Following are graphics of sampling posterior distribution for $\beta_3$ and $\beta_4$. Examples for other parameters can be similarly derived.

In [43]:
myHPD<- unimodalHPD(data = myEst$b[3000:10000, 4])

graphresult <- function(data = myEst$b[, 4], obj = 'beta4', 
                        hpd_horizontal = unlist(Filter(Negate(is.null), myHPD[[2]])), 
                        hpd_vertical = unlist(Filter(Negate(is.null), myHPD[[1]]))) {
    p1 <- ggplot(data = NULL, mapping = aes(x = data[3001:10000])) +
    geom_histogram(aes(y=..density..), color="darkblue", fill="lightblue") + 
    geom_density(alpha=.2, fill="#FF6666") + 
    geom_hline(yintercept = hpd_horizontal, linetype="dashed") + 
    geom_vline(xintercept = hpd_vertical) + 
    xlab(label = obj) + ggtitle(label = 'Histogram, Density and HPD region')
    
    p2 <- ggplot(data = NULL, mapping = aes(x = c(1:10000), y = data)) + geom_line() + 
    xlab(label = 'number of runs') + ylab(label = obj) + 
    labs(title = 'Evidence of Convergence')
                 
    grid.arrange(p1, p2, nrow = 2)
}
                 
graphresult()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In [55]:
## Get the HPD parameters for beta4
myHPD <- unimodalHPD(data = myEst$b[3001:10000, 3])

cat("The cutoffs for beta3 are:", unlist(Filter(Negate(is.null), myHPD[[1]])))
cat("\n")
cat("The realted cutoff densities are:", unlist(Filter(Negate(is.null), myHPD[[2]])))

graphresult(data = myEst$b[, 3], obj = 'beta3')
The cutoffs for beta3 are: 2.908529 3.031627 2.90857 3.031632 2.90857 3.031632
The realted cutoff densities are: 2.102526 2.104708 2.104708
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.