by Sheng BI
library(MASS)
library(magrittr)
library(ggplot2)
library(gridExtra)
Today we are going to solve a linear regression problem with groupwise heteroscedasticity. We assume that we know which group each observation belongs to.
Now some math. We consider a linear model $y=X\beta+e$, where
$$ E[e^{'}e]\equiv\Sigma=\left[\begin{array}{ccccccccc} \sigma_{1}^{2} & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\ \vdots & \ddots\\ \vdots & 0 & \sigma_{1}^{2} & 0 & 0 & 0 & 0 & 0 & 0\\ \vdots & 0 & 0 & \sigma_{2}^{2} & 0 & 0 & 0 & 0 & 0\\ \vdots & & & & \ddots\\ \vdots & & & & & \sigma_{2}^{2} & & & 0\\ 0 & & & & & & \ddots\\ 0 & & & & & & & \ddots\\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & \sigma_{J}^{2} \end{array}\right] $$So, we have groupwise heteroscedasiticy with $J$ different groups of variance parameters. Different groups have different variance parameter, while the variance parameter within a group is identical.
An extreme case is that the variance parameters differ for all the observations. Now, we assume normal prior for regression parameter $\beta$ and independent inverse gamma prior for variance paraemters $\{\sigma_{j}^{2}\}_{j=1,..,J}$.
$$\beta\sim N(\underline{\beta},\underline{V_{\beta}})$$$$\sigma_{j}^{2}\sim IG(a_{j},b_{j})$$Notice that these notations I used here are standard as in bayesian textbooks such as Koop (2003). There are some differences in the specification of parameters of gamma distribution. I follow the convention adopted by the wiki page on gamma distribution, because it is congruent with the rgamma function in R.
Now the likelihood can be written as
$$ L(\beta_{,}\{\sigma_{j}^{2}\}|X,Y)=\prod_{j=1}^{J}\frac{(\sigma_{j}^{2})^{-\frac{N_{j}}{2}}}{(2\pi)^{-\frac{N_{j}}{2}}}\text{exp}(-\frac{1}{2\sigma_{j}^{2}}(Y_{j}-X_{j}\beta)^{'}(Y_{j}-X_{j}\beta)) $$Now, the deriviation of the postierior of $\beta$ conditional on $\{\sigma_{j}^{2}\}_{j=1,..,J}$ is almost analogous to what we have in the standard normal context, except that we should account for the heteroscedasticity by multiplying $\Sigma^{-1}$.
$$ \beta|\{\sigma_{j}^{2}\}_{j=1,..,J},y\sim N(AB,A) $$where $A=(X^{'}\Sigma^{-1}X+(\underline{V_{\beta}})^{-1})^{-1}$ and $B=(X^{'}\Sigma^{-1}y+(\underline{V_{\beta}})^{-1}\underline{\beta})$.
The posterior for $\sigma_{j}^{2}$ can be written as :
$$ \sigma_{j}^{2}|\sigma_{-j}^{2},\beta,y\sim IG(\frac{N_{j}}{2}+a_{j},b_{j}+\frac{1}{2}\sum_{j}^{N_{j}}(y_{j}-x_{j}\beta)^{2}) $$The derivation is actually a straightforward generalization from the standard linear regression with normal-gamma priors. It sufficies to collect the observations according to which group they belong to.
Now, I start generating my data. I divide 1000 observations into 3 groups. The first group contains 300 observations, the middle 400 and the last 300.
## Data Generating Process
set.seed(2019)
dgp <- function() {
####### Data Generating Process
N <- 1000; ## nb. of observations
k <- 5; ## nb. of features (including constant)
x1.m <- 0; x1.sd <- 1
x2.m <- 0; x2.sd <- 1
x3.m <- 0; x3.sd <- 1
x4.m <- 0; x4.sd <- 1
x1 <- rnorm(n = 1000, mean = x1.m, sd = x1.sd) ## feature 1
x2 <- rnorm(n = 1000, mean = x2.m, sd = x2.sd) ## feature 2
x3 <- rnorm(n = 1000, mean = x3.m, sd = x3.sd) ## feature 3
x4 <- rnorm(n = 1000, mean = x4.m, sd = x4.sd) ## feature 4
btrue <- c(1,2,3,4,5); ## True values of regression coefficients
X <- matrix(cbind(rep(1, N),x1,x2,x3,x4), ncol = k); ## Explanatory variables as a matrix
## Here we specify the groupwise heteroskedasiticty (the covariance-variance matrix).
heterSigma <- diag(1,N);
diag(heterSigma)[1:300] <- 2; diag(heterSigma)[301:700] <- 3; diag(heterSigma)[701:1000] <- 4;
Y <- X %*% matrix(btrue, k, 1) + mvrnorm(1,mu = rep(0,N), Sigma = heterSigma);
result <- list();
result$X <- X; result$Y <- Y;
result$N <- N; result$k <- k;
result$e <- diag(heterSigma);
result$btrue <- btrue
return(result)
}
myData <- dgp();
The specification of scale parameters of inverse gamma distributions takes into account of the fact that the mean of inverse gamma is $\frac{\alpha}{\beta - 1}$.
# Gibbs Sampling Algorithm for Standard Linear Regression
hetGroupwise <- function(d = myData, T = 3000) {
X <- d$X; Y <- d$Y; k <- d$k; N <- d$N;
g <- d$e; J <- length(unique(g));
## store result
b <- matrix(0, nrow = T, ncol = k);
sigY <- matrix(0, nrow = T, ncol = J);
## starting value of beta parameters (naive)
a1 <- 2; b1 <- var(Y[1:300,1]);
a2 <- 2; b2 <- var(Y[301:700,1]);
a3 <- 2; b3 <- var(Y[701:1000,1]);
sigY[1, ] <- c(b1, b2, b3); # Y variancce for each group
b.mean <- coef(lm(Y ~ X-1)); ## assumed mean of prior of regression parameters beta
xx <- t(X) %*% X /((b1+b2+b3)/3); ## inverse of assumed variance of prior of regression parameters beta
for (t in 2:T) {
s.inv <- diag(1, N); diag(s.inv)[1:300] <- 1/sigY[t-1, 1];
diag(s.inv)[301:700] <- 1/sigY[t-1, 2]; diag(s.inv)[701:1000] <- 1/sigY[t-1, 3];
A <- solve(t(X) %*% s.inv %*% X + xx);
B <- (t(X) %*% s.inv %*% Y + xx %*% b.mean);
b[t, ] <- mvrnorm(1, A %*% B, A);
for (j in 1:J) {
if (j == 1) {
temp <- t(Y[1:300,1] - X[1:300,] %*% b[t, ]) %*% (Y[1:300,1] - X[1:300,] %*% b[t, ]);
sigY[t, j] <- 1/rgamma(n=1, shape=(300/2) + a1, rate=(b1 + (temp)/(2)));
} else if (j == 2) {
temp <- t(Y[301:700,1] - X[301:700,] %*% b[t, ]) %*% (Y[301:700,1] - X[301:700,] %*% b[t, ]);
sigY[t, j] <- 1/rgamma(n=1, shape=(400/2) + a2, rate=(b2 + (temp)/(2)));
} else if (j == 3) {
temp <- t(Y[701:1000,1] - X[701:1000,] %*% b[t, ]) %*% (Y[701:1000,1] - X[701:1000,] %*% b[t, ]);
sigY[t, j] <- 1/rgamma(n=1, shape=(300/2) + a3, rate=(b3 + (temp)/(2)));
}
}
}
result <- list()
result$b <- b
result$sigY <- sigY
return(result)
}
myresult <- hetGroupwise()
Below are results drawn. The red lines are 2.5% and 97.5% quantiles. The black lines are medians and the blue lines are the true values of the corresponding parameters.
The specification of the variance of prior distribution of $\beta$ is crucial. If the value is small, than range between 2.5% and 97.5% may be very thin thus does not include the true value. However, that is not a big problem since the narrower range actually represents higher precision.
## examine result for parameter b1.
graph.m1 <- function(d = myresult) {
t0 <- 1000 + 1; T <- dim(d$b)[1]; ## define burning period.
p.b1.hist <- ggplot(data = NULL, mapping = aes(x = d$b[(t0:T),1])) +
geom_histogram(alpha = 0.3, bins = 80) +
geom_vline(xintercept = quantile(x = d$b[,1], probs = 0.5)) +
geom_vline(xintercept = quantile(x = d$b[,1], probs = c(0.025,0.975)), linetype = 'dashed', color = 'red' ) +
xlab('b1') + ylab('Count') +
geom_vline(xintercept = myData$btrue[1], color = 'blue') +
xlim(quantile(x = d$b[,1], probs = c(0.001,0.999))) +
annotate("text", x = myData$btrue[1], y = 50, label = "True value")+
annotate("text", x = quantile(x = d$b[,1], probs = 0.5), y = 30, label = "Median") +
annotate("text", x = quantile(x = d$b[,1], probs = c(0.025,0.975)), y = 30, label = c('1st quartile','3rd quartile'));
p.b5.hist <- ggplot(data = NULL, mapping = aes(x = d$b[(t0:T),5])) +
geom_histogram(alpha = 0.3, bins = 80) +
geom_vline(xintercept = quantile(x = d$b[,5], probs = 0.5)) +
geom_vline(xintercept = quantile(x = d$b[,5], probs = c(0.025,0.975)), linetype = 'dashed', color = 'red' ) +
xlab('b5') + ylab('Count') +
geom_vline(xintercept = myData$btrue[5], color = 'blue');
sigtrue <- unique(myData$e)
p.g2.hist <- ggplot(data = NULL, mapping = aes(x = d$sigY[(t0:T),2])) +
geom_histogram(alpha = 0.3, bins = 80) +
geom_vline(xintercept = quantile(x = d$sigY[,2], probs = 0.5)) +
geom_vline(xintercept = quantile(x = d$sigY[,2], probs = c(0.025,0.975)), linetype = 'dashed', color = 'red' ) +
xlab('variance of group 2') + ylab('Count') +
geom_vline(xintercept = sigtrue[2], color = 'blue');
p.g3.hist <- ggplot(data = NULL, mapping = aes(x = d$sigY[(t0:T),3])) +
geom_histogram(alpha = 0.3, bins = 80) +
geom_vline(xintercept = quantile(x = d$sigY[,3], probs = 0.5)) +
geom_vline(xintercept = quantile(x = d$sigY[,3], probs = c(0.025,0.975)), linetype = 'dashed', color = 'red' ) +
xlab('variance of group 3') + ylab('Count') +
geom_vline(xintercept = sigtrue[3], color = 'blue');
grid.arrange(p.b1.hist, p.b5.hist, p.g2.hist, p.g3.hist, nrow = 2, ncol = 2)
}
graph.m1()