## By Sheng BI
# library(tidyverse) # metapackage with lots of helpful functions
library(magrittr)
library(ggplot2)
# library(gridExtra)
library(truncnorm)
list.files(path = "../input")
Adaptive rejection sampling is a generalization of the accept-reject method.
In this method, a series of piecewise exponential densities are built up during iterations, such that an envelope of the target density is achieved.
This method helps tackle the difficulty of choosing an efficient proposal density. To apply this method, there is a techinical restrition, that is the target density should be differentiable and log-concave.
This method could be fairly easily understood via graph. Our first example is the standard normal density. Let us first take the log transformation of the density function.
# Piecewise linear functions as envelopes:
z <- function(A,B) {
## Input: Coordinates of Point A and Point B
## Output: slope and y intercept.
m <- (B[2]- A[2])/(B[1]- A[1]) # slope
b <- A[2] - m * A[1] # y-intercept
return(c(unname(m),unname(b)))
}
l <- function(x, pars) {
## Linear Function
## Input: x - argument, pars - c(slope, y-intercept)
## Output: y value
return(pars[1]*x + pars[2])
}
## The LOG density function that we want to marjorize. The log density function must be concave.
myFunc <- function(x){(exp(-(x^2)/2))/sqrt(2*pi)} %>% log
## break points. Should be adjusted accoording to the domain of our density function
myCuts <- seq(-2,2,by = 0.5)
graph.b <- function(myDens = myFunc, myXcoords = myCuts, ext = 0.4) {
## ext: a parameter by which which we adjust the extension of our piecewise linear functions
df <- data.frame(x = myXcoords, y = myDens(myXcoords),
slope = rep(NA, length(myXcoords)), y_intercept = rep(NA, length(myXcoords)))
N <- dim(df)[1] ## N == 9
## the 3rd and 4th column stores the slope and y-intercept of the lower envelope,
## which passes through points df[row, 1:2] and df[row+1, 1:2]
for (row in 1:(N-1)) {
df[row, 3:4] <- z(unlist(df[row,1:2]),unlist(df[row+1,1:2]))
}
# print(df)
p1 <- ggplot(data = df, mapping = aes(x=df$x, y=df$y)) + geom_point() + stat_function(fun = myDens)
p1 <- p1 + geom_segment(aes(x = df[1,1], y = df[1,2],
xend = df[1,1], yend = df[1,2] + ext))
for (i in 1:(N-1)) {
b <- z(unlist(df[i, ]), unlist(df[i+1, ]))
p1 <- p1 + annotate('segment', x = df[i,1], y = df[i,2],
xend = df[i,1] - ext,
yend = l(df[i, 1] - ext, b))
p1 <- p1 + annotate('segment', x = df[i+1,1], y = df[i+1,2],
xend = df[i+1,1] + ext, yend = l(df[i+1, 1] + ext, b))
p1 <- p1 + annotate('segment', x = df[i,1], y = df[i,2],
xend = df[i+1,1], yend = df[i+1,2],
linetype = 'dashed', color = 'red')
}
p1 <- p1 + geom_segment(aes(x = df[N,1], y = df[N,2],
xend = df[N,1], yend = df[N,2] + ext ))
p1 + xlab("x") + ylab("log density") # + xlim(-2.5,2.5)
}
graph.b()
Our second example is the log transformation of a beta density.
myFunc <- function(x0){return(dbeta(x = x0, shape1 = 3, shape2 = 1.3, log = TRUE))}
myCuts <- seq(0.1,0.9,by = 0.1)
graph.b()
The red dashed lines represent the lower envelope, that is, the part of the linear function which is under the log density function.
The black lines are simply extensions of these linear functions.
Now, let us trim the unnecessary parts of the linear extensions to make sure that linear upper envelople is well defined in each interval. We take standard normal density as an example.
P.S.
You could simply change the myFunc and myCuts functions to examine your own probability density function.
The following code provide more details than needed to produce the graph.
## The LOG density function that is majorized by piecewise linear functions.
myFunc <- function(x){(exp(-(x^2)/2))/sqrt(2*pi)} %>% log
## break points
myCuts <- seq(-2,2,by = 0.5)
graph.b1 <- function(myDens = myFunc, myXcoords = myCuts) {
df <- data.frame(x = myXcoords, y = myDens(myXcoords),
slope = rep(NA, length(myXcoords)), y_intercept = rep(NA, length(myXcoords)))
N <- dim(df)[1] ## N == 9
for (row in 1:(N-1)) {
df[row, 3:4] <- z(unlist(df[row,1:2]),unlist(df[row+1,1:2]))
}
p1 <- ggplot(data = df, mapping = aes(x=df$x, y=df$y)) + geom_point() + stat_function(fun = myDens)
## store all the intersection points of linear functions ABOVE the log density
intersections <- matrix(NA, nrow = N-1, ncol = 4)
intersections[ , 1:2] <- data.matrix(df[1:(N-1), 1:2], rownames.force = NA) ## left break point
intersections[1, 3:4] <- c(df[1,1], l(df[1,1], z(unlist(df[2, 1:2]),unlist(df[3, 1:2]))))
intersections[N-1, 3:4] <- c(df[N,1], l(df[N,1], z(unlist(df[N-1,1:2]),unlist(df[N-2,1:2]))))
## store all the intersecting lines using matrix
## Again, the first 2 columns store the abscissa and ordinate or the lest break point respectively
crosslines <- matrix(NA, nrow = 2 * (N-1), ncol = 4)
crosslines[1, 3:4] <- c(NA,NA);
crosslines[2, 3:4] <- z(unlist(df[2, 1:2]), unlist(df[3, 1:2]));
crosslines[2*(N-1)-1, 3:4] <- z(unlist(df[N-1, 1:2]), unlist(df[N-2, 1:2]));
crosslines[2*(N-1) , 3:4] <- c(NA,NA);
crosslines[1, 1:2] <- unlist(df[1,1:2]);
crosslines[2, 1:2] <- unlist(df[1,1:2]);
crosslines[2*(N-1)-1, 1:2] <- unlist(df[N-1,1:2]);
crosslines[2*(N-1), 1:2] <- unlist(df[N-1,1:2]);
for (row in 2:(N-2)) {
line1 <- z(unlist(df[row-1, 1:2]), unlist(df[row, 1:2] ))
line2 <- z(unlist(df[row+1, 1:2]), unlist(df[row+2, 1:2]))
pt_x <- (line2[2] - line1[2])/(line1[1] - line2[1])
pt_y <- l(pt_x, line2)
pt <- c(pt_x, pt_y)
intersections[row, 3:4] <- pt
crosslines[2*row-1, 3:4] <- line1;
crosslines[2*row , 3:4] <- line2;
crosslines[2*row-1, 1:2] <- unlist(df[row,1:2]);
crosslines[2*row , 1:2] <- unlist(df[row,1:2]);
}
## store ALL the intersection points.
all_points <- matrix(NA, nrow = 2*N-1, ncol = 2)
for (row in 1:N) {
all_points[2*row-1, ] <- unlist(df[row, 1:2])
if (row < N) {all_points[2*row , ] <- intersections[row, 3:4];}
}
# Or equivalently, all_points could also be obtained via by these steps:
#all_points <- rbind(intersections, data.matrix(df)[2:(N-1), ])
#all_points <-
#all_points[do.call(order, c(decreasing = FALSE, data.frame(all_points[,1]))),]
#all_points[order(all_points[,1], all_points[,2] ,decreasing = FALSE),]
result <- list()
result[[1]] <- intersections
result[[2]] <- all_points
result[[3]] <- crosslines
# print(all_points)
# create the piecewise linear function
piecewise <- function(x, A = all_points) {
# the input x will be a vector, under ggplot2::stat_func().
i = findInterval(x, A[,1])
# i <- i[i>0 & i<=dim(A)[1]]
b <- matrix(NA, ncol = 2, nrow = length(i))
for (each in 1:(length(i)-1)) {
b[each, ] <- z(unlist(A[i[each]+1, ]), unlist(A[i[each],]))
}
N1 <- dim(A)[1];
b[length(i), ] <- z(unlist(A[N1-2, ]), unlist(A[N1-1,]))
y <- b[,1]*x + b[,2]
return(y)
}
p1 <- p1 + geom_segment(aes(x = df[1,1], y = df[1,2],
xend = intersections[1,3], yend = intersections[1,4]))
p1 <- p1 + geom_segment(aes(x = df[N,1], y = df[N,2],
xend = intersections[N-1,3], yend = intersections[N-1,4]))
for (i in 1:(N-1)) {
p1 <- p1 + annotate('segment', x = df[i,1], y = df[i,2],
xend = df[i+1,1], yend = df[i+1,2],
linetype = 'dashed', color = 'red')
}
p1 <- p1 + stat_function(fun = piecewise)
result[[4]] <- p1 + xlab('x') + ylab('log density')
return(result)
}
temp <- graph.b1()
temp[[4]]
## The LOG density function that is majorized by piecewise linear functions.
myFuncExp <- function(x){(exp(-(x^2)/2))/sqrt(2*pi)}
## break points
myCuts <- seq(-2,2,by = 0.5)
graph.b2 <- function(myDens = myFuncExp, myXcoords = myCuts) {
df <- data.frame(x = myXcoords, y = myDens(myXcoords) %>% log,
slope = rep(NA, length(myXcoords)), y_intercept = rep(NA, length(myXcoords)),
yexp = myDens(myXcoords))
# print(df)
N <- dim(df)[1] ## N == 9
for (row in 1:(N-1)) {
df[row, 3:4] <- z(unlist(df[row,1:2]),unlist(df[row+1,1:2]))
}
p1 <- ggplot(data = df, mapping = aes(x=df$x, y=df$yexp)) +
geom_point() + stat_function(fun = myDens)
## store all the intersection points of linear functions ABOVE the log density
intersections <- matrix(NA, nrow = N-1, ncol = 4)
intersections[ , 1:2] <- data.matrix(df[1:(N-1), 1:2], rownames.force = NA) ## left break point
intersections[1, 3:4] <- c(df[1,1], l(df[1,1], z(unlist(df[2, 1:2]),unlist(df[3, 1:2]))))
intersections[N-1, 3:4] <- c(df[N,1], l(df[N,1], z(unlist(df[N-1,1:2]),unlist(df[N-2,1:2]))))
for (row in 2:(N-2)) {
line1 <- z(unlist(df[row-1, 1:2]), unlist(df[row, 1:2] ))
line2 <- z(unlist(df[row+1, 1:2]), unlist(df[row+2, 1:2]))
pt_x <- (line2[2] - line1[2])/(line1[1] - line2[1])
pt_y <- l(pt_x, line2)
pt <- c(pt_x, pt_y)
intersections[row, 3:4] <- pt
}
## store ALL the intersection points.
all_points <- matrix(NA, nrow = 2*N-1, ncol = 2)
for (row in 1:N) {
all_points[2*row-1, ] <- unlist(df[row, 1:2])
if (row < N) {all_points[2*row , ] <- intersections[row, 3:4];}
}
# create the piecewise linear function
piecewise <- function(x, A = all_points) {
# the input x will be a vector, under ggplot2::stat_func().
i = findInterval(x, A[,1])
b <- matrix(NA, ncol = 2, nrow = length(i))
for (each in 1:(length(i)-1)) {
b[each, ] <- z(unlist(A[i[each]+1, ]), unlist(A[i[each],]))
}
N1 <- dim(A)[1];
b[length(i), ] <- z(unlist(A[N1-2, ]), unlist(A[N1-1,]))
y <- exp(b[, 1]*x + b[, 2])
return(y)
}
p1 <- p1 + stat_function(fun = piecewise)
p1 + xlab('x') + ylab('Density')
}
graph.b2()
Now we take the beta distribution as an example. Again, the first step is to trim the unnecessary linear extensions.
myFunc <- function(x0){return(dbeta(x = x0, shape1 = 3, shape2 = 1.3, log = TRUE))}
myCuts <- seq(0.1,0.9,by = 0.1)
temp <- graph.b1()
temp[[4]]
The second step is to remove the log.
myFunc <- function(x0){return(dbeta(x = x0, shape1 = 3, shape2 = 1.3, log = FALSE))}
myCuts <- seq(0,1,by = 0.1)
graph.b2()
Here are math on the model specifications:
Let $\{\theta_{0},...,\theta_{J}\}$ denote the set of break points in the support of $\theta$.
Let $m(\theta)=\text{log}(f(\theta))$, that is, $m(\theta)$ is the log transformation of the density function.
Let $\mathcal{P}=\{(\theta_{0},m(\theta_{0})),...,(\theta_{j},m(\theta_{j})),...,(\theta_{J},m(\theta_{J}))\}$ be the set of ordered pairs in the coordinate plane, wherer each $m(\theta)$ is the corresponding ordinate of $\theta$.
Let $L_{j,j+1}$ be the linear function which is the extension of the chord connecting $P_{j}$ and $P_{j+1}$.
Now, we define the upper envelope $\overline{m}$:
$\overline{m}(\theta)\equiv\text{min}\{L_{j,j+1}|L_{j,j+1}(\theta)\geq m(\theta)\}$ for $j=0,...,J$ and any $\theta$.
That is, $\overline{m}(.)$ is defined by connecting those lines $L_{j,j+1}$ which majorize the log density $m(.)$ most closely.
Similarly, we can define the lower envelope as:
$\underline{m}(\theta)\equiv L_{j,j+1}(\theta),\theta\in[\theta_{j},\theta_{j+1}]\}$ for $j=0,...,J$
Define the proposal density as the piecewise exponential function:
$$ g(\theta)=G^{-1}[\sum_{j}\text{exp}(m_{j}\theta+b_{j})\times I(\theta\in[\theta_{j},\theta_{j+1}])], $$where $G$ is the normalizing constant with
$$ G=\sum_{j}\int_{\theta_{j}}^{\theta_{j+1}}\text{exp}(m_{j}\theta+b_{j})d\theta=\sum_{j}\text{exp}(b_{j})\times\frac{\text{exp}(m_{j}\theta_{j+1})-\text{exp}(m_{j}\theta_{j})}{m_{j}} $$The first step would be to find the linear lines which majorize the log transformation of the density in the corresponding intervals. The second step is to consider the original graph with the linear lines being expontialized.
Next time, we will use this algorithm to define the sampling algorithm for piecewise exponential functions, and most importantly, introduce the adaptive rejection sampling algorithm.