Posterior Interval by ‘Numerical Method’



### Placenta Previa example
y=seq(0,1,by=0.001)

plot(y,dbeta(y,438,544),type="l",xlim=c(0.3,0.6))  #Posterior distribution

post.y=rbeta(1000,438,544)  #Taking 1000 samples from the Post. dstn

dev.new()
hist(post.y,freq=F)  #Plotting the sample


mean(post.y)  #The posterior mean
[1] 0.4461668
quantile(post.y,probs=c(0.025,0.975))  # 95% Posterior Interval using Numerical Method
     2.5%     97.5% 
0.4183153 0.4766519 







Normal Model With Discrete Prior Distribution (Estimating the mean)


options(scipen=999)  #To disable printing your results in scientific notation.

mu=c(20,30,40,50,60,70)  #Parameter of interest
prior=c(0.1,0.15,0.25,0.25,0.15,0.1)  #Discrete Prior Distribution
y=c(38.6,42.4,57.5,40.5,51.7,67.1,33.4,60.9,64.1,40.1,40.7,6.4)  #Observations (n=10)

ybar=mean(y)
sigma=10  #Known population variance
n=length(y)
like=exp((-n/(2*sigma^2))*(mu-ybar)^2)  #Likelihood function (Refer to the notes)

unn.post=prior*like  #Unnormalized Discrete Posterior dstn 
unn.post
[1] 0.00000000000000000220148 0.00000012289485905573538 0.04683563061571816704687
[4] 0.06580160638607958356605 0.00000034081137965149324 0.00000000000000001205079
nor.post=unn.post/sum(unn.post)  #Normalized Discrete Posterior dstn (We have to Normalize it to make the Probability Distribution integrate to 1)
nor.post  #We can see that the Discrete Post. dstn is [20: 0, 30: 0, 40: 0.42, 50: 0.58, 60: 0, 70: 0]
[1] 0.00000000000000001954479 0.00000109106327884201111 0.41580776526252849478738
[4] 0.58418811794322056396567 0.00000302573097203836150 0.00000000000000010698715
dev.new()
plot(mu,nor.post)  #We can see that the most likely values are 40 and 50


s.post=sample(mu,1000,prob=nor.post,replace=T)  #Take a sample from the Post. dstn
table(s.post)  #The number of observations from values of 40 and 50
s.post
 40  50 
385 615 
mean(s.post)
[1] 46.15
sd(s.post)
[1] 4.868388







Frequentist vs Bayesian Estimators (by MSE)


#Sample Size is SMALL
n=10
theta=seq(0,1,by=0.01)  #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n  #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2  #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta

dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))  

We can see that the Bayes’ MSEs are smaller in general when the sample size is small

#Sample Size is LARGE
n=1000
theta=seq(0,1,by=0.01)  #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n  #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2  #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta

dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))  

We can see that the MSEs of the two estimators are almost identical when the sample size is large







Highest Posterior Density (HPD) Credible Set



95% Credible Interval

n = 10
y = 2

ci = qbeta(c(0.025,0.975), y+1, n-y+1)
ci
[1] 0.06021773 0.51775585
Error in dev.off() : 
  QuartzBitmap_Output - unable to open file '/Users/jaeyonglee/.local/share/rstudio/notebooks/E15CF856-Bayesian/1/13BF2920BFE883ED/ce7ecaqvv8bt5_t/_rs_chunk_plot_001.png'


95% HPD interval

# install.packages("TeachingDemos")  # hpd function is in there
library(TeachingDemos)

hpd=hpd(qbeta,shape1=y+1,shape2=n-y+1)
hpd
[1] 0.04055517 0.48372366


Visualization

theta = seq(0, 1, by=0.01)

plot(theta,dbeta(theta,y+1,n-y+1),type="l",xlab=expression(theta),ylab="Posterior Density Function", cex.lab=1)

segments(ci[1],0,ci[1],dbeta(ci[1],y+1,n-y+1),col="red",lwd=2)
segments(ci[2],0,ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)
segments(ci[1],dbeta(ci[1],y+1,n-y+1),ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)

# HPD
segments(hpd[1],0,hpd[1],dbeta(hpd[1],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[2],0,hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[1],dbeta(hpd[1],y+1,n-y+1),hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
legend(x=0.6,y=3,legend=c("95% CI=(0.06,0.52)", "95% HPD CI=(0.04,0.48)"),col=c("red","blue"),lty=c(1,2),cex=0.8)







Calculating Posterior Predictive P-values (Bayesian P-Values)



nsim=100

y=c(1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0)
T.y=sum(diff(y)!=0)
theta=rep(NA,nsim)
T.y.rep=rep(NA,nsim)

for (i in 1:nsim){
  theta[i]<-rbeta(1,8,14)
  y.rep<-rbinom(20,1,theta[i])
  T.y.rep[i]<-sum(diff(y.rep)!=0)
}

sum(T.y.rep<=T.y)/nsim  # Bayesian P-value
[1] 0

This is the Bayesian p-value (after 100 simulations)


hist(T.y.rep)  # Posterior predictive distribution of the replicated test statistic
abline(v=3,col="red")  # The value left to the red line is the p-value







Posterior dstn for Multinomial model for Categorical data



Let’s calculate the post. dstn for theta1 - theta2. (Let’s call this diff)


We’re gonna take samples from the post. dstn of (theta1,theta2,theta3) and calculate it.


library(MCMCpack)
Loading required package: coda
Loading required package: MASS
##
## Markov Chain Monte Carlo Package (MCMCpack)
## Copyright (C) 2003-2021 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
##
## Support provided by the U.S. National Science Foundation
## (Grants SES-0350646 and SES-0350613)
##
n=1447

y1=727
y2=583
y3=137

alpha=c(y1+1,y2+1,y3+1)  # the parameters of post. dstn of (theta1,theta2,theta3)

nsim=100  # number of simulations    

theta=rdirichlet(nsim,alpha)  # Taking samples from the post. dstn of (theta1,theta2,theta3)
diff=theta[,1]-theta[,2]
diff  # theta1 - theta2
  [1] 0.10562810 0.08850064 0.11133521 0.14880062 0.09170576 0.09004244 0.04849895 0.07814048
  [9] 0.11238263 0.09523778 0.07019925 0.07856047 0.08240375 0.09766831 0.10575962 0.10521868
 [17] 0.12542080 0.10559810 0.13319903 0.10775616 0.13554330 0.08357416 0.08696108 0.10125090
 [25] 0.13818495 0.12471304 0.09711644 0.12440268 0.07473838 0.13054324 0.11836621 0.13410864
 [33] 0.07671566 0.11473167 0.11137363 0.09173639 0.06048886 0.13933621 0.11845024 0.10651097
 [41] 0.12411735 0.07698650 0.13703001 0.08163398 0.10304243 0.07565535 0.12622887 0.13891881
 [49] 0.08686959 0.11644592 0.10733023 0.14259136 0.10831884 0.10159467 0.11397080 0.11186925
 [57] 0.09299134 0.08475031 0.09747366 0.09730852 0.12853930 0.07998293 0.07742454 0.07187104
 [65] 0.09298987 0.07714006 0.09282218 0.07540104 0.09250740 0.11414998 0.11282102 0.07785968
 [73] 0.12050435 0.09884799 0.11860044 0.10809613 0.03751695 0.10312953 0.10039184 0.07973068
 [81] 0.13225262 0.08227616 0.06086792 0.09202436 0.10271909 0.08772210 0.09410449 0.08521208
 [89] 0.09947449 0.11700026 0.11808139 0.15889028 0.09612751 0.11579798 0.11020222 0.08929781
 [97] 0.09390641 0.13181758 0.08235398 0.12179827


Histogram plot of dstn for diff


hist(diff)


Point Estimate


mean(diff)
[1] 0.1020829


Interval Estimate (alpha = 0.05)


quantile(diff,probs=c(0.025,0.975))  # Equal tailed interval for alpha = 0.05
      2.5%      97.5% 
0.06066891 0.14104516 


The count of theta1 < theta2


sum(diff<0)
[1] 0


We can see that theta1 is larger than theta2.







Monte Carlo Integration


Case I



True Value


2/3
[1] 0.6666667


Direct Computation

N<-100000
theta<-runif(N,-1,1)  # Drawing N number of values
h.theta<-2*theta^2   # The h() function

mu.h.theta<-mean(h.theta)   # MC estimate
mu.h.theta
[1] 0.6688993

Pretty close to the true value.


Iteration Method

N=10000
theta=rep(0,N)  # Initializing the places to save the values I guess
h.theta=rep(0,N)
mu.h.theta=rep(0,N)

for (i in 1:N){
  theta[i]=runif(1,-1,1)  # Drawing 1 value at a time
  h.theta[i]=2*theta[i]^2
  mu.h.theta[i]=mean(h.theta[1:i])
}

mu.h.theta[N]  # MC estimate
[1] 0.6572248
dev.new()
plot(1:N,mu.h.theta,type="l")

The h(theta) eventually converges to the true value.




Case II



True Value

1-exp(-1)
[1] 0.6321206


Direct Computation

N=10000
x=runif(N)  # Drawing N number of values

# g(x) = exp(-x)
theta.hat=mean(exp(-x))  # MC estimate
theta.hat
[1] 0.6311722

Pretty close to the true value.


Iteration method

N=10000
x=rep(0,N)  # Initializing the places to save the values I guess
h.x=rep(0,N)
mu.h.x=rep(0,N)

for (i in 1:N){
  x[i]=runif(1)
  h.x[i]=exp(-x[i])
  mu.h.x[i]=mean(h.x[1:i])
}

mu.h.x[N]  # MC estimate
[1] 0.6354287
dev.new()
plot(1:N,mu.h.x,type="l")

It eventually converges to the true value.








Rejection Sampling


Example1


Target Density, p

p <- function(x) {
  if (x >= 0 && x < 0.25)
     8 * x
  else if (x >= 0.25 && x <= 1)
     8/3 - 8 * x/3
   else 0
  }


Proposal Density, g and Constant M

g <- function(x) {
   if (x >= 0 && x <= 1)
     1
   else 0
  }

M<-3  # Try out different values for M (M is better to be smaller)


Accept/Reject algorithm

m <- 1000  # Total number of accepted values needed
n.draws <- 0  # For counting the number of accepted values
draws <- c()  # Place to store accepted values

while (n.draws < m) {
   x.c <- runif(1, 0, 1)  # Drawing a candidate from the proposal density
   accept.prob <- p(x.c)/(M * g(x.c))  # The prob. of accepting for each candidate
   u <- runif(1, 0, 1)  # The auxiliary variable used for accept/reject algo.
   
   if (accept.prob >= u) {
     draws <- c(draws, x.c)  # Storing the accepted candidate
     n.draws <- n.draws + 1  # Counting the accepted candidate
     }
   }


The accepted values

head(draws)
[1] 0.5807693 0.2013411 0.6341995 0.1950827 0.2677502 0.3024960


Visualization

# Initializing variables for the plots
x <- seq(0, 1, by = 0.01)
y=rep(0,length(x))
for (i in 1:length(x)){
  y[i]=p(x[i])
}

dev.new()
par(mfrow=c(1,3))
plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and Mg()")
abline(h=M,col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)

plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and the density\n of the accepted values")
lines(density(draws),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)

hist(draws,freq=F,ylim=c(0,4), xlab="x", ylab="y", main="Histogram of accepted values")
lines(x,y,type="l",col="black")




Example2


Target Density, p

x.grid=seq(-5,5,by=0.01)
p=dnorm(x.grid,0,1)


Proposal Density, g and Constant M.

g=dnorm(x.grid,0,2)
M=2


Accept/Reject algorithm

N=10000  # Total number of iterations
         # This time we will be checking the general acceptance rate
x=rep(0,N)  # Places to store the accepted values
            # The unaccepted ones will be 0 so we can count how many nonzeros there
            # out of N to calculate the general acceptance rate

for (i in 1:(N)){
  x_star=rnorm(1,0,2)  # Drawing a candidate from the proposal density
  accept.prob=dnorm(x_star,0,1)/(M*dnorm(x_star,0,2))  # Prob. of accepting
  U=runif(1,0,1)  # The auxiliary variable
  if (accept.prob>=U){x[i]=x_star}
}


The general acceptance rate

i.e. how many values were accepted out of N candidates

accept.rate=sum(x!=0)/N
accept.rate
[1] 0.5078

From this we can deduce that the number of accepted values is about half of N


Visualization

dev.new()
par(mfrow=c(1,3))
plot(x.grid,p,type="l",ylim=c(0,0.6), xlab="x", ylab="y", main="Plot of p() and Mg()")
lines(x.grid,M*g,type="l",col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)


nonzero = x[x!=0]  # We don't need 0s to plot the density

plot(x.grid,p,type="l", main="Plot of p() and the density\n of the accepted values")
lines(density(nonzero),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)

hist(nonzero,freq=F,ylim=c(0,0.5), ylab="y", main="Histogram of accepted values")
lines(x.grid, dnorm(x.grid,0,1))








Importance Sampling


True Value

1-pnorm(3)
[1] 0.001349898


Importance Sampling Estimate

N=100000  # Number of samples to extract from the proposal dstn
x=rnorm(N,4,1)  # Samples from the proposal dstn
f.x=dnorm(x,0,1)  # To calculate the weight ftn
g.x=dnorm(x,4,1)  # To calculate the weight ftn
h.x=as.numeric(x>=3)  # Target
w.x=f.x/g.x  # Weight ftn

h.x_IS=mean(h.x*w.x)  # Importance Sampling estimate
h.x_IS
[1] 0.001357118

We can see that it is pretty close


Sampling Error of the IS estimate

sd(h.x*w.x)
[1] 0.00312042

We can see that the sampling error is very small which is an indicator of a good estimate.



Comparison to MC (CaseII)


Direct Computation

N=10000  # Number of samples to extract
x=runif(N,0,3)  # Samples extracted from Unif(0,3)
g.x=1/sqrt(2*pi)*exp(-0.5*x^2)  # The g() ftn
1/2-3*mean(g.x)  # MC estimate
[1] -0.004296541


Sampling Error of the MC estimate

sd(g.x)
[1] 0.1394672

We can see that the sampling error is a lot bigger than IS estimate’s.


Iteration Method

N=10000  # Number of samples to extract
x=rep(0,N)  # Storage for samples
g.x=rep(0,N)  # Storage for values of g()
mu.g.x=rep(0,N)  # Storage for values of mean of g()

for (i in 1:N){
  x[i]=runif(1,0,3)  # Extracting a sample
  g.x[i]=1/sqrt(2*pi)*exp(-1/2*x[i]^2)
  mu.g.x[i]=mean(g.x[1:i])
}

1/2-3*mu.g.x[N]  # MC estimate
[1] -0.005483052


Visualization

dev.new()
plot(1:N,0.5-3*mu.g.x,type="l", ylim=c(0,0.15))


Sampling Error of the MC estimate (Iteration method)

sd(g.x)
[1] 0.1393593


Comparison tp MC (Case I)

N=100000  # Number of samples to extract
x=rnorm(N)  # Samples taken from N(0,1)
h.x=as.numeric(x>=3)  # h() = P(X>=3) ; X ~ N(0,1)
h.x_MC=mean(h.x)  # MC estimate
h.x_MC
[1] 0.00132


Sample Error of MC (Case I)

sd(h.x)
[1] 0.036308

Case I has a better sample error than case II.








Markov Chain



Prediction

nsim=100
P=matrix(c(0.9,0.5,0.1,0.5),2,2)  # Transition matrix
x=matrix(c(0,1),nsim,2,byrow=T)  # Initial states

for(i in 1:(nsim-1)){
  x[i+1,]=x[i,]%*%P  # Matrix multiplication
}

dev.new()
plot(1:nsim,x[,1],type="l",ylim=c(0,1),col="red")
lines(1:nsim,x[,2],type="l",col="blue")
legend("topright", legend=c("Sunny", "Rainy"), col=c("red", "blue"), lty=1, cex=0.8)

We can see that it converges.








Gibbs Sampler (One of MCMC methods)



N=10000  # Total number of iterations(time)
burn=N/2  # Burn-in period is half of N I guess

# data (a single obs.)
y1=0
y2=0

# correlation
rho=0.8

# initial values
theta.1=rep(1,N)  # Starting point is 1 for theta1
theta.2=rep(-10,N)  # Starting point is -10 for theta2
theta=cbind(theta.1,theta.2)  # Theta binded

# iterations
for (i in 1:(N-1)){
  # Taking samples from the conditional dstns of theta1 and theta2 respectively
  theta.1[i+1]=rnorm(1,y1+rho*(theta.2[i]-y2),sqrt(1-rho^2))
  theta.2[i+1]=rnorm(1,y2+rho*(theta.1[i+1]-y1),sqrt(1-rho^2))
  # Samples binded
  theta[i+1,]=c(theta.1[i+1],theta.2[i+1])
}

head(theta)
        theta.1    theta.2
[1,]  1.0000000 -10.000000
[2,] -7.4826999  -5.727806
[3,] -4.8542972  -3.540300
[4,] -3.4723206  -1.958079
[5,] -1.5681786  -0.440435
[6,] -0.2940323   0.667433


Visualizations

# check traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")

The values of theta1 and theta2 converge.


dev.new()
par(mfrow=c(1,1))
plot(theta)

A simple visualization.


ntr=1000 
dev.new()
par(mfrow=c(1,1))
plot(theta.1[1:ntr],theta.2[1:ntr],type="l")
text(theta.1[1:ntr],theta.2[1:ntr],labels=1:10,cex=1, font=5)

With this plot, we can see the movement of the convergence I guess.


The averages of thetas: these are the estimates

colMeans(theta)
    theta.1     theta.2 
-0.01851583 -0.02379488 


The correlation of thetas from the samples using conditional dstns

cor(theta)
          theta.1   theta.2
theta.1 1.0000000 0.7815042
theta.2 0.7815042 1.0000000


The correlation of thetas without the Burn-in period

cor(theta[burn:N,])
          theta.1   theta.2
theta.1 1.0000000 0.7764459
theta.2 0.7764459 1.0000000

Excluding the Burn-in period might and might not improve, I guess.


The actual dstn

library(mvtnorm)  # To sample from an actual BVN dstn

joint.s=rmvnorm(N/2,mean=c(y1,y2),sig=matrix(c(1,rho,rho,1),2,2))  # The actual dstn

head(joint.s)
           [,1]        [,2]
[1,] -1.4267623 -2.27127038
[2,] -1.4242776 -1.08538234
[3,]  1.6142546  1.97821223
[4,]  0.1976566  0.13710940
[5,] -0.5978071 -0.07247639
[6,]  1.2196402  0.23673397

Visualization

plot(xlim=c(-10,4), ylim=c(-10,4), joint.s)

The averages of thetas : these are the estimates

colMeans(joint.s)
[1] 0.01367316 0.02207730

The correlation of thetas from the samples using the actual dstn

cor(joint.s)
          [,1]      [,2]
[1,] 1.0000000 0.8009714
[2,] 0.8009714 1.0000000








Metropolis algorithm


Example1


N=10000
burn.in=N/2  # Burn-in period is half of N I guess

# data (a single obs.)
y1=0
y2=0

# correlation
rho=0.8

# initial value
sig2=1-rho^2    # variance for posterior density
sig2.prop=0.1   # variance for proposal density
theta.1=rep(1,N)  # Initial value is 1 I guess
theta.2=rep(1,N)


# iterations
for (i in 1:(N-1)){
  
  # for theta.1
  theta.1.star=rnorm(1,theta.1[i],sqrt(sig2.prop))  # Sample from the proposal dstn
  U.1=runif(1,0,1)  # U: auxiliary variable
  
  # Ratio of the two densities
  ratio.1=dnorm(theta.1.star,y1+rho*(theta.2[i]-y2),1-rho^2)/dnorm(theta.1[i],y1+rho*(theta.2[i]-y2),1-rho^2)
  
  # Probability of move
  alpha.1=min(ratio.1,1)
  
  if (U.1<=alpha.1){theta.1[i+1]=theta.1.star}  # Accepted
  else theta.1[i+1]=theta.1[i]
  
  
  ## for theta.2
  theta.2.star=rnorm(1,theta.2[i],sqrt(sig2.prop))
  U.2=runif(1,0,1)
  
  # Ratio of the two densities
  ratio.2=dnorm(theta.2.star,y2+rho*(theta.1[i+1]-y1),1-rho^2)/dnorm(theta.2[i],y2+rho*(theta.1[i+1]-y1),1-rho^2)
  
  # Probability of move
  alpha.2=min(ratio.2,1)
  
  if (U.2<=alpha.2){theta.2[i+1]=theta.2.star}  # Accepted
  else theta.2[i+1]=theta.2[i]
  
}

theta=cbind(theta.1,theta.2)  # Binding the thetas together

head(theta)
       theta.1   theta.2
[1,] 1.0000000 1.0000000
[2,] 1.0768556 1.0000000
[3,] 1.3101948 0.8304043
[4,] 1.3101948 0.5861109
[5,] 1.2283990 0.5861109
[6,] 0.7631203 0.5752664


Visualizations

# traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")


dev.new()
par(mfrow=c(1,2))
hist(theta[(burn.in+1):N,1])
hist(theta[(burn.in+1):N,1])


dev.new()
par(mfrow=c(1,1))
plot(theta[(burn.in+1):N,])


The averages of theta1 and theta2: These are the estimates

colMeans(theta[(burn.in+1):N,])
   theta.1    theta.2 
0.01982588 0.02451556 


The correlation of theta1 and theta2

cor(theta[(burn.in+1):N,])
          theta.1   theta.2
theta.1 1.0000000 0.8035538
theta.2 0.8035538 1.0000000




Example2


The metropolis algorithm made into a function (because we need to run for 4 different variances)

mh=function(v,sigma,x0,N){
  x=numeric(N)  # Making N number of 0s
  x[1]=x0  # Starting point
  k=0
  for (i in 2:N){  # Starts at 2 because 1 is the starting point
    y=rnorm(1,x[i-1],sigma)  # Sampling the candidate from the proposal dstn with 
                             # mean: previous value and variance: sigma
    u=runif(1)  # The auxiliary variable
    alpha=min(dt(y,v)/dt(x[i-1],v),1)  # Calculating the probability of move
                             # The ratio r is p(candidate|y)/p(previous value|y)
                             # Where p() is in this case the t dstn with df=v
    if (u<=alpha){x[i]=y}  # Accepting the candidate
    else {x[i]=x[i-1]
    k=k+1  # Counting the number of rejections
    }
  }
  return(list(x=x,k=k))
}


Let’s run the algorithm

N=1000  # Number of samples
burn.in=N/2  # The Burn-in period
v=4  # df of t dstn
x0=1  # Starting point
sigma=c(0.05,0.5,2,16)  # Different variances for proposal dstns

mh1=mh(v,sigma[1],x0,N)
mh2=mh(v,sigma[2],x0,N)
mh3=mh(v,sigma[3],x0,N)
mh4=mh(v,sigma[4],x0,N)


The number of candidates rejected by each variance

c(mh1$k,mh2$k,mh3$k,mh4$k)/N
[1] 0.023 0.146 0.462 0.892

We can see that, when the variance(low) => Few rejected, variance(high) => Many rejected.


Visualizations

dev.new()
par(mfrow=c(2,2))
plot(1:N,mh1$x,type="l")  # The trace plots
plot(1:N,mh2$x,type="l")
plot(1:N,mh3$x,type="l")
plot(1:N,mh4$x,type="l")

The frequent horizontal lines in the trace plot means that there were many rejections. Horizontal line means that previous values were used many times because candidates were rejected.


a=seq(-5,5,by=0.01)  # To plot the t dstn

dev.new()
par(mfrow=c(2,2))

hist(mh1$x[(burn.in+1):N],freq=F,ylim=c(0,0.7))  # Histogram of the samples
                                                 # without the Burn-in period
lines(a,dt(a,df=4))  # t dstn with df=4: The actual dstn

hist(mh2$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

hist(mh3$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

hist(mh4$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

We can see that when the variance is too low or high, the dstn of the samples were not approximate to the actual dstn. Whereas when the variance is modest, the samples were pretty approximate to the actual dstn.








Metropolis-Hastings algorithm


Example1


The M-H algorithm

N=10000  # Number of samples
burn.in=N/2  # The Burn-in period
sigma=4  # of the Rayleigh dstn
x=numeric(N)  # Initializing the spaces for the chain
x[1]=rchisq(1,df=1)  # Starting point
k=0  # Number of rejected candidates

for (i in 2:N){  # Starts from 2 because 1 is the starting point
  x.star=rchisq(1,df=x[i-1])  # Candidate taken from chisq dstn with df=previous value
  u=runif(1)  # Auxiliary variable
  
  # For calculating the ratio r
  num=(x.star/(sigma^2)*exp(-x.star^2/(2*sigma^2)))/dchisq(x.star,df=x[i-1])
  den=(x[i-1]/(sigma^2)*exp(-x[i-1]^2/(2*sigma^2)))/dchisq(x[i-1],df=x.star)
  
  alpha=min(num/den,1)  # The probability of move
  if (u<=alpha){x[i]=x.star}  # Accepting the candidate
  else {x[i]=x[i-1]
  k=k+1 # Counting the number of rejected candidates
  }
}


Visualizations

dev.new()
plot(1:N,x,type="l")  # The trace plot


# install.packages("VGAM")
library(VGAM) # For drayleigh()
Loading required package: stats4
Loading required package: splines

Attaching package: ‘VGAM’

The following object is masked from ‘package:coda’:

    nvar
xgrid=seq(0,20,by=0.1)
ytrue=drayleigh(xgrid,scale=sigma)  # The density of the actual dstn

dev.new()
hist(x[(burn.in+1):N],freq=F, main="Comparison to the actual dstn")  # freq=F: to display density on y axis not the frequency
lines(xgrid,ytrue,type="l")  # The actual dstn

We can see that the samples(the histogram) is pretty approximate to the actual dstn(the line).




Example2


N=100000  # Number of samples
burn.in=N/2  # Burn-in period

## initial value
sig2=100  # variance of the proposal dstn
theta=rep(0,N)  # Initializing the space for theta

p.th=function(th){   # The actual dstn
  0.3*exp(-0.2*th^2)+0.7*exp(-0.2*(th-10)^2)
}

## iterations
for (i in 1:(N-1)){  # No value for starting point was given so I guess it's 0
  
  theta.star=rnorm(1,theta[i],sqrt(sig2))  # candidate
  U=runif(1,0,1)  # auxiliary variable
  
  ratio=p.th(theta.star)/p.th(theta[i])  # the ratio r
  # Note that since the proposal dstn is symmetric, the ratio r is just like Metropolis'
  
  alpha=min(ratio,1)  # The probability of move
  
  if (U<=alpha){theta[i+1]=theta.star}  # Accepting the candidate
  else theta[i+1]=theta[i]
  
}


Visualizations

dev.new()
par(mfrow=c(1,1))
plot(burn.in+1:N,theta,type="l")  # the trace plot


th.grid=seq(-10,20,by=1)
th.true=rep(0,length(th.grid))
for (i in 1:length(th.grid)){
  th.true[i]=p.th(th.grid[i])
}
th.true=th.true/sum(th.true)  # Normalizing I guess? This makes sum(th.true) = 1

dev.new()
par(mfrow=c(1,1))
hist(theta,freq=FALSE, main="Comparison to the actual dstn")
lines(th.grid,th.true,type="l")


Estimate for the mean of the target dstn I guess?

mean(theta[(burn.in+1):N])
[1] 6.886633




---
title: "Bayesian Statistics - pt.1"
output:
  html_notebook:
    toc: yes
---

\
\
\
\

# Posterior Interval by 'Numerical Method'

\

```{r}

### Placenta Previa example
y=seq(0,1,by=0.001)

plot(y,dbeta(y,438,544),type="l",xlim=c(0.3,0.6))  #Posterior distribution

post.y=rbeta(1000,438,544)  #Taking 1000 samples from the Post. dstn

dev.new()
hist(post.y,freq=F)  #Plotting the sample

mean(post.y)  #The posterior mean
quantile(post.y,probs=c(0.025,0.975))  # 95% Posterior Interval using Numerical Method

```

\
\
\
\
\
\

# Normal Model With Discrete Prior Distribution (Estimating the mean)

\

```{r}
options(scipen=999)  #To disable printing your results in scientific notation.

mu=c(20,30,40,50,60,70)  #Parameter of interest
prior=c(0.1,0.15,0.25,0.25,0.15,0.1)  #Discrete Prior Distribution
y=c(38.6,42.4,57.5,40.5,51.7,67.1,33.4,60.9,64.1,40.1,40.7,6.4)  #Observations (n=10)

ybar=mean(y)
sigma=10  #Known population variance
n=length(y)
like=exp((-n/(2*sigma^2))*(mu-ybar)^2)  #Likelihood function (Refer to the notes)

unn.post=prior*like  #Unnormalized Discrete Posterior dstn 
unn.post

nor.post=unn.post/sum(unn.post)  #Normalized Discrete Posterior dstn (We have to Normalize it to make the Probability Distribution integrate to 1)
nor.post  #We can see that the Discrete Post. dstn is [20: 0, 30: 0, 40: 0.42, 50: 0.58, 60: 0, 70: 0]

dev.new()
plot(mu,nor.post)  #We can see that the most likely values are 40 and 50

s.post=sample(mu,1000,prob=nor.post,replace=T)  #Take a sample from the Post. dstn
table(s.post)  #The number of observations from values of 40 and 50
mean(s.post)
sd(s.post)

```

\
\
\
\
\
\

# Frequentist vs Bayesian Estimators (by MSE)

\

```{r}
#Sample Size is SMALL
n=10
theta=seq(0,1,by=0.01)  #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n  #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2  #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta

dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))  
```

#### We can see that the Bayes' MSEs are smaller in general when the sample size is small


```{r}
#Sample Size is LARGE
n=1000
theta=seq(0,1,by=0.01)  #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n  #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2  #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta

dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))  
```

#### We can see that the MSEs of the two estimators are almost identical when the sample size is large

\
\
\
\
\
\

# Highest Posterior Density (HPD) Credible Set

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0086.jpeg)

\

### 95% Credible Interval

```{r}
n = 10
y = 2

ci = qbeta(c(0.025,0.975), y+1, n-y+1)
ci
```

\

### 95% HPD interval

```{r}
# install.packages("TeachingDemos")  # hpd function is in there
library(TeachingDemos)

hpd=hpd(qbeta,shape1=y+1,shape2=n-y+1)
hpd
```

\

### Visualization

```{r}
theta = seq(0, 1, by=0.01)

plot(theta,dbeta(theta,y+1,n-y+1),type="l",xlab=expression(theta),ylab="Posterior Density Function", cex.lab=1)

segments(ci[1],0,ci[1],dbeta(ci[1],y+1,n-y+1),col="red",lwd=2)
segments(ci[2],0,ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)
segments(ci[1],dbeta(ci[1],y+1,n-y+1),ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)

# HPD
segments(hpd[1],0,hpd[1],dbeta(hpd[1],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[2],0,hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[1],dbeta(hpd[1],y+1,n-y+1),hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
legend(x=0.6,y=3,legend=c("95% CI=(0.06,0.52)", "95% HPD CI=(0.04,0.48)"),col=c("red","blue"),lty=c(1,2),cex=0.8)
```

\
\
\
\
\
\

# Calculating Posterior Predictive P-values (Bayesian P-Values)

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0083.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0084.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0085.jpeg)

\

```{r}
nsim=100

y=c(1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0)
T.y=sum(diff(y)!=0)
theta=rep(NA,nsim)
T.y.rep=rep(NA,nsim)

for (i in 1:nsim){
  theta[i]<-rbeta(1,8,14)
  y.rep<-rbinom(20,1,theta[i])
  T.y.rep[i]<-sum(diff(y.rep)!=0)
}

sum(T.y.rep<=T.y)/nsim  # Bayesian P-value

```

#### This is the Bayesian p-value (after 100 simulations)

\

```{r}
hist(T.y.rep)  # Posterior predictive distribution of the replicated test statistic
abline(v=3,col="red")  # The value left to the red line is the p-value
```

\
\
\
\
\
\

# Posterior dstn for Multinomial model for Categorical data

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/ss.png)

\

#### Let's calculate the post. dstn for theta1 - theta2. (Let's call this diff)

\

#### We're gonna take samples from the post. dstn of (theta1,theta2,theta3) and calculate it.

\
```{r}
library(MCMCpack)

n=1447

y1=727
y2=583
y3=137

alpha=c(y1+1,y2+1,y3+1)  # the parameters of post. dstn of (theta1,theta2,theta3)

nsim=100  # number of simulations    

theta=rdirichlet(nsim,alpha)  # Taking samples from the post. dstn of (theta1,theta2,theta3)
diff=theta[,1]-theta[,2]
diff  # theta1 - theta2
```

\

### Histogram plot of dstn for diff

\

```{r}
hist(diff)
```

\

### Point Estimate

\

```{r}
mean(diff)
```

\

### Interval Estimate (alpha = 0.05)

\

```{r}
quantile(diff,probs=c(0.025,0.975))  # Equal tailed interval for alpha = 0.05
```

\

### The count of theta1 < theta2

\

```{r}
sum(diff<0)
```

\

### We can see that theta1 is larger than theta2.

\
\
\
\
\
\

# Monte Carlo Integration

\

## Case I

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0153.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0152.jpeg)

\

### True Value

\

```{r}
2/3
```

\

### Direct Computation

```{r}
N<-100000
theta<-runif(N,-1,1)  # Drawing N number of values
h.theta<-2*theta^2   # The h() function

mu.h.theta<-mean(h.theta)   # MC estimate
mu.h.theta
```

#### Pretty close to the true value.

\

### Iteration Method

```{r}
N=10000
theta=rep(0,N)  # Initializing the places to save the values I guess
h.theta=rep(0,N)
mu.h.theta=rep(0,N)

for (i in 1:N){
  theta[i]=runif(1,-1,1)  # Drawing 1 value at a time
  h.theta[i]=2*theta[i]^2
  mu.h.theta[i]=mean(h.theta[1:i])
}

mu.h.theta[N]  # MC estimate
```

```{r}
dev.new()
plot(1:N,mu.h.theta,type="l")
```

#### The h(theta) eventually converges to the true value.

\
\
\

## Case II

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0154.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0155.jpeg)

\

### True Value

```{r}
1-exp(-1)
```

\

### Direct Computation

```{r}
N=10000
x=runif(N)  # Drawing N number of values

# g(x) = exp(-x)
theta.hat=mean(exp(-x))  # MC estimate
theta.hat
```

#### Pretty close to the true value.

\

### Iteration method

```{r}
N=10000
x=rep(0,N)  # Initializing the places to save the values I guess
h.x=rep(0,N)
mu.h.x=rep(0,N)

for (i in 1:N){
  x[i]=runif(1)
  h.x[i]=exp(-x[i])
  mu.h.x[i]=mean(h.x[1:i])
}

mu.h.x[N]  # MC estimate
```

```{r}
dev.new()
plot(1:N,mu.h.x,type="l")
```

#### It eventually converges to the true value.

\
\
\
\
\
\
\

# Rejection Sampling

\

## Example1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0157.jpeg)

\

### Target Density, p

```{r}
p <- function(x) {
  if (x >= 0 && x < 0.25)
     8 * x
  else if (x >= 0.25 && x <= 1)
     8/3 - 8 * x/3
   else 0
  }
```

\

### Proposal Density, g and Constant M

```{r}
g <- function(x) {
   if (x >= 0 && x <= 1)
     1
   else 0
  }

M<-3  # Try out different values for M (M is better to be smaller)
```

\

### Accept/Reject algorithm

```{r}
m <- 1000  # Total number of accepted values needed
n.draws <- 0  # For counting the number of accepted values
draws <- c()  # Place to store accepted values

while (n.draws < m) {
   x.c <- runif(1, 0, 1)  # Drawing a candidate from the proposal density
   accept.prob <- p(x.c)/(M * g(x.c))  # The prob. of accepting for each candidate
   u <- runif(1, 0, 1)  # The auxiliary variable used for accept/reject algo.
   
   if (accept.prob >= u) {
     draws <- c(draws, x.c)  # Storing the accepted candidate
     n.draws <- n.draws + 1  # Counting the accepted candidate
     }
   }
```

\

### The accepted values

```{r}
head(draws)
```

\

### Visualization

```{r}
# Initializing variables for the plots
x <- seq(0, 1, by = 0.01)
y=rep(0,length(x))
for (i in 1:length(x)){
  y[i]=p(x[i])
}

dev.new()
par(mfrow=c(1,3))
plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and Mg()")
abline(h=M,col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)

plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and the density\n of the accepted values")
lines(density(draws),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)

hist(draws,freq=F,ylim=c(0,4), xlab="x", ylab="y", main="Histogram of accepted values")
lines(x,y,type="l",col="black")
```

\
\
\

## Example2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0158.jpeg)

\

### Target Density, p

```{r}
x.grid=seq(-5,5,by=0.01)
p=dnorm(x.grid,0,1)
```

\

### Proposal Density, g and Constant M.

```{r}
g=dnorm(x.grid,0,2)
M=2
```

\

### Accept/Reject algorithm

```{r}
N=10000  # Total number of iterations
         # This time we will be checking the general acceptance rate
x=rep(0,N)  # Places to store the accepted values
            # The unaccepted ones will be 0 so we can count how many nonzeros there
            # out of N to calculate the general acceptance rate

for (i in 1:(N)){
  x_star=rnorm(1,0,2)  # Drawing a candidate from the proposal density
  accept.prob=dnorm(x_star,0,1)/(M*dnorm(x_star,0,2))  # Prob. of accepting
  U=runif(1,0,1)  # The auxiliary variable
  if (accept.prob>=U){x[i]=x_star}
}
```

\

### The general acceptance rate

#### i.e. how many values were accepted out of N candidates

```{r}
accept.rate=sum(x!=0)/N
accept.rate
```

#### From this we can deduce that the number of accepted values is about half of N

\

### Visualization

```{r}
dev.new()
par(mfrow=c(1,3))
plot(x.grid,p,type="l",ylim=c(0,0.6), xlab="x", ylab="y", main="Plot of p() and Mg()")
lines(x.grid,M*g,type="l",col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)


nonzero = x[x!=0]  # We don't need 0s to plot the density

plot(x.grid,p,type="l", main="Plot of p() and the density\n of the accepted values")
lines(density(nonzero),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)

hist(nonzero,freq=F,ylim=c(0,0.5), ylab="y", main="Histogram of accepted values")
lines(x.grid, dnorm(x.grid,0,1))
```

\
\
\
\
\
\
\

# Importance Sampling

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0160.jpeg)

\

### True Value

```{r}
1-pnorm(3)
```

\

### Importance Sampling Estimate

```{r}
N=100000  # Number of samples to extract from the proposal dstn
x=rnorm(N,4,1)  # Samples from the proposal dstn
f.x=dnorm(x,0,1)  # To calculate the weight ftn
g.x=dnorm(x,4,1)  # To calculate the weight ftn
h.x=as.numeric(x>=3)  # Target
w.x=f.x/g.x  # Weight ftn

h.x_IS=mean(h.x*w.x)  # Importance Sampling estimate
h.x_IS
```

#### We can see that it is pretty close

\

### Sampling Error of the IS estimate

```{r}
sd(h.x*w.x)
```

#### We can see that the sampling error is very small which is an indicator of a good estimate.

\
\

## Comparison to MC (CaseII)

\

### Direct Computation

```{r}
N=10000  # Number of samples to extract
x=runif(N,0,3)  # Samples extracted from Unif(0,3)
g.x=1/sqrt(2*pi)*exp(-0.5*x^2)  # The g() ftn
1/2-3*mean(g.x)  # MC estimate
```

\

### Sampling Error of the MC estimate

```{r}
sd(g.x)
```

#### We can see that the sampling error is a lot bigger than IS estimate's.

\

### Iteration Method

```{r}
N=10000  # Number of samples to extract
x=rep(0,N)  # Storage for samples
g.x=rep(0,N)  # Storage for values of g()
mu.g.x=rep(0,N)  # Storage for values of mean of g()

for (i in 1:N){
  x[i]=runif(1,0,3)  # Extracting a sample
  g.x[i]=1/sqrt(2*pi)*exp(-1/2*x[i]^2)
  mu.g.x[i]=mean(g.x[1:i])
}

1/2-3*mu.g.x[N]  # MC estimate
```

\

### Visualization

```{r}
dev.new()
plot(1:N,0.5-3*mu.g.x,type="l", ylim=c(0,0.15))
```

\

### Sampling Error of the MC estimate (Iteration method)

```{r}
sd(g.x)
```

\

## Comparison tp MC (Case I)

```{r}
N=100000  # Number of samples to extract
x=rnorm(N)  # Samples taken from N(0,1)
h.x=as.numeric(x>=3)  # h() = P(X>=3) ; X ~ N(0,1)
h.x_MC=mean(h.x)  # MC estimate
h.x_MC
```

\

### Sample Error of MC (Case I)

```{r}
sd(h.x)
```

#### Case I has a better sample error than case II.

\
\
\
\
\
\
\

# Markov Chain

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0187.jpeg)

\

### Prediction

```{r}
nsim=100
P=matrix(c(0.9,0.5,0.1,0.5),2,2)  # Transition matrix
x=matrix(c(0,1),nsim,2,byrow=T)  # Initial states

for(i in 1:(nsim-1)){
  x[i+1,]=x[i,]%*%P  # Matrix multiplication
}

dev.new()
plot(1:nsim,x[,1],type="l",ylim=c(0,1),col="red")
lines(1:nsim,x[,2],type="l",col="blue")
legend("topright", legend=c("Sunny", "Rainy"), col=c("red", "blue"), lty=1, cex=0.8)
```

#### We can see that it converges.

\
\
\
\
\
\
\

# Gibbs Sampler (One of MCMC methods)

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0189.jpeg)

\

```{r}
N=10000  # Total number of iterations(time)
burn=N/2  # Burn-in period is half of N I guess

# data (a single obs.)
y1=0
y2=0

# correlation
rho=0.8

# initial values
theta.1=rep(1,N)  # Starting point is 1 for theta1
theta.2=rep(-10,N)  # Starting point is -10 for theta2
theta=cbind(theta.1,theta.2)  # Theta binded

# iterations
for (i in 1:(N-1)){
  # Taking samples from the conditional dstns of theta1 and theta2 respectively
  theta.1[i+1]=rnorm(1,y1+rho*(theta.2[i]-y2),sqrt(1-rho^2))
  theta.2[i+1]=rnorm(1,y2+rho*(theta.1[i+1]-y1),sqrt(1-rho^2))
  # Samples binded
  theta[i+1,]=c(theta.1[i+1],theta.2[i+1])
}

head(theta)
```

\

### Visualizations

```{r}
# check traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
```

#### The values of theta1 and theta2 converge.

\

```{r}
dev.new()
par(mfrow=c(1,1))
plot(theta)
```

#### A simple visualization.

\

```{r}
ntr=1000 
dev.new()
par(mfrow=c(1,1))
plot(theta.1[1:ntr],theta.2[1:ntr],type="l")
text(theta.1[1:ntr],theta.2[1:ntr],labels=1:10,cex=1, font=5)
```

#### With this plot, we can see the movement of the convergence I guess.

\

### The averages of thetas: these are the estimates

```{r}
colMeans(theta)
```

\

### The correlation of thetas from the samples using conditional dstns

```{r}
cor(theta)
```

\

### The correlation of thetas without the Burn-in period

```{r}
cor(theta[burn:N,])
```

#### Excluding the Burn-in period might and might not improve, I guess.

\

### The actual dstn

```{r}
library(mvtnorm)  # To sample from an actual BVN dstn

joint.s=rmvnorm(N/2,mean=c(y1,y2),sig=matrix(c(1,rho,rho,1),2,2))  # The actual dstn

head(joint.s)
```

### Visualization

```{r}
plot(xlim=c(-10,4), ylim=c(-10,4), joint.s)
```

### The averages of thetas : these are the estimates

```{r}
colMeans(joint.s)
```

### The correlation of thetas from the samples using the actual dstn

```{r}
cor(joint.s)
```

\
\
\
\
\
\
\

# Metropolis algorithm

\

## Example1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0191.jpg)

\

```{r}
N=10000
burn.in=N/2  # Burn-in period is half of N I guess

# data (a single obs.)
y1=0
y2=0

# correlation
rho=0.8

# initial value
sig2=1-rho^2    # variance for posterior density
sig2.prop=0.1   # variance for proposal density
theta.1=rep(1,N)  # Initial value is 1 I guess
theta.2=rep(1,N)


# iterations
for (i in 1:(N-1)){
  
  # for theta.1
  theta.1.star=rnorm(1,theta.1[i],sqrt(sig2.prop))  # Sample from the proposal dstn
  U.1=runif(1,0,1)  # U: auxiliary variable
  
  # Ratio of the two densities
  ratio.1=dnorm(theta.1.star,y1+rho*(theta.2[i]-y2),1-rho^2)/dnorm(theta.1[i],y1+rho*(theta.2[i]-y2),1-rho^2)
  
  # Probability of move
  alpha.1=min(ratio.1,1)
  
  if (U.1<=alpha.1){theta.1[i+1]=theta.1.star}  # Accepted
  else theta.1[i+1]=theta.1[i]
  
  
  ## for theta.2
  theta.2.star=rnorm(1,theta.2[i],sqrt(sig2.prop))
  U.2=runif(1,0,1)
  
  # Ratio of the two densities
  ratio.2=dnorm(theta.2.star,y2+rho*(theta.1[i+1]-y1),1-rho^2)/dnorm(theta.2[i],y2+rho*(theta.1[i+1]-y1),1-rho^2)
  
  # Probability of move
  alpha.2=min(ratio.2,1)
  
  if (U.2<=alpha.2){theta.2[i+1]=theta.2.star}  # Accepted
  else theta.2[i+1]=theta.2[i]
  
}

theta=cbind(theta.1,theta.2)  # Binding the thetas together

head(theta)
```

\

### Visualizations

```{r}
# traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
```

\

```{r}
dev.new()
par(mfrow=c(1,2))
hist(theta[(burn.in+1):N,1])
hist(theta[(burn.in+1):N,1])
```

\

```{r}
dev.new()
par(mfrow=c(1,1))
plot(theta[(burn.in+1):N,])
```

\

### The averages of theta1 and theta2: These are the estimates

```{r}
colMeans(theta[(burn.in+1):N,])
```

\

### The correlation of theta1 and theta2

```{r}
cor(theta[(burn.in+1):N,])
```

\
\
\

## Example2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0192.jpeg)

\

### The metropolis algorithm made into a function (because we need to run for 4 different variances)

```{r}
mh=function(v,sigma,x0,N){
  x=numeric(N)  # Making N number of 0s
  x[1]=x0  # Starting point
  k=0
  for (i in 2:N){  # Starts at 2 because 1 is the starting point
    y=rnorm(1,x[i-1],sigma)  # Sampling the candidate from the proposal dstn with 
                             # mean: previous value and variance: sigma
    u=runif(1)  # The auxiliary variable
    alpha=min(dt(y,v)/dt(x[i-1],v),1)  # Calculating the probability of move
                             # The ratio r is p(candidate|y)/p(previous value|y)
                             # Where p() is in this case the t dstn with df=v
    if (u<=alpha){x[i]=y}  # Accepting the candidate
    else {x[i]=x[i-1]
    k=k+1  # Counting the number of rejections
    }
  }
  return(list(x=x,k=k))
}
```

\

### Let's run the algorithm

```{r}
N=1000  # Number of samples
burn.in=N/2  # The Burn-in period
v=4  # df of t dstn
x0=1  # Starting point
sigma=c(0.05,0.5,2,16)  # Different variances for proposal dstns

mh1=mh(v,sigma[1],x0,N)
mh2=mh(v,sigma[2],x0,N)
mh3=mh(v,sigma[3],x0,N)
mh4=mh(v,sigma[4],x0,N)
```

\

### The number of candidates rejected by each variance

```{r}
c(mh1$k,mh2$k,mh3$k,mh4$k)/N
```

#### We can see that, when the variance(low) => Few rejected, variance(high) => Many rejected.

\

### Visualizations

```{r}
dev.new()
par(mfrow=c(2,2))
plot(1:N,mh1$x,type="l")  # The trace plots
plot(1:N,mh2$x,type="l")
plot(1:N,mh3$x,type="l")
plot(1:N,mh4$x,type="l")
```

#### The frequent horizontal lines in the trace plot means that there were many rejections. Horizontal line means that previous values were used many times because candidates were rejected.

\

```{r}
a=seq(-5,5,by=0.01)  # To plot the t dstn

dev.new()
par(mfrow=c(2,2))

hist(mh1$x[(burn.in+1):N],freq=F,ylim=c(0,0.7))  # Histogram of the samples
                                                 # without the Burn-in period
lines(a,dt(a,df=4))  # t dstn with df=4: The actual dstn

hist(mh2$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

hist(mh3$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

hist(mh4$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))
```

#### We can see that when the variance is too low or high, the dstn of the samples were not approximate to the actual dstn. Whereas when the variance is modest, the samples were pretty approximate to the actual dstn.

\
\
\
\
\
\
\

# Metropolis-Hastings algorithm

\

## Example1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0193.jpeg)

\

### The M-H algorithm

```{r}
N=10000  # Number of samples
burn.in=N/2  # The Burn-in period
sigma=4  # of the Rayleigh dstn
x=numeric(N)  # Initializing the spaces for the chain
x[1]=rchisq(1,df=1)  # Starting point
k=0  # Number of rejected candidates

for (i in 2:N){  # Starts from 2 because 1 is the starting point
  x.star=rchisq(1,df=x[i-1])  # Candidate taken from chisq dstn with df=previous value
  u=runif(1)  # Auxiliary variable
  
  # For calculating the ratio r
  num=(x.star/(sigma^2)*exp(-x.star^2/(2*sigma^2)))/dchisq(x.star,df=x[i-1])
  den=(x[i-1]/(sigma^2)*exp(-x[i-1]^2/(2*sigma^2)))/dchisq(x[i-1],df=x.star)
  
  alpha=min(num/den,1)  # The probability of move
  if (u<=alpha){x[i]=x.star}  # Accepting the candidate
  else {x[i]=x[i-1]
  k=k+1 # Counting the number of rejected candidates
  }
}
```

\

### Visualizations

```{r}
dev.new()
plot(1:N,x,type="l")  # The trace plot
```

\

```{r}
# install.packages("VGAM")
library(VGAM) # For drayleigh()

xgrid=seq(0,20,by=0.1)
ytrue=drayleigh(xgrid,scale=sigma)  # The density of the actual dstn

dev.new()
hist(x[(burn.in+1):N],freq=F, main="Comparison to the actual dstn")  # freq=F: to display density on y axis not the frequency
lines(xgrid,ytrue,type="l")  # The actual dstn
```

#### We can see that the samples(the histogram) is pretty approximate to the actual dstn(the line).

\
\
\

## Example2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0194.jpeg)

\

```{r}
N=100000  # Number of samples
burn.in=N/2  # Burn-in period

## initial value
sig2=100  # variance of the proposal dstn
theta=rep(0,N)  # Initializing the space for theta

p.th=function(th){   # The actual dstn
  0.3*exp(-0.2*th^2)+0.7*exp(-0.2*(th-10)^2)
}

## iterations
for (i in 1:(N-1)){  # No value for starting point was given so I guess it's 0
  
  theta.star=rnorm(1,theta[i],sqrt(sig2))  # candidate
  U=runif(1,0,1)  # auxiliary variable
  
  ratio=p.th(theta.star)/p.th(theta[i])  # the ratio r
  # Note that since the proposal dstn is symmetric, the ratio r is just like Metropolis'
  
  alpha=min(ratio,1)  # The probability of move
  
  if (U<=alpha){theta[i+1]=theta.star}  # Accepting the candidate
  else theta[i+1]=theta[i]
  
}
```

\

### Visualizations

```{r}
dev.new()
par(mfrow=c(1,1))
plot(burn.in+1:N,theta,type="l")  # the trace plot
```

\

```{r}
th.grid=seq(-10,20,by=1)
th.true=rep(0,length(th.grid))
for (i in 1:length(th.grid)){
  th.true[i]=p.th(th.grid[i])
}
th.true=th.true/sum(th.true)  # Normalizing I guess? This makes sum(th.true) = 1

dev.new()
par(mfrow=c(1,1))
hist(theta,freq=FALSE, main="Comparison to the actual dstn")
lines(th.grid,th.true,type="l")
```

\

### Estimate for the mean of the target dstn I guess?

```{r}
mean(theta[(burn.in+1):N])
```

\
\
\
