Posterior Interval by ‘Numerical Method’
### Placenta Previa example
y=seq(0,1,by=0.001)
plot(y,dbeta(y,438,544),type="l",xlim=c(0.3,0.6)) #Posterior distribution
post.y=rbeta(1000,438,544) #Taking 1000 samples from the Post. dstn
dev.new()
hist(post.y,freq=F) #Plotting the sample
mean(post.y) #The posterior mean
[1] 0.4461668
quantile(post.y,probs=c(0.025,0.975)) # 95% Posterior Interval using Numerical Method
2.5% 97.5%
0.4183153 0.4766519
Normal Model With Discrete Prior Distribution (Estimating the mean)
options(scipen=999) #To disable printing your results in scientific notation.
mu=c(20,30,40,50,60,70) #Parameter of interest
prior=c(0.1,0.15,0.25,0.25,0.15,0.1) #Discrete Prior Distribution
y=c(38.6,42.4,57.5,40.5,51.7,67.1,33.4,60.9,64.1,40.1,40.7,6.4) #Observations (n=10)
ybar=mean(y)
sigma=10 #Known population variance
n=length(y)
like=exp((-n/(2*sigma^2))*(mu-ybar)^2) #Likelihood function (Refer to the notes)
unn.post=prior*like #Unnormalized Discrete Posterior dstn
unn.post
[1] 0.00000000000000000220148 0.00000012289485905573538 0.04683563061571816704687
[4] 0.06580160638607958356605 0.00000034081137965149324 0.00000000000000001205079
nor.post=unn.post/sum(unn.post) #Normalized Discrete Posterior dstn (We have to Normalize it to make the Probability Distribution integrate to 1)
nor.post #We can see that the Discrete Post. dstn is [20: 0, 30: 0, 40: 0.42, 50: 0.58, 60: 0, 70: 0]
[1] 0.00000000000000001954479 0.00000109106327884201111 0.41580776526252849478738
[4] 0.58418811794322056396567 0.00000302573097203836150 0.00000000000000010698715
dev.new()
plot(mu,nor.post) #We can see that the most likely values are 40 and 50
s.post=sample(mu,1000,prob=nor.post,replace=T) #Take a sample from the Post. dstn
table(s.post) #The number of observations from values of 40 and 50
s.post
40 50
385 615
mean(s.post)
[1] 46.15
sd(s.post)
[1] 4.868388
Frequentist vs Bayesian Estimators (by MSE)
#Sample Size is SMALL
n=10
theta=seq(0,1,by=0.01) #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2 #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta
dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))
We can see that the Bayes’ MSEs are smaller in general when the sample size is small
#Sample Size is LARGE
n=1000
theta=seq(0,1,by=0.01) #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2 #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta
dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))
We can see that the MSEs of the two estimators are almost identical when the sample size is large
Highest Posterior Density (HPD) Credible Set
95% Credible Interval
n = 10
y = 2
ci = qbeta(c(0.025,0.975), y+1, n-y+1)
ci
[1] 0.06021773 0.51775585
Error in dev.off() :
QuartzBitmap_Output - unable to open file '/Users/jaeyonglee/.local/share/rstudio/notebooks/E15CF856-Bayesian/1/13BF2920BFE883ED/ce7ecaqvv8bt5_t/_rs_chunk_plot_001.png'
95% HPD interval
# install.packages("TeachingDemos") # hpd function is in there
library(TeachingDemos)
hpd=hpd(qbeta,shape1=y+1,shape2=n-y+1)
hpd
[1] 0.04055517 0.48372366
Visualization
theta = seq(0, 1, by=0.01)
plot(theta,dbeta(theta,y+1,n-y+1),type="l",xlab=expression(theta),ylab="Posterior Density Function", cex.lab=1)
segments(ci[1],0,ci[1],dbeta(ci[1],y+1,n-y+1),col="red",lwd=2)
segments(ci[2],0,ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)
segments(ci[1],dbeta(ci[1],y+1,n-y+1),ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)
# HPD
segments(hpd[1],0,hpd[1],dbeta(hpd[1],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[2],0,hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[1],dbeta(hpd[1],y+1,n-y+1),hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
legend(x=0.6,y=3,legend=c("95% CI=(0.06,0.52)", "95% HPD CI=(0.04,0.48)"),col=c("red","blue"),lty=c(1,2),cex=0.8)
Calculating Posterior Predictive P-values (Bayesian P-Values)
nsim=100
y=c(1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0)
T.y=sum(diff(y)!=0)
theta=rep(NA,nsim)
T.y.rep=rep(NA,nsim)
for (i in 1:nsim){
theta[i]<-rbeta(1,8,14)
y.rep<-rbinom(20,1,theta[i])
T.y.rep[i]<-sum(diff(y.rep)!=0)
}
sum(T.y.rep<=T.y)/nsim # Bayesian P-value
[1] 0
This is the Bayesian p-value (after 100 simulations)
hist(T.y.rep) # Posterior predictive distribution of the replicated test statistic
abline(v=3,col="red") # The value left to the red line is the p-value
Posterior dstn for Multinomial model for Categorical data
Let’s calculate the post. dstn for theta1 - theta2. (Let’s call this diff)
We’re gonna take samples from the post. dstn of (theta1,theta2,theta3) and calculate it.
library(MCMCpack)
Loading required package: coda
Loading required package: MASS
##
## Markov Chain Monte Carlo Package (MCMCpack)
## Copyright (C) 2003-2021 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
##
## Support provided by the U.S. National Science Foundation
## (Grants SES-0350646 and SES-0350613)
##
n=1447
y1=727
y2=583
y3=137
alpha=c(y1+1,y2+1,y3+1) # the parameters of post. dstn of (theta1,theta2,theta3)
nsim=100 # number of simulations
theta=rdirichlet(nsim,alpha) # Taking samples from the post. dstn of (theta1,theta2,theta3)
diff=theta[,1]-theta[,2]
diff # theta1 - theta2
[1] 0.10562810 0.08850064 0.11133521 0.14880062 0.09170576 0.09004244 0.04849895 0.07814048
[9] 0.11238263 0.09523778 0.07019925 0.07856047 0.08240375 0.09766831 0.10575962 0.10521868
[17] 0.12542080 0.10559810 0.13319903 0.10775616 0.13554330 0.08357416 0.08696108 0.10125090
[25] 0.13818495 0.12471304 0.09711644 0.12440268 0.07473838 0.13054324 0.11836621 0.13410864
[33] 0.07671566 0.11473167 0.11137363 0.09173639 0.06048886 0.13933621 0.11845024 0.10651097
[41] 0.12411735 0.07698650 0.13703001 0.08163398 0.10304243 0.07565535 0.12622887 0.13891881
[49] 0.08686959 0.11644592 0.10733023 0.14259136 0.10831884 0.10159467 0.11397080 0.11186925
[57] 0.09299134 0.08475031 0.09747366 0.09730852 0.12853930 0.07998293 0.07742454 0.07187104
[65] 0.09298987 0.07714006 0.09282218 0.07540104 0.09250740 0.11414998 0.11282102 0.07785968
[73] 0.12050435 0.09884799 0.11860044 0.10809613 0.03751695 0.10312953 0.10039184 0.07973068
[81] 0.13225262 0.08227616 0.06086792 0.09202436 0.10271909 0.08772210 0.09410449 0.08521208
[89] 0.09947449 0.11700026 0.11808139 0.15889028 0.09612751 0.11579798 0.11020222 0.08929781
[97] 0.09390641 0.13181758 0.08235398 0.12179827
Histogram plot of dstn for diff
hist(diff)
Point Estimate
mean(diff)
[1] 0.1020829
Interval Estimate (alpha = 0.05)
quantile(diff,probs=c(0.025,0.975)) # Equal tailed interval for alpha = 0.05
2.5% 97.5%
0.06066891 0.14104516
The count of theta1 < theta2
sum(diff<0)
[1] 0
We can see that theta1 is larger than theta2.
Monte Carlo Integration
Case I
True Value
2/3
[1] 0.6666667
Direct Computation
N<-100000
theta<-runif(N,-1,1) # Drawing N number of values
h.theta<-2*theta^2 # The h() function
mu.h.theta<-mean(h.theta) # MC estimate
mu.h.theta
[1] 0.6688993
Pretty close to the true value.
Iteration Method
N=10000
theta=rep(0,N) # Initializing the places to save the values I guess
h.theta=rep(0,N)
mu.h.theta=rep(0,N)
for (i in 1:N){
theta[i]=runif(1,-1,1) # Drawing 1 value at a time
h.theta[i]=2*theta[i]^2
mu.h.theta[i]=mean(h.theta[1:i])
}
mu.h.theta[N] # MC estimate
[1] 0.6572248
dev.new()
plot(1:N,mu.h.theta,type="l")
The h(theta) eventually converges to the true value.
Case II
True Value
1-exp(-1)
[1] 0.6321206
Direct Computation
N=10000
x=runif(N) # Drawing N number of values
# g(x) = exp(-x)
theta.hat=mean(exp(-x)) # MC estimate
theta.hat
[1] 0.6311722
Pretty close to the true value.
Iteration method
N=10000
x=rep(0,N) # Initializing the places to save the values I guess
h.x=rep(0,N)
mu.h.x=rep(0,N)
for (i in 1:N){
x[i]=runif(1)
h.x[i]=exp(-x[i])
mu.h.x[i]=mean(h.x[1:i])
}
mu.h.x[N] # MC estimate
[1] 0.6354287
dev.new()
plot(1:N,mu.h.x,type="l")
It eventually converges to the true value.
Rejection Sampling
Example1
Target Density, p
p <- function(x) {
if (x >= 0 && x < 0.25)
8 * x
else if (x >= 0.25 && x <= 1)
8/3 - 8 * x/3
else 0
}
Proposal Density, g and Constant M
g <- function(x) {
if (x >= 0 && x <= 1)
1
else 0
}
M<-3 # Try out different values for M (M is better to be smaller)
Accept/Reject algorithm
m <- 1000 # Total number of accepted values needed
n.draws <- 0 # For counting the number of accepted values
draws <- c() # Place to store accepted values
while (n.draws < m) {
x.c <- runif(1, 0, 1) # Drawing a candidate from the proposal density
accept.prob <- p(x.c)/(M * g(x.c)) # The prob. of accepting for each candidate
u <- runif(1, 0, 1) # The auxiliary variable used for accept/reject algo.
if (accept.prob >= u) {
draws <- c(draws, x.c) # Storing the accepted candidate
n.draws <- n.draws + 1 # Counting the accepted candidate
}
}
The accepted values
head(draws)
[1] 0.5807693 0.2013411 0.6341995 0.1950827 0.2677502 0.3024960
Visualization
# Initializing variables for the plots
x <- seq(0, 1, by = 0.01)
y=rep(0,length(x))
for (i in 1:length(x)){
y[i]=p(x[i])
}
dev.new()
par(mfrow=c(1,3))
plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and Mg()")
abline(h=M,col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)
plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and the density\n of the accepted values")
lines(density(draws),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)
hist(draws,freq=F,ylim=c(0,4), xlab="x", ylab="y", main="Histogram of accepted values")
lines(x,y,type="l",col="black")
Example2
Target Density, p
x.grid=seq(-5,5,by=0.01)
p=dnorm(x.grid,0,1)
Proposal Density, g and Constant M.
g=dnorm(x.grid,0,2)
M=2
Accept/Reject algorithm
N=10000 # Total number of iterations
# This time we will be checking the general acceptance rate
x=rep(0,N) # Places to store the accepted values
# The unaccepted ones will be 0 so we can count how many nonzeros there
# out of N to calculate the general acceptance rate
for (i in 1:(N)){
x_star=rnorm(1,0,2) # Drawing a candidate from the proposal density
accept.prob=dnorm(x_star,0,1)/(M*dnorm(x_star,0,2)) # Prob. of accepting
U=runif(1,0,1) # The auxiliary variable
if (accept.prob>=U){x[i]=x_star}
}
The general acceptance rate
i.e. how many values were accepted out of N candidates
accept.rate=sum(x!=0)/N
accept.rate
[1] 0.5078
From this we can deduce that the number of accepted values is about half of N
Visualization
dev.new()
par(mfrow=c(1,3))
plot(x.grid,p,type="l",ylim=c(0,0.6), xlab="x", ylab="y", main="Plot of p() and Mg()")
lines(x.grid,M*g,type="l",col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)
nonzero = x[x!=0] # We don't need 0s to plot the density
plot(x.grid,p,type="l", main="Plot of p() and the density\n of the accepted values")
lines(density(nonzero),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)
hist(nonzero,freq=F,ylim=c(0,0.5), ylab="y", main="Histogram of accepted values")
lines(x.grid, dnorm(x.grid,0,1))
Importance Sampling
True Value
1-pnorm(3)
[1] 0.001349898
Importance Sampling Estimate
N=100000 # Number of samples to extract from the proposal dstn
x=rnorm(N,4,1) # Samples from the proposal dstn
f.x=dnorm(x,0,1) # To calculate the weight ftn
g.x=dnorm(x,4,1) # To calculate the weight ftn
h.x=as.numeric(x>=3) # Target
w.x=f.x/g.x # Weight ftn
h.x_IS=mean(h.x*w.x) # Importance Sampling estimate
h.x_IS
[1] 0.001357118
We can see that it is pretty close
Sampling Error of the IS estimate
sd(h.x*w.x)
[1] 0.00312042
We can see that the sampling error is very small which is an indicator of a good estimate.
Comparison to MC (CaseII)
Direct Computation
N=10000 # Number of samples to extract
x=runif(N,0,3) # Samples extracted from Unif(0,3)
g.x=1/sqrt(2*pi)*exp(-0.5*x^2) # The g() ftn
1/2-3*mean(g.x) # MC estimate
[1] -0.004296541
Sampling Error of the MC estimate
sd(g.x)
[1] 0.1394672
We can see that the sampling error is a lot bigger than IS estimate’s.
Iteration Method
N=10000 # Number of samples to extract
x=rep(0,N) # Storage for samples
g.x=rep(0,N) # Storage for values of g()
mu.g.x=rep(0,N) # Storage for values of mean of g()
for (i in 1:N){
x[i]=runif(1,0,3) # Extracting a sample
g.x[i]=1/sqrt(2*pi)*exp(-1/2*x[i]^2)
mu.g.x[i]=mean(g.x[1:i])
}
1/2-3*mu.g.x[N] # MC estimate
[1] -0.005483052
Visualization
dev.new()
plot(1:N,0.5-3*mu.g.x,type="l", ylim=c(0,0.15))
Sampling Error of the MC estimate (Iteration method)
sd(g.x)
[1] 0.1393593
Comparison tp MC (Case I)
N=100000 # Number of samples to extract
x=rnorm(N) # Samples taken from N(0,1)
h.x=as.numeric(x>=3) # h() = P(X>=3) ; X ~ N(0,1)
h.x_MC=mean(h.x) # MC estimate
h.x_MC
[1] 0.00132
Sample Error of MC (Case I)
sd(h.x)
[1] 0.036308
Case I has a better sample error than case II.
Markov Chain
Prediction
nsim=100
P=matrix(c(0.9,0.5,0.1,0.5),2,2) # Transition matrix
x=matrix(c(0,1),nsim,2,byrow=T) # Initial states
for(i in 1:(nsim-1)){
x[i+1,]=x[i,]%*%P # Matrix multiplication
}
dev.new()
plot(1:nsim,x[,1],type="l",ylim=c(0,1),col="red")
lines(1:nsim,x[,2],type="l",col="blue")
legend("topright", legend=c("Sunny", "Rainy"), col=c("red", "blue"), lty=1, cex=0.8)
We can see that it converges.
Gibbs Sampler (One of MCMC methods)
N=10000 # Total number of iterations(time)
burn=N/2 # Burn-in period is half of N I guess
# data (a single obs.)
y1=0
y2=0
# correlation
rho=0.8
# initial values
theta.1=rep(1,N) # Starting point is 1 for theta1
theta.2=rep(-10,N) # Starting point is -10 for theta2
theta=cbind(theta.1,theta.2) # Theta binded
# iterations
for (i in 1:(N-1)){
# Taking samples from the conditional dstns of theta1 and theta2 respectively
theta.1[i+1]=rnorm(1,y1+rho*(theta.2[i]-y2),sqrt(1-rho^2))
theta.2[i+1]=rnorm(1,y2+rho*(theta.1[i+1]-y1),sqrt(1-rho^2))
# Samples binded
theta[i+1,]=c(theta.1[i+1],theta.2[i+1])
}
head(theta)
theta.1 theta.2
[1,] 1.0000000 -10.000000
[2,] -7.4826999 -5.727806
[3,] -4.8542972 -3.540300
[4,] -3.4723206 -1.958079
[5,] -1.5681786 -0.440435
[6,] -0.2940323 0.667433
Visualizations
# check traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
The values of theta1 and theta2 converge.
dev.new()
par(mfrow=c(1,1))
plot(theta)
A simple visualization.
ntr=1000
dev.new()
par(mfrow=c(1,1))
plot(theta.1[1:ntr],theta.2[1:ntr],type="l")
text(theta.1[1:ntr],theta.2[1:ntr],labels=1:10,cex=1, font=5)
With this plot, we can see the movement of the convergence I guess.
The averages of thetas: these are the estimates
colMeans(theta)
theta.1 theta.2
-0.01851583 -0.02379488
The correlation of thetas from the samples using conditional dstns
cor(theta)
theta.1 theta.2
theta.1 1.0000000 0.7815042
theta.2 0.7815042 1.0000000
The correlation of thetas without the Burn-in period
cor(theta[burn:N,])
theta.1 theta.2
theta.1 1.0000000 0.7764459
theta.2 0.7764459 1.0000000
Excluding the Burn-in period might and might not improve, I guess.
The actual dstn
library(mvtnorm) # To sample from an actual BVN dstn
joint.s=rmvnorm(N/2,mean=c(y1,y2),sig=matrix(c(1,rho,rho,1),2,2)) # The actual dstn
head(joint.s)
[,1] [,2]
[1,] -1.4267623 -2.27127038
[2,] -1.4242776 -1.08538234
[3,] 1.6142546 1.97821223
[4,] 0.1976566 0.13710940
[5,] -0.5978071 -0.07247639
[6,] 1.2196402 0.23673397
Visualization
plot(xlim=c(-10,4), ylim=c(-10,4), joint.s)
The averages of thetas : these are the estimates
colMeans(joint.s)
[1] 0.01367316 0.02207730
The correlation of thetas from the samples using the actual dstn
cor(joint.s)
[,1] [,2]
[1,] 1.0000000 0.8009714
[2,] 0.8009714 1.0000000
Metropolis algorithm
Example1
N=10000
burn.in=N/2 # Burn-in period is half of N I guess
# data (a single obs.)
y1=0
y2=0
# correlation
rho=0.8
# initial value
sig2=1-rho^2 # variance for posterior density
sig2.prop=0.1 # variance for proposal density
theta.1=rep(1,N) # Initial value is 1 I guess
theta.2=rep(1,N)
# iterations
for (i in 1:(N-1)){
# for theta.1
theta.1.star=rnorm(1,theta.1[i],sqrt(sig2.prop)) # Sample from the proposal dstn
U.1=runif(1,0,1) # U: auxiliary variable
# Ratio of the two densities
ratio.1=dnorm(theta.1.star,y1+rho*(theta.2[i]-y2),1-rho^2)/dnorm(theta.1[i],y1+rho*(theta.2[i]-y2),1-rho^2)
# Probability of move
alpha.1=min(ratio.1,1)
if (U.1<=alpha.1){theta.1[i+1]=theta.1.star} # Accepted
else theta.1[i+1]=theta.1[i]
## for theta.2
theta.2.star=rnorm(1,theta.2[i],sqrt(sig2.prop))
U.2=runif(1,0,1)
# Ratio of the two densities
ratio.2=dnorm(theta.2.star,y2+rho*(theta.1[i+1]-y1),1-rho^2)/dnorm(theta.2[i],y2+rho*(theta.1[i+1]-y1),1-rho^2)
# Probability of move
alpha.2=min(ratio.2,1)
if (U.2<=alpha.2){theta.2[i+1]=theta.2.star} # Accepted
else theta.2[i+1]=theta.2[i]
}
theta=cbind(theta.1,theta.2) # Binding the thetas together
head(theta)
theta.1 theta.2
[1,] 1.0000000 1.0000000
[2,] 1.0768556 1.0000000
[3,] 1.3101948 0.8304043
[4,] 1.3101948 0.5861109
[5,] 1.2283990 0.5861109
[6,] 0.7631203 0.5752664
Visualizations
# traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
dev.new()
par(mfrow=c(1,2))
hist(theta[(burn.in+1):N,1])
hist(theta[(burn.in+1):N,1])
dev.new()
par(mfrow=c(1,1))
plot(theta[(burn.in+1):N,])
The averages of theta1 and theta2: These are the estimates
colMeans(theta[(burn.in+1):N,])
theta.1 theta.2
0.01982588 0.02451556
The correlation of theta1 and theta2
cor(theta[(burn.in+1):N,])
theta.1 theta.2
theta.1 1.0000000 0.8035538
theta.2 0.8035538 1.0000000
Example2
The metropolis algorithm made into a function (because we need to run for 4 different variances)
mh=function(v,sigma,x0,N){
x=numeric(N) # Making N number of 0s
x[1]=x0 # Starting point
k=0
for (i in 2:N){ # Starts at 2 because 1 is the starting point
y=rnorm(1,x[i-1],sigma) # Sampling the candidate from the proposal dstn with
# mean: previous value and variance: sigma
u=runif(1) # The auxiliary variable
alpha=min(dt(y,v)/dt(x[i-1],v),1) # Calculating the probability of move
# The ratio r is p(candidate|y)/p(previous value|y)
# Where p() is in this case the t dstn with df=v
if (u<=alpha){x[i]=y} # Accepting the candidate
else {x[i]=x[i-1]
k=k+1 # Counting the number of rejections
}
}
return(list(x=x,k=k))
}
Let’s run the algorithm
N=1000 # Number of samples
burn.in=N/2 # The Burn-in period
v=4 # df of t dstn
x0=1 # Starting point
sigma=c(0.05,0.5,2,16) # Different variances for proposal dstns
mh1=mh(v,sigma[1],x0,N)
mh2=mh(v,sigma[2],x0,N)
mh3=mh(v,sigma[3],x0,N)
mh4=mh(v,sigma[4],x0,N)
The number of candidates rejected by each variance
c(mh1$k,mh2$k,mh3$k,mh4$k)/N
[1] 0.023 0.146 0.462 0.892
We can see that, when the variance(low) => Few rejected, variance(high) => Many rejected.
Visualizations
dev.new()
par(mfrow=c(2,2))
plot(1:N,mh1$x,type="l") # The trace plots
plot(1:N,mh2$x,type="l")
plot(1:N,mh3$x,type="l")
plot(1:N,mh4$x,type="l")
The frequent horizontal lines in the trace plot means that there were many rejections. Horizontal line means that previous values were used many times because candidates were rejected.
a=seq(-5,5,by=0.01) # To plot the t dstn
dev.new()
par(mfrow=c(2,2))
hist(mh1$x[(burn.in+1):N],freq=F,ylim=c(0,0.7)) # Histogram of the samples
# without the Burn-in period
lines(a,dt(a,df=4)) # t dstn with df=4: The actual dstn
hist(mh2$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))
hist(mh3$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))
hist(mh4$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))
We can see that when the variance is too low or high, the dstn of the samples were not approximate to the actual dstn. Whereas when the variance is modest, the samples were pretty approximate to the actual dstn.
Metropolis-Hastings algorithm
Example1
The M-H algorithm
N=10000 # Number of samples
burn.in=N/2 # The Burn-in period
sigma=4 # of the Rayleigh dstn
x=numeric(N) # Initializing the spaces for the chain
x[1]=rchisq(1,df=1) # Starting point
k=0 # Number of rejected candidates
for (i in 2:N){ # Starts from 2 because 1 is the starting point
x.star=rchisq(1,df=x[i-1]) # Candidate taken from chisq dstn with df=previous value
u=runif(1) # Auxiliary variable
# For calculating the ratio r
num=(x.star/(sigma^2)*exp(-x.star^2/(2*sigma^2)))/dchisq(x.star,df=x[i-1])
den=(x[i-1]/(sigma^2)*exp(-x[i-1]^2/(2*sigma^2)))/dchisq(x[i-1],df=x.star)
alpha=min(num/den,1) # The probability of move
if (u<=alpha){x[i]=x.star} # Accepting the candidate
else {x[i]=x[i-1]
k=k+1 # Counting the number of rejected candidates
}
}
Visualizations
dev.new()
plot(1:N,x,type="l") # The trace plot
# install.packages("VGAM")
library(VGAM) # For drayleigh()
Loading required package: stats4
Loading required package: splines
Attaching package: ‘VGAM’
The following object is masked from ‘package:coda’:
nvar
xgrid=seq(0,20,by=0.1)
ytrue=drayleigh(xgrid,scale=sigma) # The density of the actual dstn
dev.new()
hist(x[(burn.in+1):N],freq=F, main="Comparison to the actual dstn") # freq=F: to display density on y axis not the frequency
lines(xgrid,ytrue,type="l") # The actual dstn
We can see that the samples(the histogram) is pretty approximate to the actual dstn(the line).
Example2
N=100000 # Number of samples
burn.in=N/2 # Burn-in period
## initial value
sig2=100 # variance of the proposal dstn
theta=rep(0,N) # Initializing the space for theta
p.th=function(th){ # The actual dstn
0.3*exp(-0.2*th^2)+0.7*exp(-0.2*(th-10)^2)
}
## iterations
for (i in 1:(N-1)){ # No value for starting point was given so I guess it's 0
theta.star=rnorm(1,theta[i],sqrt(sig2)) # candidate
U=runif(1,0,1) # auxiliary variable
ratio=p.th(theta.star)/p.th(theta[i]) # the ratio r
# Note that since the proposal dstn is symmetric, the ratio r is just like Metropolis'
alpha=min(ratio,1) # The probability of move
if (U<=alpha){theta[i+1]=theta.star} # Accepting the candidate
else theta[i+1]=theta[i]
}
Visualizations
dev.new()
par(mfrow=c(1,1))
plot(burn.in+1:N,theta,type="l") # the trace plot
th.grid=seq(-10,20,by=1)
th.true=rep(0,length(th.grid))
for (i in 1:length(th.grid)){
th.true[i]=p.th(th.grid[i])
}
th.true=th.true/sum(th.true) # Normalizing I guess? This makes sum(th.true) = 1
dev.new()
par(mfrow=c(1,1))
hist(theta,freq=FALSE, main="Comparison to the actual dstn")
lines(th.grid,th.true,type="l")
Estimate for the mean of the target dstn I guess?
mean(theta[(burn.in+1):N])
[1] 6.886633
---
title: "Bayesian Statistics - pt.1"
output:
  html_notebook:
    toc: yes
---

\
\
\
\

# Posterior Interval by 'Numerical Method'

\

```{r}

### Placenta Previa example
y=seq(0,1,by=0.001)

plot(y,dbeta(y,438,544),type="l",xlim=c(0.3,0.6))  #Posterior distribution

post.y=rbeta(1000,438,544)  #Taking 1000 samples from the Post. dstn

dev.new()
hist(post.y,freq=F)  #Plotting the sample

mean(post.y)  #The posterior mean
quantile(post.y,probs=c(0.025,0.975))  # 95% Posterior Interval using Numerical Method

```

\
\
\
\
\
\

# Normal Model With Discrete Prior Distribution (Estimating the mean)

\

```{r}
options(scipen=999)  #To disable printing your results in scientific notation.

mu=c(20,30,40,50,60,70)  #Parameter of interest
prior=c(0.1,0.15,0.25,0.25,0.15,0.1)  #Discrete Prior Distribution
y=c(38.6,42.4,57.5,40.5,51.7,67.1,33.4,60.9,64.1,40.1,40.7,6.4)  #Observations (n=10)

ybar=mean(y)
sigma=10  #Known population variance
n=length(y)
like=exp((-n/(2*sigma^2))*(mu-ybar)^2)  #Likelihood function (Refer to the notes)

unn.post=prior*like  #Unnormalized Discrete Posterior dstn 
unn.post

nor.post=unn.post/sum(unn.post)  #Normalized Discrete Posterior dstn (We have to Normalize it to make the Probability Distribution integrate to 1)
nor.post  #We can see that the Discrete Post. dstn is [20: 0, 30: 0, 40: 0.42, 50: 0.58, 60: 0, 70: 0]

dev.new()
plot(mu,nor.post)  #We can see that the most likely values are 40 and 50

s.post=sample(mu,1000,prob=nor.post,replace=T)  #Take a sample from the Post. dstn
table(s.post)  #The number of observations from values of 40 and 50
mean(s.post)
sd(s.post)

```

\
\
\
\
\
\

# Frequentist vs Bayesian Estimators (by MSE)

\

```{r}
#Sample Size is SMALL
n=10
theta=seq(0,1,by=0.01)  #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n  #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2  #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta

dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))  
```

#### We can see that the Bayes' MSEs are smaller in general when the sample size is small


```{r}
#Sample Size is LARGE
n=1000
theta=seq(0,1,by=0.01)  #Generate values of theta from (0,1) to see the MSE at each value of theta
#Refer to the notes for the equation of the MSEs
mse.f=theta*(1-theta)/n  #MSE of the Frequentist estimator at each value of theta
mse.b=((4-n)*theta^2-(4-n)*theta+1)/(n+2)^2  #MSE of the Bayesian estimator(=Posterior Mean) at each value of theta

dev.new()
plot(theta,mse.f,type="l")
lines(theta,mse.b,col="red")
legend(x="topright", legend=c("Freq.","Bayes."), lty=c(1,1), col=c("black","red"))  
```

#### We can see that the MSEs of the two estimators are almost identical when the sample size is large

\
\
\
\
\
\

# Highest Posterior Density (HPD) Credible Set

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0086.jpeg)

\

### 95% Credible Interval

```{r}
n = 10
y = 2

ci = qbeta(c(0.025,0.975), y+1, n-y+1)
ci
```

\

### 95% HPD interval

```{r}
# install.packages("TeachingDemos")  # hpd function is in there
library(TeachingDemos)

hpd=hpd(qbeta,shape1=y+1,shape2=n-y+1)
hpd
```

\

### Visualization

```{r}
theta = seq(0, 1, by=0.01)

plot(theta,dbeta(theta,y+1,n-y+1),type="l",xlab=expression(theta),ylab="Posterior Density Function", cex.lab=1)

segments(ci[1],0,ci[1],dbeta(ci[1],y+1,n-y+1),col="red",lwd=2)
segments(ci[2],0,ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)
segments(ci[1],dbeta(ci[1],y+1,n-y+1),ci[2],dbeta(ci[2],y+1,n-y+1),col="red",lwd=2)

# HPD
segments(hpd[1],0,hpd[1],dbeta(hpd[1],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[2],0,hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
segments(hpd[1],dbeta(hpd[1],y+1,n-y+1),hpd[2],dbeta(hpd[2],y+1,n-y+1),col="blue",lwd=2,lty=2)
legend(x=0.6,y=3,legend=c("95% CI=(0.06,0.52)", "95% HPD CI=(0.04,0.48)"),col=c("red","blue"),lty=c(1,2),cex=0.8)
```

\
\
\
\
\
\

# Calculating Posterior Predictive P-values (Bayesian P-Values)

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0083.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0084.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0085.jpeg)

\

```{r}
nsim=100

y=c(1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0)
T.y=sum(diff(y)!=0)
theta=rep(NA,nsim)
T.y.rep=rep(NA,nsim)

for (i in 1:nsim){
  theta[i]<-rbeta(1,8,14)
  y.rep<-rbinom(20,1,theta[i])
  T.y.rep[i]<-sum(diff(y.rep)!=0)
}

sum(T.y.rep<=T.y)/nsim  # Bayesian P-value

```

#### This is the Bayesian p-value (after 100 simulations)

\

```{r}
hist(T.y.rep)  # Posterior predictive distribution of the replicated test statistic
abline(v=3,col="red")  # The value left to the red line is the p-value
```

\
\
\
\
\
\

# Posterior dstn for Multinomial model for Categorical data

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/ss.png)

\

#### Let's calculate the post. dstn for theta1 - theta2. (Let's call this diff)

\

#### We're gonna take samples from the post. dstn of (theta1,theta2,theta3) and calculate it.

\
```{r}
library(MCMCpack)

n=1447

y1=727
y2=583
y3=137

alpha=c(y1+1,y2+1,y3+1)  # the parameters of post. dstn of (theta1,theta2,theta3)

nsim=100  # number of simulations    

theta=rdirichlet(nsim,alpha)  # Taking samples from the post. dstn of (theta1,theta2,theta3)
diff=theta[,1]-theta[,2]
diff  # theta1 - theta2
```

\

### Histogram plot of dstn for diff

\

```{r}
hist(diff)
```

\

### Point Estimate

\

```{r}
mean(diff)
```

\

### Interval Estimate (alpha = 0.05)

\

```{r}
quantile(diff,probs=c(0.025,0.975))  # Equal tailed interval for alpha = 0.05
```

\

### The count of theta1 < theta2

\

```{r}
sum(diff<0)
```

\

### We can see that theta1 is larger than theta2.

\
\
\
\
\
\

# Monte Carlo Integration

\

## Case I

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0153.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0152.jpeg)

\

### True Value

\

```{r}
2/3
```

\

### Direct Computation

```{r}
N<-100000
theta<-runif(N,-1,1)  # Drawing N number of values
h.theta<-2*theta^2   # The h() function

mu.h.theta<-mean(h.theta)   # MC estimate
mu.h.theta
```

#### Pretty close to the true value.

\

### Iteration Method

```{r}
N=10000
theta=rep(0,N)  # Initializing the places to save the values I guess
h.theta=rep(0,N)
mu.h.theta=rep(0,N)

for (i in 1:N){
  theta[i]=runif(1,-1,1)  # Drawing 1 value at a time
  h.theta[i]=2*theta[i]^2
  mu.h.theta[i]=mean(h.theta[1:i])
}

mu.h.theta[N]  # MC estimate
```

```{r}
dev.new()
plot(1:N,mu.h.theta,type="l")
```

#### The h(theta) eventually converges to the true value.

\
\
\

## Case II

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0154.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0155.jpeg)

\

### True Value

```{r}
1-exp(-1)
```

\

### Direct Computation

```{r}
N=10000
x=runif(N)  # Drawing N number of values

# g(x) = exp(-x)
theta.hat=mean(exp(-x))  # MC estimate
theta.hat
```

#### Pretty close to the true value.

\

### Iteration method

```{r}
N=10000
x=rep(0,N)  # Initializing the places to save the values I guess
h.x=rep(0,N)
mu.h.x=rep(0,N)

for (i in 1:N){
  x[i]=runif(1)
  h.x[i]=exp(-x[i])
  mu.h.x[i]=mean(h.x[1:i])
}

mu.h.x[N]  # MC estimate
```

```{r}
dev.new()
plot(1:N,mu.h.x,type="l")
```

#### It eventually converges to the true value.

\
\
\
\
\
\
\

# Rejection Sampling

\

## Example1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0157.jpeg)

\

### Target Density, p

```{r}
p <- function(x) {
  if (x >= 0 && x < 0.25)
     8 * x
  else if (x >= 0.25 && x <= 1)
     8/3 - 8 * x/3
   else 0
  }
```

\

### Proposal Density, g and Constant M

```{r}
g <- function(x) {
   if (x >= 0 && x <= 1)
     1
   else 0
  }

M<-3  # Try out different values for M (M is better to be smaller)
```

\

### Accept/Reject algorithm

```{r}
m <- 1000  # Total number of accepted values needed
n.draws <- 0  # For counting the number of accepted values
draws <- c()  # Place to store accepted values

while (n.draws < m) {
   x.c <- runif(1, 0, 1)  # Drawing a candidate from the proposal density
   accept.prob <- p(x.c)/(M * g(x.c))  # The prob. of accepting for each candidate
   u <- runif(1, 0, 1)  # The auxiliary variable used for accept/reject algo.
   
   if (accept.prob >= u) {
     draws <- c(draws, x.c)  # Storing the accepted candidate
     n.draws <- n.draws + 1  # Counting the accepted candidate
     }
   }
```

\

### The accepted values

```{r}
head(draws)
```

\

### Visualization

```{r}
# Initializing variables for the plots
x <- seq(0, 1, by = 0.01)
y=rep(0,length(x))
for (i in 1:length(x)){
  y[i]=p(x[i])
}

dev.new()
par(mfrow=c(1,3))
plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and Mg()")
abline(h=M,col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)

plot(x,y,type="l",ylim=c(0,4),col="black", main="Plot of p() and the density\n of the accepted values")
lines(density(draws),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)

hist(draws,freq=F,ylim=c(0,4), xlab="x", ylab="y", main="Histogram of accepted values")
lines(x,y,type="l",col="black")
```

\
\
\

## Example2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0158.jpeg)

\

### Target Density, p

```{r}
x.grid=seq(-5,5,by=0.01)
p=dnorm(x.grid,0,1)
```

\

### Proposal Density, g and Constant M.

```{r}
g=dnorm(x.grid,0,2)
M=2
```

\

### Accept/Reject algorithm

```{r}
N=10000  # Total number of iterations
         # This time we will be checking the general acceptance rate
x=rep(0,N)  # Places to store the accepted values
            # The unaccepted ones will be 0 so we can count how many nonzeros there
            # out of N to calculate the general acceptance rate

for (i in 1:(N)){
  x_star=rnorm(1,0,2)  # Drawing a candidate from the proposal density
  accept.prob=dnorm(x_star,0,1)/(M*dnorm(x_star,0,2))  # Prob. of accepting
  U=runif(1,0,1)  # The auxiliary variable
  if (accept.prob>=U){x[i]=x_star}
}
```

\

### The general acceptance rate

#### i.e. how many values were accepted out of N candidates

```{r}
accept.rate=sum(x!=0)/N
accept.rate
```

#### From this we can deduce that the number of accepted values is about half of N

\

### Visualization

```{r}
dev.new()
par(mfrow=c(1,3))
plot(x.grid,p,type="l",ylim=c(0,0.6), xlab="x", ylab="y", main="Plot of p() and Mg()")
lines(x.grid,M*g,type="l",col="red")
legend("topright", legend=c("p()", "Mg()"), col=c("black", "red"), lty=1, cex=0.8)


nonzero = x[x!=0]  # We don't need 0s to plot the density

plot(x.grid,p,type="l", main="Plot of p() and the density\n of the accepted values")
lines(density(nonzero),col="red")
legend("topright", legend=c("p()", "acc"), col=c("black", "red"), lty=1, cex=0.8)

hist(nonzero,freq=F,ylim=c(0,0.5), ylab="y", main="Histogram of accepted values")
lines(x.grid, dnorm(x.grid,0,1))
```

\
\
\
\
\
\
\

# Importance Sampling

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0160.jpeg)

\

### True Value

```{r}
1-pnorm(3)
```

\

### Importance Sampling Estimate

```{r}
N=100000  # Number of samples to extract from the proposal dstn
x=rnorm(N,4,1)  # Samples from the proposal dstn
f.x=dnorm(x,0,1)  # To calculate the weight ftn
g.x=dnorm(x,4,1)  # To calculate the weight ftn
h.x=as.numeric(x>=3)  # Target
w.x=f.x/g.x  # Weight ftn

h.x_IS=mean(h.x*w.x)  # Importance Sampling estimate
h.x_IS
```

#### We can see that it is pretty close

\

### Sampling Error of the IS estimate

```{r}
sd(h.x*w.x)
```

#### We can see that the sampling error is very small which is an indicator of a good estimate.

\
\

## Comparison to MC (CaseII)

\

### Direct Computation

```{r}
N=10000  # Number of samples to extract
x=runif(N,0,3)  # Samples extracted from Unif(0,3)
g.x=1/sqrt(2*pi)*exp(-0.5*x^2)  # The g() ftn
1/2-3*mean(g.x)  # MC estimate
```

\

### Sampling Error of the MC estimate

```{r}
sd(g.x)
```

#### We can see that the sampling error is a lot bigger than IS estimate's.

\

### Iteration Method

```{r}
N=10000  # Number of samples to extract
x=rep(0,N)  # Storage for samples
g.x=rep(0,N)  # Storage for values of g()
mu.g.x=rep(0,N)  # Storage for values of mean of g()

for (i in 1:N){
  x[i]=runif(1,0,3)  # Extracting a sample
  g.x[i]=1/sqrt(2*pi)*exp(-1/2*x[i]^2)
  mu.g.x[i]=mean(g.x[1:i])
}

1/2-3*mu.g.x[N]  # MC estimate
```

\

### Visualization

```{r}
dev.new()
plot(1:N,0.5-3*mu.g.x,type="l", ylim=c(0,0.15))
```

\

### Sampling Error of the MC estimate (Iteration method)

```{r}
sd(g.x)
```

\

## Comparison tp MC (Case I)

```{r}
N=100000  # Number of samples to extract
x=rnorm(N)  # Samples taken from N(0,1)
h.x=as.numeric(x>=3)  # h() = P(X>=3) ; X ~ N(0,1)
h.x_MC=mean(h.x)  # MC estimate
h.x_MC
```

\

### Sample Error of MC (Case I)

```{r}
sd(h.x)
```

#### Case I has a better sample error than case II.

\
\
\
\
\
\
\

# Markov Chain

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0187.jpeg)

\

### Prediction

```{r}
nsim=100
P=matrix(c(0.9,0.5,0.1,0.5),2,2)  # Transition matrix
x=matrix(c(0,1),nsim,2,byrow=T)  # Initial states

for(i in 1:(nsim-1)){
  x[i+1,]=x[i,]%*%P  # Matrix multiplication
}

dev.new()
plot(1:nsim,x[,1],type="l",ylim=c(0,1),col="red")
lines(1:nsim,x[,2],type="l",col="blue")
legend("topright", legend=c("Sunny", "Rainy"), col=c("red", "blue"), lty=1, cex=0.8)
```

#### We can see that it converges.

\
\
\
\
\
\
\

# Gibbs Sampler (One of MCMC methods)

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0189.jpeg)

\

```{r}
N=10000  # Total number of iterations(time)
burn=N/2  # Burn-in period is half of N I guess

# data (a single obs.)
y1=0
y2=0

# correlation
rho=0.8

# initial values
theta.1=rep(1,N)  # Starting point is 1 for theta1
theta.2=rep(-10,N)  # Starting point is -10 for theta2
theta=cbind(theta.1,theta.2)  # Theta binded

# iterations
for (i in 1:(N-1)){
  # Taking samples from the conditional dstns of theta1 and theta2 respectively
  theta.1[i+1]=rnorm(1,y1+rho*(theta.2[i]-y2),sqrt(1-rho^2))
  theta.2[i+1]=rnorm(1,y2+rho*(theta.1[i+1]-y1),sqrt(1-rho^2))
  # Samples binded
  theta[i+1,]=c(theta.1[i+1],theta.2[i+1])
}

head(theta)
```

\

### Visualizations

```{r}
# check traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
```

#### The values of theta1 and theta2 converge.

\

```{r}
dev.new()
par(mfrow=c(1,1))
plot(theta)
```

#### A simple visualization.

\

```{r}
ntr=1000 
dev.new()
par(mfrow=c(1,1))
plot(theta.1[1:ntr],theta.2[1:ntr],type="l")
text(theta.1[1:ntr],theta.2[1:ntr],labels=1:10,cex=1, font=5)
```

#### With this plot, we can see the movement of the convergence I guess.

\

### The averages of thetas: these are the estimates

```{r}
colMeans(theta)
```

\

### The correlation of thetas from the samples using conditional dstns

```{r}
cor(theta)
```

\

### The correlation of thetas without the Burn-in period

```{r}
cor(theta[burn:N,])
```

#### Excluding the Burn-in period might and might not improve, I guess.

\

### The actual dstn

```{r}
library(mvtnorm)  # To sample from an actual BVN dstn

joint.s=rmvnorm(N/2,mean=c(y1,y2),sig=matrix(c(1,rho,rho,1),2,2))  # The actual dstn

head(joint.s)
```

### Visualization

```{r}
plot(xlim=c(-10,4), ylim=c(-10,4), joint.s)
```

### The averages of thetas : these are the estimates

```{r}
colMeans(joint.s)
```

### The correlation of thetas from the samples using the actual dstn

```{r}
cor(joint.s)
```

\
\
\
\
\
\
\

# Metropolis algorithm

\

## Example1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0191.jpg)

\

```{r}
N=10000
burn.in=N/2  # Burn-in period is half of N I guess

# data (a single obs.)
y1=0
y2=0

# correlation
rho=0.8

# initial value
sig2=1-rho^2    # variance for posterior density
sig2.prop=0.1   # variance for proposal density
theta.1=rep(1,N)  # Initial value is 1 I guess
theta.2=rep(1,N)


# iterations
for (i in 1:(N-1)){
  
  # for theta.1
  theta.1.star=rnorm(1,theta.1[i],sqrt(sig2.prop))  # Sample from the proposal dstn
  U.1=runif(1,0,1)  # U: auxiliary variable
  
  # Ratio of the two densities
  ratio.1=dnorm(theta.1.star,y1+rho*(theta.2[i]-y2),1-rho^2)/dnorm(theta.1[i],y1+rho*(theta.2[i]-y2),1-rho^2)
  
  # Probability of move
  alpha.1=min(ratio.1,1)
  
  if (U.1<=alpha.1){theta.1[i+1]=theta.1.star}  # Accepted
  else theta.1[i+1]=theta.1[i]
  
  
  ## for theta.2
  theta.2.star=rnorm(1,theta.2[i],sqrt(sig2.prop))
  U.2=runif(1,0,1)
  
  # Ratio of the two densities
  ratio.2=dnorm(theta.2.star,y2+rho*(theta.1[i+1]-y1),1-rho^2)/dnorm(theta.2[i],y2+rho*(theta.1[i+1]-y1),1-rho^2)
  
  # Probability of move
  alpha.2=min(ratio.2,1)
  
  if (U.2<=alpha.2){theta.2[i+1]=theta.2.star}  # Accepted
  else theta.2[i+1]=theta.2[i]
  
}

theta=cbind(theta.1,theta.2)  # Binding the thetas together

head(theta)
```

\

### Visualizations

```{r}
# traceplots
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
```

\

```{r}
dev.new()
par(mfrow=c(1,2))
hist(theta[(burn.in+1):N,1])
hist(theta[(burn.in+1):N,1])
```

\

```{r}
dev.new()
par(mfrow=c(1,1))
plot(theta[(burn.in+1):N,])
```

\

### The averages of theta1 and theta2: These are the estimates

```{r}
colMeans(theta[(burn.in+1):N,])
```

\

### The correlation of theta1 and theta2

```{r}
cor(theta[(burn.in+1):N,])
```

\
\
\

## Example2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0192.jpeg)

\

### The metropolis algorithm made into a function (because we need to run for 4 different variances)

```{r}
mh=function(v,sigma,x0,N){
  x=numeric(N)  # Making N number of 0s
  x[1]=x0  # Starting point
  k=0
  for (i in 2:N){  # Starts at 2 because 1 is the starting point
    y=rnorm(1,x[i-1],sigma)  # Sampling the candidate from the proposal dstn with 
                             # mean: previous value and variance: sigma
    u=runif(1)  # The auxiliary variable
    alpha=min(dt(y,v)/dt(x[i-1],v),1)  # Calculating the probability of move
                             # The ratio r is p(candidate|y)/p(previous value|y)
                             # Where p() is in this case the t dstn with df=v
    if (u<=alpha){x[i]=y}  # Accepting the candidate
    else {x[i]=x[i-1]
    k=k+1  # Counting the number of rejections
    }
  }
  return(list(x=x,k=k))
}
```

\

### Let's run the algorithm

```{r}
N=1000  # Number of samples
burn.in=N/2  # The Burn-in period
v=4  # df of t dstn
x0=1  # Starting point
sigma=c(0.05,0.5,2,16)  # Different variances for proposal dstns

mh1=mh(v,sigma[1],x0,N)
mh2=mh(v,sigma[2],x0,N)
mh3=mh(v,sigma[3],x0,N)
mh4=mh(v,sigma[4],x0,N)
```

\

### The number of candidates rejected by each variance

```{r}
c(mh1$k,mh2$k,mh3$k,mh4$k)/N
```

#### We can see that, when the variance(low) => Few rejected, variance(high) => Many rejected.

\

### Visualizations

```{r}
dev.new()
par(mfrow=c(2,2))
plot(1:N,mh1$x,type="l")  # The trace plots
plot(1:N,mh2$x,type="l")
plot(1:N,mh3$x,type="l")
plot(1:N,mh4$x,type="l")
```

#### The frequent horizontal lines in the trace plot means that there were many rejections. Horizontal line means that previous values were used many times because candidates were rejected.

\

```{r}
a=seq(-5,5,by=0.01)  # To plot the t dstn

dev.new()
par(mfrow=c(2,2))

hist(mh1$x[(burn.in+1):N],freq=F,ylim=c(0,0.7))  # Histogram of the samples
                                                 # without the Burn-in period
lines(a,dt(a,df=4))  # t dstn with df=4: The actual dstn

hist(mh2$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

hist(mh3$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))

hist(mh4$x[(burn.in+1):N],freq=F,ylim=c(0,0.4))
lines(a,dt(a,df=4))
```

#### We can see that when the variance is too low or high, the dstn of the samples were not approximate to the actual dstn. Whereas when the variance is modest, the samples were pretty approximate to the actual dstn.

\
\
\
\
\
\
\

# Metropolis-Hastings algorithm

\

## Example1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0193.jpeg)

\

### The M-H algorithm

```{r}
N=10000  # Number of samples
burn.in=N/2  # The Burn-in period
sigma=4  # of the Rayleigh dstn
x=numeric(N)  # Initializing the spaces for the chain
x[1]=rchisq(1,df=1)  # Starting point
k=0  # Number of rejected candidates

for (i in 2:N){  # Starts from 2 because 1 is the starting point
  x.star=rchisq(1,df=x[i-1])  # Candidate taken from chisq dstn with df=previous value
  u=runif(1)  # Auxiliary variable
  
  # For calculating the ratio r
  num=(x.star/(sigma^2)*exp(-x.star^2/(2*sigma^2)))/dchisq(x.star,df=x[i-1])
  den=(x[i-1]/(sigma^2)*exp(-x[i-1]^2/(2*sigma^2)))/dchisq(x[i-1],df=x.star)
  
  alpha=min(num/den,1)  # The probability of move
  if (u<=alpha){x[i]=x.star}  # Accepting the candidate
  else {x[i]=x[i-1]
  k=k+1 # Counting the number of rejected candidates
  }
}
```

\

### Visualizations

```{r}
dev.new()
plot(1:N,x,type="l")  # The trace plot
```

\

```{r}
# install.packages("VGAM")
library(VGAM) # For drayleigh()

xgrid=seq(0,20,by=0.1)
ytrue=drayleigh(xgrid,scale=sigma)  # The density of the actual dstn

dev.new()
hist(x[(burn.in+1):N],freq=F, main="Comparison to the actual dstn")  # freq=F: to display density on y axis not the frequency
lines(xgrid,ytrue,type="l")  # The actual dstn
```

#### We can see that the samples(the histogram) is pretty approximate to the actual dstn(the line).

\
\
\

## Example2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0194.jpeg)

\

```{r}
N=100000  # Number of samples
burn.in=N/2  # Burn-in period

## initial value
sig2=100  # variance of the proposal dstn
theta=rep(0,N)  # Initializing the space for theta

p.th=function(th){   # The actual dstn
  0.3*exp(-0.2*th^2)+0.7*exp(-0.2*(th-10)^2)
}

## iterations
for (i in 1:(N-1)){  # No value for starting point was given so I guess it's 0
  
  theta.star=rnorm(1,theta[i],sqrt(sig2))  # candidate
  U=runif(1,0,1)  # auxiliary variable
  
  ratio=p.th(theta.star)/p.th(theta[i])  # the ratio r
  # Note that since the proposal dstn is symmetric, the ratio r is just like Metropolis'
  
  alpha=min(ratio,1)  # The probability of move
  
  if (U<=alpha){theta[i+1]=theta.star}  # Accepting the candidate
  else theta[i+1]=theta[i]
  
}
```

\

### Visualizations

```{r}
dev.new()
par(mfrow=c(1,1))
plot(burn.in+1:N,theta,type="l")  # the trace plot
```

\

```{r}
th.grid=seq(-10,20,by=1)
th.true=rep(0,length(th.grid))
for (i in 1:length(th.grid)){
  th.true[i]=p.th(th.grid[i])
}
th.true=th.true/sum(th.true)  # Normalizing I guess? This makes sum(th.true) = 1

dev.new()
par(mfrow=c(1,1))
hist(theta,freq=FALSE, main="Comparison to the actual dstn")
lines(th.grid,th.true,type="l")
```

\

### Estimate for the mean of the target dstn I guess?

```{r}
mean(theta[(burn.in+1):N])
```

\
\
\
