Convergence Diagnostics



# install.packages("geoR")
library(coda)  # Our main convergence diagnostic package
library(geoR)
'RandomFieldsUtils' will use OMP
'RandomFields' will use OMP
--------------------------------------------------------------
 Analysis of Geostatistical Data
 For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
 geoR version 1.8-1 (built on 2020-02-08) is now loaded
--------------------------------------------------------------
library(MCMCpack)


Running Metropolis algorithm

library(mvtnorm)

N=10000
burn.in=N/2

# data
y=c(0,0)

# correlation
rho=0.8

# initial value
cov.var=matrix(c(1,rho,rho,1),2,2)        # cov for posterior density
cov.var.prop=matrix(c(1,rho,rho,1),2,2)   # cov for proposal density
theta=matrix(0,nrow=N,ncol=2)

# iterations
for (i in 1:(N-1)){
  theta.star=rmvnorm(1,mean=theta[i,],sigma=cov.var.prop)  # Candidate
  U=runif(1,0,1)  # Auxiliary variable
  
  # The ratio r
  ratio=dmvnorm(theta.star,mean=y,sigma=cov.var)/dmvnorm(theta[i,],mean=y,sigma=cov.var)
  
  # The probability of move
  alpha=min(ratio,1)
  
  if (U<=alpha){theta[i+1,]=theta.star}  # Accepting the candidate
  else theta[i+1,]=theta[i,]
}


Visual inspection: Traceplot

dev.new()
par(mfrow=c(2,1))
plot(1:N,theta[,1],type="l")
plot(1:N,theta[,2],type="l")

We can see that the chains don’t get stuck in certain areas. So these are kind of good I guess.


Visual inspection: Running mean plot

mean.th1=rep(0,N)  # To store the mean values
mean.th2=rep(0,N)  # To store the mean values

# Get the mean of the draws UP TO EACH ITERATION
for (i in 1:N){
  mean.th1[i]=mean(theta[1:i,1])
  mean.th2[i]=mean(theta[1:i,2])
}

dev.new()
par(mfrow=c(2,1))
plot(1:N,mean.th1,type="l")
plot(1:N,mean.th2,type="l")

Both thetas’ means seem to converge.



Convergence diagnostic using ‘coda’ package


Before we use the diagnostics, we must TURN OUR CHAINS INTO MCMC objects.

# mcmc() is from 'coda' package
theta=mcmc(theta)  # We must turn our chains into MCMC objects

summary(theta)  # Summary statistics for the distribution

Iterations = 1:10000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 10000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

           Mean     SD Naive SE Time-series SE
[1,] -0.0002463 0.9831 0.009831        0.02998
[2,]  0.0052000 0.9770 0.009770        0.02962

2. Quantiles for each variable:

       2.5%     25%      50%    75% 97.5%
var1 -1.866 -0.6817 -0.02747 0.6463 1.987
var2 -1.944 -0.6377  0.01751 0.6711 1.907

Thinning interval here means the ‘keeping every kth samples only’ in the traceplot.


Visual inspection using ‘coda’ package

plot(theta)  # This gives us both dense plot and traceplot


# densplot(theta)
# traceplot(theta)

Note that these plots are plotted against the variance of theta instead of the values of theta directly.








Gelman and Rubin Diagnostic









Adaptive Simulation algorithm



Let’s monitor the acceptance rate of the Metropolis algo. using adaptive simulation algo.


library(mvtnorm)  # for mvnorm

N=10000  # Increase iterations for better performance
burn.in=N/2

## data
y=c(0,0)

## correlation
rho=0.8
c=2.4/sqrt(2)  # d=2 since it's a bivariate model

## initial value
cov.var=matrix(c(1,rho,rho,1),2,2)        #cov for posterior density
cov.var.prop=matrix(c(1,rho,rho,1),2,2)   #cov for proposal density
theta=matrix(0,nrow=N,ncol=2)

theta.1_out=rep(1,N)
c_out=NULL

## iterations
for (i in 1:(N-1)){
  
  # The adaptive simulation algo. part
  if (i>1 & ((i-1)%%100==0)){  # %% gets the remainder
                               # so this is saying every 100 iterations
    ar=sum(diff(theta.1_out[(i-100):(i-1)])!=0)/99
    # ar is the sum of the nonzeros of the sequence's differences?
    
    if (ar<0.44){c=c/sqrt(2)}  # we need smaller variance
    else if (ar>0.44){c=c*sqrt(2)}  # we need larger variance
  }
  
  
  # The Metropolis algo. part
  theta.star=rmvnorm(1,mean=theta[i,],sigma=c^2*cov.var.prop)
  # taking the candidate from the proposal dstn
  # c: scaling factor
  
  U=runif(1,0,1)  
  
  ratio=dmvnorm(theta.star,mean=y,sigma=cov.var)/dmvnorm(theta[i,],mean=y,sigma=cov.var)
  alpha=min(ratio,1)
  
  if (U<=alpha){theta[i+1,]=theta.star}
  else theta[i+1,]=theta[i,]
  
  theta.1_out[i]=theta[i,1]
  c_out[i]=c
}


Visualization


dev.new()
par(mfrow=c(2,1))
plot(1:N,theta[,1],type="l")
plot(1:N,theta[,2],type="l")


dev.new()
par(mfrow=c(1,3))
plot(theta[,1],theta[,2])
hist(theta[,1])
hist(theta[,2])


Estimated means of theta1 and theta2

mean(theta[(burn.in+1):N,1])
[1] 0.06777173
mean(theta[(burn.in+1):N,2])
[1] 0.06194615

Both are pretty close to 0.


The acceptance rates after iterations

library(coda)
library(MCMCpack)

theta=mcmc(theta)  # we need to turn it into an mcmc object first
1-rejectionRate(theta) # The acceptance rate
     var1      var2 
0.4463446 0.4463446 

Both are pretty close to 0.44 which is great I guess.








Adaptive Metropolis algorithm


Ignore the “Change the ~” from the image


Also, I guess you have to use conditional post. dstn in this case?


N=10000
burn.in=N/2

## data
y=c(0,0)

## correlation
rho=0.8

## scaling factor
c1=2.4/sqrt(1)
c2=2.4/sqrt(1)
eps=0.01  # This is to prevent the variance from shrinking to zero

## initial value
sig2=1-rho^2                   # variance for posterior density
var1.prop=1                    # variance for proposal density
var2.prop=1                    # variance for proposal density

theta.1=rep(10,N)
theta.2=rep(0,N)

c1_out=NULL
c2_out=NULL


#### iterations
for (i in 1:(N-1)){
  
  # Adaptive algorithm part
  
  ## update variance of proposal
  if (i>100){
    var1.prop=var(theta.1[1:(i-1)])  
    var2.prop=var(theta.2[1:(i-1)])  
  }
  
  ## control acceptance rate every 100 iterations
  if (i>1 & ((i-1)%%100==0)){  
    ar1=sum(diff(theta.1[(i-100):(i-1)])!=0)/99
    if (ar1<0.44){c1=c1/sqrt(2)}
    else if (ar1>0.44){c1=c1*sqrt(2)}
    
    ar2=sum(diff(theta.2[(i-100):(i-1)])!=0)/99
    if (ar2<0.44){c2=c2/sqrt(2)}
    else if (ar2>0.44){c2=c2*sqrt(2)}
  }
  
  
  # Metropolis part
  
  ## for theta.1
  theta.1.star=rnorm(1,theta.1[i],c1^2*var1.prop+c1^2*eps)
  U.1=runif(1,0,1)
  
  ratio.1=dnorm(theta.1.star,y[1]+rho*(theta.2[i]-y[2]),sqrt(sig2))/dnorm(theta.1[i],y[1]+rho*(theta.2[i]-y[2]),sqrt(sig2))
  alpha.1=min(ratio.1,1)
  
  if (U.1<=alpha.1){theta.1[i+1]=theta.1.star} else theta.1[i+1]=theta.1[i]
  
  ## for theta.2
  theta.2.star=rnorm(1,theta.2[i],c2^2*var2.prop+c2^2*eps)
  U.2=runif(1,0,1)
  
  ratio.2=dnorm(theta.2.star,y[2]+rho*(theta.1[i]-y[1]),sqrt(sig2))/dnorm(theta.2[i],y[2]+rho*(theta.1[i]-y[1]),sqrt(sig2))
  alpha.2=min(ratio.2,1)
  
  if (U.2<=alpha.2){theta.2[i+1]=theta.2.star} else theta.2[i+1]=theta.2[i]
  
  c1_out[i]=c1
  c2_out[i]=c2
}


Visualizations

dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")


par(mfrow=c(1,3))
plot(theta.1,theta.2)
hist(theta.1)
hist(theta.2)


Estimated means of theta1 and theta2

mean(theta.1[(burn.in+1):N])
[1] 0.05575984
mean(theta.2[(burn.in+1):N])
[1] 0.03889123

Both are pretty close to 0


Acceptance rate after iterations

library(coda)
library(MCMCpack)

theta.1=mcmc(theta.1)
theta.2=mcmc(theta.2)
1-rejectionRate(theta.1)
     var1 
0.4528453 
1-rejectionRate(theta.2)
     var1 
0.4461446 

Both have acceptance rates pretty close to 0.44








Slice Sampler



nsim=100000  # Increase iterations for better performance

u=rep(1,nsim)
x=rep(1,nsim)

for(i in 2:nsim){
  u[i]<-runif(1,0,0.5*exp(-sqrt(x[i-1])))
  x[i]<-runif(1,0,(log(2*u[i]))^2)
}


dev.new()
par(mfrow=c(1,1))

a=seq(0,50,0.01)
fa=0.5*exp(-sqrt(a))

hist(x,freq=F,xlim=c(0,50),breaks=200)
lines(a,fa,type="l",col="red")

The red line is the actual density, and the histogram is of samples from the slice sampler. It’s pretty close indeed.








Bayesian Linear Regression model


Example 1


Loading data

library(LearnBayes)  # for the 'birdextinct' data

Attaching package: ‘LearnBayes’

The following object is masked from ‘package:VGAM’:

    laplace

The following object is masked from ‘package:MCMCpack’:

    rdirichlet

The following object is masked from ‘package:TeachingDemos’:

    triplot
data("birdextinct")
attach(birdextinct)

birdextinct

time is the y variable, and nesting, size, and status are the x variables. Note that size and status are binary.


EDA

N <- dim(birdextinct)[1]  # Sample size

logtime=log(time)  # time is very skewed and log makes it less skewed

dev.new()
par(mfrow=c(1,2))
hist(time)
hist(logtime)

Since time is very skewed, we used log. We can see that it has lessened the skewness but not much to be honest.


Frequentist linear regression

lm.fit=lm(logtime~nesting+size+status,data=birdextinct,x=TRUE,y=TRUE)
summary(lm.fit)

Call:
lm(formula = logtime ~ nesting + size + status, data = birdextinct, 
    x = TRUE, y = TRUE)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8410 -0.2932 -0.0709  0.2165  2.5167 

Coefficients:
            Estimate Std. Error t value      Pr(>|t|)    
(Intercept)  0.43087    0.20706   2.081      0.041870 *  
nesting      0.26501    0.03679   7.203 0.00000000133 ***
size        -0.65220    0.16667  -3.913      0.000242 ***
status       0.50417    0.18263   2.761      0.007712 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.6524 on 58 degrees of freedom
Multiple R-squared:  0.5982,    Adjusted R-squared:  0.5775 
F-statistic: 28.79 on 3 and 58 DF,  p-value: 0.00000000001577


Residual plot

dev.new()
par(mfrow=c(1,1))
plot(lm.fit$fitted.values,lm.fit$residuals,ylim=c(-1.5,1.5),xlab="Fitted Values",ylab="Residuals")
abline(h=0)
abline(h=2*summary(lm.fit)$sigma,lty=2)   
abline(h=-2*summary(lm.fit)$sigma,lty=2)

# sigma is the residual standard error (or residual standard deviation) which is a measure used to assess how well a linear regression model fits the data, smaller meaning the model fits the data better.

There are no violations of equal variance of error terms, or the linearity of the model. Also there seem to be no extreme outliers although there is one that is outside of the 2 times the residual standard error.




Bayesian linear regression


Let’s use blinreg() to sample from the joint post. dstn of beta and sigma_squared

theta.sample=blinreg(lm.fit$y,lm.fit$x,1000)

dev.new()
par(mfrow=c(2,2))
hist(theta.sample$beta[,2],main="Nesting",xlab=expression(beta[1]))
hist(theta.sample$beta[,3],main="Size",xlab=expression(beta[2]))
hist(theta.sample$beta[,4],main="Status",xlab=expression(beta[3]))
hist(theta.sample$sigma,main="Error SD",xlab=expression(sigma))

These are the samples from marginal posterior distribution of each variable. Error SD is the error terms calculated from the samples(?). They look like normal distribution so mean and median are pretty much the same, so we can choose either.


Comparison of coefficients

apply(theta.sample$beta,2,quantile,c(0.025,0.5,0.975))  # 95% credible intervals
      X(Intercept)  Xnesting      Xsize   Xstatus
2.5%   0.002096505 0.1935873 -0.9666888 0.1415132
50%    0.430099242 0.2665441 -0.6566102 0.4938495
97.5%  0.816603110 0.3423739 -0.3243618 0.8661174
# We can think the median(50%) as the bayes estimates of the coefficients since they seem like they follow normal distribution
lm.fit$coefficients  # freq's estimators of coefficients
(Intercept)     nesting        size      status 
  0.4308716   0.2650140  -0.6521982   0.5041655 

We can see that both freq’s and bayes’ results are similar. This is due to bayes linear model using a noninformative prior. Bayes provides more information because we can clearly see the distribution of each variables.


Sigma comparison

quantile(theta.sample$sigma,c(0.025,0.5,0.975))  # 50% is the bayes estimate
     2.5%       50%     97.5% 
0.5566649 0.6597165 0.8047768 
summary(lm.fit)$sigma
[1] 0.6524084

We can see that sigma in both cases are similar too.




Sampling using marginal post. dstn of sigma_squared


EDA

##Determine explanatory variables
X <- as.matrix(birdextinct[,3:5])
cov.names <- names(birdextinct[,c(-1,-2)])

##Pairs plot
pairs(cbind(logtime,X))


##Add a column of 1s for the intercepts
X.int <- cbind(rep(1,N),X)

##Determine the number of explanatory variables
k <- dim(X.int)[2]


Functions to use to sample from cond. post. dstn of beta and marginal post. dstn of sigma_squared

library(MASS)  # for mvrnorm

##Function for sampling from p(beta|sigma2,X,y) : cond. post. dstn of beta
sample.beta <- function(sigma2,X,y){
  V.beta <- solve(t(X)%*%X)             
  beta.hat <- V.beta%*%t(X)%*%y
  return(mvrnorm(1,beta.hat,sigma2*V.beta))
}

#Function for sampling from p(sigma2|X,y) : marginal post. dstn of sigma_squared
sample.sigma2 <- function(X,y,N,k){
  V.beta <- solve(t(X)%*%X)            
  beta.hat <- V.beta%*%t(X)%*%y
  s2 <- t(y-X%*%beta.hat)%*%(y-X%*%beta.hat)/(N-k)
  return(1/rgamma(1,shape=(N-k)/2,rate=((N-k)/2)*s2))
}


Sampling

# Initialize variables for the posterior samples
nsim=1000
beta.post.samples = matrix(0,nsim,k)
sigma2.post.samples = rep(0,nsim)

# Iterations
for(i in 1:nsim){
  sigma2.post.samples[i] <- sample.sigma2(X.int,logtime,N,k-1)
  beta.post.samples[i,] <- sample.beta(sigma2.post.samples[i],X.int,logtime)
}


Visualization

#Examine posterior samples
dev.new()
par(mfrow=c(3,2))
hist(sigma2.post.samples)
boxplot(as.data.frame(beta.post.samples[,-1]),names=cov.names)
abline(h=0)

plot(1:nsim,sigma2.post.samples,type="l")
plot(1:nsim,beta.post.samples[,2],type="l")
plot(1:nsim,beta.post.samples[,3],type="l")
plot(1:nsim,beta.post.samples[,4],type="l")

The histogram is of sigma_squared, and the boxplot is of the betas. The traceplot is to show the convergence of the samples of sigma_squared and each beta.


The bayes estimates of sigma_squared

mean(sigma2.post.samples)
[1] 0.4288413

The bayes estimates of betas

colMeans(beta.post.samples)
[1]  0.4445125  0.2648737 -0.6600526  0.4931243

Comparison to the freq’s estimates

lm.fit$coefficients  # The betas
(Intercept)     nesting        size      status 
  0.4308716   0.2650140  -0.6521982   0.5041655 
(summary(lm.fit)$sigma)^2  # Sigma_squared
[1] 0.4256367

We can see that both bayes and freq’s results are similar. We can say that out bayes model is doing well I guess.


Correlation map of the betas

#Look at the correlation between the betas
pairs(beta.post.samples,labels=c("Intercept",cov.names))




Sampling using the conditional dstn of simga_squared (Gibbs sampler I guess)


Functions to sample sigma_squared from the cond. post. dstn of sigma_squared and beta from the cond. post. dstn of beta (same one as above)

##Function for sampling from p(beta|sigma2,X,y) : cond. post. dstn of beta
sample.beta <- function(sigma2,X,y){
  V.beta <- solve(t(X)%*%X)             
  beta.hat <- V.beta%*%t(X)%*%y
  return(mvrnorm(1,beta.hat,sigma2*V.beta))
}

#Function for sampling from p(sigma2|beta,X,y) : marginal post. dstn of sigma_squared
sample.sigma2.cond <- function(X,y,N,beta){
  par1=N/2
  par2=0.5*t(y-X%*%beta)%*%(y-X%*%beta)
  return(1/rgamma(1,shape=par1,rate=par2))
}


Sampling

#Initialize variables for the posterior samples
nsim=1000
beta.post.samples = matrix(0,nsim,k)
sigma2.cond.post.samples = rep(0,nsim)

for(i in 1:nsim){
  beta.post.samples[i,]=sample.beta(sigma2.post.samples[i],X.int,logtime)
  sigma2.cond.post.samples[i]=sample.sigma2.cond(X.int,logtime,N,beta.post.samples[i,])
}


Visualization

dev.new()
par(mfrow=c(2,1))
hist(sigma2.cond.post.samples)

plot(1:nsim,sigma2.cond.post.samples,type="l")

These are the results of the samples from the cond. post. dstn of sigma squared


Bayes estimate of sigma_squared

mean(sigma2.post.samples)
[1] 0.4288413


Bayes estimate of betas

colMeans(beta.post.samples)
[1]  0.4240287  0.2646643 -0.6447407  0.5092205


Comparison to the freq’s estimates

lm.fit$coefficients  # The betas
(Intercept)     nesting        size      status 
  0.4308716   0.2650140  -0.6521982   0.5041655 
(summary(lm.fit)$sigma)^2  # Sigma_squared
[1] 0.4256367

Again, we can see that both bayes and freq’s results are similar. Again, we can say that out bayes model is doing well I guess.



Conclusion

Samples from the blinreg() are almost the same as samples using the marginal post. dstn or the samples using the cond. post. dstn, which are all in turn very similar to the freq’s result (due to using noninformative prior).





Example 2


Loading data

baby=read.table("http://www.stat.berkeley.edu/~statlabs/data/babies.data",head=TRUE)
attach(baby)


Performing bayesian linear regression using MCMCregress()

library(MCMCpack)  # for MCMCregress
library(coda)  # for mcmc

options(scipen = 1)  # to remove scientific notation

reg<-MCMCregress(bwt~age+weight,b0=0,B0=0,burnin=1000,mcmc=10000)
summary(reg)

Iterations = 1001:11000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 10000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

                  Mean        SD   Naive SE Time-series SE
(Intercept) 116.683971  2.283267 0.02283267     0.02279083
age           0.074110  0.080186 0.00080186     0.00080328
weight        0.005596  0.003552 0.00003552     0.00003552
sigma2      332.647666 13.449063 0.13449063     0.13449063

2. Quantiles for each variable:

                  2.5%        25%        50%        75%     97.5%
(Intercept) 112.280607 115.129890 116.674374 118.241310 121.16086
age          -0.082259   0.019546   0.074589   0.128984   0.22803
weight       -0.001408   0.003216   0.005631   0.007999   0.01249
sigma2      307.227120 323.355276 332.277383 341.446596 360.23436

We can use either the median(50%) or the mean as the bayes estimate


Visualization

dev.new()
plot(reg)

We can see that the density of the variables look like normal dstn so we can use either mean or median.


Comparison to freq.

freq=lm(bwt~age+weight,data=baby)
summary(freq)

Call:
lm(formula = bwt ~ age + weight, data = baby)

Residuals:
   Min     1Q Median     3Q    Max 
-65.06 -11.11   0.44  11.25  56.90 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 116.693894   2.294315  50.862   <2e-16 ***
age           0.074025   0.080475   0.920    0.358    
weight        0.005564   0.003514   1.584    0.114    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.23 on 1233 degrees of freedom
Multiple R-squared:  0.002871,  Adjusted R-squared:  0.001254 
F-statistic: 1.775 on 2 and 1233 DF,  p-value: 0.1699
(summary(freq)$sigma)^2
[1] 332.1512

We can see that the results are very similar








Logistic regression for binary response



Loading data

library(MASS)
data(birthwt)
attach(birthwt)
The following objects are masked from baby:

    age, bwt, smoke
head(birthwt)

‘low’ is the binary response for low birth weight


Post. dstn of beta (with prior dstn for beta as default improper uniform prior)

library(coda)  # for mcmc
library(MCMCpack)  # for MCMClogit

# default improper uniform prior
logit.1 <- MCMClogit(low~age+as.factor(race)+smoke, data=birthwt, beta.start=rep(0.7,5))
# I think you have to make categorical variables into a factor first
# However, smoke is a binary variable, and you don't have to factor a binary variable

# beta.start is the starting values for the beta vector. There are total 5 betas including the intercept: intercept, age, race2, race3, smoke

# Here, b0 and B0 values were not given => this is an improper uniform prior

summary(logit.1)

Iterations = 1001:11000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 10000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

                    Mean      SD  Naive SE Time-series SE
(Intercept)      -0.9883 0.89476 0.0089476       0.036991
age              -0.0382 0.03425 0.0003425       0.001416
as.factor(race)2  1.0488 0.51366 0.0051366       0.021548
as.factor(race)3  1.0972 0.42240 0.0042240       0.019524
smoke             1.1453 0.38656 0.0038656       0.015945

2. Quantiles for each variable:

                     2.5%     25%      50%      75%   97.5%
(Intercept)      -2.66847 -1.6161 -1.00037 -0.35902 0.82157
age              -0.10923 -0.0607 -0.03668 -0.01464 0.02724
as.factor(race)2  0.02845  0.6990  1.04471  1.40430 2.04175
as.factor(race)3  0.26664  0.7974  1.10880  1.38758 1.91430
smoke             0.37771  0.8948  1.13697  1.41022 1.91549

Visualization

dev.new()
plot(logit.1)


Convergence Diagnostic

logit.1<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(-0.7,5))
logit.2<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(-0.3,5))
logit.3<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(0,5))
logit.4<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(0.3,5))
logit.5<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(0.7,5))

logit.list<-mcmc.list(list(logit.1,logit.2,logit.3,logit.4,logit.5))
gelman.diag(logit.list)
Potential scale reduction factors:

                 Point est. Upper C.I.
(Intercept)               1          1
age                       1          1
as.factor(race)2          1          1
as.factor(race)3          1          1
smoke                     1          1

Multivariate psrf

1

Since the potential scale reduction factors for each variable are all below 1.1, we can say that the MCMC convergence is good for each variable.

gelman.plot(logit.list)

The plots indicate good convergence as well.


Post. dstn of beta (with prior dstn for beta as MVN(0,1000I))

(large variance => noninformative prior)

## multivariate normal prior
logit.2 <- MCMClogit(low~age+as.factor(race)+smoke, b0=0, B0=0.001,data=birthwt)
# b0 is the prior mean of the MVN
# B0 is the prior precision(= 1/prior variance) of the MVN
# This means that the prior variance we used here is 1000
# Since the prior variance is very large, this is a noninformative prior

summary(logit.2)

Iterations = 1001:11000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 10000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

                     Mean      SD  Naive SE Time-series SE
(Intercept)      -1.04275 0.91241 0.0091241       0.038374
age              -0.03591 0.03414 0.0003414       0.001384
as.factor(race)2  1.03163 0.49963 0.0049963       0.020275
as.factor(race)3  1.08843 0.41973 0.0041973       0.017280
smoke             1.13528 0.38761 0.0038761       0.015654

2. Quantiles for each variable:

                    2.5%      25%      50%      75%   97.5%
(Intercept)      -2.7988 -1.69138 -1.06366 -0.40923 0.79022
age              -0.1074 -0.05856 -0.03523 -0.01299 0.03033
as.factor(race)2  0.0360  0.69870  1.02284  1.37216 2.01679
as.factor(race)3  0.2908  0.80183  1.08516  1.36941 1.92019
smoke             0.3795  0.87363  1.13227  1.40031 1.92418

Visualization

plot(logit.2)








Poisson regression for count data


Loading data

data(epil)
attach(epil)
The following object is masked _by_ .GlobalEnv:

    y

The following object is masked from birthwt:

    age

The following object is masked from baby:

    age
head(epil)


Post. dstn of beta

## MCMCpack
pois=MCMCpoisson(y~lbase*trt+lage+V4,data = epil)
summary(pois)

Iterations = 1001:11000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 10000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

                      Mean      SD  Naive SE Time-series SE
(Intercept)         1.8952 0.04213 0.0004213       0.001910
lbase               0.9481 0.04234 0.0004234       0.001898
trtprogabide       -0.3494 0.06029 0.0006029       0.002746
lage                0.8948 0.11668 0.0011668       0.005371
V4                 -0.1578 0.05384 0.0005384       0.002402
lbase:trtprogabide  0.5657 0.06225 0.0006225       0.002771

2. Quantiles for each variable:

                      2.5%     25%     50%     75%    97.5%
(Intercept)         1.8130  1.8680  1.8966  1.9221  1.97764
lbase               0.8648  0.9192  0.9490  0.9756  1.03506
trtprogabide       -0.4742 -0.3864 -0.3465 -0.3100 -0.23288
lage                0.6695  0.8168  0.8912  0.9689  1.14595
V4                 -0.2682 -0.1920 -0.1567 -0.1221 -0.05663
lbase:trtprogabide  0.4407  0.5227  0.5664  0.6059  0.68952

Visualization

dev.new()
plot(pois)


Comparison to the Freq. result

summary(glm(y ~ lbase*trt + lage + V4, family = poisson, data = epil), cor = TRUE)

Call:
glm(formula = y ~ lbase * trt + lage + V4, family = poisson, 
    data = epil)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-5.0915  -1.4126  -0.2739   0.7580  10.7711  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)         1.89791    0.04260  44.552  < 2e-16 ***
lbase               0.94862    0.04360  21.759  < 2e-16 ***
trtprogabide       -0.34588    0.06100  -5.670 1.42e-08 ***
lage                0.88760    0.11650   7.619 2.56e-14 ***
V4                 -0.15977    0.05458  -2.927  0.00342 ** 
lbase:trtprogabide  0.56154    0.06352   8.841  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2517.83  on 235  degrees of freedom
Residual deviance:  869.07  on 230  degrees of freedom
AIC: 1647

Number of Fisher Scoring iterations: 5

Correlation of Coefficients:
                   (Intercept) lbase trtprogabide lage  V4   
lbase              -0.56                                     
trtprogabide       -0.63        0.39                         
lage               -0.18       -0.01  0.06                   
V4                 -0.28        0.00  0.00         0.00      
lbase:trtprogabide  0.33       -0.69 -0.60         0.32  0.00

We can see that the results are very similar.




---
title: "Bayesian Statistics - pt.2"
output:
  html_notebook:
    toc: yes
---

\
\
\
\

# Convergence Diagnostics

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/ss2.png)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0195.jpeg)

\

```{r}
# install.packages("geoR")
library(coda)  # Our main convergence diagnostic package
library(geoR)
library(MCMCpack)
```

\

### Running Metropolis algorithm

```{r}
library(mvtnorm)

N=10000
burn.in=N/2

# data
y=c(0,0)

# correlation
rho=0.8

# initial value
cov.var=matrix(c(1,rho,rho,1),2,2)        # cov for posterior density
cov.var.prop=matrix(c(1,rho,rho,1),2,2)   # cov for proposal density
theta=matrix(0,nrow=N,ncol=2)

# iterations
for (i in 1:(N-1)){
  theta.star=rmvnorm(1,mean=theta[i,],sigma=cov.var.prop)  # Candidate
  U=runif(1,0,1)  # Auxiliary variable
  
  # The ratio r
  ratio=dmvnorm(theta.star,mean=y,sigma=cov.var)/dmvnorm(theta[i,],mean=y,sigma=cov.var)
  
  # The probability of move
  alpha=min(ratio,1)
  
  if (U<=alpha){theta[i+1,]=theta.star}  # Accepting the candidate
  else theta[i+1,]=theta[i,]
}
```

\

### Visual inspection: Traceplot

```{r}
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta[,1],type="l")
plot(1:N,theta[,2],type="l")
```

#### We can see that the chains don't get stuck in certain areas. So these are kind of good I guess.

\

### Visual inspection: Running mean plot

```{r}
mean.th1=rep(0,N)  # To store the mean values
mean.th2=rep(0,N)  # To store the mean values

# Get the mean of the draws UP TO EACH ITERATION
for (i in 1:N){
  mean.th1[i]=mean(theta[1:i,1])
  mean.th2[i]=mean(theta[1:i,2])
}

dev.new()
par(mfrow=c(2,1))
plot(1:N,mean.th1,type="l")
plot(1:N,mean.th2,type="l")
```

#### Both thetas' means seem to converge.

\
\

### Convergence diagnostic using 'coda' package

\

#### Before we use the diagnostics, we must TURN OUR CHAINS INTO MCMC objects.

```{r}
# mcmc() is from 'coda' package
theta=mcmc(theta)  # We must turn our chains into MCMC objects

summary(theta)  # Summary statistics for the distribution
```

#### Thinning interval here means the 'keeping every kth samples only' in the traceplot.

\

### Visual inspection using 'coda' package

```{r}
plot(theta)  # This gives us both dense plot and traceplot

# densplot(theta)
# traceplot(theta)
```

#### Note that these plots are plotted against the variance of theta instead of the values of theta directly.

\
\
\
\
\
\
\

# Gelman and Rubin Diagnostic

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0296.jpeg)

\
\
\
\
\
\
\

# Adaptive Simulation algorithm

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0301.jpeg)

\

#### Let's monitor the acceptance rate of the Metropolis algo. using adaptive simulation algo.

\

```{r}
library(mvtnorm)  # for mvnorm

N=10000  # Increase iterations for better performance
burn.in=N/2

## data
y=c(0,0)

## correlation
rho=0.8
c=2.4/sqrt(2)  # d=2 since it's a bivariate model

## initial value
cov.var=matrix(c(1,rho,rho,1),2,2)        #cov for posterior density
cov.var.prop=matrix(c(1,rho,rho,1),2,2)   #cov for proposal density
theta=matrix(0,nrow=N,ncol=2)

theta.1_out=rep(1,N)
c_out=NULL

## iterations
for (i in 1:(N-1)){
  
  # The adaptive simulation algo. part
  if (i>1 & ((i-1)%%100==0)){  # %% gets the remainder
                               # so this is saying every 100 iterations
    ar=sum(diff(theta.1_out[(i-100):(i-1)])!=0)/99
    # ar is the sum of the nonzeros of the sequence's differences?
    
    if (ar<0.44){c=c/sqrt(2)}  # we need smaller variance
    else if (ar>0.44){c=c*sqrt(2)}  # we need larger variance
  }
  
  
  # The Metropolis algo. part
  theta.star=rmvnorm(1,mean=theta[i,],sigma=c^2*cov.var.prop)
  # taking the candidate from the proposal dstn
  # c: scaling factor
  
  U=runif(1,0,1)  
  
  ratio=dmvnorm(theta.star,mean=y,sigma=cov.var)/dmvnorm(theta[i,],mean=y,sigma=cov.var)
  alpha=min(ratio,1)
  
  if (U<=alpha){theta[i+1,]=theta.star}
  else theta[i+1,]=theta[i,]
  
  theta.1_out[i]=theta[i,1]
  c_out[i]=c
}
```

\

### Visualization

\

```{r}
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta[,1],type="l")
plot(1:N,theta[,2],type="l")
```

\

```{r}
dev.new()
par(mfrow=c(1,3))
plot(theta[,1],theta[,2])
hist(theta[,1])
hist(theta[,2])
```

\

### Estimated means of theta1 and theta2

```{r}
mean(theta[(burn.in+1):N,1])
mean(theta[(burn.in+1):N,2])
```

#### Both are pretty close to 0.

\

### The acceptance rates after iterations

```{r}
library(coda)
library(MCMCpack)

theta=mcmc(theta)  # we need to turn it into an mcmc object first
1-rejectionRate(theta) # The acceptance rate
```

#### Both are pretty close to 0.44 which is great I guess.

\
\
\
\
\
\
\

# Adaptive Metropolis algorithm

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0299.jpeg)

#### Ignore the "Change the ~" from the image

\

#### Also, I guess you have to use conditional post. dstn in this case?

\

```{r}
N=10000
burn.in=N/2

## data
y=c(0,0)

## correlation
rho=0.8

## scaling factor
c1=2.4/sqrt(1)
c2=2.4/sqrt(1)
eps=0.01  # This is to prevent the variance from shrinking to zero

## initial value
sig2=1-rho^2                   # variance for posterior density
var1.prop=1                    # variance for proposal density
var2.prop=1                    # variance for proposal density

theta.1=rep(10,N)
theta.2=rep(0,N)

c1_out=NULL
c2_out=NULL


#### iterations
for (i in 1:(N-1)){
  
  # Adaptive algorithm part
  
  ## update variance of proposal
  if (i>100){
    var1.prop=var(theta.1[1:(i-1)])  
    var2.prop=var(theta.2[1:(i-1)])  
  }
  
  ## control acceptance rate every 100 iterations
  if (i>1 & ((i-1)%%100==0)){  
    ar1=sum(diff(theta.1[(i-100):(i-1)])!=0)/99
    if (ar1<0.44){c1=c1/sqrt(2)}
    else if (ar1>0.44){c1=c1*sqrt(2)}
    
    ar2=sum(diff(theta.2[(i-100):(i-1)])!=0)/99
    if (ar2<0.44){c2=c2/sqrt(2)}
    else if (ar2>0.44){c2=c2*sqrt(2)}
  }
  
  
  # Metropolis part
  
  ## for theta.1
  theta.1.star=rnorm(1,theta.1[i],c1^2*var1.prop+c1^2*eps)
  U.1=runif(1,0,1)
  
  ratio.1=dnorm(theta.1.star,y[1]+rho*(theta.2[i]-y[2]),sqrt(sig2))/dnorm(theta.1[i],y[1]+rho*(theta.2[i]-y[2]),sqrt(sig2))
  alpha.1=min(ratio.1,1)
  
  if (U.1<=alpha.1){theta.1[i+1]=theta.1.star} else theta.1[i+1]=theta.1[i]
  
  ## for theta.2
  theta.2.star=rnorm(1,theta.2[i],c2^2*var2.prop+c2^2*eps)
  U.2=runif(1,0,1)
  
  ratio.2=dnorm(theta.2.star,y[2]+rho*(theta.1[i]-y[1]),sqrt(sig2))/dnorm(theta.2[i],y[2]+rho*(theta.1[i]-y[1]),sqrt(sig2))
  alpha.2=min(ratio.2,1)
  
  if (U.2<=alpha.2){theta.2[i+1]=theta.2.star} else theta.2[i+1]=theta.2[i]
  
  c1_out[i]=c1
  c2_out[i]=c2
}
```

\

### Visualizations

```{r}
dev.new()
par(mfrow=c(2,1))
plot(1:N,theta.1,type="l")
plot(1:N,theta.2,type="l")
```

\

```{r}
par(mfrow=c(1,3))
plot(theta.1,theta.2)
hist(theta.1)
hist(theta.2)
```

\

### Estimated means of theta1 and theta2

```{r}
mean(theta.1[(burn.in+1):N])
mean(theta.2[(burn.in+1):N])
```

#### Both are pretty close to 0

\

### Acceptance rate after iterations

```{r}
library(coda)
library(MCMCpack)

theta.1=mcmc(theta.1)
theta.2=mcmc(theta.2)
1-rejectionRate(theta.1)
1-rejectionRate(theta.2)
```

#### Both have acceptance rates pretty close to 0.44

\
\
\
\
\
\
\

# Slice Sampler

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0300.jpeg)

\

```{r}
nsim=100000  # Increase iterations for better performance

u=rep(1,nsim)
x=rep(1,nsim)

for(i in 2:nsim){
  u[i]<-runif(1,0,0.5*exp(-sqrt(x[i-1])))
  x[i]<-runif(1,0,(log(2*u[i]))^2)
}
```

\

```{r}
dev.new()
par(mfrow=c(1,1))

a=seq(0,50,0.01)
fa=0.5*exp(-sqrt(a))

hist(x,freq=F,xlim=c(0,50),breaks=200)
lines(a,fa,type="l",col="red")
```

#### The red line is the actual density, and the histogram is of samples from the slice sampler. It's pretty close indeed.

\
\
\
\
\
\
\

# Bayesian Linear Regression model

\

## Example 1

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0377.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0378.jpeg)

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0379.jpeg)

\

### Loading data

```{r}
library(LearnBayes)  # for the 'birdextinct' data

data("birdextinct")
attach(birdextinct)

birdextinct
```

#### time is the y variable, and nesting, size, and status are the x variables. Note that size and status are binary.

\

### EDA

```{r}
N <- dim(birdextinct)[1]  # Sample size

logtime=log(time)  # time is very skewed and log makes it less skewed

dev.new()
par(mfrow=c(1,2))
hist(time)
hist(logtime)
```

#### Since time is very skewed, we used log. We can see that it has lessened the skewness but not much to be honest.

\

### Frequentist linear regression

```{r}
lm.fit=lm(logtime~nesting+size+status,data=birdextinct,x=TRUE,y=TRUE)
summary(lm.fit)
```

\

### Residual plot

```{r}
dev.new()
par(mfrow=c(1,1))
plot(lm.fit$fitted.values,lm.fit$residuals,ylim=c(-1.5,1.5),xlab="Fitted Values",ylab="Residuals")
abline(h=0)
abline(h=2*summary(lm.fit)$sigma,lty=2)   
abline(h=-2*summary(lm.fit)$sigma,lty=2)
# sigma is the residual standard error (or residual standard deviation) which is a measure used to assess how well a linear regression model fits the data, smaller meaning the model fits the data better.
```

There are no violations of equal variance of error terms, or the linearity of the model. Also there seem to be no extreme outliers although there is one that is outside of the 2 times the residual standard error.

\
\
\

### Bayesian linear regression

\

#### Let's use blinreg() to sample from the joint post. dstn of beta and sigma_squared

```{r}
theta.sample=blinreg(lm.fit$y,lm.fit$x,1000)

dev.new()
par(mfrow=c(2,2))
hist(theta.sample$beta[,2],main="Nesting",xlab=expression(beta[1]))
hist(theta.sample$beta[,3],main="Size",xlab=expression(beta[2]))
hist(theta.sample$beta[,4],main="Status",xlab=expression(beta[3]))
hist(theta.sample$sigma,main="Error SD",xlab=expression(sigma))
```

#### These are the samples from marginal posterior distribution of each variable. Error SD is the error terms calculated from the samples(?). They look like normal distribution so mean and median are pretty much the same, so we can choose either.

\

### Comparison of coefficients

```{r}
apply(theta.sample$beta,2,quantile,c(0.025,0.5,0.975))  # 95% credible intervals
# We can think the median(50%) as the bayes estimates of the coefficients since they seem like they follow normal distribution
```

```{r}
lm.fit$coefficients  # freq's estimators of coefficients
```


#### We can see that both freq's and bayes' results are similar. This is due to bayes linear model using a noninformative prior. Bayes provides more information because we can clearly see the distribution of each variables.

\

### Sigma comparison

```{r}
quantile(theta.sample$sigma,c(0.025,0.5,0.975))  # 50% is the bayes estimate
```

```{r}
summary(lm.fit)$sigma
```

#### We can see that sigma in both cases are similar too.

\
\
\

## Sampling using marginal post. dstn of sigma_squared

\

### EDA

```{r}
##Determine explanatory variables
X <- as.matrix(birdextinct[,3:5])
cov.names <- names(birdextinct[,c(-1,-2)])

##Pairs plot
pairs(cbind(logtime,X))

##Add a column of 1s for the intercepts
X.int <- cbind(rep(1,N),X)

##Determine the number of explanatory variables
k <- dim(X.int)[2]
```

\

### Functions to use to sample from cond. post. dstn of beta and marginal post. dstn of sigma_squared

```{r}
library(MASS)  # for mvrnorm

##Function for sampling from p(beta|sigma2,X,y) : cond. post. dstn of beta
sample.beta <- function(sigma2,X,y){
  V.beta <- solve(t(X)%*%X)             
  beta.hat <- V.beta%*%t(X)%*%y
  return(mvrnorm(1,beta.hat,sigma2*V.beta))
}

#Function for sampling from p(sigma2|X,y) : marginal post. dstn of sigma_squared
sample.sigma2 <- function(X,y,N,k){
  V.beta <- solve(t(X)%*%X)            
  beta.hat <- V.beta%*%t(X)%*%y
  s2 <- t(y-X%*%beta.hat)%*%(y-X%*%beta.hat)/(N-k)
  return(1/rgamma(1,shape=(N-k)/2,rate=((N-k)/2)*s2))
}
```

\

### Sampling

```{r}
# Initialize variables for the posterior samples
nsim=1000
beta.post.samples = matrix(0,nsim,k)
sigma2.post.samples = rep(0,nsim)

# Iterations
for(i in 1:nsim){
  sigma2.post.samples[i] <- sample.sigma2(X.int,logtime,N,k-1)
  beta.post.samples[i,] <- sample.beta(sigma2.post.samples[i],X.int,logtime)
}
```

\

### Visualization

```{r}
#Examine posterior samples
dev.new()
par(mfrow=c(3,2))
hist(sigma2.post.samples)
boxplot(as.data.frame(beta.post.samples[,-1]),names=cov.names)
abline(h=0)

plot(1:nsim,sigma2.post.samples,type="l")
plot(1:nsim,beta.post.samples[,2],type="l")
plot(1:nsim,beta.post.samples[,3],type="l")
plot(1:nsim,beta.post.samples[,4],type="l")
```

#### The histogram is of sigma_squared, and the boxplot is of the betas. The traceplot is to show the convergence of the samples of sigma_squared and each beta.

\

### The bayes estimates of sigma_squared

```{r}
mean(sigma2.post.samples)
```

### The bayes estimates of betas

```{r}
colMeans(beta.post.samples)
```

### Comparison to the freq's estimates

```{r}
lm.fit$coefficients  # The betas
```

```{r}
(summary(lm.fit)$sigma)^2  # Sigma_squared
```

#### We can see that both bayes and freq's results are similar. We can say that out bayes model is doing well I guess.


\

### Correlation map of the betas

```{r}
#Look at the correlation between the betas
pairs(beta.post.samples,labels=c("Intercept",cov.names))
```

\
\
\

## Sampling using the conditional dstn of simga_squared  (Gibbs sampler I guess)

\

### Functions to sample sigma_squared from the cond. post. dstn of sigma_squared and beta from the cond. post. dstn of beta (same one as above)

```{r}
##Function for sampling from p(beta|sigma2,X,y) : cond. post. dstn of beta
sample.beta <- function(sigma2,X,y){
  V.beta <- solve(t(X)%*%X)             
  beta.hat <- V.beta%*%t(X)%*%y
  return(mvrnorm(1,beta.hat,sigma2*V.beta))
}

#Function for sampling from p(sigma2|beta,X,y) : marginal post. dstn of sigma_squared
sample.sigma2.cond <- function(X,y,N,beta){
  par1=N/2
  par2=0.5*t(y-X%*%beta)%*%(y-X%*%beta)
  return(1/rgamma(1,shape=par1,rate=par2))
}
```

\

### Sampling

```{r}
#Initialize variables for the posterior samples
nsim=1000
beta.post.samples = matrix(0,nsim,k)
sigma2.cond.post.samples = rep(0,nsim)

for(i in 1:nsim){
  beta.post.samples[i,]=sample.beta(sigma2.post.samples[i],X.int,logtime)
  sigma2.cond.post.samples[i]=sample.sigma2.cond(X.int,logtime,N,beta.post.samples[i,])
}
```

\

### Visualization

```{r}
dev.new()
par(mfrow=c(2,1))
hist(sigma2.cond.post.samples)

plot(1:nsim,sigma2.cond.post.samples,type="l")
```

#### These are the results of the samples from the cond. post. dstn of sigma squared

\

### Bayes estimate of sigma_squared

```{r}
mean(sigma2.post.samples)
```

\

### Bayes estimate of betas

```{r}
colMeans(beta.post.samples)
```

\

### Comparison to the freq's estimates

```{r}
lm.fit$coefficients  # The betas
```

```{r}
(summary(lm.fit)$sigma)^2  # Sigma_squared
```

#### Again, we can see that both bayes and freq's results are similar. Again, we can say that out bayes model is doing well I guess.

\
\

### Conclusion

#### Samples from the blinreg() are almost the same as samples using the marginal post. dstn or the samples using the cond. post. dstn, which are all in turn very similar to the freq's result (due to using noninformative prior).


\
\
\
\

## Example 2

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/IMG_0380.jpeg)

\

### Loading data

```{r}
baby=read.table("http://www.stat.berkeley.edu/~statlabs/data/babies.data",head=TRUE)
attach(baby)
```

\

### Performing bayesian linear regression using MCMCregress()

```{r}
library(MCMCpack)  # for MCMCregress
library(coda)  # for mcmc

options(scipen = 1)  # to remove scientific notation

reg<-MCMCregress(bwt~age+weight,b0=0,B0=0,burnin=1000,mcmc=10000)
summary(reg)
```

#### We can use either the median(50%) or the mean as the bayes estimate

\

### Visualization

```{r}
dev.new()
plot(reg)
```

#### We can see that the density of the variables look like normal dstn so we can use either mean or median.

\

### Comparison to freq.

```{r}
freq=lm(bwt~age+weight,data=baby)
summary(freq)
```

```{r}
(summary(freq)$sigma)^2
```

#### We can see that the results are very similar

\
\
\
\
\
\
\

# Logistic regression for binary response

\

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/ss3.png)

\

### Loading data

```{r}
library(MASS)
data(birthwt)
attach(birthwt)
head(birthwt)
```

#### 'low' is the binary response for low birth weight

\

### Post. dstn of beta (with prior dstn for beta as default improper uniform prior)

```{r}
library(coda)  # for mcmc
library(MCMCpack)  # for MCMClogit

# default improper uniform prior
logit.1 <- MCMClogit(low~age+as.factor(race)+smoke, data=birthwt, beta.start=rep(0.7,5))
# I think you have to make categorical variables into a factor first
# However, smoke is a binary variable, and you don't have to factor a binary variable

# beta.start is the starting values for the beta vector. There are total 5 betas including the intercept: intercept, age, race2, race3, smoke

# Here, b0 and B0 values were not given => this is an improper uniform prior

summary(logit.1)
```

### Visualization

```{r}
dev.new()
plot(logit.1)
```

\

### Convergence Diagnostic

```{r}
logit.1<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(-0.7,5))
logit.2<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(-0.3,5))
logit.3<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(0,5))
logit.4<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(0.3,5))
logit.5<-MCMClogit(low~age+as.factor(race)+smoke, data=birthwt,beta.start=rep(0.7,5))

logit.list<-mcmc.list(list(logit.1,logit.2,logit.3,logit.4,logit.5))
gelman.diag(logit.list)
```

#### Since the potential scale reduction factors for each variable are all below 1.1, we can say that the MCMC convergence is good for each variable.

```{r}
gelman.plot(logit.list)
```

#### The plots indicate good convergence as well.

\

### Post. dstn of beta (with prior dstn for beta as MVN(0,1000I)) 

#### (large variance => noninformative prior)

```{r}
## multivariate normal prior
logit.2 <- MCMClogit(low~age+as.factor(race)+smoke, b0=0, B0=0.001,data=birthwt)
# b0 is the prior mean of the MVN
# B0 is the prior precision(= 1/prior variance) of the MVN
# This means that the prior variance we used here is 1000
# Since the prior variance is very large, this is a noninformative prior

summary(logit.2)
```

### Visualization

```{r}
plot(logit.2)
```

\
\
\
\
\
\
\

# Poisson regression for count data

![](/Users/jaeyonglee/Documents/College/Bachelor/4y_2s/bayes/R/images/ss4.png)

\

### Loading data

```{r}
data(epil)
attach(epil)
head(epil)
```

\

### Post. dstn of beta

```{r}
## MCMCpack
pois=MCMCpoisson(y~lbase*trt+lage+V4,data = epil)
summary(pois)
```

### Visualization

```{r}
dev.new()
plot(pois)
```

\

### Comparison to the Freq. result

```{r}
summary(glm(y ~ lbase*trt + lage + V4, family = poisson, data = epil), cor = TRUE)
```

#### We can see that the results are very similar.

\
\
\
