Parameter Learning with Simple Bayesian Network; PyMC3 vs. Edward; Edward posteriors not converging around correct parameter values


#1

Hello Edward community!

I have seen a few posts on here about doing inference on bayesian networks with Edward, but none have addressed parameter learning.

To test this out, I am working with a VERY simple BN; a subgraph of the canonical wet grass BN :smirk:. In particular, I’m just looking to estimate cloudy and rain parameters from some data that I simulated. Parameter learning in this case just boils down to sampling the posterior distributions of 3 Bernoulli parameters.

My Primary Issue:
I coded up this situation in both PyMC3 and Edward. The posterior PyMC3 distributions found using the Metropolis sampler for the parameter looked very similar to the sampling distributions for the maximum likelihood estimates (as expected given uniform priors). But in Edward, using the Metropolis Hastings algorithm yields posterior distributions that do not even approximately converge around the correct parameter values. Specifically, Edward keeps coming up with posteriors centered around 0.5 for all parameters, despite the underlying parameters not being 0.5 (see parameter specifications in code below).

I’m not sure why this discrepancy is occurring. If anyone has any ideas, I’d love to hear them! I have a feeling it’s a result of my not fulling understanding how to specify the model with the Empirical class. Please see the code below.

import pandas as pd
import numpy as np
from scipy import stats
import pymc3 as pm
import edward as ed
import tensorflow as tf
from edward import models

### generate data
p_c_real = 0.5 #probability of rain
p_c_0_real = 0.2 #probability of rain given not cloudy
p_c_1_real = 0.8 #probability of rain given cloudy
N = 100 #number of data points

def rain(is_cloudy, p_c_0 = p_c_0_real, p_c_1=p_c_1_real):
    if is_cloudy==0:
        return stats.bernoulli.rvs(p=p_c_0)
    elif is_cloudy==1:
        return stats.bernoulli.rvs(p=p_c_1)

#create cloud observations
cloud_ar = stats.bernoulli.rvs(p=p_c_real, size = N)

#make into dataframe
simple_data = pd.DataFrame(dict(cloudy=cloud_ar))

#add the rain column
simple_data['rain'] = simple_data.cloudy.map(rain)

#generate filters for clouds
cloudy_0 = simple_data.cloudy==0
cloudy_1 = simple_data.cloudy==1

### PyMC3 implementation
with pm.Model() as simple_bn:
    #priors
    p_c = pm.Uniform('p_c', 0, 1)
    p_c_0 = pm.Uniform('p_c_0', 0, 1)
    p_c_1 = pm.Uniform('p_c_1', 0, 1)
    
    #likelihoods
    C = pm.Bernoulli('C', p = p_c, observed = simple_data.cloudy.values)
    R_C_0 = pm.Bernoulli('R_C_0', p = p_c_0, observed = simple_data.loc[cloudy_0, 'rain'].values)
    R_C_1 = pm.Bernoulli('R_C_1', p = p_c_1, observed = simple_data.loc[cloudy_1, 'rain'].values)
    
    #inference!
    stepper = pm.Metropolis(vars = [p_c, p_c_0, p_c_1])
    trace = pm.sample(3000, step = stepper)


### Edward implementation
#set up priors
p_c = models.Uniform(name = 'p_c', low = 0.0, high = 1.0)
p_c_0 = models.Uniform(name = 'p_c_0', low = 0.0, high = 1.0)
p_c_1 = models.Uniform(name = 'p_c_1', low = 0.0, high = 1.0)

#likelihoods
C = models.Bernoulli(name = 'C', probs = p_c, sample_shape = simple_data.shape[0])
R_C_0 = models.Bernoulli(name = 'R_C_0', probs = p_c_0, 
                         sample_shape = simple_data.loc[simple_data.cloudy==0].shape[0])
R_C_1 = models.Bernoulli(name = 'R_C_1', probs  = p_c_1, 
                         sample_shape = simple_data.loc[simple_data.cloudy==1].shape[0])

#proposal distribution for metropolis hastings 
propose_dist_1 = models.Normal(loc = p_c, scale = 0.1)
propose_dist_2 = models.Normal(loc = p_c_0, scale = 0.1)
propose_dist_3 = models.Normal(loc = p_c_1, scale = 0.1)

#set up posterior dists
T = 10000
qp_c = models.Empirical(tf.get_variable('qp_c/params', 
                                        shape = [T], 
                                        initializer=tf.constant_initializer(0.5)))
qp_c_0 = models.Empirical(tf.get_variable('qp_c_0/params', 
                                          shape = [T], 
                                          initializer=tf.constant_initializer(0.5)))
qp_c_1 = models.Empirical(tf.get_variable('qp_c_1/params', 
                                          shape = [T], 
                                          initializer=tf.constant_initializer(0.5)))

#set up inputs for inference
latent_vars = {p_c:qp_c, p_c_0:qp_c_0, p_c_1:qp_c_1}
prop_dist = {p_c:propose_dist_1, 
             p_c_0:propose_dist_3, 
             p_c_1:propose_dist_3}
data = {C:simple_data.cloudy.values, 
        R_C_0:simple_data.loc[cloudy_0, 'rain'].values,
        R_C_1:simple_data.loc[cloudy_1, 'rain'].values}

#Metropolis Hastings
inference = ed.MetropolisHastings(latent_vars, prop_dist, data)
inference.run()

Plotting the posterior distributions from the pymc3 analysis, I get this:

See bottom of this post if you are interested in the code necessary to generate this image.

And here is an analogous picture from Edward.

I cannot figure out why the Edward posteriors for p_c_0 and p_c_1 are not even approximately correct. I did not get this converging to incorrect value issue when I was only learning one parameter.

Thanks in advance for any ideas!

#code to generate plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')

### pymc3 plot

#find maximum likelihood estimates
#p_c
p_c_mle = simple_data.cloudy.sum()/simple_data.shape[0]

#p_r_0
p_c_0_mle = simple_data.loc[cloudy_0, 'rain'].sum()/cloudy_0.sum()

#p_r_1
p_c_1_mle = simple_data.loc[cloudy_1, 'rain'].sum()/cloudy_1.sum()

#get MLE sampling Distribution pdfs
x_s = np.linspace(0, 1, num = 500)
p_c_mle_pdf = stats.norm.pdf(x_s, p_c_mle, np.sqrt((p_c_mle * (1 - p_c_mle))/simple_data.shape[0]))
p_c_0_mle_pdf = stats.norm.pdf(x_s, p_c_0_mle, np.sqrt((p_c_0_mle * (1 - p_c_0_mle))/cloudy_0.sum()))
p_c_1_mle_pdf = stats.norm.pdf(x_s, p_c_1_mle, np.sqrt((p_c_1_mle * (1 - p_c_1_mle))/cloudy_1.sum()))

#plot the parameter estimates
fig, ax = plt.subplots(3, 1, figsize = (14, 14), sharex = True)

#p_c
ax[0].hist(trace['p_c'], bins = 50, alpha = 0.5, normed = True)
ax[0].plot(x_s, p_c_mle_pdf, lw = 1.5, color = 'purple', label = 'MLE Sampling Distr.')
ax[0].axvline(x = p_c_real, linestyle = '--', alpha = 0.5, color = 'black', label = 'True Param.')
ax[0].axvline(x = p_c_mle, linestyle = '--', alpha = 0.5, color = 'blue', label = 'MLE')
ax[0].axvline(x = trace['p_c'].mean(), linestyle = ':', alpha = 0.7, color = 'green', label = 'Posterior Mean')
ax[0].set_title('p_c')

#p_r_0
ax[1].hist(trace['p_c_0'], bins = 50, alpha = 0.5, normed = True)
ax[1].plot(x_s, p_c_0_mle_pdf, lw = 1.5, color = 'purple')
ax[1].axvline(x = p_c_0_real, linestyle = '--', alpha = 0.5, color = 'black')
ax[1].axvline(x = p_c_0_mle, linestyle = '--', alpha = 0.5, color = 'blue')
ax[1].axvline(x = trace['p_c_0'].mean(), linestyle = ':', alpha = 0.7, color = 'green')
ax[1].set_title('p_c_0')

#p_r_1
ax[2].hist(trace['p_c_1'], bins = 50, alpha = 0.5, normed = True)
ax[2].plot(x_s, p_c_1_mle_pdf, lw = 1.5, color = 'purple'
ax[2].axvline(x = p_c_1_real, linestyle = '--', alpha = 0.5, color = 'black')
ax[2].axvline(x = p_c_1_mle, linestyle = '--', alpha = 0.5, color = 'blue')
ax[2].axvline(x = trace['p_c_1'].mean(), linestyle = ':', alpha = 0.7, color = 'green')
ax[2].set_title('p_c_1')

ax[0].legend(loc = 'best')

plt.show()
### Edward plot
fig, ax = plt.subplots(3, 1, figsize = (14, 14), sharex = True)

#p_c
ax[0].hist(qp_c.params.eval()[1000:], bins = 50, alpha = 0.5, normed = True)
ax[0].set_title('p_c')

#p_r_0
ax[1].hist(qp_c_0.params.eval()[1000:], bins = 50, alpha = 0.5, normed = True)
ax[1].set_title('p_c_0')

#p_r_0
ax[2].hist(qp_c_1.params.eval()[1000:], bins = 50, alpha = 0.5, normed = True)
ax[2].set_title('p_c_1')

plt.show();

#2

I noticed a typo in the edward model,
‘p_c_0:propose_dist_3’
should be
’p_c_0:propose_dist_2’


#3

Thank you @rrtucci!!! Fixing that typo appears to have done the trick…


Parameter learning of hidden(latent) nodes for arbitrary bayesian network
#4

Great! Any release of entire code in notebook?