Deep Generative Models in Edward

Hi all,
I am trying to model deep generative models similar to Rezende et al. 2014 (https://arxiv.org/pdf/1401.4082.pdf) using Edward. In the toy problem below, I am trying to estimate P(Y|X) = \int_{H} P(Y,H|X) dH, where X and Y are univariate. I am wondering if this specification of model and inference in Edward are correct.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from edward.models import Empirical, Normal, InverseGamma, Uniform, TransformedDistribution
from edward.models import MultivariateNormalDiag
ds = tf.contrib.distributions
sess = ed.get_session() 
ed.set_seed(42)

def build_toy_dataset():
  D = 1
  N = 100
  x = np.random.randn(N, D)
  x_orig = np.zeros((N, D))
  b = np.array([2.0],dtype=np.float64)
  d = np.array([np.sqrt(1)],dtype=np.float64)
  x_data_gen2 = Normal(loc= b, scale=d)   
  x_orig[:,0] =sess.run(x_data_gen2.sample(N))[:,0]
  x[:,0] = np.square(x_orig[:,0])  
  wTrue = np.array([3.0])
  bTrue = 2.0
  varTrue = 1.0  
  y_data_gen = Normal(loc=np.dot(x, wTrue) + np.ones(N)*bTrue, scale=np.ones([N])*np.sqrt(varTrue))
                                                           
  y = sess.run(y_data_gen.sample(1))
  return x_orig, y
  


N = 100  # number of data points
D = 1  # number of features
Dth1 = 10
Dth2 = 10

# DATA
w_true = np.ones(D) * 1.0
X_train, y_train = build_toy_dataset()   
y_train = y_train.T

#==============================================================================
#   Model          
#==============================================================================

X = tf.placeholder(tf.float64, [N, D])                
ig1 = tf.constant(0.01,dtype=tf.float64)
#### LAYER 1
W1 = tf.ones([D,Dth1],dtype=tf.float64,name="W1") # D x Dth1
B1 = tf.ones([1,Dth1],dtype=tf.float64,name="B1") # 1 X Dth1
H1_loc = tf.nn.softplus(tf.matmul(X,W1) + B1)  # N X Dth1
H1_scale_diag =tf.nn.softplus(tf.nn.softplus(tf.matmul(X,W1) + B1))# N diagonals of shape [Dth1 X 1]
H1 = MultivariateNormalDiag(loc=H1_loc,scale_diag=H1_scale_diag,name="H1") # N X Dth1
#### LAYER 2                                                   
W2 = tf.ones([Dth1,Dth2],dtype=tf.float64,name="W2") # Dth1 x Dth2
B2 = tf.ones([1,Dth2],dtype=tf.float64,name="B2") # 1 X Dth2
H2_loc = tf.nn.softplus(tf.matmul(H1,W2) + B2) # N X Dth2
H2_scale_diag = tf.nn.softplus(tf.nn.softplus(tf.matmul(H1,W2) + B2))  # N diagonals of shape [Dth2 X 1]
H2 = MultivariateNormalDiag(loc=H2_loc,scale_diag=H2_scale_diag,name="H2") # N X Dth2
                                                   
#### OUTPUT                                                   
W0 = Normal(loc=tf.ones([Dth2,1],dtype=tf.float64), scale=tf.ones([Dth2,1],dtype=tf.float64),name="W0")#prior on w
B0 = Normal(loc=tf.ones(1,dtype=tf.float64), scale=tf.ones(1,dtype=tf.float64),name="B0") # prior on b  
y_loc = tf.matmul(H2,W0)+ tf.ones([N,1],dtype=tf.float64)*B0
y_scale_diag = tf.nn.softplus(y_loc)
y = Normal(loc=y_loc, scale=y_scale_diag,name='y')  

#==============================================================================
# Inference
#==============================================================================
qW1 = tf.Variable(tf.truncated_normal([D,Dth1],dtype=tf.float64),name='qW1')
qB1 = tf.Variable(tf.truncated_normal([1,Dth1],dtype=tf.float64),name='qB1') 
qW2 = tf.Variable(tf.truncated_normal([Dth1,Dth2],dtype=tf.float64),name='qW2')
qB2 = tf.Variable(tf.truncated_normal([1,Dth2],dtype=tf.float64),name='qB2') 

qH1 = MultivariateNormalDiag(loc=tf.nn.softplus(tf.matmul(X,qW1) + qB1),
                          scale_diag=tf.nn.softplus(tf.nn.softplus(tf.matmul(X,qW1) + qB1)),name="qH1") # N X Dth1
                                                   
qH2 = MultivariateNormalDiag(loc=tf.nn.softplus(tf.matmul(qH1,qW2) + qB2),
                          scale_diag=tf.nn.softplus(tf.nn.softplus(tf.matmul(qH1,qW2) + qB2)),name="qH2") # N X Dth1
                                                   
qW0 = Normal(loc=tf.Variable(tf.random_uniform([Dth2,1],dtype=tf.float64)),
            scale=tf.nn.softplus(tf.Variable(tf.truncated_normal([Dth2,1],dtype=tf.float64))),name="qW0") #posterior on w1
qB0 = Normal(loc=tf.Variable(tf.random_uniform([1],dtype=tf.float64)),
            scale=tf.nn.softplus(tf.Variable(tf.truncated_normal([1],dtype=tf.float64))),name="qB0") #posterior on b1

inference = ed.KLpq({H1:qH1,H2:qH2,W0:qW0,B0:qB0}, data={X:X_train,y: y_train})
inference.initialize(n_iter=5000, n_print=100, n_samples=100,logdir='log')
tf.global_variables_initializer().run()


for _ in range(inference.n_iter):
  # Update and print progress of algorithm.  
  info_dict = inference.update()                                                                                       
  inference.print_progress(info_dict)

It is correct but slow. Some tips:

  1. In your build_toy_dataset(), you’re building Edward random variables and then immediately evaluating them. This can be made more efficient using NumPy random calls or teasing out the random variable building from the function. Otherwise multiple calls will build extraneous nodes in the TensorFlow graph.
  2. You might try using Normal instead of MultivariateNormalDiag—both work when I played with the code.
  3. Setting n_samples=100 makes both the graph building and the computation during training very expensive. After reducing it to just n_samples=5, the code runs very fast.