Dirichlet process mixture model for multivariate Bernoulli mixture

Hi, I have been trying to implement Dirichlet process mixture model for data that follow a mixture of multivariate Bernoulli. I used Independent and Mixture to construct multivariate Bernoulli and mixture of multivariate Bernoulli respectively. Also, I used KLqp for inference.

The problem is that I always get nans after a few iterations. The version of my tensorflow is ‘1.7.0’, the version of my Edward is ‘1.3.5’. Any suggestions would be very much appreciated.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import numpy as np
import tensorflow as tf

from edward.models import (
    Categorical, Mixture, Gamma, Beta,
    Bernoulli, Independent)

def build_toy_dataset2(N):
  probs = np.array([[0.9, 0.9],[0.1, 0.9],[0.1, 0.1]])
  x = np.zeros((N, 2), dtype=np.float32)  
  num_group1 = num_group2 = int(N/3)
  num_group3 = N - num_group1 - num_group2
  x[:num_group1,0] = np.random.choice([1,0],
                                 size=num_group1,
                                 p=[probs[0,0], 1-probs[0,0]])
  x[:num_group1,1] = np.random.choice([1,0],
                                      size=num_group1,
                                      p=[probs[0,1], 1-probs[0,1]])
  x[np.arange(num_group1,num_group1+num_group2),
    0] = np.random.choice([1,0],
                          size=num_group2,
                          p=[probs[1,0], 1-probs[1,0]])  
  x[np.arange(num_group1,num_group1+num_group2),
    1] = np.random.choice([1,0],
                          size=num_group2,
                          p=[probs[1,1], 1-probs[1,1]])
  x[np.arange(num_group1+num_group2,N),
    0] = np.random.choice([1,0],
                           size=num_group3,
                           p=[probs[2,0], 1-probs[2,0]])
  x[np.arange(num_group1+num_group2,N),
    1] = np.random.choice([1,0],
                          size=num_group3,
                          p=[probs[2,1], 1-probs[2,1]])
  return x

ed.set_seed(42)
N = 500
D = 2
T = K = 5  # truncation level in DP
alpha = 0.5
x_train = build_toy_dataset2(N)

# MODEL
alpha = Gamma(concentration=6.0, rate=1.0, name="alpha")
beta = Beta(concentration1=1.0, concentration0=tf.ones(K)*alpha, name="beta")
pi = tf.concat([tf.reshape(beta[0],[1]), tf.reshape(tf.multiply(beta[1:],tf.cumprod(1 - beta[:-1])), [K-1])], 0, name="pi")
alpha0 = tf.Variable(tf.ones(D), dtype=tf.float32, trainable=False)
beta0 = tf.Variable(tf.ones(D), dtype=tf.float32, trainable=False)
prob = Beta(alpha0, beta0, sample_shape = K)
cat = Categorical(probs=pi, sample_shape=N)
comps = [Independent(distribution=Bernoulli(probs=prob[k]),
                         reinterpreted_batch_ndims=1, sample_shape=N)
         for k in range(K)]
x = Mixture(cat=cat, components=comps, sample_shape=N)

# INFERENCE
qalpha = Gamma(concentration=tf.Variable(tf.constant(1.0),name="qalpha_concentration"), rate=tf.Variable(tf.constant(1.0), name="qalpha_rate"))
qbeta = Beta(tf.nn.softplus(tf.Variable(tf.random_normal([T]))) + 1e-5,
             tf.nn.softplus(tf.Variable(tf.random_normal([T]))) + 1e-5)
qalpha0 = tf.nn.softplus(tf.Variable(tf.random_normal([K,D]))) + 1e-5
qbeta0 = tf.nn.softplus(tf.Variable(tf.random_normal([K,D]))) + 1e-5
qprob = Beta(qalpha0,
             qbeta0)


inference = ed.KLqp({alpha:qalpha, beta: qbeta, prob: qprob}, data={x: x_train})
learning_rate = 1e-3
optimizer = tf.train.AdamOptimizer(learning_rate)
n_iter=1000
n_samples=10

inference.initialize(n_iter=n_iter, n_print=0, n_samples=n_samples, optimizer = optimizer)

sess = ed.get_session()
init = tf.global_variables_initializer()
init.run()

for _ in range(inference.n_iter):
  info_dict = inference.update()
  inference.print_progress(info_dict)
  t = info_dict['t']
  if t % inference.n_print == 0:
    print(sess.run(qprob.mean()))
    print(sess.run(qbeta.mean()))