Categorical value out of range with KLqp for Dirichlet Process Mixture

I’m getting an out of range error when sampling from a Categorical random variable. I discovered I can generate out of range indices when the input probabilities are ill defined. For example:

z = Categorical(probs=[np.nan, 0, -np.inf], sample_shape=5)
z.eval()
>> array([3, 3, 3, 3, 3])

whereas the expected indices are in the range: [0,3) I tried using modulus as in tf.mod(z, K) to map the cluster assignment index to an acceptable range of [0, K), however, I still get (for K=5):

InvalidArgumentError: Received a label value of 5 which is outside the valid range of [0, 5). Label values: 5 5 5 5 ...

Here’s the complete code:

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import edward as ed
import tensorflow as tf

from edward.models import Normal, Gamma, Beta
from edward.models import Categorical, InverseGamma

from scipy.stats import mode

def generate_data(N):
    pi = np.array([0.4, 0.6])
    mus = [[1,1], [-1, -1]]
    stds = [[0.1, 0.1], [0.1, 0.1]]
    x = np.zeros((N,2))
    for n in range(N):
        k = np.argmax(np.random.multinomial(1, pi))
        x[n,:] = np.random.multivariate_normal(mus[k], np.diag(stds[k]))
    return x

#ed.set_seed(0)

#generate data
N = 500 #number of data points
x_data = generate_data(N)
N, D = x_data.shape

with tf.name_scope("model"):
    K = 5  #number of components
    alpha = Gamma(concentration=1.0, rate=1.0, name='alpha')
    beta = Beta(concentration1=1.0, concentration0=tf.ones(K)*alpha, name='beta')
    pi = tf.concat([tf.reshape(beta[0],[1]), tf.reshape(tf.multiply(beta[1:],tf.cumprod(1 - beta[:-1])), [K-1])], 0)

    mu = Normal(loc=tf.zeros(D, name='mu/loc'), scale=tf.ones(D, name='mu/scale'), sample_shape=K, name='mu')
    sigma = InverseGamma(concentration=tf.ones([D]), rate=tf.ones([D]), sample_shape=K, name='sigma')
    z = Categorical(probs=tf.nn.softmax(pi), sample_shape=N, name='z') ## tf.nn.softmax(pi)
    x = Normal(loc=tf.gather(mu, tf.mod(z,K)), scale=tf.gather(sigma, tf.mod(z,K)), name='mixture') #tf.mod(z,K)

with tf.name_scope("posterior"):
    qalpha = Normal(loc=tf.Variable(tf.ones([])),
                    scale=tf.nn.softplus(tf.Variable(tf.ones([]))), name='qalpha')

    qmu = Normal(loc=tf.Variable(tf.random_normal([K,D])), 
                 scale=tf.nn.softplus(tf.Variable(tf.random_normal([K,D]))), name='qmu')

    qsigma = Normal(loc=tf.nn.softplus(tf.Variable(tf.random_normal([K,D]))),
                    scale=tf.nn.softplus(tf.Variable(tf.ones([K,D]))), name='qsigma')

    qz = Categorical(probs=tf.nn.softmax(tf.Variable(tf.ones([K]))),sample_shape=N, name='qz')


with tf.name_scope("inference"):

    inference = ed.KLqp({alpha: qalpha, mu: qmu, sigma: qsigma, z: qz}, data={x: x_data})
    inference.initialize()

    sess = ed.get_session()
    tf.global_variables_initializer().run()

    for iter in range(inference.n_iter):
        info_dict = inference.update()
        inference.print_progress(info_dict)

        t = info_dict['t']
        if t == 1 or t % inference.n_print == 0:
            qalpha_mean, qmu_mean = sess.run([qalpha.mean(), qmu.mean()])
            print "\nmean alpha: ", qalpha_mean
            #print "cluster means: ", qmu_mean

print "computing cluster assignments..."
#compute likelihood for each data point averaged over many posterior samples
#x_post has shape (N, 100, K, D)
x_post_num_samples = 1000
mu_sample = qmu.sample(x_post_num_samples)
sigma_sample = qsigma.sample(x_post_num_samples)
x_post = Normal(loc=tf.ones([N, 1, 1, 1]) * mu_sample,
                scale=tf.ones([N, 1, 1, 1]) * tf.sqrt(sigma_sample))
x_broadcasted = tf.tile(tf.reshape(x_data, [N, 1, 1, D]), [1, x_post_num_samples, K, 1])

#sum over latent dimension, then average over posterior samples
#log_liks final shape is (N, K)
log_liks = x_post.log_prob(x_broadcasted)
log_liks = tf.reduce_mean(log_liks, 3)
log_liks = tf.reduce_mean(log_liks, 1)

#choose cluster with the highest likelihood
assignments = tf.argmax(log_liks, 1).eval()

#posterior predictive check
print "posterior predictive check..."
x_post2 = ed.copy(x, {alpha: qalpha, mu: qmu, sigma: qsigma, z: qz})
x_post2_samples = tf.stack([x_post2.sample() for _ in range(x_post_num_samples)])
x_post2_samples_avg = np.mean(x_post2_samples.eval(), axis=0)

alpha_samples = tf.stack([qalpha.sample() for _ in range(x_post_num_samples)])
alpha_samples_avg = np.mean(alpha_samples.eval(), axis=0)
print "posterior mean alpha: ", alpha_samples_avg

pi_dist, _ = np.histogram(assignments, bins=range(K+1))
pi_dist = pi_dist/np.sum(pi_dist, dtype=np.float32)

#qmu_mean = qmu.mean().eval()
qmu_mean_samples = tf.stack([qmu.sample() for _ in range(x_post_num_samples)])
qmu_mean_samples_avg = np.mean(qmu_mean_samples.eval(), axis=0)
print "posterior cluster means: "
print qmu_mean_samples_avg

#TODO: how to save / load posterior in Edward

#generate plots
plt.figure()
sns.distplot(alpha_samples, hist=True, kde=True, label='alpha')
plt.title('concentration parameter')
plt.legend()
plt.savefig('./figures/dpmm_klqp_alpha_hist.png')

plt.figure()
plt.scatter(x_data[:,0], x_data[:,1], c=assignments, cmap=cm.bwr)
plt.axis([-3, 3, -3, 3])
plt.title("DP cluster assignments")
plt.xlabel("X1")
plt.ylabel("X2")
plt.grid()
plt.savefig('./figures/dpmm_klqp_clusters.png')

plt.figure()
plt.scatter(x_post2_samples_avg[:,0], x_post2_samples_avg[:,1])
plt.scatter(qmu_mean_samples_avg[:,0], qmu_mean_samples_avg[:,1], s=50, marker='x', color='r')
plt.axis([-3, 3, -3, 3])
plt.title("Posterior Predictive Check")
plt.xlabel("X1")
plt.ylabel("X2")
plt.grid()
plt.savefig('./figures/dpmm_klqp_ppc.png')

plt.figure()
plt.bar(range(K), pi_dist, color='r', label='pi_k')
plt.title('mixture proportions')
plt.xlabel('clusters')
plt.ylabel('proportions')
plt.legend()
plt.savefig('./figures/dpmm_klqp_mixture_proportions.png')

I appreciate any comments on how to resolve the out of range error or ways of improving the code. Thanks!