Inference in mixture models containing transformations

Dear Edward users. I need to infer parameters in a Gaussian mixture including a transformation at the end of the model.

I’ve rewritten the mixture model at the tutorial section (Unsupervised learning by clustering data points) adding a simple multiplication. My intention is to recover again the true parameters which were used to create the dataset by using KLqp inference. But something weird loss values appears…

Any comment/help with this kind of models will be welcome?

Cheers

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.cm as cm
import numpy as np
import six
import tensorflow as tf

from edward.models import (
   Categorical, Dirichlet, Empirical, InverseGamma,
   MultivariateNormalDiag, Normal, ParamMixture, Mixture)

plt.style.use('ggplot')

pi = np.array([0.5, 0.5])
mus = [[1, 1], [-1, -1]]
stds = [[0.1, 0.1], [0.1, 0.1]]

def build_toy_dataset(N, pi=pi, mus=mus, stds=stds):
 x = np.zeros((N, 2), dtype=np.float32)
 for n in range(N):
   k = np.argmax(np.random.multinomial(1, pi))
   x[n, :] = np.random.multivariate_normal(mus[k], np.diag(stds[k]))
 return x

N = 500  # number of data points
K = 2  # number of components
D = 2  # dimensionality of data
ed.set_seed(42)

x_train = build_toy_dataset(N)
x2_train = 5.0*x_train

# plt.scatter(x_train[:, 0], x_train[:, 1])
# plt.title("Simulated dataset")
# plt.show()
# The collapsed version marginalizes out the mixture assignments.

pi = Dirichlet(tf.ones(K))
mu = Normal(tf.zeros(D), tf.ones(D), sample_shape=K)
sigmasq = InverseGamma(tf.ones(D), tf.ones(D), sample_shape=K)
cat = Categorical(probs=pi, sample_shape=N)

components = [
   MultivariateNormalDiag(mu[k], sigmasq[k], sample_shape=N)
   for k in range(K)]
x = Mixture(cat=cat, components=components, sample_shape=N)
x2 = 5.0*x

## KLqp

qmu = Normal(loc=tf.Variable(tf.random_normal([K,D])),
 scale=tf.nn.softplus(tf.Variable(tf.random_normal([K,D]))))

qsigmasq = InverseGamma(
 concentration=tf.nn.softplus(tf.Variable(tf.zeros([K,D]))),
 rate=tf.nn.softplus(tf.Variable(tf.zeros([K,D]))))

inference = ed.KLqp({mu: qmu, sigmasq: qsigmasq}, data={x2: x2_train})

n_iter = 10000
n_print = 500
n_samples = 30

inference.initialize(n_iter=n_iter, n_print=n_print, n_samples=n_samples)
sess = ed.get_session()
init = tf.global_variables_initializer()
init.run()

learning_curve = []
for _ in range(inference.n_iter):
   info_dict = inference.update()
   if _%1000 == 0:
       print(info_dict)
       print(qmu.loc.eval())
       print(qmu.scale.eval())
   learning_curve.append(info_dict['loss'])

plt.semilogy(learning_curve)
plt.show()

Hi,

I haven’t worked with these stuff for a while, but looking back, inv-gamma didn’t work well with klqp.

Hi, and many thanks @ecosang !

Apologies for this huge delay in my response, but I needed to understand well what I was doing. Your answer helped me a lot, and I’ve tried to do the thing with the TransformedDistribution function. Now the code run very well.

Again, many thanks!

Here is the new code with transformations:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib.pyplot as plt
import matplotlib.cm as cm

import numpy as np
import six

import edward as ed
import tensorflow as tf

from tensorflow.contrib.distributions.python.ops import bijectors as bijector

from edward.models import (
    Categorical, Dirichlet, Empirical, InverseGamma,
    MultivariateNormalDiag, Normal, ParamMixture, Mixture,
    NormalWithSoftplusScale)

true_pi = np.array([0.2, 0.8])
true_mus = [[1., 1.], [-1., -1.]]
true_stds = [[0.1, 0.1], [0.1, 0.1]]

def build_toy_dataset(N, pi=true_pi, mus=true_mus, stds=true_stds):
  x = np.zeros((N, 2), dtype=np.float32)
  for n in range(N):
    k = np.argmax(np.random.multinomial(1, true_pi))
    x[n, :] = np.random.multivariate_normal(true_mus[k], np.diag(true_stds[k]))
  return x

N = 500  # number of data points
K = 2  # number of components
D = 2  # dimensionality of data
ed.set_seed(42)

x_train = build_toy_dataset(N)
transformedx_train = 5.0*x_train + 3.0

# Mixture model

pi = Dirichlet(tf.ones(K))
mu = Normal(tf.zeros(D), tf.ones(D), sample_shape=K)
sigmasq = InverseGamma(tf.ones(D), tf.ones(D), sample_shape=K)

cat = Categorical(probs=pi, sample_shape=N)
components = [
    MultivariateNormalDiag(mu[k], sigmasq[k], sample_shape=N)
    for k in range(K)]

x = Mixture(cat=cat, components=components, sample_shape=N)

transformedx = ed.models.TransformedDistribution(
  distribution=x,
  bijector=tf.contrib.distributions.bijectors.Affine(
    shift=3.,
    scale_identity_multiplier=5.,
    event_ndims=0),
  name="transformedx", sample_shape = N)

# KLqp inference
qmu = Normal(loc=tf.Variable(tf.random_normal([K,D])), scale=tf.nn.softplus(tf.Variable(tf.random_normal([K,D]))))
qsigmasq = ed.models.TransformedDistribution(distribution = NormalWithSoftplusScale(tf.Variable(tf.zeros([K,D])), tf.Variable(tf.zeros([K,D]))), bijector=bijector.Exp())

inference = ed.KLqp({mu:qmu, sigmasq:qsigmasq}, data={transformedx: transformedx_train})

n_iter = 1000
n_print = 500
n_samples = 10

inference.initialize(n_iter=n_iter, n_print=n_print, n_samples=n_samples)
sess = ed.get_session()
init = tf.global_variables_initializer()
init.run()

learning_curve = []
for _ in range(inference.n_iter):
    info_dict = inference.update()
    if _%100 == 0:
        print(qmu.loc.eval())
        print(info_dict)
        learning_curve.append(info_dict['loss'])

# plt.semilogy(learning_curve)
# plt.show()

# Assignment clusters

M = 100
mu_sample = qmu.sample(M)
sigmasq_sample = qsigmasq.sample(M)
x_post = Normal(loc=tf.ones([N, 1, 1, 1]) * mu_sample,
                scale=tf.ones([N, 1, 1, 1]) * tf.sqrt(sigmasq_sample))
x_broadcasted = tf.tile(tf.reshape(x_train, [N, 1, 1, D]), [1, M, K, 1])

# Sum over latent dimension, then average over posterior samples.
# ``log_liks`` ends up with shape (N, K).
log_liks = x_post.log_prob(x_broadcasted)
log_liks = tf.reduce_sum(log_liks, 3)
log_liks = tf.reduce_mean(log_liks, 1)


clusters = tf.argmax(log_liks, 1).eval()
plt.scatter(transformedx_train[:, 0], transformedx_train[:, 1], c=clusters, cmap=cm.bwr)
postmu = qmu.loc.eval()
plt.scatter(5.0*postmu[:, 0]+3.0, 5.0*postmu[:, 1]+3.0, marker = 'x')

plt.show()
2 Likes