Hi again @dustin I wonder if it possible to use the Inverse-Wishart distribution in Edward in order to do Gibbs sampling for high-dimensional GMM problems. The code below uses KLqp, but I get an error concerning to WishartCholesky

```
InvalidArgumentError (see above for traceback): Input matrix is not invertible.
[[Node: inference/sample_9/qsigma/log_prob/OperatorPDCholesky/sqrt_solve/MatrixTriangularSolve = MatrixTriangularSolve[T=DT_FLOAT, adjoint=false, lower=true, _device="/job:localhost/replica:0/task:0/cpu:0"](inference/sample_9/LinearOperatorTriL/MatrixBandPart, inference/sample_9/qsigma/log_prob/Reshape)]]
```

The code is

```
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import edward as ed
import tensorflow as tf
from tensorflow.contrib.linalg import LinearOperatorTriL
ds = tf.contrib.distributions
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.cm as cm
import six
import time
plt.style.use('ggplot')
from edward.models import Gamma, Beta, Categorical, WishartCholesky, \
MultivariateNormalTriL, Mixture, MultivariateNormalDiag
# -- Dataset --
N = 5000 # number of data points
true_K = 3 # number of components
D = 2 # dimensionality of data
true_pi = np.array([0.3, 0.6, 0.1])
true_mus = [[10, 1], [-10, -10], [-1, 20]]
true_stds = [[[3, -0.1], [-0.1, 2]],
[[4, 0.0], [0.0, 1]], [[3, 0.2], [0.2, 4]]]
def build_toy_dataset(N,pi,mus,stds):
x = np.zeros((N, 2), dtype=np.float32)
label = np.zeros((N,),dtype=np.float32)
for n in range(N):
k = np.argmax(np.random.multinomial(1, pi))
x[n, :] = np.random.multivariate_normal(mus[k], stds[k])
label[n] = k
return x, label
xn_data, xn_label = build_toy_dataset(N,true_pi, true_mus, true_stds)
K = 6 # number of components to test
#with tf.name_scope("model"): # GMM model
# sess = ed.get_session()
# tf.global_variables_initializer().run()
# Stick-break process
alpha = Gamma(concentration=6.0, rate=1.0, name="alpha")
beta = Beta(concentration1=1.0, concentration0=tf.ones(K)*alpha, name="beta")
pi = tf.concat([tf.reshape(beta[0],[1]), tf.reshape(tf.multiply(beta[1:],tf.cumprod(1 - beta[:-1])), [K-1])], 0, name="pi")
z = Categorical(probs=pi,sample_shape=N) # Assignments
# Hyperparameters
nu0 = tf.Variable(D, dtype=tf.float32, trainable=False)
psi0 = tf.Variable(np.eye(D), dtype=tf.float32, trainable=False)
mu0 = tf.Variable(np.zeros(D), dtype=tf.float32, trainable=False)
k0 = tf.Variable(1., dtype=tf.float32, trainable=False)
# Parameters
sigma = WishartCholesky(df=nu0, scale=psi0, cholesky_input_output_matrices=True, sample_shape = K, name="sigma_distribution")
mu = MultivariateNormalTriL(mu0, k0*sigma, name="mu_distribution")
components = [MultivariateNormalTriL(mu[k], sigma[k], sample_shape=N) for k in range(K)]
xn = Mixture(cat=z,components=components,sample_shape=N)
#with tf.name_scope("posterior"):
qalpha = Gamma(concentration=tf.Variable(tf.constant(1.0),name="qalpha_concentration"), rate=tf.Variable(tf.constant(1.0), name="qalpha_rate"))
qbeta = Beta(concentration1=1.0, concentration0=tf.Variable(tf.ones([K]),name="qconcetrarion0"))
qpsi0 = tf.Variable(tf.random_normal([K, D, D], dtype=tf.float32))
Ltril = LinearOperatorTriL(ds.matrix_diag_transform(qpsi0, transform=tf.nn.softplus)).to_dense()
qsigma = WishartCholesky(df=tf.ones([K])*100, scale=Ltril, cholesky_input_output_matrices=True, name="qsigma")
qmu0 = tf.Variable(tf.random_normal([K, D], dtype=tf.float32))
qR = tf.nn.softplus(tf.Variable(tf.zeros([K, D], dtype=tf.float32)))
qmu = MultivariateNormalDiag(qmu0, qR)
inference = ed.KLqp({alpha:qalpha, beta:qbeta, mu: qmu, sigma: qsigma}, data={xn: xn_data})
learning_rate = 1e-3
optimizer = tf.train.AdamOptimizer(learning_rate)
n_iter=100000
n_samples=10
inference.initialize(n_iter=n_iter, n_print=0, n_samples=n_samples, optimizer = optimizer)
sess = ed.get_session()
init = tf.global_variables_initializer()
init.run()
learning_curve = []
for _ in range(inference.n_iter):
info_dict = inference.update()
if _%10000 == 0:
print(info_dict)
learning_curve.append(info_dict['loss'])
post_beta = qbeta.sample(500)
post_pi = tf.map_fn(lambda beta: tf.concat([tf.reshape(beta[0],[1]), tf.reshape(tf.multiply(beta[1:],tf.cumprod(1 - beta[:-1])), [K-1])], 0), post_beta)
post_pi_mean = post_pi.eval().mean(axis=0)
# Two subplots, unpack the axes array immediately
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))
plot_w = np.arange(K) + 1
ax1.bar(plot_w - 0.5, post_pi_mean, width=1., lw=0);
ax1.set_xlim(0.5, K);
ax1.set_xlabel('Component');
ax1.set_ylabel('Posterior expected mixture weight');
ax2.semilogy(learning_curve)
fig.savefig(time.strftime("%Y_%m_%d-%H_%M_%S")+'-var_K='+np.str(K)+'-lr='+np.str(learning_rate)+'-n_iter='+np.str(n_iter)+'-n_samples='+np.str(n_samples)+'.pdf')
```