# Parameters not getting updated

Hi all,
Below is my attempt to use the black-box variational approach in Edward (KLqp) for the case of a three node bayesian network. Each of the node is represented with a Gaussian distribution (X,Y,Z). I have observed the data for two nodes (X,Y) but not the third one (Z). I assume that the conditional distribution P(Y|X,Z) = N(wX+wzZ+b, sig) and P(z) = N(muz,sigz). I am trying to estimate the distribution for the parameters w,wz,b,muz,sigz from the data.

• I am able to perfectly obtain the parameters for (w,wz,b) when (muz, sigz) are fixed to their values used for data generation.

• When I assume muz to be unknown (code below) along with w,wz,b, The value of muz is not being updated and subsequently the obtained estimates of the parameters are wrong.

Does anyone have suggestions on the reason for this behavior and if there are any errors in the model definition or alternate approach to handle this in Edward?.

Thank you!

``````        def build_toy_dataset(N, b, w, noise_std=1):
D = len(w)
x = np.random.randn(N, D)
x[:,0] = np.random.normal(loc=2, scale=1, size=N)
x[:,1] = np.random.normal(loc=4, scale=1, size=N)
x[:,2] = np.random.normal(loc=6, scale=1, size=N)
y = np.dot(x, w) + np.ones(N)*b +np.random.normal(0, noise_std, size= N)
return x[:,0:2], y

#ed.set_seed(42)

N = 1000  # number of data points
Do = 2  # number of features obs
Dh = 1  # number of features hidden
D = Do+Dh

# DATA
w_true = np.ones(D) * 5.0
X_train, y_train = build_toy_dataset(N,1, w_true)
X_test, y_test = build_toy_dataset(N,1, w_true)

# MODEL
X = tf.placeholder(tf.float32, [N, Do])
w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
b = Normal(loc=tf.zeros(1), scale=tf.ones(1))

alpha = tf.Variable(0.5, trainable=False)
beta = tf.Variable(0.7, trainable=False)
ig = InverseGamma(alpha, beta)

wz = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
#        alpha_z = tf.Variable(0.5, trainable=False)
#        beta_z = tf.Variable(0.7, trainable=False)
#        ig_z = InverseGamma(alpha_z, beta_z)
mu_z = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
Z = Normal(loc=mu_z,scale=tf.sqrt(1.0))

y = Normal(loc=ed.dot(X, w) + Z*wz + b,  scale=tf.ones([N]) * tf.sqrt(ig))

# INFERENCE
qw = Normal(loc=tf.Variable(tf.random_normal([Do])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Do]))))
qb = Normal(loc=tf.Variable(tf.random_normal([1])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([1]))))

qig = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))),
tf.nn.softplus(tf.Variable(tf.random_normal([]))))

qwz = Normal(loc=tf.Variable(tf.random_normal([Dh])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

qmuz = Normal(tf.Variable(tf.random_normal([Dh])),
tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, mu_z:qmuz}, data={X: X_train, y: y_train})
inference.run(n_iter=5000, n_print=100, n_samples=100)

## CRITICISM
sess = ed.get_session()
mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_muz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,mu_z.loc])
print("Inferred posterior mean:")
print(mean_qw, mean_qb,qig_rate/(qig_conc+1),mean_qwz,mean_muz)
print(np.dot(np.mean(X_train,axis=0),mean_qw)+mean_qb+mean_muz*mean_qwz)
print("y mean:data",np.mean(y_train),"y variance:data",np.var(y_train))
print("x mean:data",np.mean(X_train,axis=0),"x variance:data",np.var(X_train,axis=0))
``````

This line

``````mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_muz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,mu_z.loc])
``````

Youâ€™re asking for the location parameter of `mu_z`. `mu_z` is the prior and not the inferred posterior. As per its definition in your code, this parameter is fixed as a vector of 0s. Is this intended?

Thanks for pointing it out Dustin. Now I defined priors for mu_z and ig_z and included Z as the latent variable in the inference and it is working as expected (code below). However, the mean for y obtained using the posterior of the parameters [mean(X_train)w+Z.locwz+b] is not consistent with the mean of the training data [y_train]. I suspect this is due to the optimization algorithm and the variational approach settings. I am new to the variational bayesian techniques so I would like to ask if you have any suggestion on the parameter settings or the variational inference engine that could improve the results.

``````
wz = Normal(loc=tf.zeros(Dh)+5, scale=tf.ones(Dh))
alpha_z = tf.Variable(0.5, trainable=False)
beta_z = tf.Variable(0.7, trainable=False)
ig_z = InverseGamma(alpha_z, beta_z)
mu_z = Normal(loc=tf.zeros(Dh)+6, scale=tf.ones(Dh))
Z = Normal(loc=mu_z,scale=tf.sqrt(ig_z))

y = Normal(loc=ed.dot(X, w) + Z*wz + b,  scale=tf.ones([N]) * tf.sqrt(ig))

# INFERENCE
qz = Normal(tf.Variable(tf.random_normal([Dh])),
tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

inference = ed.KLpq({w: qw, b: qb, ig:qig, wz:qwz, Z:qz}, data={X: X_train, y: y_train})
inference.run(n_iter=5000, n_print=100, n_samples=10)
``````

Hi Dustin,
I have a follow up question on replacing the observed random variable with a latent one. For example, In my case:

``````        X = tf.placeholder(tf.float32, [N, Do])
Z = tf.placeholder(tf.float32, [N, Dh])
w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
b = Normal(loc=tf.zeros(1), scale=tf.ones(1))

alpha = tf.Variable(0.5, trainable=False)
beta = tf.Variable(0.7, trainable=False)
ig = InverseGamma(alpha, beta)

wz = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz}, data={X: X_train, y: y_train,Z:z_train})

``````

If I want to treat Z as a latent variable that replaces the data, How do I do it in Edward. Is the following definition correct

``````        X = tf.placeholder(tf.float32, [N, Do])
w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
b = Normal(loc=tf.zeros(1), scale=tf.ones(1))

alpha = tf.Variable(0.5, trainable=False)
beta = tf.Variable(0.7, trainable=False)
ig = InverseGamma(alpha, beta)

wz = Normal(loc=tf.zeros(Dh)+5, scale=tf.ones(Dh))
mu_z = Normal(loc=tf.zeros([N,Dh])+6, scale=tf.ones([N,Dh]))
Z = Normal(loc=mu_z,scale=tf.ones([N,Dh]))

y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
qz = Normal(tf.Variable(tf.random_normal([N,Dh])),
tf.nn.softplus(tf.Variable(tf.random_normal([N,Dh]))))

inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, Z:qz}, data={X: X_train, y: y_train})
``````

or

``````        mu_z = Normal(loc=tf.zeros([1,Dh])+6, scale=tf.ones([1,Dh]))
Z = Normal(loc=mu_z,scale=tf.ones([1,Dh]))

y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
qz = Normal(tf.Variable(tf.random_normal([1,Dh])),
tf.nn.softplus(tf.Variable(tf.random_normal([1,Dh]))))

inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, Z:qz}, data={X: X_train, y: y_train})
``````

Thank you !

Below is a working version of the code: Klpq inference method works perfectly for my case. Thanks for the help!. However, the KLqp with default loss function does not converge to the parameters used to generate the data.

``````    def build_toy_dataset(N, b, w, noise_std=1):
D = len(w)
x = np.random.randn(N, D)
x[:,0] = np.random.normal(loc=2, scale=1, size=N)
x[:,1] = np.random.normal(loc=4, scale=1, size=N)
x[:,2] = np.random.normal(loc=6, scale=1, size=N)
y = np.dot(x, w) + np.ones(N)*b +np.random.normal(0, noise_std, size= N)
z = np.zeros((N,1))
z[:,0] = x[:,2]
return x[:,0:2], y, z
#ed.set_seed(42)
N = 1000  # number of data points
Do = 2  # number of features obs
Dh = 1  # number of features hidden
D = Do+Dh
# DATA
w_true = np.ones(D) * 5.0
X_train, y_train, z_train = build_toy_dataset(N,1, w_true)
X_test, y_test, z_test = build_toy_dataset(N,1, w_true)
# MODEL
X = tf.placeholder(tf.float32, [N, Do])
w = Normal(loc=tf.ones(Do)+5.0, scale=tf.ones(Do)*0.1)
b = Normal(loc=tf.ones(1), scale=tf.ones(1)*0.1)
alpha = tf.Variable(0.5, trainable=False)
beta = tf.Variable(0.7, trainable=False)
ig = InverseGamma(alpha, beta)

wz = Normal(loc=tf.ones(Dh)+5.0, scale=tf.ones(Dh)*0.1)
alpha_z = Uniform(0.2,1.0)
beta_z = Uniform(0.2,1.0)
ig_z = InverseGamma(concentration=alpha_z, rate=beta_z)

muz_loc = Normal(loc=6.0,scale=0.1)
mu_z = Normal(loc=tf.ones([Dh])*muz_loc, scale=tf.ones([Dh]))
Z = Normal(loc=tf.ones([N,Dh])*mu_z,scale=tf.ones([N,Dh])*ig_z)

y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))

# INFERENCE
qw = Normal(loc=tf.Variable(tf.random_normal([Do])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Do]))))
qb = Normal(loc=tf.Variable(tf.random_normal([1])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([1]))))

qig = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))),
tf.nn.softplus(tf.Variable(tf.random_normal([]))))

qwz = Normal(loc=tf.Variable(tf.random_normal([Dh])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

qmu_z = Normal(loc=tf.Variable(tf.random_normal([Dh])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

qig_z = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))),
tf.nn.softplus(tf.Variable(tf.random_normal([]))))

inference = ed.KLpq({w: qw, b: qb, ig:qig, wz:qwz, mu_z:qmu_z,ig_z:qig_z}, data={X: X_train, y: y_train})
inference.run(n_iter=5000, n_print=100, n_samples=100)

## CRITICISM
sess = ed.get_session()
mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_qmuz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,qmu_z.loc])
print("posterior mean:",np.dot(np.mean(X_train,axis=0),mean_qw)+mean_qb+mean_qmuz*mean_qwz, "y mean:data",np.mean(y_train))
``````