Hi all,
Below is my attempt to use the black-box variational approach in Edward (KLqp) for the case of a three node bayesian network. Each of the node is represented with a Gaussian distribution (X,Y,Z). I have observed the data for two nodes (X,Y) but not the third one (Z). I assume that the conditional distribution P(Y|X,Z) = N(wX+wzZ+b, sig) and P(z) = N(muz,sigz). I am trying to estimate the distribution for the parameters w,wz,b,muz,sigz from the data.
-
I am able to perfectly obtain the parameters for (w,wz,b) when (muz, sigz) are fixed to their values used for data generation.
-
When I assume muz to be unknown (code below) along with w,wz,b, The value of muz is not being updated and subsequently the obtained estimates of the parameters are wrong.
Does anyone have suggestions on the reason for this behavior and if there are any errors in the model definition or alternate approach to handle this in Edward?.
Thank you!
def build_toy_dataset(N, b, w, noise_std=1):
D = len(w)
x = np.random.randn(N, D)
x[:,0] = np.random.normal(loc=2, scale=1, size=N)
x[:,1] = np.random.normal(loc=4, scale=1, size=N)
x[:,2] = np.random.normal(loc=6, scale=1, size=N)
y = np.dot(x, w) + np.ones(N)*b +np.random.normal(0, noise_std, size= N)
return x[:,0:2], y
#ed.set_seed(42)
N = 1000 # number of data points
Do = 2 # number of features obs
Dh = 1 # number of features hidden
D = Do+Dh
# DATA
w_true = np.ones(D) * 5.0
X_train, y_train = build_toy_dataset(N,1, w_true)
X_test, y_test = build_toy_dataset(N,1, w_true)
# MODEL
X = tf.placeholder(tf.float32, [N, Do])
w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
alpha = tf.Variable(0.5, trainable=False)
beta = tf.Variable(0.7, trainable=False)
ig = InverseGamma(alpha, beta)
wz = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
# alpha_z = tf.Variable(0.5, trainable=False)
# beta_z = tf.Variable(0.7, trainable=False)
# ig_z = InverseGamma(alpha_z, beta_z)
mu_z = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
Z = Normal(loc=mu_z,scale=tf.sqrt(1.0))
y = Normal(loc=ed.dot(X, w) + Z*wz + b, scale=tf.ones([N]) * tf.sqrt(ig))
# INFERENCE
qw = Normal(loc=tf.Variable(tf.random_normal([Do])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Do]))))
qb = Normal(loc=tf.Variable(tf.random_normal([1])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([1]))))
qig = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))),
tf.nn.softplus(tf.Variable(tf.random_normal([]))))
qwz = Normal(loc=tf.Variable(tf.random_normal([Dh])),
scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))
qmuz = Normal(tf.Variable(tf.random_normal([Dh])),
tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))
inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, mu_z:qmuz}, data={X: X_train, y: y_train})
inference.run(n_iter=5000, n_print=100, n_samples=100)
## CRITICISM
sess = ed.get_session()
mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_muz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,mu_z.loc])
print("Inferred posterior mean:")
print(mean_qw, mean_qb,qig_rate/(qig_conc+1),mean_qwz,mean_muz)
print(np.dot(np.mean(X_train,axis=0),mean_qw)+mean_qb+mean_muz*mean_qwz)
print("y mean:data",np.mean(y_train),"y variance:data",np.var(y_train))
print("x mean:data",np.mean(X_train,axis=0),"x variance:data",np.var(X_train,axis=0))