Parameters not getting updated

Hi all,
Below is my attempt to use the black-box variational approach in Edward (KLqp) for the case of a three node bayesian network. Each of the node is represented with a Gaussian distribution (X,Y,Z). I have observed the data for two nodes (X,Y) but not the third one (Z). I assume that the conditional distribution P(Y|X,Z) = N(wX+wzZ+b, sig) and P(z) = N(muz,sigz). I am trying to estimate the distribution for the parameters w,wz,b,muz,sigz from the data.

  • I am able to perfectly obtain the parameters for (w,wz,b) when (muz, sigz) are fixed to their values used for data generation.

  • When I assume muz to be unknown (code below) along with w,wz,b, The value of muz is not being updated and subsequently the obtained estimates of the parameters are wrong.

Does anyone have suggestions on the reason for this behavior and if there are any errors in the model definition or alternate approach to handle this in Edward?.

Thank you!

        def build_toy_dataset(N, b, w, noise_std=1):
          D = len(w)
          x = np.random.randn(N, D)
          x[:,0] = np.random.normal(loc=2, scale=1, size=N)
          x[:,1] = np.random.normal(loc=4, scale=1, size=N)
          x[:,2] = np.random.normal(loc=6, scale=1, size=N)
          y = np.dot(x, w) + np.ones(N)*b +np.random.normal(0, noise_std, size= N)
          return x[:,0:2], y
        
        #ed.set_seed(42)
        
        N = 1000  # number of data points
        Do = 2  # number of features obs
        Dh = 1  # number of features hidden 
        D = Do+Dh
        
        # DATA
        w_true = np.ones(D) * 5.0
        X_train, y_train = build_toy_dataset(N,1, w_true)
        X_test, y_test = build_toy_dataset(N,1, w_true)
        
        # MODEL
        X = tf.placeholder(tf.float32, [N, Do])       
        w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
        b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
        
        alpha = tf.Variable(0.5, trainable=False)
        beta = tf.Variable(0.7, trainable=False)
        ig = InverseGamma(alpha, beta)

        wz = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
#        alpha_z = tf.Variable(0.5, trainable=False)
#        beta_z = tf.Variable(0.7, trainable=False)
#        ig_z = InverseGamma(alpha_z, beta_z)
        mu_z = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))        
        Z = Normal(loc=mu_z,scale=tf.sqrt(1.0))
        
        y = Normal(loc=ed.dot(X, w) + Z*wz + b,  scale=tf.ones([N]) * tf.sqrt(ig))
        
        # INFERENCE
        qw = Normal(loc=tf.Variable(tf.random_normal([Do])),
                    scale=tf.nn.softplus(tf.Variable(tf.random_normal([Do]))))
        qb = Normal(loc=tf.Variable(tf.random_normal([1])),
                    scale=tf.nn.softplus(tf.Variable(tf.random_normal([1]))))
                    
        qig = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))), 
                           tf.nn.softplus(tf.Variable(tf.random_normal([])))) 
                           
        qwz = Normal(loc=tf.Variable(tf.random_normal([Dh])),
                    scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

        qmuz = Normal(tf.Variable(tf.random_normal([Dh])), 
                           tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))
      
        inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, mu_z:qmuz}, data={X: X_train, y: y_train})
        inference.run(n_iter=5000, n_print=100, n_samples=100)
        
        ## CRITICISM
        sess = ed.get_session()
        mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_muz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,mu_z.loc])
        print("Inferred posterior mean:")
        print(mean_qw, mean_qb,qig_rate/(qig_conc+1),mean_qwz,mean_muz)
        print(np.dot(np.mean(X_train,axis=0),mean_qw)+mean_qb+mean_muz*mean_qwz)
        print("y mean:data",np.mean(y_train),"y variance:data",np.var(y_train))
        print("x mean:data",np.mean(X_train,axis=0),"x variance:data",np.var(X_train,axis=0))  

This line

mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_muz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,mu_z.loc])

You’re asking for the location parameter of mu_z. mu_z is the prior and not the inferred posterior. As per its definition in your code, this parameter is fixed as a vector of 0s. Is this intended?

Thanks for pointing it out Dustin. Now I defined priors for mu_z and ig_z and included Z as the latent variable in the inference and it is working as expected (code below). However, the mean for y obtained using the posterior of the parameters [mean(X_train)w+Z.locwz+b] is not consistent with the mean of the training data [y_train]. I suspect this is due to the optimization algorithm and the variational approach settings. I am new to the variational bayesian techniques so I would like to ask if you have any suggestion on the parameter settings or the variational inference engine that could improve the results.


        wz = Normal(loc=tf.zeros(Dh)+5, scale=tf.ones(Dh))
        alpha_z = tf.Variable(0.5, trainable=False)
        beta_z = tf.Variable(0.7, trainable=False)
        ig_z = InverseGamma(alpha_z, beta_z) 
        mu_z = Normal(loc=tf.zeros(Dh)+6, scale=tf.ones(Dh))
        Z = Normal(loc=mu_z,scale=tf.sqrt(ig_z))
        
        y = Normal(loc=ed.dot(X, w) + Z*wz + b,  scale=tf.ones([N]) * tf.sqrt(ig))
        
        # INFERENCE
        qz = Normal(tf.Variable(tf.random_normal([Dh])), 
                           tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))
      
        inference = ed.KLpq({w: qw, b: qb, ig:qig, wz:qwz, Z:qz}, data={X: X_train, y: y_train})
        inference.run(n_iter=5000, n_print=100, n_samples=10)

Hi Dustin,
I have a follow up question on replacing the observed random variable with a latent one. For example, In my case:

        X = tf.placeholder(tf.float32, [N, Do])    
        Z = tf.placeholder(tf.float32, [N, Dh]) 
        w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
        b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
        
        alpha = tf.Variable(0.5, trainable=False)
        beta = tf.Variable(0.7, trainable=False)
        ig = InverseGamma(alpha, beta)

        wz = Normal(loc=tf.zeros(Dh), scale=tf.ones(Dh))
        y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
        inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz}, data={X: X_train, y: y_train,Z:z_train})

If I want to treat Z as a latent variable that replaces the data, How do I do it in Edward. Is the following definition correct

        X = tf.placeholder(tf.float32, [N, Do])       
        w = Normal(loc=tf.zeros(Do), scale=tf.ones(Do))
        b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
        
        alpha = tf.Variable(0.5, trainable=False)
        beta = tf.Variable(0.7, trainable=False)
        ig = InverseGamma(alpha, beta)

        wz = Normal(loc=tf.zeros(Dh)+5, scale=tf.ones(Dh))
        mu_z = Normal(loc=tf.zeros([N,Dh])+6, scale=tf.ones([N,Dh]))
        Z = Normal(loc=mu_z,scale=tf.ones([N,Dh]))
        
        y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
        qz = Normal(tf.Variable(tf.random_normal([N,Dh])), 
                           tf.nn.softplus(tf.Variable(tf.random_normal([N,Dh]))))

        inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, Z:qz}, data={X: X_train, y: y_train})

or

        mu_z = Normal(loc=tf.zeros([1,Dh])+6, scale=tf.ones([1,Dh]))
        Z = Normal(loc=mu_z,scale=tf.ones([1,Dh]))
        
        y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
        qz = Normal(tf.Variable(tf.random_normal([1,Dh])), 
                           tf.nn.softplus(tf.Variable(tf.random_normal([1,Dh]))))

        inference = ed.KLqp({w: qw, b: qb, ig:qig, wz:qwz, Z:qz}, data={X: X_train, y: y_train})

Thank you !

Below is a working version of the code: Klpq inference method works perfectly for my case. Thanks for the help!. However, the KLqp with default loss function does not converge to the parameters used to generate the data.

    def build_toy_dataset(N, b, w, noise_std=1):
      D = len(w)
      x = np.random.randn(N, D)
      x[:,0] = np.random.normal(loc=2, scale=1, size=N)
      x[:,1] = np.random.normal(loc=4, scale=1, size=N)
      x[:,2] = np.random.normal(loc=6, scale=1, size=N)
      y = np.dot(x, w) + np.ones(N)*b +np.random.normal(0, noise_std, size= N)
      z = np.zeros((N,1))
      z[:,0] = x[:,2]
      return x[:,0:2], y, z    
    #ed.set_seed(42)    
    N = 1000  # number of data points
    Do = 2  # number of features obs
    Dh = 1  # number of features hidden 
    D = Do+Dh
    # DATA
    w_true = np.ones(D) * 5.0
    X_train, y_train, z_train = build_toy_dataset(N,1, w_true)
    X_test, y_test, z_test = build_toy_dataset(N,1, w_true)
    # MODEL
    X = tf.placeholder(tf.float32, [N, Do])    
    w = Normal(loc=tf.ones(Do)+5.0, scale=tf.ones(Do)*0.1)
    b = Normal(loc=tf.ones(1), scale=tf.ones(1)*0.1)
    alpha = tf.Variable(0.5, trainable=False)
    beta = tf.Variable(0.7, trainable=False)
    ig = InverseGamma(alpha, beta)

    wz = Normal(loc=tf.ones(Dh)+5.0, scale=tf.ones(Dh)*0.1)
    alpha_z = Uniform(0.2,1.0)
    beta_z = Uniform(0.2,1.0)
    ig_z = InverseGamma(concentration=alpha_z, rate=beta_z)
    
    muz_loc = Normal(loc=6.0,scale=0.1)      
    mu_z = Normal(loc=tf.ones([Dh])*muz_loc, scale=tf.ones([Dh]))
    Z = Normal(loc=tf.ones([N,Dh])*mu_z,scale=tf.ones([N,Dh])*ig_z)
    
    y = Normal(loc=ed.dot(X, w) + ed.dot(Z,wz) + b,  scale=tf.ones([N]) * tf.sqrt(ig))
    
    # INFERENCE
    qw = Normal(loc=tf.Variable(tf.random_normal([Do])),
                scale=tf.nn.softplus(tf.Variable(tf.random_normal([Do]))))
    qb = Normal(loc=tf.Variable(tf.random_normal([1])),
                scale=tf.nn.softplus(tf.Variable(tf.random_normal([1]))))
                
    qig = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))), 
                       tf.nn.softplus(tf.Variable(tf.random_normal([])))) 
                       
    qwz = Normal(loc=tf.Variable(tf.random_normal([Dh])),
                scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))

    qmu_z = Normal(loc=tf.Variable(tf.random_normal([Dh])),
                scale=tf.nn.softplus(tf.Variable(tf.random_normal([Dh]))))
                
    qig_z = InverseGamma(tf.nn.softplus(tf.Variable(tf.random_normal([]))), 
                       tf.nn.softplus(tf.Variable(tf.random_normal([]))))                
                       
    inference = ed.KLpq({w: qw, b: qb, ig:qig, wz:qwz, mu_z:qmu_z,ig_z:qig_z}, data={X: X_train, y: y_train})
    inference.run(n_iter=5000, n_print=100, n_samples=100)
    
    ## CRITICISM
    sess = ed.get_session()
    mean_qw, mean_qb, qig_conc,qig_rate,mean_qwz,mean_qmuz = sess.run([qw.loc,qb.loc,qig.concentration,qig.rate,qwz.loc,qmu_z.loc])
    print("posterior mean:",np.dot(np.mean(X_train,axis=0),mean_qw)+mean_qb+mean_qmuz*mean_qwz, "y mean:data",np.mean(y_train))