Edward: Memory leak with iterations


#1

Hi All:)

I’m asking about the memory leak when using edward inference in an iteration. Any suggestions are very appreciated!

Problem description: TF does not appear to have the leakage issue in the same loop(print out memory is the same), but when using edward in it, the memory usage keeps increasing. The inference I ran is nothing more complicated than a sample code. And I just cannot figure out which part is causing the memory leaking, since I reset the default graph every iteration.

Output:
'Iteration ', 0, ’ rss: ', 170033152L
'Iteration ', 1, ’ rss: ', 182091776L
'Iteration ', 2, ’ rss: ', 194420736L
'Iteration ', 3, ’ rss: ', 206897152L

Code snapshot:

import tensorflow as tf
import numpy as np
import gc
import resource,os
import edward as ed
from edward.models import Normal
from tqdm import tqdm
import psutil
x_train = np.linspace(-3, 3, num=50)
y_train = np.cos(x_train) + np.random.normal(0, 0.1, size=50)
x_train = x_train.astype(np.float32).reshape((50, 1))
y_train = y_train.astype(np.float32).reshape((50, 1))

reps = 5
py = psutil.Process(os.getpid())
for i in range(reps):
    tf.reset_default_graph()
    W_0 = Normal(loc=tf.zeros([1, 2]), scale=tf.ones([1, 2]))
    W_1 = Normal(loc=tf.zeros([2, 1]), scale=tf.ones([2, 1]))
    b_0 = Normal(loc=tf.zeros(2), scale=tf.ones(2))
    b_1 = Normal(loc=tf.zeros(1), scale=tf.ones(1))
    # tf.constant(np.random.random((1000, 1000)))
    qW_0 = Normal(loc=tf.get_variable("qW_0/loc", [1, 2]),
                  scale=tf.nn.softplus(tf.get_variable("qW_0/scale", [1, 2])))
    qW_1 = Normal(loc=tf.get_variable("qW_1/loc", [2, 1]),
                  scale=tf.nn.softplus(tf.get_variable("qW_1/scale", [2, 1])))
    qb_0 = Normal(loc=tf.get_variable("qb_0/loc", [2]),
                  scale=tf.nn.softplus(tf.get_variable("qb_0/scale", [2])))
    qb_1 = Normal(loc=tf.get_variable("qb_1/loc", [1]),
                  scale=tf.nn.softplus(tf.get_variable("qb_1/scale", [1])))
    x = x_train
    y_mean = tf.matmul(tf.tanh(tf.matmul(x, W_0) + b_0), W_1) + b_1
    y = Normal(loc=y_mean, scale=0.1)
    sess = tf.Session()
    with sess.as_default():
        # tf.constant(np.random.random((1000, 1000)))
        inference = ed.KLqp({W_0: qW_0, b_0: qb_0, W_1: qW_1, b_1: qb_1}, data={y: y_train})
        inference.initialize(n_print=0)
        tf.global_variables_initializer().run()
        for _ in tqdm(range(1000 - 1)):
            inference.update()
        info_dict = inference.update()
        inference.finalize()
        # inference.run(n_iter=1000,use_coordinator=True)
    sess.close()
    gc.collect()
    # print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
    print('Iteration ', i, ' rss: ', py.memory_info().rss)

#2

I faced same problem. And I confirmed the problem doesn’t occur when declaration of formula and inference are put out to outside of loop.

In your case, like this.

import tensorflow as tf
import numpy as np
import gc
import resource,os
import edward as ed
from edward.models import Normal
from tqdm import tqdm
import psutil
x_train = np.linspace(-3, 3, num=50)
y_train = np.cos(x_train) + np.random.normal(0, 0.1, size=50)
x_train = x_train.astype(np.float32).reshape((50, 1))
y_train = y_train.astype(np.float32).reshape((50, 1))

W_0 = Normal(loc=tf.zeros([1, 2]), scale=tf.ones([1, 2]))
W_1 = Normal(loc=tf.zeros([2, 1]), scale=tf.ones([2, 1]))
b_0 = Normal(loc=tf.zeros(2), scale=tf.ones(2))
b_1 = Normal(loc=tf.zeros(1), scale=tf.ones(1))

qW_0 = Normal(loc=tf.get_variable(“qW_0/loc”, [1, 2]),
scale=tf.nn.softplus(tf.get_variable(“qW_0/scale”, [1, 2])))
qW_1 = Normal(loc=tf.get_variable(“qW_1/loc”, [2, 1]),
scale=tf.nn.softplus(tf.get_variable(“qW_1/scale”, [2, 1])))
qb_0 = Normal(loc=tf.get_variable(“qb_0/loc”, [2]),
scale=tf.nn.softplus(tf.get_variable(“qb_0/scale”, [2])))
qb_1 = Normal(loc=tf.get_variable(“qb_1/loc”, [1]),
scale=tf.nn.softplus(tf.get_variable(“qb_1/scale”, [1])))
x = x_train
y_mean = tf.matmul(tf.tanh(tf.matmul(x, W_0) + b_0), W_1) + b_1
y = Normal(loc=y_mean, scale=0.1)

inference = ed.KLqp({W_0: qW_0, b_0: qb_0, W_1: qW_1, b_1: qb_1}, data={y: y_train})
inference.initialize(n_print=0)

reps = 5
py = psutil.Process(os.getpid())
for i in range(reps):
tf.reset_default_graph()

sess = tf.Session()
with sess.as_default():
    # tf.constant(np.random.random((1000, 1000)))
    tf.global_variables_initializer().run()
    for _ in tqdm(range(1000 - 1)):
        inference.update()
    info_dict = inference.update()
    inference.finalize()
    # inference.run(n_iter=1000,use_coordinator=True)
sess.close()
gc.collect()
# print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
print('Iteration ', i, ' rss: ', py.memory_info().rss)

But, I haven’t confirm work of this program. And, I can’t understand why this happen.