Tensorflow NaN loss during training: trying to reshape logits and labels


#1

Hello All,

I have just started to use tensorflow. I am trying to feed some neurons with raw images (944,944) that I later reshape for BINARY classification. Full code is here:

import tensorflow as tf
import numpy as np
import os
# import cv2
from scipy import ndimage
import PIL

tf.logging.set_verbosity(tf.logging.INFO)
file_writer =tf.summary.FileWriter('./log',tf.Session().graph)

def define_model(features, labels, mode):
"""Model function for CNN."""
# Input Layer
input_layer = tf.reshape(features["x"], [-1, 512, 512, 1])

# Convolutional Layer #1
conv1 = tf.layers.conv2d(
  inputs=input_layer,
  filters=32,
  kernel_size=[16, 16],
  padding="same",
  activation=tf.nn.relu)

# Pooling Layer #1
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

# Convolutional Layer #2 and Pooling Layer #2
conv2 = tf.layers.conv2d(
    inputs=pool1,
    filters=64,
    kernel_size=[16, 16],
    padding="same",
    activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

# Dense Layer
# pool2_shape = tf.shape(pool2)
# pool2_flat = tf.reshape (pool2, 
[-1,pool2_shape[1]*pool2_shape[2]*pool2_shape[3]])
pool2_flat=tf.layers.flatten(pool2)
dense = tf.layers.dense(inputs=pool2_flat, units=1024, 
activation=tf.nn.relu)
dropout = tf.layers.dropout(
    inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)

# Logits Layer - raw predictions
logits = tf.layers.dense(inputs=dropout, units=10)

predictions = {
    # Generate predictions (for PREDICT and EVAL mode)
    "classes": tf.argmax(input=logits, axis=1),
    # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
    # `logging_hook`.
    "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}

if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels, 
logits=tf.reshape(logits,[10,10]))

# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(
        loss=loss,
        global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, 
train_op=train_op)

# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {
    "accuracy": tf.metrics.accuracy(
        labels=labels, predictions=predictions["classes"])}
return tf.estimator.EstimatorSpec(
    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


def load_images(path):
list_of_imgs = []
list_of_classes = []
for (dirpath1, dirnames1, filenames1) in os.walk(path):
    for dir1 in dirnames1:
        path1 = os.path.join(dirpath1, dir1)
        for (dirpath2, dirnames2, filenames2) in os.walk(path1):
            for dir2 in dirnames2:
                path2 = os.path.join(dirpath2, dir2)
                # for (dirpath3, dirnames3, filenames3) in 
os.walk(dirpath2):
                # for directorypath, directoryname in 
zip(dirpath3,dirnames3):
                for directoryname in os.listdir(path2):
                    if directoryname == "Mascara_Frames_Aislados":
                        directorypath = os.path.join(path2, directoryname)
                        directorypath = os.path.join(directorypath, 
"crudas")
                        for img in os.listdir(directorypath):
                            img = os.path.join(directorypath, img)
                            if not img.endswith(".bmp"):
                                continue
                            a = ndimage.imread(img)
                            if a is None:
                                print ("Unable to read image: ", img)
                                continue
                            a = np.resize(a, [512, 512])
                            list_of_imgs.append(a.flatten())
                            list_of_classes.append(1)
                    elif directoryname == "FalsaAlarma":
                        directorypath = os.path.join(path2, directoryname)
                        directorypath = os.path.join(directorypath, 
"crudas")
                        for img in os.listdir(directorypath):
                            img = os.path.join(directorypath, img)
                            if not img.endswith(".bmp"):
                                continue
                            a = ndimage.imread(img)
                            if a is None:
                                print ("Unable to read image: ", img)
                                continue
                            a = np.resize(a,[512,512])
                            list_of_imgs.append(a.flatten())
                            list_of_classes.append(0)


images = np.array(list_of_imgs, dtype="float16")
labels = np.array(list_of_classes, dtype="int32")
return images,labels

if __name__ == '__main__':
# Load training and eval data
# mnist = tf.contrib.learn.datasets.load_dataset("mnist")
# train_data = mnist.train.images  # Returns np.array
# train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
train_data, train_labels = 
load_images("C:\\Users\\Heads\\Desktop\\BDManchas_Semi")

eval_data = train_data.copy()
eval_labels = train_labels.copy()

# Create the Estimator
classifier = tf.estimator.Estimator(
    model_fn=define_model, model_dir="/tmp/convnet_model")

# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
    tensors=tensors_to_log, every_n_iter=50)

# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": train_data},
    y=train_labels,
    batch_size=10,
    num_epochs=None,
    shuffle=True)
classifier.train(
    input_fn=train_input_fn,
    steps=100, #TODO estaba a 20000
    hooks=[logging_hook])

# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": eval_data},
    y=eval_labels,
    num_epochs=1,
    shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

The problem is that I am getting the error: NaN loss during training. Like following:

C:\Users\Heads\AppData\Local\Programs\Python\Python35\python.exe 
C:/Users/Heads/Desktop/TensorflowTests/test.py
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_every_n_hours': 10000, 
'_task_id': 0, '_cluster_spec': 
<tensorflow.python.training.server_lib.ClusterSpec object at 
0x0000000007A56E48>, '_save_summary_steps': 100, '_train_distribute': None, 
'_num_worker_replicas': 1, '_task_type': 'worker', '_keep_checkpoint_max': 5, 
'_save_checkpoints_secs': 600, '_service': None, '_is_chief': True, 
'_model_dir': '/tmp/convnet_model', '_global_id_in_cluster': 0, 
'_log_step_count_steps': 100, '_tf_random_seed': None, 
'_save_checkpoints_steps': None, '_evaluation_master': '', '_master': '', 
'_num_ps_replicas': 0, '_session_config': None}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/convnet_model\model.ckpt-4
2018-05-23 16:51:08.096240: W 
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101] 
Allocation of 2147483648 exceeds 10% of system memory.
2018-05-23 16:51:11.786245: W 
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101] 
Allocation of 2147483648 exceeds 10% of system memory.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
2018-05-23 16:52:51.429447: W 
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101] 
Allocation of 2147483648 exceeds 10% of system memory.
2018-05-23 16:55:48.539695: W 
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101] 
Allocation of 1073741824 exceeds 10% of system memory.
2018-05-23 16:55:48.539695: W 
T:\src\github\tensorflow\tensorflow\core\framework\allocator.cc:101] 
Allocation of 1073741824 exceeds 10% of system memory.
INFO:tensorflow:Saving checkpoints for 5 into /tmp/convnet_model\model.ckpt.
INFO:tensorflow:probabilities = [[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]
[nan nan nan nan nan nan nan nan nan nan]]
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
 File "C:/Users/Heads/Desktop/TensorflowTests/test.py", line 155, in 
<module>
hooks=[logging_hook])
...
File "C:\Users\Heads\AppData\Local\Programs\Python\Python35\lib\site- 
packages\tensorflow\python\training\monitored_session.py", line 1199, in run
run_metadata=run_metadata))
File "C:\Users\Heads\AppData\Local\Programs\Python\Python35\lib\site- 
packages\tensorflow\python\training\basic_session_run_hooks.py", line 623, in after_run
raise NanLossDuringTrainingError


 tensorflow.python.training.basic_session_run_hooks. NanLossDuringTrainingError: NaN loss during training.

Process finished with exit code 1

I think the problem is coming from the labels, that they are not coming in the same shape as the inputs (batch=10). I have unsuccessfully tried to reshape the labels, and the logits and also I decreased the learning rate. Also I have tried to use another way of computing the loss that avoids NaN results as posted How to choose cross-entropy loss in tensorflow?

Thanks