I am running two machines with two GPUs each. I placed the TF_CONFIG on both machines in additional to the training code as stated in the [Tensor Tutorial].(https://www.tensorflow.org/api_docs/python/tf/distribute/MultiWorkerMirroredStrategy)
The tf.distributed.multiworkermirroredstrategy shows both GPU resources are running, however I get the following errors:
<code>WARNING:absl:Enabling collective ops after program startup may cause error when accessing previously created tensors.
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
Epoch 1/3
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
numer = math_ops.reduce_sum(v, axis=axes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
return op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
numer = math_ops.reduce_sum(v, axis=axes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
return op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 83
79 print(">>")
82 if __name__ == "__main__":
---> 83 main(sys.argv)
Cell In[11], line 72, in main(args)
68 with strategy.scope():
69 # Model building/compiling need to be within `strategy.scope()`.
70 multi_worker_model = build_and_compile_cnn_model()
---> 72 multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
74 elapsed_time = time.time() - start_time
75 str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))
File /opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/anaconda3/lib/python3.12/site-packages/optree/ops.py:747, in tree_map(func, tree, is_leaf, none_is_leaf, namespace, *rests)
745 leaves, treespec = _C.flatten(tree, is_leaf, none_is_leaf, namespace)
746 flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
--> 747 return treespec.unflatten(map(func, *flat_args))
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
</code>
<code>WARNING:absl:Enabling collective ops after program startup may cause error when accessing previously created tensors.
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
Epoch 1/3
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
numer = math_ops.reduce_sum(v, axis=axes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
return op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
numer = math_ops.reduce_sum(v, axis=axes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
return op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 83
79 print(">>")
82 if __name__ == "__main__":
---> 83 main(sys.argv)
Cell In[11], line 72, in main(args)
68 with strategy.scope():
69 # Model building/compiling need to be within `strategy.scope()`.
70 multi_worker_model = build_and_compile_cnn_model()
---> 72 multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
74 elapsed_time = time.time() - start_time
75 str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))
File /opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/anaconda3/lib/python3.12/site-packages/optree/ops.py:747, in tree_map(func, tree, is_leaf, none_is_leaf, namespace, *rests)
745 leaves, treespec = _C.flatten(tree, is_leaf, none_is_leaf, namespace)
746 flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
--> 747 return treespec.unflatten(map(func, *flat_args))
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
</code>
WARNING:absl:Enabling collective ops after program startup may cause error when accessing previously created tensors.
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
Epoch 1/3
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
numer = math_ops.reduce_sum(v, axis=axes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
return op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
yield
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
numer = math_ops.reduce_sum(v, axis=axes)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
return op(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 83
79 print(">>")
82 if __name__ == "__main__":
---> 83 main(sys.argv)
Cell In[11], line 72, in main(args)
68 with strategy.scope():
69 # Model building/compiling need to be within `strategy.scope()`.
70 multi_worker_model = build_and_compile_cnn_model()
---> 72 multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
74 elapsed_time = time.time() - start_time
75 str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))
File /opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/anaconda3/lib/python3.12/site-packages/optree/ops.py:747, in tree_map(func, tree, is_leaf, none_is_leaf, namespace, *rests)
745 leaves, treespec = _C.flatten(tree, is_leaf, none_is_leaf, namespace)
746 flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
--> 747 return treespec.unflatten(map(func, *flat_args))
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
The code I tried was the following:
<code>earrays are in uint8 and have values in the range [0, 255].
# You need to convert them to float32 with values in the range [0, 1]
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
return train_dataset
def build_and_compile_cnn_model():
model = tf.keras.Sequential(
[
tf.keras.Input(shape=(28, 28)),
tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
tf.keras.layers.Conv2D(32, 3, activation="relu"),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(10),
]
)
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"],
)
return model
def main(args):
start_time = time.time()
#tf_config = json.loads(os.environ["TF_CONFIG"])
#print(">>")
#print(">> Running the prototype...")
#print(">> TF_CONFIG: {}".format(tf_config))
#print(">>")
strategy = tf.distribute.MultiWorkerMirroredStrategy()
# per_worker_batch_size = 64
# num_workers = len(tf_config["cluster"]["worker"])
# global_batch_size = per_worker_batch_size * num_workers
# multi_worker_dataset = mnist_dataset(global_batch_size)
# turn on sharding
global_batch_size = 64
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
multi_worker_dataset = mnist_dataset(global_batch_size)
multi_worker_dataset_with_shrd = multi_worker_dataset.with_options(options)
with strategy.scope():
# Model building/compiling need to be within `strategy.scope()`.
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
elapsed_time = time.time() - start_time
str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))
print(">>")
print(">> Prototype run: finished. Duration: {}.".format(str_elapsed_time))
print(">>")
if __name__ == "__main__":
main(sys.argv)
</code>
<code>earrays are in uint8 and have values in the range [0, 255].
# You need to convert them to float32 with values in the range [0, 1]
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
return train_dataset
def build_and_compile_cnn_model():
model = tf.keras.Sequential(
[
tf.keras.Input(shape=(28, 28)),
tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
tf.keras.layers.Conv2D(32, 3, activation="relu"),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(10),
]
)
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"],
)
return model
def main(args):
start_time = time.time()
#tf_config = json.loads(os.environ["TF_CONFIG"])
#print(">>")
#print(">> Running the prototype...")
#print(">> TF_CONFIG: {}".format(tf_config))
#print(">>")
strategy = tf.distribute.MultiWorkerMirroredStrategy()
# per_worker_batch_size = 64
# num_workers = len(tf_config["cluster"]["worker"])
# global_batch_size = per_worker_batch_size * num_workers
# multi_worker_dataset = mnist_dataset(global_batch_size)
# turn on sharding
global_batch_size = 64
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
multi_worker_dataset = mnist_dataset(global_batch_size)
multi_worker_dataset_with_shrd = multi_worker_dataset.with_options(options)
with strategy.scope():
# Model building/compiling need to be within `strategy.scope()`.
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
elapsed_time = time.time() - start_time
str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))
print(">>")
print(">> Prototype run: finished. Duration: {}.".format(str_elapsed_time))
print(">>")
if __name__ == "__main__":
main(sys.argv)
</code>
earrays are in uint8 and have values in the range [0, 255].
# You need to convert them to float32 with values in the range [0, 1]
x_train = x_train / np.float32(255)
y_train = y_train.astype(np.int64)
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
return train_dataset
def build_and_compile_cnn_model():
model = tf.keras.Sequential(
[
tf.keras.Input(shape=(28, 28)),
tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
tf.keras.layers.Conv2D(32, 3, activation="relu"),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dense(10),
]
)
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
metrics=["accuracy"],
)
return model
def main(args):
start_time = time.time()
#tf_config = json.loads(os.environ["TF_CONFIG"])
#print(">>")
#print(">> Running the prototype...")
#print(">> TF_CONFIG: {}".format(tf_config))
#print(">>")
strategy = tf.distribute.MultiWorkerMirroredStrategy()
# per_worker_batch_size = 64
# num_workers = len(tf_config["cluster"]["worker"])
# global_batch_size = per_worker_batch_size * num_workers
# multi_worker_dataset = mnist_dataset(global_batch_size)
# turn on sharding
global_batch_size = 64
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
multi_worker_dataset = mnist_dataset(global_batch_size)
multi_worker_dataset_with_shrd = multi_worker_dataset.with_options(options)
with strategy.scope():
# Model building/compiling need to be within `strategy.scope()`.
multi_worker_model = build_and_compile_cnn_model()
multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
elapsed_time = time.time() - start_time
str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))
print(">>")
print(">> Prototype run: finished. Duration: {}.".format(str_elapsed_time))
print(">>")
if __name__ == "__main__":
main(sys.argv)