Thiết kế website giá rẻ

Question

I am running two machines with two GPUs each. I placed the TF_CONFIG on both machines in additional to the training code as stated in the [Tensor Tutorial].(https://www.tensorflow.org/api_docs/python/tf/distribute/MultiWorkerMirroredStrategy)

The tf.distributed.multiworkermirroredstrategy shows both GPU resources are running, however I get the following errors:

<code>WARNING:absl:Enabling collective ops after program startup may cause error when accessing previously created tensors.

INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']

INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')

INFO:tensorflow:Check health not enabled.

INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO

Epoch 1/3

INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.

Traceback (most recent call last):

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception

yield

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run

self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper

return func(*args, **kwargs)

^^^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper

numer = math_ops.reduce_sum(v, axis=axes)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper

return op(*args, **kwargs)

^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler

raise e.with_traceback(filtered_tb) from None

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op

raise ValueError(e.message)

ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.

INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.

Traceback (most recent call last):

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception

yield

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run

self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper

return func(*args, **kwargs)

^^^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper

numer = math_ops.reduce_sum(v, axis=axes)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper

return op(*args, **kwargs)

^^^^^^^^^^^^^^^^^^^

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler

raise e.with_traceback(filtered_tb) from None

File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op

raise ValueError(e.message)

ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.

---------------------------------------------------------------------------

ValueError Traceback (most recent call last)

Cell In[11], line 83

79 print(">>")

82 if __name__ == "__main__":

---> 83 main(sys.argv)

Cell In[11], line 72, in main(args)

68 with strategy.scope():

69 # Model building/compiling need to be within `strategy.scope()`.

70 multi_worker_model = build_and_compile_cnn_model()

---> 72 multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)

74 elapsed_time = time.time() - start_time

75 str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))

File /opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)

119 filtered_tb = _process_traceback_frames(e.__traceback__)

120 # To get the full stack trace, call:

121 # `keras.config.disable_traceback_filtering()`

--> 122 raise e.with_traceback(filtered_tb) from None

123 finally:

124 del filtered_tb

File /opt/anaconda3/lib/python3.12/site-packages/optree/ops.py:747, in tree_map(func, tree, is_leaf, none_is_leaf, namespace, *rests)

745 leaves, treespec = _C.flatten(tree, is_leaf, none_is_leaf, namespace)

746 flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]

--> 747 return treespec.unflatten(map(func, *flat_args))

ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.

</code>

<code>WARNING:absl:Enabling collective ops after program startup may cause error when accessing previously created tensors. INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1'] INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1'] INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1') INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1') INFO:tensorflow:Check health not enabled. INFO:tensorflow:Check health not enabled. INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO Epoch 1/3 INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>. Traceback (most recent call last): File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception yield File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run self.main_result = self.main_fn(*self.main_args, **self.main_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper numer = math_ops.reduce_sum(v, axis=axes) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper return op(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler raise e.with_traceback(filtered_tb) from None File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op raise ValueError(e.message) ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>. INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>. Traceback (most recent call last): File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception yield File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run self.main_result = self.main_fn(*self.main_args, **self.main_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper numer = math_ops.reduce_sum(v, axis=axes) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper return op(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler raise e.with_traceback(filtered_tb) from None File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op raise ValueError(e.message) ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>. --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[11], line 83 79 print(">>") 82 if __name__ == "__main__": ---> 83 main(sys.argv) Cell In[11], line 72, in main(args) 68 with strategy.scope(): 69 # Model building/compiling need to be within `strategy.scope()`. 70 multi_worker_model = build_and_compile_cnn_model() ---> 72 multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70) 74 elapsed_time = time.time() - start_time 75 str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time)) File /opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs) 119 filtered_tb = _process_traceback_frames(e.__traceback__) 120 # To get the full stack trace, call: 121 # `keras.config.disable_traceback_filtering()` --> 122 raise e.with_traceback(filtered_tb) from None 123 finally: 124 del filtered_tb File /opt/anaconda3/lib/python3.12/site-packages/optree/ops.py:747, in tree_map(func, tree, is_leaf, none_is_leaf, namespace, *rests) 745 leaves, treespec = _C.flatten(tree, is_leaf, none_is_leaf, namespace) 746 flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests] --> 747 return treespec.unflatten(map(func, *flat_args)) ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>. </code>

WARNING:absl:Enabling collective ops after program startup may cause error when accessing previously created tensors.
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:GPU:0', '/job:worker/replica:0/task:0/device:GPU:1', '/job:worker/replica:0/task:1/device:CPU:0', '/job:worker/replica:0/task:1/device:GPU:0', '/job:worker/replica:0/task:1/device:GPU:1']
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1')
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:Check health not enabled.
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['razer:12345', 'alienware:45282']}, task_type = 'worker', task_id = 0, num_workers = 2, local_devices = ('/job:worker/task:0/device:GPU:0', '/job:worker/task:0/device:GPU:1'), communication = CommunicationImplementation.AUTO
Epoch 1/3
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
    yield
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
    numer = math_ops.reduce_sum(v, axis=axes)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
    return op(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
    raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
INFO:tensorflow:Error reported to Coordinator: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/training/coordinator.py", line 293, in stop_on_exception
    yield
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/mirrored_run.py", line 387, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1809, in mean_reduce_helper
    numer = math_ops.reduce_sum(v, axis=axes)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/ops/weak_tensor_ops.py", line 88, in wrapper
    return op(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/opt/anaconda3/lib/python3.12/site-packages/tensorflow/python/framework/ops.py", line 1037, in _create_c_op
    raise ValueError(e.message)
ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 83
     79     print(">>")
     82 if __name__ == "__main__":
---> 83     main(sys.argv)

Cell In[11], line 72, in main(args)
     68 with strategy.scope():
     69     # Model building/compiling need to be within `strategy.scope()`.
     70     multi_worker_model = build_and_compile_cnn_model()
---> 72 multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)
     74 elapsed_time = time.time() - start_time
     75 str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))

File /opt/anaconda3/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb

File /opt/anaconda3/lib/python3.12/site-packages/optree/ops.py:747, in tree_map(func, tree, is_leaf, none_is_leaf, namespace, *rests)
    745 leaves, treespec = _C.flatten(tree, is_leaf, none_is_leaf, namespace)
    746 flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
--> 747 return treespec.unflatten(map(func, *flat_args))

ValueError: Invalid reduction dimension 0 for input with 0 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](StatefulPartitionedCall, Sum/reduction_indices)' with input shapes: [], [] and with computed input tensors: input[1] = <0>.

The code I tried was the following:

<code>earrays are in uint8 and have values in the range [0, 255].

# You need to convert them to float32 with values in the range [0, 1]

x_train = x_train / np.float32(255)

y_train = y_train.astype(np.int64)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size)

return train_dataset

def build_and_compile_cnn_model():

model = tf.keras.Sequential(

[

tf.keras.Input(shape=(28, 28)),

tf.keras.layers.Reshape(target_shape=(28, 28, 1)),

tf.keras.layers.Conv2D(32, 3, activation="relu"),

tf.keras.layers.Flatten(),

tf.keras.layers.Dense(128, activation="relu"),

tf.keras.layers.Dense(10),

]

)

model.compile(

loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),

optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),

metrics=["accuracy"],

)

return model

def main(args):

start_time = time.time()

#tf_config = json.loads(os.environ["TF_CONFIG"])

#print(">>")

#print(">> Running the prototype...")

#print(">> TF_CONFIG: {}".format(tf_config))

#print(">>")

strategy = tf.distribute.MultiWorkerMirroredStrategy()

# per_worker_batch_size = 64

# num_workers = len(tf_config["cluster"]["worker"])

# global_batch_size = per_worker_batch_size * num_workers

# multi_worker_dataset = mnist_dataset(global_batch_size)

# turn on sharding

global_batch_size = 64

options = tf.data.Options()

options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

multi_worker_dataset = mnist_dataset(global_batch_size)

multi_worker_dataset_with_shrd = multi_worker_dataset.with_options(options)

with strategy.scope():

# Model building/compiling need to be within `strategy.scope()`.

multi_worker_model = build_and_compile_cnn_model()

multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)

elapsed_time = time.time() - start_time

str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))

print(">>")

print(">> Prototype run: finished. Duration: {}.".format(str_elapsed_time))

print(">>")

if __name__ == "__main__":

main(sys.argv)

</code>

<code>earrays are in uint8 and have values in the range [0, 255]. # You need to convert them to float32 with values in the range [0, 1] x_train = x_train / np.float32(255) y_train = y_train.astype(np.int64) train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size) return train_dataset def build_and_compile_cnn_model(): model = tf.keras.Sequential( [ tf.keras.Input(shape=(28, 28)), tf.keras.layers.Reshape(target_shape=(28, 28, 1)), tf.keras.layers.Conv2D(32, 3, activation="relu"), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dense(10), ] ) model.compile( loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=["accuracy"], ) return model def main(args): start_time = time.time() #tf_config = json.loads(os.environ["TF_CONFIG"]) #print(">>") #print(">> Running the prototype...") #print(">> TF_CONFIG: {}".format(tf_config)) #print(">>") strategy = tf.distribute.MultiWorkerMirroredStrategy() # per_worker_batch_size = 64 # num_workers = len(tf_config["cluster"]["worker"]) # global_batch_size = per_worker_batch_size * num_workers # multi_worker_dataset = mnist_dataset(global_batch_size) # turn on sharding global_batch_size = 64 options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA multi_worker_dataset = mnist_dataset(global_batch_size) multi_worker_dataset_with_shrd = multi_worker_dataset.with_options(options) with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70) elapsed_time = time.time() - start_time str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time)) print(">>") print(">> Prototype run: finished. Duration: {}.".format(str_elapsed_time)) print(">>") if __name__ == "__main__": main(sys.argv) </code>

earrays are in uint8 and have values in the range [0, 255].
    # You need to convert them to float32 with values in the range [0, 1]
    x_train = x_train / np.float32(255)
    y_train = y_train.astype(np.int64)
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
    return train_dataset


def build_and_compile_cnn_model():
    model = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(28, 28)),
            tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
            tf.keras.layers.Conv2D(32, 3, activation="relu"),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(10),
        ]
    )
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
        metrics=["accuracy"],
    )
    return model


def main(args):
    start_time = time.time()

    #tf_config = json.loads(os.environ["TF_CONFIG"])
    #print(">>")
    #print(">> Running the prototype...")
    #print(">> TF_CONFIG: {}".format(tf_config))
    #print(">>")

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    # per_worker_batch_size = 64
    # num_workers = len(tf_config["cluster"]["worker"])
    # global_batch_size = per_worker_batch_size * num_workers
    # multi_worker_dataset = mnist_dataset(global_batch_size)

    # turn on sharding
    global_batch_size = 64
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    multi_worker_dataset = mnist_dataset(global_batch_size)
    multi_worker_dataset_with_shrd = multi_worker_dataset.with_options(options)

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_and_compile_cnn_model()

    multi_worker_model.fit(multi_worker_dataset_with_shrd, epochs=3, steps_per_epoch=70)

    elapsed_time = time.time() - start_time
    str_elapsed_time = time.strftime("%H : %M : %S", time.gmtime(elapsed_time))

    print(">>")
    print(">> Prototype run: finished. Duration: {}.".format(str_elapsed_time))
    print(">>")


if __name__ == "__main__":
    main(sys.argv)

Thiết kế website giá rẻ

Danh mục

Problems in Jupyter Lab with Distributed Tensorflow using tf.distributed.multiworkermirroredstrategy