diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index b7224ba4..ca893d3b 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -70,19 +70,6 @@ def __init__(self, cfg): self.args = ConfigArguments.get_instance() LoadConfig(self.args, cfg) - print(f"[DEBUG DLIOBenchmark.__init__] After LoadConfig:") - print(f" storage_type = {self.args.storage_type!r}") - print(f" storage_root = {self.args.storage_root!r}") - print(f" storage_options= {self.args.storage_options!r}") - print(f" data_folder = {self.args.data_folder!r}") - print(f" framework = {self.args.framework!r}") - print(f" num_files_train= {self.args.num_files_train!r}") - print(f" record_length = {self.args.record_length!r}") - print(f" generate_data = {self.args.generate_data!r}") - print(f" do_train = {self.args.do_train!r}") - print(f" do_checkpoint = {self.args.do_checkpoint!r}") - print(f" epochs = {self.args.epochs!r}") - print(f" batch_size = {self.args.batch_size!r}") self.storage = StorageFactory().get_storage(self.args.storage_type, self.args.storage_root, self.args.framework) @@ -107,6 +94,22 @@ def __init__(self, cfg): # Configure the logging library self.args.configure_dlio_logging(is_child=False) self.logger = DLIOLogger.get_instance() + + if self.my_rank == 0: + self.logger.output(f"[DEBUG DLIOBenchmark.__init__] After LoadConfig:") + self.logger.output(f" storage_type = {self.args.storage_type!r}") + self.logger.output(f" storage_root = {self.args.storage_root!r}") + self.logger.output(f" storage_options= {self.args.storage_options!r}") + self.logger.output(f" data_folder = {self.args.data_folder!r}") + self.logger.output(f" framework = {self.args.framework!r}") + self.logger.output(f" num_files_train= {self.args.num_files_train!r}") + self.logger.output(f" record_length = {self.args.record_length!r}") + self.logger.output(f" generate_data = {self.args.generate_data!r}") + self.logger.output(f" do_train = {self.args.do_train!r}") + self.logger.output(f" do_checkpoint = {self.args.do_checkpoint!r}") + self.logger.output(f" epochs = {self.args.epochs!r}") + self.logger.output(f" batch_size = {self.args.batch_size!r}") + if dftracer_initialize: dftracer = self.args.configure_dftracer(is_child=False, use_pid=False) with Profile(name=f"{self.__init__.__qualname__}", cat=MODULE_DLIO_BENCHMARK): diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index 2f8bb8fb..a88e9242 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -498,27 +498,32 @@ def derive_configurations(self, file_list_train=None, file_list_eval=None): if self.generate_data or self.do_checkpoint: from dlio_benchmark.utils.utility import HAS_DGEN method = self.data_gen_method.lower() - if method == 'numpy': - # Only reachable via explicit DLIO_DATA_GEN=numpy — warn loudly. - self.logger.output(f"{'='*80}") - self.logger.output(f"WARNING: Data Generation Method: NUMPY (Slow Legacy Path)") - self.logger.output(f" Using NumPy random generation — 155x SLOWER than dgen-py") - self.logger.output(f" This path is for explicit comparison benchmarks ONLY.") - self.logger.output(f" Remove DLIO_DATA_GEN=numpy to restore dgen-py (default).") - self.logger.output(f"{'='*80}") - elif not HAS_DGEN: - # dgen is the default but dgen-py is not installed — warn and fall back. - self.logger.warning( - "dgen-py is not installed — falling back to NumPy for data generation " - "(~155x slower). Install dgen-py>=0.2.0 (requires Python>=3.11) for " - "full performance, or set DLIO_DATA_GEN=numpy to suppress this warning." - ) + + if method != 'numpy' and not HAS_DGEN: self.data_gen_method = 'numpy' - else: - self.logger.output(f"{'='*80}") - self.logger.output(f"Data Generation Method: DGEN (default)") - self.logger.output(f" dgen-py zero-copy BytesView — 155x faster than NumPy, 0 MiB overhead") - self.logger.output(f"{'='*80}") + + if DLIOMPI.get_instance().rank() == 0: + if method == 'numpy': + # Only reachable via explicit DLIO_DATA_GEN=numpy — warn loudly. + self.logger.output(f"{'='*80}") + self.logger.output(f"WARNING: Data Generation Method: NUMPY (Slow Legacy Path)") + self.logger.output(f" Using NumPy random generation — 155x SLOWER than dgen-py") + self.logger.output(f" This path is for explicit comparison benchmarks ONLY.") + self.logger.output(f" Remove DLIO_DATA_GEN=numpy to restore dgen-py (default).") + self.logger.output(f"{'='*80}") + elif not HAS_DGEN: + # dgen is the default but dgen-py is not installed — warn and fall back. + self.logger.warning( + "dgen-py is not installed — falling back to NumPy for data generation " + "(~155x slower). Install dgen-py>=0.2.0 (requires Python>=3.11) for " + "full performance, or set DLIO_DATA_GEN=numpy to suppress this warning." + ) + + else: + self.logger.output(f"{'='*80}") + self.logger.output(f"Data Generation Method: DGEN (default)") + self.logger.output(f" dgen-py zero-copy BytesView — 155x faster than NumPy, 0 MiB overhead") + self.logger.output(f"{'='*80}") if self.checkpoint_mechanism == CheckpointMechanismType.NONE: if self.framework == FrameworkType.TENSORFLOW: