From 203a883552e58a7e1a055e6a92653ae3def22ff7 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 1 Apr 2026 06:06:01 +0000 Subject: [PATCH 1/5] Fix the num_workers usage. --- superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 37d27bf1a..9d34ab9eb 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -651,13 +651,16 @@ def _generate_dataset(self): if self._args.dataset_url: self._raw_data_path = str(Path(self._args.data_home) / 'data.json') download_file(self._args.dataset_url, self._raw_data_path) + command = ( 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' f'--input {self._raw_data_path} ' f'--tokenizer-type {self._args.tokenizer_type} ' f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' - f'--workers {str(self._args.num_workers)} ' + # num_workers=0 is valid for DataLoader (main process loads data), + # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. + f'--workers {max(1, self._args.num_workers)} ' f'--vocab-file {self._vocab_path} ' f'--merge-file {self._merges_path}' ) From c6d205e4d1e9903470d66fb299547c04b950e253 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 21:59:47 +0000 Subject: [PATCH 2/5] Address PR review: derive --output-prefix from data_prefix, add tests - Megatron's preprocess_data.py appends '_text_document' to the --output-prefix when producing the .bin/.idx files. Derive the output-prefix from --data_prefix (stripping a trailing '_text_document' suffix when present) so that the generated files match the existence checks for any custom data_prefix value, instead of being hardcoded to '/dataset'. - Add unit test test_megatron_gpt_dataset_generate_command covering: num_workers=0 clamps to '--workers 1' with default data_prefix ('dataset_text_document' -> '/dataset'), num_workers=4 with custom 'custom_text_document' -> '/custom', and a data_prefix without the suffix used as-is. --- .../model_benchmarks/megatron_gpt3.py | 12 +++- .../model_benchmarks/test_megatron_gpt.py | 66 +++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 9d34ab9eb..1143fc2ff 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -652,12 +652,22 @@ def _generate_dataset(self): self._raw_data_path = str(Path(self._args.data_home) / 'data.json') download_file(self._args.dataset_url, self._raw_data_path) + # Megatron's preprocess_data.py appends '_text_document' to --output-prefix + # when producing the .bin/.idx files. Derive the output-prefix from + # data_prefix (stripping the '_text_document' suffix when present) so that + # the generated files match the existence checks above for any custom + # data_prefix value. + output_prefix_basename = self._args.data_prefix + if output_prefix_basename.endswith('_text_document'): + output_prefix_basename = output_prefix_basename[:-len('_text_document')] + output_prefix = os.path.join(self._args.data_home, output_prefix_basename) + command = ( 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' f'--input {self._raw_data_path} ' f'--tokenizer-type {self._args.tokenizer_type} ' - f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' + f'--output-prefix {output_prefix} ' # num_workers=0 is valid for DataLoader (main process loads data), # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. f'--workers {max(1, self._args.num_workers)} ' diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index b7c588677..b0efd3483 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -174,6 +174,72 @@ def test_megatron_gpt_dataset(self): ret = benchmark._generate_dataset() assert (ret is True) + @mock.patch('superbench.benchmarks.model_benchmarks.megatron_gpt3.run_command') + @mock.patch('superbench.benchmarks.model_benchmarks.megatron_gpt3.download_file') + def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_run_command): + """Verify _generate_dataset clamps --workers to >=1 and derives --output-prefix from data_prefix.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + + # Case 1: num_workers=0 with default data_prefix should produce + # '--workers 1' (clamped) and '--output-prefix /dataset' + # (default data_prefix='dataset_text_document' with the suffix stripped). + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' + f'--batch_size 2048 --num_workers 0 ' + f'--dataset_url http://example.com/data.json' + ), + ) + benchmark._preprocess() + ret = benchmark._generate_dataset() + # Dataset generation will fail because the mocked run_command does not actually + # produce .bin/.idx files; we only care about the constructed command. + assert ret is False + assert mock_run_command.call_count >= 1 + cmd = mock_run_command.call_args_list[0].args[0] + assert '--workers 1' in cmd, cmd + assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd + + # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should + # produce '--workers 4' and '--output-prefix /custom'. + mock_run_command.reset_mock() + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' + f'--batch_size 2048 --num_workers 4 --data_prefix custom_text_document ' + f'--dataset_url http://example.com/data.json' + ), + ) + benchmark._preprocess() + benchmark._generate_dataset() + cmd = mock_run_command.call_args_list[0].args[0] + assert '--workers 4' in cmd, cmd + assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd + + # Case 3: data_prefix without the '_text_document' suffix is used as-is. + mock_run_command.reset_mock() + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' + f'--batch_size 2048 --num_workers 2 --data_prefix mydata ' + f'--dataset_url http://example.com/data.json' + ), + ) + benchmark._preprocess() + benchmark._generate_dataset() + cmd = mock_run_command.call_args_list[0].args[0] + assert '--workers 2' in cmd, cmd + assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): """Test command generation.""" From ee2839e9d72fec7ee799fd8a1a71deaf90cf9c51 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 22:03:02 +0000 Subject: [PATCH 3/5] Tighten test substring matches with trailing space Avoid '--workers 1' matching '--workers 10' etc. --- tests/benchmarks/model_benchmarks/test_megatron_gpt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index b0efd3483..45a6b570e 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -204,7 +204,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru assert ret is False assert mock_run_command.call_count >= 1 cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 1' in cmd, cmd + assert '--workers 1 ' in cmd, cmd assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should @@ -221,7 +221,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru benchmark._preprocess() benchmark._generate_dataset() cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 4' in cmd, cmd + assert '--workers 4 ' in cmd, cmd assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd # Case 3: data_prefix without the '_text_document' suffix is used as-is. @@ -237,7 +237,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru benchmark._preprocess() benchmark._generate_dataset() cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 2' in cmd, cmd + assert '--workers 2 ' in cmd, cmd assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') From b90dbf964c83bda263c3f7007a0bcc447a59a3b6 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sat, 2 May 2026 00:19:08 +0000 Subject: [PATCH 4/5] Address PR review: use normalize_command and valid code_base in dataset generate test - Replace brittle whitespace substring assertions ('--workers 1 ', '--output-prefix ... ') with normalize_command()-based parsed CLI unit checks, so the test validates semantics rather than formatting. - Use --code_base {self._tmp_dir} together with createMockFiles(['pretrain_gpt.py']) to avoid the unrealistic /root/Megatron-DeepSpeed path. The mocked run_command now creates the expected .bin/.idx files via side_effect so _preprocess() succeeds end-to-end and is asserted to be True. --- .../model_benchmarks/test_megatron_gpt.py | 104 ++++++++++-------- 1 file changed, 56 insertions(+), 48 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 45a6b570e..7e0b7a6bb 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -186,59 +186,67 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12345' - # Case 1: num_workers=0 with default data_prefix should produce - # '--workers 1' (clamped) and '--output-prefix /dataset' - # (default data_prefix='dataset_text_document' with the suffix stripped). - benchmark = benchmark_cls( - self.benchmark_name, - parameters=( - f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' - f'--batch_size 2048 --num_workers 0 ' - f'--dataset_url http://example.com/data.json' - ), + # Use a real, valid code_base so _preprocess() can validate it (avoid hardcoded /root path). + self.createMockFiles(['pretrain_gpt.py']) + + # Helper: make run_command's side_effect create the expected .bin/.idx files + # so _generate_dataset() (invoked from within _preprocess()) succeeds. + created_files = [] + + def _make_dataset_files(prefix): + def _side_effect(*_args, **_kwargs): + for ext in ('.bin', '.idx'): + p = Path(self._tmp_dir) / f'{prefix}{ext}' + p.touch() + created_files.append(p) + return _side_effect + + self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()]) + + def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix): + mock_run_command.reset_mock() + mock_run_command.side_effect = _make_dataset_files(expected_data_prefix) + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} ' + f'--batch_size 2048 --dataset_url http://example.com/data.json ' + f'{extra_params}' + ), + ) + assert benchmark._preprocess() is True + assert mock_run_command.call_count >= 1 + cmd = mock_run_command.call_args_list[0].args[0] + units = normalize_command(cmd) + assert f'--workers {expected_workers}' in units, units + expected_output_prefix = os.path.join(self._tmp_dir, expected_prefix_basename) + assert f'--output-prefix {expected_output_prefix}' in units, units + + # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped) + # and '--output-prefix /dataset' (default 'dataset_text_document' suffix stripped). + _run_case( + extra_params='--num_workers 0', + expected_workers=1, + expected_prefix_basename='dataset', + expected_data_prefix='dataset_text_document', ) - benchmark._preprocess() - ret = benchmark._generate_dataset() - # Dataset generation will fail because the mocked run_command does not actually - # produce .bin/.idx files; we only care about the constructed command. - assert ret is False - assert mock_run_command.call_count >= 1 - cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 1 ' in cmd, cmd - assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd - - # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should - # produce '--workers 4' and '--output-prefix /custom'. - mock_run_command.reset_mock() - benchmark = benchmark_cls( - self.benchmark_name, - parameters=( - f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' - f'--batch_size 2048 --num_workers 4 --data_prefix custom_text_document ' - f'--dataset_url http://example.com/data.json' - ), + + # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should produce + # '--workers 4' and '--output-prefix /custom'. + _run_case( + extra_params='--num_workers 4 --data_prefix custom_text_document', + expected_workers=4, + expected_prefix_basename='custom', + expected_data_prefix='custom_text_document', ) - benchmark._preprocess() - benchmark._generate_dataset() - cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 4 ' in cmd, cmd - assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd # Case 3: data_prefix without the '_text_document' suffix is used as-is. - mock_run_command.reset_mock() - benchmark = benchmark_cls( - self.benchmark_name, - parameters=( - f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' - f'--batch_size 2048 --num_workers 2 --data_prefix mydata ' - f'--dataset_url http://example.com/data.json' - ), + _run_case( + extra_params='--num_workers 2 --data_prefix mydata', + expected_workers=2, + expected_prefix_basename='mydata', + expected_data_prefix='mydata', ) - benchmark._preprocess() - benchmark._generate_dataset() - cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 2 ' in cmd, cmd - assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): From 687ac81f995a3fb2c5d8bfeb8b1afa2415c46931 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sun, 3 May 2026 05:04:57 +0000 Subject: [PATCH 5/5] Address review feedback for num_workers / data_prefix handling - Warn when num_workers is silently clamped from 0 to 1 for the preprocess subprocess so the user sees the override in the log. The DataLoader still receives the original num_workers value. - Guard the '_text_document' suffix-strip against the edge case where data_prefix == '_text_document' (which would otherwise produce a malformed --output-prefix '/' with an empty basename). Fall back to the original data_prefix value in that case. - Extend test_megatron_gpt_dataset_generate_command with a 4th case asserting the empty-basename fallback. --- .../model_benchmarks/megatron_gpt3.py | 22 +++++++++++++++---- .../model_benchmarks/test_megatron_gpt.py | 10 +++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 1143fc2ff..7541aee4b 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -659,18 +659,32 @@ def _generate_dataset(self): # data_prefix value. output_prefix_basename = self._args.data_prefix if output_prefix_basename.endswith('_text_document'): - output_prefix_basename = output_prefix_basename[:-len('_text_document')] + stripped = output_prefix_basename[:-len('_text_document')] + # Guard against data_prefix == '_text_document' which would + # leave an empty basename and produce a malformed --output-prefix + # ending in '/'. Fall back to the original value in that case. + output_prefix_basename = stripped or output_prefix_basename output_prefix = os.path.join(self._args.data_home, output_prefix_basename) + # num_workers=0 is valid for DataLoader (main process loads data), + # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. + preprocess_workers = max(1, self._args.num_workers) + if preprocess_workers != self._args.num_workers: + logger.warning( + 'preprocess_data.py requires --workers >= 1; ' + 'overriding num_workers={} to {} for dataset preprocessing only ' + '(DataLoader still uses num_workers={}).'.format( + self._args.num_workers, preprocess_workers, self._args.num_workers + ) + ) + command = ( 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' f'--input {self._raw_data_path} ' f'--tokenizer-type {self._args.tokenizer_type} ' f'--output-prefix {output_prefix} ' - # num_workers=0 is valid for DataLoader (main process loads data), - # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. - f'--workers {max(1, self._args.num_workers)} ' + f'--workers {preprocess_workers} ' f'--vocab-file {self._vocab_path} ' f'--merge-file {self._merges_path}' ) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 7e0b7a6bb..e856a8010 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -248,6 +248,16 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected expected_data_prefix='mydata', ) + # Case 4: edge case - data_prefix == '_text_document' should NOT strip down + # to an empty basename (which would produce '--output-prefix /'). + # Fall back to using '_text_document' as the basename. + _run_case( + extra_params='--num_workers 1 --data_prefix _text_document', + expected_workers=1, + expected_prefix_basename='_text_document', + expected_data_prefix='_text_document', + ) + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): """Test command generation."""