From 367cf624f27e90d23c31f55b36a5e5c9e4f34d38 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 26 Mar 2026 22:16:10 +0000 Subject: [PATCH 1/3] Fix DeepSpeed BF16 config validation error by removing FP16-only loss scaling parameters --- .../model_benchmarks/megatron_gpt3.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 37d27bf1a..42f4459c0 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -307,15 +307,23 @@ def __prepare_deespeed_config(self, precision_megatron): """Prepare deepspeed configs.""" self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json') # Load deepspeed config template json file - precision_template = { - 'enabled': True, - 'loss_scale': 0, - 'loss_scale_window': 500, - 'min_loss_scale': 1, - 'initial_scale_power': 11 - } - if self._args.hysteresis is not None: - precision_template['hysteresis'] = self._args.hysteresis + # FP16 supports loss scaling parameters; BF16 does not (sufficient dynamic range). + if precision_megatron == 'fp16': + precision_template = { + 'enabled': True, + 'loss_scale': 0, + 'loss_scale_window': 500, + 'min_loss_scale': 1, + 'initial_scale_power': 11 + } + if self._args.hysteresis is not None: + precision_template['hysteresis'] = self._args.hysteresis + elif precision_megatron == 'bf16': + precision_template = { + 'enabled': True, + } + else: + precision_template = None ds_config_template = { 'train_batch_size': self._args.batch_size, @@ -328,7 +336,7 @@ def __prepare_deespeed_config(self, precision_megatron): 'prescale_gradients': self._args.prescale_grad, } - if len(precision_megatron) > 0: + if precision_template is not None: ds_config_template[precision_megatron] = precision_template # Write to config json file From d3baec915066a7e33590bfeab1b4615bd14028ab Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 21:52:17 +0000 Subject: [PATCH 2/3] Address PR review: clarify deepspeed config template comment Replace misleading 'Load deepspeed config template json file' comment with 'Build deepspeed config template in memory' since the template is constructed inline rather than loaded from a file. --- superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 42f4459c0..1b07c0f9d 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -306,7 +306,7 @@ def _parse_log(self, output): def __prepare_deespeed_config(self, precision_megatron): """Prepare deepspeed configs.""" self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json') - # Load deepspeed config template json file + # Build deepspeed config template in memory. # FP16 supports loss scaling parameters; BF16 does not (sufficient dynamic range). if precision_megatron == 'fp16': precision_template = { From 158733527fdc217cb153e2611f562ea6af8eba1b Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sun, 3 May 2026 05:03:43 +0000 Subject: [PATCH 3/3] test: add deepspeed config schema regression test for fp16/bf16/fp32 Asserts that __prepare_deespeed_config writes the expected JSON schema: - bf16 -> {'enabled': True} only (no loss_scale / loss_scale_window / min_loss_scale / initial_scale_power / hysteresis), matching DeepSpeed's BF16 config validator that triggered the original failure. - fp16 -> retains the loss-scale fields. - '' (empty precision, e.g. fp32 path) -> no precision section attached. The test cleans up the mock pretrain_gpt.py at test end so it does not leak into the negative path of test_megatron_gpt_preprocess. --- .../model_benchmarks/test_megatron_gpt.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index b7c588677..39e4558fe 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -500,6 +500,69 @@ def test_deepseek_v2_command(self): self.assertEqual(actual_units, expected_units) + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') + def test_megatron_gpt_deepspeed_config(self, mock_generate_dataset): + """Lock in the deepspeed JSON schema written for fp16 / bf16 / fp32. + + BF16 must NOT include loss_scale / loss_scale_window / min_loss_scale / + initial_scale_power / hysteresis (DeepSpeed's BF16 config schema rejects + them); FP16 must keep them; FP32 must omit any precision section. + """ + import json + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + self.createMockFiles(['pretrain_gpt.py']) + # createMockFiles only cleans up at tearDownClass; remove at test end so + # we don't leak the file into other tests (e.g. test_megatron_gpt_preprocess + # which exercises the "no code_base" negative path). + pretrain_path = Path(self._tmp_dir) / 'pretrain_gpt.py' + self.addCleanup(lambda: pretrain_path.is_file() and pretrain_path.unlink()) + + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} ' + f'--batch_size 2048 --deepspeed' + ), + ) + mock_generate_dataset.return_value = True + assert benchmark._preprocess() is True + + # bf16: only {'enabled': True}, no loss-scale fields, no fp16 section + benchmark._MegatronGPT__prepare_deespeed_config('bf16') + with open(benchmark._config_json_path) as f: + cfg = json.load(f) + self.assertEqual(cfg.get('bf16'), {'enabled': True}) + self.assertNotIn('fp16', cfg) + for forbidden in ('loss_scale', 'loss_scale_window', 'min_loss_scale', 'initial_scale_power', 'hysteresis'): + self.assertNotIn(forbidden, cfg['bf16']) + + # fp16: keeps loss-scale fields + benchmark._MegatronGPT__prepare_deespeed_config('fp16') + with open(benchmark._config_json_path) as f: + cfg = json.load(f) + self.assertIn('fp16', cfg) + self.assertNotIn('bf16', cfg) + self.assertTrue(cfg['fp16']['enabled']) + self.assertEqual(cfg['fp16']['loss_scale'], 0) + self.assertEqual(cfg['fp16']['loss_scale_window'], 500) + self.assertEqual(cfg['fp16']['min_loss_scale'], 1) + self.assertEqual(cfg['fp16']['initial_scale_power'], 11) + + # empty precision (e.g., fp32 path which calls __prepare with ''): + # no precision section attached. + benchmark._MegatronGPT__prepare_deespeed_config('') + with open(benchmark._config_json_path) as f: + cfg = json.load(f) + self.assertNotIn('fp16', cfg) + self.assertNotIn('bf16', cfg) + self.assertNotIn('', cfg) + @decorator.load_data('tests/data/megatron_deepspeed.log') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_parse_log(self, raw_output, mock_generate_dataset):