From 367cf624f27e90d23c31f55b36a5e5c9e4f34d38 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Thu, 26 Mar 2026 22:16:10 +0000
Subject: [PATCH 1/3] Fix DeepSpeed BF16 config validation error by removing
 FP16-only loss scaling parameters

---
 .../model_benchmarks/megatron_gpt3.py         | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 37d27bf1a..42f4459c0 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -307,15 +307,23 @@ def __prepare_deespeed_config(self, precision_megatron):
         """Prepare deepspeed configs."""
         self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json')
         # Load deepspeed config template json file
-        precision_template = {
-            'enabled': True,
-            'loss_scale': 0,
-            'loss_scale_window': 500,
-            'min_loss_scale': 1,
-            'initial_scale_power': 11
-        }
-        if self._args.hysteresis is not None:
-            precision_template['hysteresis'] = self._args.hysteresis
+        # FP16 supports loss scaling parameters; BF16 does not (sufficient dynamic range).
+        if precision_megatron == 'fp16':
+            precision_template = {
+                'enabled': True,
+                'loss_scale': 0,
+                'loss_scale_window': 500,
+                'min_loss_scale': 1,
+                'initial_scale_power': 11
+            }
+            if self._args.hysteresis is not None:
+                precision_template['hysteresis'] = self._args.hysteresis
+        elif precision_megatron == 'bf16':
+            precision_template = {
+                'enabled': True,
+            }
+        else:
+            precision_template = None
 
         ds_config_template = {
             'train_batch_size': self._args.batch_size,
@@ -328,7 +336,7 @@ def __prepare_deespeed_config(self, precision_megatron):
             'prescale_gradients': self._args.prescale_grad,
         }
 
-        if len(precision_megatron) > 0:
+        if precision_template is not None:
             ds_config_template[precision_megatron] = precision_template
 
         # Write to config json file

From d3baec915066a7e33590bfeab1b4615bd14028ab Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Mon, 27 Apr 2026 21:52:17 +0000
Subject: [PATCH 2/3] Address PR review: clarify deepspeed config template
 comment

Replace misleading 'Load deepspeed config template json file' comment
with 'Build deepspeed config template in memory' since the template is
constructed inline rather than loaded from a file.
---
 superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 42f4459c0..1b07c0f9d 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -306,7 +306,7 @@ def _parse_log(self, output):
     def __prepare_deespeed_config(self, precision_megatron):
         """Prepare deepspeed configs."""
         self._config_json_path = os.path.join(self._args.data_home, 'ds_config_gpt.json')
-        # Load deepspeed config template json file
+        # Build deepspeed config template in memory.
         # FP16 supports loss scaling parameters; BF16 does not (sufficient dynamic range).
         if precision_megatron == 'fp16':
             precision_template = {

From 158733527fdc217cb153e2611f562ea6af8eba1b Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Sun, 3 May 2026 05:03:43 +0000
Subject: [PATCH 3/3] test: add deepspeed config schema regression test for
 fp16/bf16/fp32

Asserts that __prepare_deespeed_config writes the expected JSON schema:
- bf16 -> {'enabled': True} only (no loss_scale / loss_scale_window /
  min_loss_scale / initial_scale_power / hysteresis), matching DeepSpeed's
  BF16 config validator that triggered the original failure.
- fp16 -> retains the loss-scale fields.
- '' (empty precision, e.g. fp32 path) -> no precision section attached.

The test cleans up the mock pretrain_gpt.py at test end so it does not
leak into the negative path of test_megatron_gpt_preprocess.
---
 .../model_benchmarks/test_megatron_gpt.py     | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index b7c588677..39e4558fe 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -500,6 +500,69 @@ def test_deepseek_v2_command(self):
 
         self.assertEqual(actual_units, expected_units)
 
+    @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
+    def test_megatron_gpt_deepspeed_config(self, mock_generate_dataset):
+        """Lock in the deepspeed JSON schema written for fp16 / bf16 / fp32.
+
+        BF16 must NOT include loss_scale / loss_scale_window / min_loss_scale /
+        initial_scale_power / hysteresis (DeepSpeed's BF16 config schema rejects
+        them); FP16 must keep them; FP32 must omit any precision section.
+        """
+        import json
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
+        assert (benchmark_cls)
+        os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
+        os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
+        os.environ['OMPI_COMM_WORLD_RANK'] = '0'
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12345'
+        self.createMockFiles(['pretrain_gpt.py'])
+        # createMockFiles only cleans up at tearDownClass; remove at test end so
+        # we don't leak the file into other tests (e.g. test_megatron_gpt_preprocess
+        # which exercises the "no code_base" negative path).
+        pretrain_path = Path(self._tmp_dir) / 'pretrain_gpt.py'
+        self.addCleanup(lambda: pretrain_path.is_file() and pretrain_path.unlink())
+
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=(
+                f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} '
+                f'--batch_size 2048 --deepspeed'
+            ),
+        )
+        mock_generate_dataset.return_value = True
+        assert benchmark._preprocess() is True
+
+        # bf16: only {'enabled': True}, no loss-scale fields, no fp16 section
+        benchmark._MegatronGPT__prepare_deespeed_config('bf16')
+        with open(benchmark._config_json_path) as f:
+            cfg = json.load(f)
+        self.assertEqual(cfg.get('bf16'), {'enabled': True})
+        self.assertNotIn('fp16', cfg)
+        for forbidden in ('loss_scale', 'loss_scale_window', 'min_loss_scale', 'initial_scale_power', 'hysteresis'):
+            self.assertNotIn(forbidden, cfg['bf16'])
+
+        # fp16: keeps loss-scale fields
+        benchmark._MegatronGPT__prepare_deespeed_config('fp16')
+        with open(benchmark._config_json_path) as f:
+            cfg = json.load(f)
+        self.assertIn('fp16', cfg)
+        self.assertNotIn('bf16', cfg)
+        self.assertTrue(cfg['fp16']['enabled'])
+        self.assertEqual(cfg['fp16']['loss_scale'], 0)
+        self.assertEqual(cfg['fp16']['loss_scale_window'], 500)
+        self.assertEqual(cfg['fp16']['min_loss_scale'], 1)
+        self.assertEqual(cfg['fp16']['initial_scale_power'], 11)
+
+        # empty precision (e.g., fp32 path which calls __prepare with ''):
+        # no precision section attached.
+        benchmark._MegatronGPT__prepare_deespeed_config('')
+        with open(benchmark._config_json_path) as f:
+            cfg = json.load(f)
+        self.assertNotIn('fp16', cfg)
+        self.assertNotIn('bf16', cfg)
+        self.assertNotIn('', cfg)
+
     @decorator.load_data('tests/data/megatron_deepspeed.log')
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_parse_log(self, raw_output, mock_generate_dataset):