[core] support device type device_maps to work with offloading. (#12811)

sayakpaul · web-flow · commit 35086ac06acc · 2026-02-16T16:31:45.000+05:30
* support device type device_maps to work with offloading.

* add tests.

* fix tests

* skip tests where it's not supported.

* empty

* up

* up

* fix allegro.
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -112,7 +112,7 @@
 for library in LOADABLE_CLASSES:
     LIBRARIES.append(library)
 
-SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]
+SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
 
 logger = logging.get_logger(__name__)
 
@@ -468,8 +468,7 @@ def module_is_offloaded(module):
         pipeline_is_sequentially_offloaded = any(
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
         )
-
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
         if is_pipeline_device_mapped:
             raise ValueError(
                 "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -1188,7 +1187,7 @@ def enable_model_cpu_offload(self, gpu_id: int | None = None, device: torch.devi
         """
         self._maybe_raise_error_if_group_offload_active(raise_error=True)
 
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
         if is_pipeline_device_mapped:
             raise ValueError(
                 "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1312,7 +1311,7 @@ def enable_sequential_cpu_offload(self, gpu_id: int | None = None, device: torch
             raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
         self.remove_all_hooks()
 
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
         if is_pipeline_device_mapped:
             raise ValueError(
                 "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -2228,6 +2227,21 @@ def _maybe_raise_error_if_group_offload_active(
                 return True
         return False
 
+    def _is_pipeline_device_mapped(self):
+        # We support passing `device_map="cuda"`, for example. This is helpful, in case
+        # users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
+        # in limited VRAM environments because quantized models often initialize directly on the accelerator.
+        device_map = self.hf_device_map
+        is_device_type_map = False
+        if isinstance(device_map, str):
+            try:
+                torch.device(device_map)
+                is_device_type_map = True
+            except RuntimeError:
+                pass
+
+        return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
+
 
 class StableDiffusionMixin:
     r"""
diff --git a/tests/models/testing_utils/quantization.py b/tests/models/testing_utils/quantization.py
@@ -628,6 +628,21 @@ def test_bnb_training(self):
         """Test that quantized models can be used for training with adapters."""
         self._test_quantization_training(BitsAndBytesConfigMixin.BNB_CONFIGS["4bit_nf4"])
 
+    @pytest.mark.parametrize(
+        "config_name",
+        list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
+        ids=list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
+    )
+    def test_cpu_device_map(self, config_name):
+        config_kwargs = BitsAndBytesConfigMixin.BNB_CONFIGS[config_name]
+        model_quantized = self._create_quantized_model(config_kwargs, device_map="cpu")
+
+        assert hasattr(model_quantized, "hf_device_map"), "Model should have hf_device_map attribute"
+        assert model_quantized.hf_device_map is not None, "hf_device_map should not be None"
+        assert model_quantized.device == torch.device("cpu"), (
+            f"Model should be on CPU, but is on {model_quantized.device}"
+        )
+
 
 @is_quantization
 @is_quanto
diff --git a/tests/pipelines/allegro/test_allegro.py b/tests/pipelines/allegro/test_allegro.py
@@ -158,6 +158,10 @@ def test_save_load_local(self):
     def test_save_load_optional_components(self):
         pass
 
+    @unittest.skip("Decoding without tiling is not yet implemented")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
+
     def test_inference(self):
         device = "cpu"
 
diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -34,9 +34,7 @@
 
 class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyCombinedPipeline
-    params = [
-        "prompt",
-    ]
+    params = ["prompt"]
     batch_params = ["prompt", "negative_prompt"]
     required_optional_params = [
         "generator",
@@ -148,6 +146,10 @@ def test_float16_inference(self):
     def test_dict_tuple_outputs_equivalent(self):
         super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
 
+    @unittest.skip("Test not supported.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
+
 
 class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyImg2ImgCombinedPipeline
@@ -264,6 +266,10 @@ def test_dict_tuple_outputs_equivalent(self):
     def test_save_load_optional_components(self):
         super().test_save_load_optional_components(expected_max_difference=5e-4)
 
+    @unittest.skip("Test not supported.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
+
 
 class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyInpaintCombinedPipeline
@@ -384,3 +390,7 @@ def test_save_load_optional_components(self):
 
     def test_save_load_local(self):
         super().test_save_load_local(expected_max_difference=5e-3)
+
+    @unittest.skip("Test not supported.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -36,9 +36,7 @@
 
 class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22CombinedPipeline
-    params = [
-        "prompt",
-    ]
+    params = ["prompt"]
     batch_params = ["prompt", "negative_prompt"]
     required_optional_params = [
         "generator",
@@ -70,12 +68,7 @@ def get_dummy_components(self):
     def get_dummy_inputs(self, device, seed=0):
         prior_dummy = PriorDummies()
         inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
-        inputs.update(
-            {
-                "height": 64,
-                "width": 64,
-            }
-        )
+        inputs.update({"height": 64, "width": 64})
         return inputs
 
     def test_kandinsky(self):
@@ -155,12 +148,18 @@ def test_save_load_local(self):
     def test_save_load_optional_components(self):
         super().test_save_load_optional_components(expected_max_difference=5e-3)
 
+    @unittest.skip("Test not supported.")
     def test_callback_inputs(self):
         pass
 
+    @unittest.skip("Test not supported.")
     def test_callback_cfg(self):
         pass
 
+    @unittest.skip("Test not supported.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
+
 
 class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22Img2ImgCombinedPipeline
@@ -279,12 +278,18 @@ def test_save_load_optional_components(self):
     def save_load_local(self):
         super().test_save_load_local(expected_max_difference=5e-3)
 
+    @unittest.skip("Test not supported.")
     def test_callback_inputs(self):
         pass
 
+    @unittest.skip("Test not supported.")
     def test_callback_cfg(self):
         pass
 
+    @unittest.skip("Test not supported.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
+
 
 class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = KandinskyV22InpaintCombinedPipeline
@@ -411,3 +416,7 @@ def test_callback_inputs(self):
 
     def test_callback_cfg(self):
         pass
+
+    @unittest.skip("`device_map` is not yet supported for connected pipelines.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
@@ -296,6 +296,9 @@ def callback_inputs_test(pipe, i, t, callback_kwargs):
         output = pipe(**inputs)[0]
         assert output.abs().sum() == 0
 
+    def test_pipeline_with_accelerator_device_map(self):
+        super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
+
 
 @slow
 @require_torch_accelerator
diff --git a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
@@ -194,6 +194,9 @@ def test_inference_batch_single_identical(self):
     def test_save_load_dduf(self):
         super().test_save_load_dduf(atol=1e-3, rtol=1e-3)
 
+    def test_pipeline_with_accelerator_device_map(self):
+        super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
+
 
 @slow
 @require_torch_accelerator
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -2355,7 +2355,6 @@ def test_torch_dtype_dict(self):
                     f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
                 )
 
-    @require_torch_accelerator
     def test_pipeline_with_accelerator_device_map(self, expected_max_difference=1e-4):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
diff --git a/tests/pipelines/visualcloze/test_pipeline_visualcloze_combined.py b/tests/pipelines/visualcloze/test_pipeline_visualcloze_combined.py
@@ -342,3 +342,7 @@ def test_save_load_float16(self, expected_max_diff=1e-2):
         self.assertLess(
             max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
         )
+
+    @unittest.skip("Test not supported.")
+    def test_pipeline_with_accelerator_device_map(self):
+        pass
diff --git a/tests/pipelines/visualcloze/test_pipeline_visualcloze_generation.py b/tests/pipelines/visualcloze/test_pipeline_visualcloze_generation.py
@@ -310,3 +310,7 @@ def test_save_load_float16(self, expected_max_diff=1e-2):
     @unittest.skip("Skipped due to missing layout_prompt. Needs further investigation.")
     def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=0.0001, rtol=0.0001):
         pass
+
+    @unittest.skip("Needs to be revisited later.")
+    def test_pipeline_with_accelerator_device_map(self, expected_max_difference=0.0001):
+        pass

Original file line number	Diff line number	Diff line change
`@@ -2355,7 +2355,6 @@ def test_torch_dtype_dict(self):`
`2355`	`2355`	`f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",`
`2356`	`2356`	`)`
`2357`	`2357`
`2358`		`- @require_torch_accelerator`
`2359`	`2358`	`def test_pipeline_with_accelerator_device_map(self, expected_max_difference=1e-4):`
`2360`	`2359`	`components = self.get_dummy_components()`
`2361`	`2360`	`pipe = self.pipeline_class(**components)`
Original file line number	Diff line number	Diff line change
`@@ -342,3 +342,7 @@ def test_save_load_float16(self, expected_max_diff=1e-2):`
`342`	`342`	`self.assertLess(`
`343`	`343`	`max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."`
`344`	`344`	`)`
	`345`	`+`
	`346`	`+ @unittest.skip("Test not supported.")`
	`347`	`+ def test_pipeline_with_accelerator_device_map(self):`
	`348`	`+ pass`