Skip to content

[bug]: quantized qwen image edit 2511 memory management #9147

@keturn

Description

@keturn

Is there an existing issue for this problem?

  • I have searched the existing issues

Install method

Docker

Operating system

Linux

GPU vendor

Nvidia (CUDA)

GPU model

RTX 3060

GPU VRAM

12 GB

Version number

6.13.0rc2 (docker sha-25bbf32-cuda)

Browser

Firefox 150.0.2

System Information

[About…] { "version": "6.12.0.post1", "dependencies": { "absl-py" : "2.3.1", "accelerate" : "1.8.1", "annotated-types" : "0.7.0", "anyio" : "4.9.0", "attrs" : "25.3.0", "bcrypt" : "3.2.2", "bidict" : "0.23.1", "bitsandbytes" : "0.46.1", "blake3" : "1.0.5", "certifi" : "2025.6.15", "cffi" : "1.17.1", "charset-normalizer" : "3.4.2", "cityhash" : "0.4.7", "click" : "8.2.1", "coloredlogs" : "15.0.1", "compel" : "2.1.1", "contourpy" : "1.3.2", "cryptography" : "45.0.5", "CUDA" : "12.8", "cycler" : "0.12.1", "Deprecated" : "1.2.18", "diffusers" : "0.37.0", "dnspython" : "2.7.0", "dynamicprompts" : "0.31.0", "ecdsa" : "0.19.1", "einops" : "0.8.1", "email-validator" : "2.3.0", "fastapi" : "0.118.3", "fastapi-events" : "0.12.2", "filelock" : "3.18.0", "flatbuffers" : "25.2.10", "fonttools" : "4.58.5", "fsspec" : "2025.5.1", "gguf" : "0.17.1", "h11" : "0.16.0", "hf-xet" : "1.1.5", "httpcore" : "1.0.9", "httptools" : "0.6.4", "httpx" : "0.28.1", "huggingface-hub" : "0.34.4", "humanfriendly" : "10.0", "idna" : "3.10", "importlib_metadata" : "8.7.0", "InvokeAI" : "6.12.0.post1", "jax" : "0.6.2", "jaxlib" : "0.6.2", "Jinja2" : "3.1.6", "kiwisolver" : "1.4.8", "kornia" : "0.8.1", "kornia_rs" : "0.1.9", "MarkupSafe" : "3.0.1", "matplotlib" : "3.10.3", "mediapipe" : "0.10.14", "ml_dtypes" : "0.5.1", "mpmath" : "1.3.0", "networkx" : "3.5", "numpy" : "1.26.4", "nvidia-cublas-cu12" : "12.8.3.14", "nvidia-cuda-cupti-cu12" : "12.8.57", "nvidia-cuda-nvrtc-cu12" : "12.8.61", "nvidia-cuda-runtime-cu12": "12.8.57", "nvidia-cudnn-cu12" : "9.7.1.26", "nvidia-cufft-cu12" : "11.3.3.41", "nvidia-cufile-cu12" : "1.13.0.11", "nvidia-curand-cu12" : "10.3.9.55", "nvidia-cusolver-cu12" : "11.7.2.55", "nvidia-cusparse-cu12" : "12.5.7.53", "nvidia-cusparselt-cu12" : "0.6.3", "nvidia-nccl-cu12" : "2.26.2", "nvidia-nvjitlink-cu12" : "12.8.61", "nvidia-nvtx-cu12" : "12.8.55", "onnx" : "1.16.1", "onnxruntime" : "1.19.2", "opencv-contrib-python" : "4.11.0.86", "opt_einsum" : "3.4.0", "packaging" : "25.0", "passlib" : "1.7.4", "picklescan" : "0.0.26", "pillow" : "11.3.0", "prompt_toolkit" : "3.0.51", "protobuf" : "4.25.8", "psutil" : "7.0.0", "pyasn1" : "0.6.1", "pycparser" : "2.22", "pydantic" : "2.11.7", "pydantic-settings" : "2.10.1", "pydantic_core" : "2.33.2", "pymorton" : "1.0.5", "pyparsing" : "3.2.3", "PyPatchMatch" : "1.0.2", "python-dateutil" : "2.9.0.post0", "python-dotenv" : "1.1.1", "python-engineio" : "4.12.2", "python-jose" : "3.5.0", "python-multipart" : "0.0.20", "python-socketio" : "5.13.0", "PyWavelets" : "1.8.0", "PyYAML" : "6.0.2", "regex" : "2024.11.6", "requests" : "2.32.4", "rsa" : "4.9.1", "safetensors" : "0.5.3", "scipy" : "1.16.0", "semver" : "3.0.4", "sentencepiece" : "0.2.0", "setuptools" : "80.9.0", "simple-websocket" : "1.1.0", "six" : "1.17.0", "sniffio" : "1.3.1", "sounddevice" : "0.5.2", "spandrel" : "0.4.2", "starlette" : "0.46.2", "sympy" : "1.14.0", "tokenizers" : "0.22.0", "torch" : "2.7.1+cu128", "torchsde" : "0.2.6", "torchvision" : "0.22.1+cu128", "tqdm" : "4.67.1", "trampoline" : "0.1.2", "transformers" : "4.56.0", "triton" : "3.3.1", "typing-inspection" : "0.4.1", "typing_extensions" : "4.14.0", "urllib3" : "2.5.0", "uvicorn" : "0.35.0", "uvloop" : "0.21.0", "watchfiles" : "1.1.0", "wcwidth" : "0.2.13", "websockets" : "15.0.1", "wrapt" : "1.17.2", "wsproto" : "1.2.0", "zipp" : "3.23.0" }, "config": { "schema_version": "4.0.3", "legacy_models_yaml_path": null, "host": "", "port": 9090, "allow_origins": [], "allow_credentials": true, "allow_methods": ["*"], "allow_headers": ["*"], "log_tokenization": true, "patchmatch": true, "models_dir": "models", "convert_cache_dir": "models/.convert_cache", "download_cache_dir": "models/.download_cache", "legacy_conf_dir": "configs", "db_dir": "databases", "outputs_dir": "outputs", "image_subfolder_strategy": "flat", "custom_nodes_dir": "nodes", "style_presets_dir": "style_presets", "workflow_thumbnails_dir": "workflow_thumbnails", "log_handlers": ["console"], "log_format": "color", "log_level": "info", "log_sql": false, "log_level_network": "warning", "use_memory_db": false, "dev_reload": false, "profile_graphs": false, "profile_prefix": null, "profiles_dir": "profiles", "max_cache_ram_gb": null, "max_cache_vram_gb": null, "log_memory_usage": false, "model_cache_keep_alive_min": 0, "device_working_mem_gb": 3, "enable_partial_loading": true, "keep_ram_copy_of_weights": true, "ram": null, "vram": null, "lazy_offload": true, "pytorch_cuda_alloc_conf": "backend:cudaMallocAsync", "device": "cuda", "precision": "float16", "sequential_guidance": false, "attention_type": "auto", "attention_slice_size": "auto", "force_tiled_decode": false, "pil_compress_level": 1, "max_queue_size": 10000, "clear_queue_on_startup": false, "max_queue_history": null, "allow_nodes": null, "deny_nodes": null, "node_cache_size": 512, "hashing_algorithm": "blake3_single", "remote_api_tokens": [], "scan_models_on_startup": false, "unsafe_disable_picklescan": false, "allow_unknown_models": true, "multiuser": true, "strict_password_checking": false, "external_alibabacloud_api_key": null, "external_alibabacloud_base_url": null, "external_gemini_api_key": null, "external_openai_api_key": null, "external_gemini_base_url": null, "external_openai_base_url": null, "external_seedream_api_key": null, "external_seedream_base_url": null }, "set_config_fields": [ "pytorch_cuda_alloc_conf", "precision", "device", "port", "log_tokenization", "ssl_certfile", "legacy_models_yaml_path", "host", "ssl_keyfile", "multiuser", "remote_api_tokens" ] }

What happened

Initially I ran Quen Image Edit 2511 Q4.
Then installed the Q2 quant of the same model (from the Starter Models list), and switched to that.

Re-ran the same seed/prompt/etc on the Q2 model and got assorted problems:

  • torch.OOM during decoding step on two out of a batch of six generations (the other four completed)
  • WARNING --> Loading 0.0 MB into VRAM, but only -1447.625 MB were requested.
  • Widely varying reported values for QwenImageTransformer2DModel model size in VRAM, from 99.5% to 60%

invoke-q2.log

What you expected to happen

  • 7 GB Q2 QwenImageTransformer2DMode should probably fit entirely in VRAM (or is it truly 0.5% too big?)
  • the 242 MB autoencoder definitely should

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions