[bug]: quantized qwen image edit 2511 memory management

### Is there an existing issue for this problem?

- [x] I have searched the existing issues

### Install method

Docker

### Operating system

Linux

### GPU vendor

Nvidia (CUDA)

### GPU model

RTX 3060

### GPU VRAM

12 GB

### Version number

6.13.0rc2 (docker [sha-25bbf32-cuda](https://github.com/invoke-ai/InvokeAI/pkgs/container/invokeai/849339986?tag=sha-25bbf32-cuda))

### Browser

Firefox 150.0.2

### System Information

<details>
<summary>[About…]</summary>
{
    "version": "6.12.0.post1",
    "dependencies": {
        "absl-py"                 : "2.3.1",
        "accelerate"              : "1.8.1",
        "annotated-types"         : "0.7.0",
        "anyio"                   : "4.9.0",
        "attrs"                   : "25.3.0",
        "bcrypt"                  : "3.2.2",
        "bidict"                  : "0.23.1",
        "bitsandbytes"            : "0.46.1",
        "blake3"                  : "1.0.5",
        "certifi"                 : "2025.6.15",
        "cffi"                    : "1.17.1",
        "charset-normalizer"      : "3.4.2",
        "cityhash"                : "0.4.7",
        "click"                   : "8.2.1",
        "coloredlogs"             : "15.0.1",
        "compel"                  : "2.1.1",
        "contourpy"               : "1.3.2",
        "cryptography"            : "45.0.5",
        "CUDA"                    : "12.8",
        "cycler"                  : "0.12.1",
        "Deprecated"              : "1.2.18",
        "diffusers"               : "0.37.0",
        "dnspython"               : "2.7.0",
        "dynamicprompts"          : "0.31.0",
        "ecdsa"                   : "0.19.1",
        "einops"                  : "0.8.1",
        "email-validator"         : "2.3.0",
        "fastapi"                 : "0.118.3",
        "fastapi-events"          : "0.12.2",
        "filelock"                : "3.18.0",
        "flatbuffers"             : "25.2.10",
        "fonttools"               : "4.58.5",
        "fsspec"                  : "2025.5.1",
        "gguf"                    : "0.17.1",
        "h11"                     : "0.16.0",
        "hf-xet"                  : "1.1.5",
        "httpcore"                : "1.0.9",
        "httptools"               : "0.6.4",
        "httpx"                   : "0.28.1",
        "huggingface-hub"         : "0.34.4",
        "humanfriendly"           : "10.0",
        "idna"                    : "3.10",
        "importlib_metadata"      : "8.7.0",
        "InvokeAI"                : "6.12.0.post1",
        "jax"                     : "0.6.2",
        "jaxlib"                  : "0.6.2",
        "Jinja2"                  : "3.1.6",
        "kiwisolver"              : "1.4.8",
        "kornia"                  : "0.8.1",
        "kornia_rs"               : "0.1.9",
        "MarkupSafe"              : "3.0.1",
        "matplotlib"              : "3.10.3",
        "mediapipe"               : "0.10.14",
        "ml_dtypes"               : "0.5.1",
        "mpmath"                  : "1.3.0",
        "networkx"                : "3.5",
        "numpy"                   : "1.26.4",
        "nvidia-cublas-cu12"      : "12.8.3.14",
        "nvidia-cuda-cupti-cu12"  : "12.8.57",
        "nvidia-cuda-nvrtc-cu12"  : "12.8.61",
        "nvidia-cuda-runtime-cu12": "12.8.57",
        "nvidia-cudnn-cu12"       : "9.7.1.26",
        "nvidia-cufft-cu12"       : "11.3.3.41",
        "nvidia-cufile-cu12"      : "1.13.0.11",
        "nvidia-curand-cu12"      : "10.3.9.55",
        "nvidia-cusolver-cu12"    : "11.7.2.55",
        "nvidia-cusparse-cu12"    : "12.5.7.53",
        "nvidia-cusparselt-cu12"  : "0.6.3",
        "nvidia-nccl-cu12"        : "2.26.2",
        "nvidia-nvjitlink-cu12"   : "12.8.61",
        "nvidia-nvtx-cu12"        : "12.8.55",
        "onnx"                    : "1.16.1",
        "onnxruntime"             : "1.19.2",
        "opencv-contrib-python"   : "4.11.0.86",
        "opt_einsum"              : "3.4.0",
        "packaging"               : "25.0",
        "passlib"                 : "1.7.4",
        "picklescan"              : "0.0.26",
        "pillow"                  : "11.3.0",
        "prompt_toolkit"          : "3.0.51",
        "protobuf"                : "4.25.8",
        "psutil"                  : "7.0.0",
        "pyasn1"                  : "0.6.1",
        "pycparser"               : "2.22",
        "pydantic"                : "2.11.7",
        "pydantic-settings"       : "2.10.1",
        "pydantic_core"           : "2.33.2",
        "pymorton"                : "1.0.5",
        "pyparsing"               : "3.2.3",
        "PyPatchMatch"            : "1.0.2",
        "python-dateutil"         : "2.9.0.post0",
        "python-dotenv"           : "1.1.1",
        "python-engineio"         : "4.12.2",
        "python-jose"             : "3.5.0",
        "python-multipart"        : "0.0.20",
        "python-socketio"         : "5.13.0",
        "PyWavelets"              : "1.8.0",
        "PyYAML"                  : "6.0.2",
        "regex"                   : "2024.11.6",
        "requests"                : "2.32.4",
        "rsa"                     : "4.9.1",
        "safetensors"             : "0.5.3",
        "scipy"                   : "1.16.0",
        "semver"                  : "3.0.4",
        "sentencepiece"           : "0.2.0",
        "setuptools"              : "80.9.0",
        "simple-websocket"        : "1.1.0",
        "six"                     : "1.17.0",
        "sniffio"                 : "1.3.1",
        "sounddevice"             : "0.5.2",
        "spandrel"                : "0.4.2",
        "starlette"               : "0.46.2",
        "sympy"                   : "1.14.0",
        "tokenizers"              : "0.22.0",
        "torch"                   : "2.7.1+cu128",
        "torchsde"                : "0.2.6",
        "torchvision"             : "0.22.1+cu128",
        "tqdm"                    : "4.67.1",
        "trampoline"              : "0.1.2",
        "transformers"            : "4.56.0",
        "triton"                  : "3.3.1",
        "typing-inspection"       : "0.4.1",
        "typing_extensions"       : "4.14.0",
        "urllib3"                 : "2.5.0",
        "uvicorn"                 : "0.35.0",
        "uvloop"                  : "0.21.0",
        "watchfiles"              : "1.1.0",
        "wcwidth"                 : "0.2.13",
        "websockets"              : "15.0.1",
        "wrapt"                   : "1.17.2",
        "wsproto"                 : "1.2.0",
        "zipp"                    : "3.23.0"
    },
    "config": {
        "schema_version": "4.0.3",
        "legacy_models_yaml_path": null,
        "host": "",
        "port": 9090,
        "allow_origins": [],
        "allow_credentials": true,
        "allow_methods": ["*"],
        "allow_headers": ["*"],
        "log_tokenization": true,
        "patchmatch": true,
        "models_dir": "models",
        "convert_cache_dir": "models/.convert_cache",
        "download_cache_dir": "models/.download_cache",
        "legacy_conf_dir": "configs",
        "db_dir": "databases",
        "outputs_dir": "outputs",
        "image_subfolder_strategy": "flat",
        "custom_nodes_dir": "nodes",
        "style_presets_dir": "style_presets",
        "workflow_thumbnails_dir": "workflow_thumbnails",
        "log_handlers": ["console"],
        "log_format": "color",
        "log_level": "info",
        "log_sql": false,
        "log_level_network": "warning",
        "use_memory_db": false,
        "dev_reload": false,
        "profile_graphs": false,
        "profile_prefix": null,
        "profiles_dir": "profiles",
        "max_cache_ram_gb": null,
        "max_cache_vram_gb": null,
        "log_memory_usage": false,
        "model_cache_keep_alive_min": 0,
        "device_working_mem_gb": 3,
        "enable_partial_loading": true,
        "keep_ram_copy_of_weights": true,
        "ram": null,
        "vram": null,
        "lazy_offload": true,
        "pytorch_cuda_alloc_conf": "backend:cudaMallocAsync",
        "device": "cuda",
        "precision": "float16",
        "sequential_guidance": false,
        "attention_type": "auto",
        "attention_slice_size": "auto",
        "force_tiled_decode": false,
        "pil_compress_level": 1,
        "max_queue_size": 10000,
        "clear_queue_on_startup": false,
        "max_queue_history": null,
        "allow_nodes": null,
        "deny_nodes": null,
        "node_cache_size": 512,
        "hashing_algorithm": "blake3_single",
        "remote_api_tokens": [],
        "scan_models_on_startup": false,
        "unsafe_disable_picklescan": false,
        "allow_unknown_models": true,
        "multiuser": true,
        "strict_password_checking": false,
        "external_alibabacloud_api_key": null,
        "external_alibabacloud_base_url": null,
        "external_gemini_api_key": null,
        "external_openai_api_key": null,
        "external_gemini_base_url": null,
        "external_openai_base_url": null,
        "external_seedream_api_key": null,
        "external_seedream_base_url": null
    },
    "set_config_fields": [
        "pytorch_cuda_alloc_conf", "precision",               "device",                  "port",
        "log_tokenization",        "ssl_certfile",            "legacy_models_yaml_path", "host",
        "ssl_keyfile",             "multiuser",               "remote_api_tokens"
    ]
}

</details>

### What happened

Initially I ran Quen Image Edit 2511 Q4.
Then installed the Q2 quant of the same model (from the Starter Models list), and switched to that.

Re-ran the same seed/prompt/etc on the Q2 model and got assorted problems:
- torch.OOM during decoding step on two out of a batch of six generations (the other four completed)
- WARNING --> Loading 0.0 MB into VRAM, but only -1447.625 MB were requested.
- Widely varying reported values for QwenImageTransformer2DModel model size in VRAM, from 99.5% to 60%

[invoke-q2.log](https://github.com/user-attachments/files/27560771/invoke-q2.log)

### What you expected to happen

- 7 GB Q2 QwenImageTransformer2DMode should probably fit entirely in VRAM (or is it truly 0.5% too big?)
- the 242 MB autoencoder definitely should


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[bug]: quantized qwen image edit 2511 memory management #9147

Is there an existing issue for this problem?

Install method

Operating system

GPU vendor

GPU model

GPU VRAM

Version number

Browser

System Information

What happened

What you expected to happen

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

[bug]: quantized qwen image edit 2511 memory management #9147

Description

Is there an existing issue for this problem?

Install method

Operating system

GPU vendor

GPU model

GPU VRAM

Version number

Browser

System Information

What happened

What you expected to happen

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions