diff --git a/fastdeploy/config.py b/fastdeploy/config.py index b15a6dc824b..0c385cb6f2a 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -251,6 +251,24 @@ def __init__( # set attribute from pretrained_config for key, value in pretrained_config.items(): setattr(self, key, value) + + # Check NUM_MAX_DISPATCH_TOKENS_PER_RANK environment variable + env_num_max_dispatch = envs.NUM_MAX_DISPATCH_TOKENS_PER_RANK + if env_num_max_dispatch is not None: + # Only check consistency if model config explicitly sets this value + if "num_max_dispatch_tokens_per_rank" in pretrained_config: + model_num_max_dispatch = pretrained_config["num_max_dispatch_tokens_per_rank"] + if model_num_max_dispatch != env_num_max_dispatch: + raise ValueError( + f"num_max_dispatch_tokens_per_rank mismatch: " + f"environment variable NUM_MAX_DISPATCH_TOKENS_PER_RANK={env_num_max_dispatch}, " + f"but model config has num_max_dispatch_tokens_per_rank={model_num_max_dispatch}. " + f"Please ensure they are consistent." + ) + else: + # Use env value if model config doesn't explicitly set it + self.num_max_dispatch_tokens_per_rank = env_num_max_dispatch + # we need set default value when not exist for key, value in PRETRAINED_INIT_CONFIGURATION.items(): if not hasattr(self, key): diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 0c7ac3e22b1..9b71c165965 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -266,6 +266,13 @@ def _validate_split_kv_size(value: int) -> int: "FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST": lambda: bool( int(os.getenv("FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST", "1")) ), + # Number of max dispatch tokens per rank for MoE computation. + # If set, it must match the value in model config if present, otherwise an error will be raised. + "NUM_MAX_DISPATCH_TOKENS_PER_RANK": lambda: ( + int(os.getenv("NUM_MAX_DISPATCH_TOKENS_PER_RANK", "0")) + if os.getenv("NUM_MAX_DISPATCH_TOKENS_PER_RANK") + else None + ), }