From 3fbfe5ace6a8815b9e39297ec76b9a44343409f5 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Thu, 23 Apr 2026 14:59:16 +0200 Subject: [PATCH 1/9] =?UTF-8?q?LCORE-836=20spike:=20unified=20mode=20PoC?= =?UTF-8?q?=20=E2=80=94=20schema,=20synthesizer,=20migration=20tool?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a unified `llama_stack.config` sub-section to `lightspeed-stack.yaml` that lets operators express the Llama Stack operational configuration in one place, eliminating the need for a separately maintained `run.yaml`. Legacy mode (`llama_stack.library_client_config_path` + external run.yaml) is preserved and mutually exclusive with the new path. New Pydantic classes `UnifiedLlamaStackConfig`, `UnifiedInferenceSection`, and `UnifiedInferenceProvider` define the unified schema; a new `synthesize_configuration` pipeline applies profile (or baseline) → existing BYOK RAG / Solr OKP enrichment → high-level sections → `native_override` (deep-merge, list-replacement). A `baseline: default | empty` field enables strict lossless round-trip for the migration tool. Library-mode wiring in `src/client.py` detects the unified form and writes the synthesized file to disk for `AsyncLlamaStackAsLibraryClient` (which the PoC confirmed requires a file path, not a dict). Legacy enrichment path is unchanged. A `--migrate-config` flag on the `lightspeed-stack` CLI produces a unified single-file config from a legacy (run.yaml, lightspeed-stack.yaml) pair (dumb lift-and-shift: content goes under `native_override` with `baseline: empty`, and `library_client_config_path` is removed). The LS container's `llama_stack_configuration.py` CLI now auto-detects unified vs legacy based on the presence of `llama_stack.config`; the entrypoint script requires no functional change (comment clarified). `test.containerfile` copies `src/data/` into the container so the shipped default baseline resolves at runtime. Tests: 22 new unit tests covering merge semantics, high-level inference expansion, the full synthesize pipeline, profile loading, precedence (profile < high-level < native_override), and migrate-then-synthesize round-trip lossless equality. 3 new schema tests cover unified/legacy mutual exclusion. 5 existing dump-configuration expectations updated for the new `config: None` field; 1 client error-message regex updated. Full `uv run make verify` passes (black, pylint 10/10, ruff, docstyle, mypy). `uv run pytest tests/unit/` — 2098 passed, 1 skipped, 0 failed. --- scripts/llama-stack-entrypoint.sh | 12 +- src/client.py | 65 ++- src/data/default_run.yaml | 155 ++++++++ src/lightspeed_stack.py | 47 ++- src/llama_stack_configuration.py | 269 ++++++++++++- src/models/config.py | 148 +++++-- test.containerfile | 7 +- .../models/config/test_dump_configuration.py | 5 + .../config/test_llama_stack_configuration.py | 59 ++- tests/unit/test_client.py | 2 +- tests/unit/test_llama_stack_synthesize.py | 371 ++++++++++++++++++ 11 files changed, 1088 insertions(+), 52 deletions(-) create mode 100644 src/data/default_run.yaml create mode 100644 tests/unit/test_llama_stack_synthesize.py diff --git a/scripts/llama-stack-entrypoint.sh b/scripts/llama-stack-entrypoint.sh index a7eeb797b..6917017c5 100755 --- a/scripts/llama-stack-entrypoint.sh +++ b/scripts/llama-stack-entrypoint.sh @@ -1,6 +1,12 @@ #!/bin/bash # Entrypoint for llama-stack container. -# Enriches config with lightspeed dynamic values, then starts llama-stack. +# Produces the run.yaml from lightspeed-stack.yaml then starts llama-stack. +# +# Two modes, auto-detected by the Python CLI (llama_stack_configuration.py): +# - Unified (LCORE-836): `llama_stack.config` present in lightspeed-stack.yaml. +# The full run.yaml is SYNTHESIZED from the unified block; -i is ignored. +# - Legacy: `run.yaml` is mounted separately and ENRICHED with BYOK RAG / Solr / +# Azure Entra ID values from lightspeed-stack.yaml. set -e @@ -9,9 +15,9 @@ ENRICHED_CONFIG="/opt/app-root/run.yaml" LIGHTSPEED_CONFIG="${LIGHTSPEED_CONFIG:-/opt/app-root/lightspeed-stack.yaml}" ENV_FILE="/opt/app-root/.env" -# Enrich config if lightspeed config exists +# Run the config producer if lightspeed config exists if [ -f "$LIGHTSPEED_CONFIG" ]; then - echo "Enriching llama-stack config..." + echo "Preparing llama-stack config from $LIGHTSPEED_CONFIG ..." ENRICHMENT_FAILED=0 python3 /opt/app-root/llama_stack_configuration.py \ -c "$LIGHTSPEED_CONFIG" \ diff --git a/src/client.py b/src/client.py index 0c77c2d49..f42544a13 100644 --- a/src/client.py +++ b/src/client.py @@ -3,6 +3,7 @@ import json import os import tempfile +from pathlib import Path from typing import Optional import yaml @@ -11,7 +12,12 @@ from llama_stack_client import APIConnectionError, AsyncLlamaStackClient from configuration import configuration -from llama_stack_configuration import YamlDumper, enrich_byok_rag, enrich_solr +from llama_stack_configuration import ( + YamlDumper, + enrich_byok_rag, + enrich_solr, + synthesize_configuration, +) from log import get_logger from models.config import LlamaStackConfiguration from models.responses import ServiceUnavailableResponse @@ -44,22 +50,65 @@ async def load(self, llama_stack_config: LlamaStackConfiguration) -> None: async def _load_library_client(self, config: LlamaStackConfiguration) -> None: """Initialize client in library mode. + Two paths: + - Unified mode (`config.config` set): synthesize full run.yaml from the + lightspeed-stack config and write to a deterministic path. + - Legacy mode (`config.library_client_config_path` set): read the + external run.yaml and apply in-place enrichment. + Stores the final config path for use in reload. """ - if config.library_client_config_path is None: + if config.config is not None: + logger.info("Using Llama stack as library client (unified mode)") + self._config_path = self._synthesize_library_config() + elif config.library_client_config_path is not None: + logger.info("Using Llama stack as library client (legacy mode)") + self._config_path = self._enrich_library_config( + config.library_client_config_path + ) + else: raise ValueError( - "Configuration problem: library_client_config_path is not set" + "Configuration problem: neither `llama_stack.config` (unified) " + "nor `llama_stack.library_client_config_path` (legacy) is set" ) - logger.info("Using Llama stack as library client") - - self._config_path = self._enrich_library_config( - config.library_client_config_path - ) client = AsyncLlamaStackAsLibraryClient(self._config_path) await client.initialize() self._lsc = client + def _synthesize_library_config(self) -> str: + """Synthesize the full Llama Stack run.yaml from unified-mode config. + + Library-client-friendly: writes to a file since the Llama Stack library + client only accepts a file path (not a dict). Returns the path to the + synthesized file. + + The synthesizer preserves env-var references (`${env.FOO}`) verbatim; + secrets are not resolved into the file on disk. + + Returns: + str: Path to the synthesized run.yaml. + """ + lcs_config_dict = configuration.configuration.model_dump( + exclude_none=True, mode="python" + ) + config_file_dir: Optional[Path] = None + env_path = os.environ.get("LIGHTSPEED_STACK_CONFIG_PATH") + if env_path: + config_file_dir = Path(env_path).resolve().parent + + ls_config = synthesize_configuration( + lcs_config_dict, config_file_dir=config_file_dir + ) + + synthesized_path = os.path.join( + tempfile.gettempdir(), "llama_stack_synthesized_config.yaml" + ) + with open(synthesized_path, "w", encoding="utf-8") as f: + yaml.dump(ls_config, f, Dumper=YamlDumper, default_flow_style=False) + logger.info("Wrote synthesized Llama Stack config to %s", synthesized_path) + return synthesized_path + def _load_service_client(self, config: LlamaStackConfiguration) -> None: """Initialize client in service mode (remote HTTP).""" logger.info("Using Llama stack running as a service") diff --git a/src/data/default_run.yaml b/src/data/default_run.yaml new file mode 100644 index 000000000..7a4a78efa --- /dev/null +++ b/src/data/default_run.yaml @@ -0,0 +1,155 @@ +version: 2 + +apis: +- agents +- batches +- datasetio +- eval +- files +- inference +- safety +- scoring +- tool_runtime +- vector_io + +benchmarks: [] +datasets: [] +image_name: starter +external_providers_dir: ${env.EXTERNAL_PROVIDERS_DIR} + +providers: + inference: + - provider_id: openai # This ID is a reference to 'providers.inference' + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + allowed_models: ["${env.E2E_OPENAI_MODEL:=gpt-4o-mini}"] + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + files: + - config: + metadata_store: + table_name: files_metadata + backend: sql_default + storage_dir: ~/.llama/storage/files + provider_id: meta-reference-files + provider_type: inline::localfs + safety: + - config: + excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: '********' + tool_runtime: + - config: {} # Enable the RAG tool + provider_id: rag-runtime + provider_type: inline::rag-runtime + vector_io: + - config: + persistence: + namespace: vector_io::faiss + backend: kv_default + provider_id: faiss + provider_type: inline::faiss + agents: + - config: + persistence: + agent_state: + namespace: agents_state + backend: kv_default + responses: + table_name: agents_responses + backend: sql_default + provider_id: meta-reference + provider_type: inline::meta-reference + batches: + - config: + kvstore: + namespace: batches_store + backend: kv_default + provider_id: reference + provider_type: inline::reference + datasetio: + - config: + kvstore: + namespace: huggingface_datasetio + backend: kv_default + provider_id: huggingface + provider_type: remote::huggingface + - config: + kvstore: + namespace: localfs_datasetio + backend: kv_default + provider_id: localfs + provider_type: inline::localfs + eval: + - config: + kvstore: + namespace: eval_store + backend: kv_default + provider_id: meta-reference + provider_type: inline::meta-reference +scoring_fns: [] +server: + port: 8321 +storage: + backends: + kv_default: # Define the storage backend type for RAG, in this case registry and RAG are unified i.e. information on registered resources (e.g. models, vector_stores) are saved together with the RAG chunks + type: kv_sqlite + db_path: ${env.KV_STORE_PATH:=~/.llama/storage/rag/kv_store.db} + sql_default: + type: sql_sqlite + db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db} + stores: + metadata: + namespace: registry + backend: kv_default + inference: + table_name: inference_store + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + conversations: + table_name: openai_conversations + backend: sql_default + prompts: + namespace: prompts + backend: kv_default +registered_resources: + models: [] + shields: + - shield_id: llama-guard + provider_id: llama-guard + provider_shield_id: openai/gpt-4o-mini + vector_stores: [] + datasets: [] + scoring_fns: [] + benchmarks: [] + tool_groups: + - toolgroup_id: builtin::rag # Register the RAG tool + provider_id: rag-runtime +# REQUIRED: This section is necessary for file_search tool calls to work. +# Without it, llama-stack's rag-runtime silently fails all file_search operations +# with no error logged. +vector_stores: + # LCORE-1498: Disables Llama Stack RAG annotation generation + # causing unwanted citation/file markers in model output. + annotation_prompt_params: + enable_annotations: false + default_provider_id: faiss + default_embedding_model: # Define the default embedding model for RAG + provider_id: sentence-transformers + model_id: nomic-ai/nomic-embed-text-v1.5 +safety: + default_shield_id: llama-guard + diff --git a/src/lightspeed_stack.py b/src/lightspeed_stack.py index 858799c36..261092063 100644 --- a/src/lightspeed_stack.py +++ b/src/lightspeed_stack.py @@ -48,10 +48,11 @@ def create_argument_parser() -> ArgumentParser: - -d / --dump-configuration: dump the loaded configuration to JSON and exit - -s / --dump-schema: dump the configuration schema to OpenAPI JSON and exit - -c / --config: path to the configuration file (default "lightspeed-stack.yaml") - - -g / --generate-llama-stack-configuration: generate a Llama Stack - configuration from the service configuration - - -i / --input-config-file: Llama Stack input configuration filename (default "run.yaml") - - -o / --output-config-file: Llama Stack output configuration filename (default "run_.yaml") + - --migrate-config: migrate a legacy (run.yaml + lightspeed-stack.yaml) + setup into a unified single-file config and exit + - --run-yaml: input run.yaml for --migrate-config (default "run.yaml") + - --migrate-output: output path for --migrate-config + (default "lightspeed-stack-unified.yaml") Returns: Configured ArgumentParser for parsing the service CLI options. @@ -88,6 +89,27 @@ def create_argument_parser() -> ArgumentParser: help="path to configuration file (default: lightspeed-stack.yaml)", default="lightspeed-stack.yaml", ) + parser.add_argument( + "--migrate-config", + dest="migrate_config", + help="migrate legacy (run.yaml + lightspeed-stack.yaml) into a unified " + "single-file configuration and exit", + action="store_true", + default=False, + ) + parser.add_argument( + "--run-yaml", + dest="run_yaml", + help="path to legacy run.yaml for --migrate-config (default: run.yaml)", + default="run.yaml", + ) + parser.add_argument( + "--migrate-output", + dest="migrate_output", + help="output path for --migrate-config " + "(default: lightspeed-stack-unified.yaml)", + default="lightspeed-stack-unified.yaml", + ) return parser @@ -125,6 +147,23 @@ def main() -> None: if isinstance(existing_logger, logging.Logger): existing_logger.setLevel(logging.DEBUG) + # --migrate-config runs standalone; does not load config into the singleton, + # since the input may be in legacy form and we are producing its successor. + if args.migrate_config: + # pylint: disable=import-outside-toplevel + from llama_stack_configuration import migrate_config_dumb + + try: + migrate_config_dumb(args.run_yaml, args.config_file, args.migrate_output) + logger.info( + "Migration complete. Wrote unified config to %s", + args.migrate_output, + ) + except Exception as e: + logger.error("Migration failed: %s", e) + raise SystemExit(1) from e + return + configuration.load_configuration(args.config_file) logger.info("Configuration: %s", configuration.configuration) logger.info( diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index 3ac4a8ec4..3776eefbf 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -589,6 +589,246 @@ def enrich_solr( # pylint: disable=too-many-locals logger.info("Added OKP embedding model to registered_resources.models") +# ============================================================================= +# Synthesis for Unified Mode (LCORE-836) +# ============================================================================= + + +DEFAULT_BASELINE_RESOURCE = "default_run.yaml" + +PROVIDER_TYPE_MAP: dict[str, str] = { + "openai": "remote::openai", + "sentence_transformers": "inline::sentence-transformers", + "azure": "remote::azure", + "vertexai": "remote::vertexai", + "watsonx": "remote::watsonx", + "vllm_rhaiis": "remote::vllm", + "vllm_rhel_ai": "remote::vllm", +} + + +def load_default_baseline() -> dict[str, Any]: + """Load LCORE's built-in default Llama Stack baseline config. + + Returns: + dict[str, Any]: The default baseline run.yaml parsed as a dict. + """ + # importlib.resources-style load; `src/data/default_run.yaml` is shipped + # with the package. + baseline_path = Path(__file__).parent / "data" / DEFAULT_BASELINE_RESOURCE + logger.info("Loading built-in default baseline from %s", baseline_path) + with open(baseline_path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) + + +def deep_merge_list_replace( + base: dict[str, Any], overlay: dict[str, Any] +) -> dict[str, Any]: + """Deep-merge `overlay` onto `base`. + + Maps are merged recursively. Lists and scalars in `overlay` replace the + corresponding entry in `base` (no append semantics). Result is a new dict; + neither argument is mutated. + + Parameters: + base: The base mapping. + overlay: The mapping whose values take precedence. + + Returns: + dict[str, Any]: A new mapping with overlay applied on top of base. + """ + import copy # pylint: disable=import-outside-toplevel + + result: dict[str, Any] = copy.deepcopy(base) + for key, value in overlay.items(): + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = deep_merge_list_replace(result[key], value) + else: + result[key] = copy.deepcopy(value) + return result + + +def apply_high_level_inference( + ls_config: dict[str, Any], inference: dict[str, Any] +) -> None: + """Apply a high-level `inference` block into `ls_config['providers']['inference']`. + + Replaces the inference provider list entirely. Use `native_override` for + additive tweaks. + + Parameters: + ls_config: Llama Stack config dict (modified in place). + inference: High-level inference section as a dict (with 'providers' list). + """ + providers_out: list[dict[str, Any]] = [] + for provider in inference.get("providers", []): + p_type = provider["type"] + entry: dict[str, Any] = { + "provider_id": p_type, + "provider_type": PROVIDER_TYPE_MAP[p_type], + } + cfg: dict[str, Any] = {} + if provider.get("api_key_env"): + cfg["api_key"] = f"${{env.{provider['api_key_env']}}}" + if provider.get("allowed_models"): + cfg["allowed_models"] = provider["allowed_models"] + if provider.get("extra"): + cfg.update(provider["extra"]) + if cfg: + entry["config"] = cfg + providers_out.append(entry) + + if "providers" not in ls_config: + ls_config["providers"] = {} + ls_config["providers"]["inference"] = providers_out + logger.info( + "Applied high-level inference section: %s provider entries", + len(providers_out), + ) + + +def synthesize_configuration( + lcs_config: dict[str, Any], + config_file_dir: Optional[Path] = None, + default_baseline: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + """Synthesize a full Llama Stack run.yaml from a unified-mode LCORE config. + + Pipeline: + 1. Baseline = profile file (if set) else `default_baseline` (if provided) + else LCORE's built-in default. + 2. Apply existing top-level enrichment (BYOK RAG, Solr/OKP). + Azure Entra ID is intentionally not run here (side-effect on .env). + 3. Apply high-level sections (inference, and later storage/safety/...). + 4. Deep-merge (list-replace) `native_override`. + + Precedence: profile < high-level sections < native_override. + + Parameters: + lcs_config: Full lightspeed-stack.yaml content as a dict (env-expanded). + config_file_dir: Directory containing the lightspeed-stack.yaml, used + to resolve relative `profile:` paths. If None, relative paths are + resolved against the current working directory. + default_baseline: Override for the baseline when `profile:` is unset + (primarily for tests). If None, LCORE's built-in baseline is used. + + Returns: + dict[str, Any]: The synthesized Llama Stack run.yaml as a dict. + + Raises: + ValueError: If llama_stack.config is not present in `lcs_config`. + """ + unified = (lcs_config.get("llama_stack") or {}).get("config") + if unified is None: + raise ValueError( + "synthesize_configuration called without llama_stack.config set" + ) + + # 1. Baseline + profile = unified.get("profile") + baseline_kind = unified.get("baseline", "default") + if profile: + profile_path = Path(profile) + if not profile_path.is_absolute() and config_file_dir is not None: + profile_path = config_file_dir / profile_path + logger.info("Loading unified-mode profile baseline from %s", profile_path) + with open(profile_path, "r", encoding="utf-8") as f: + ls_config: dict[str, Any] = yaml.safe_load(f) + elif baseline_kind == "empty": + logger.info("Unified mode: starting from empty baseline") + ls_config = {} + elif default_baseline is not None: + import copy # pylint: disable=import-outside-toplevel + + ls_config = copy.deepcopy(default_baseline) + else: + ls_config = load_default_baseline() + + dedupe_providers_vector_io(ls_config) + + # 2. Existing enrichment (BYOK RAG, Solr/OKP) — Azure stays out (file side-effect). + enrich_byok_rag(ls_config, lcs_config.get("byok_rag", [])) + enrich_solr(ls_config, lcs_config.get("rag", {}), lcs_config.get("okp", {})) + + # 3. High-level sections + inference = unified.get("inference") + if inference is not None: + apply_high_level_inference(ls_config, inference) + + # 4. native_override — deep-merge (list-replace) + native_override = unified.get("native_override") or {} + if native_override: + ls_config = deep_merge_list_replace(ls_config, native_override) + + dedupe_providers_vector_io(ls_config) + return ls_config + + +def migrate_config_dumb( + run_yaml_path: str, + lightspeed_yaml_path: str, + output_path: str, +) -> None: + """Lossless lift-and-shift migration: fold run.yaml into lightspeed-stack.yaml. + + Reads the legacy two-file configuration (run.yaml + lightspeed-stack.yaml) + and writes a unified single-file configuration where the entire run.yaml + content is placed under `llama_stack.config.native_override`. Removes any + `llama_stack.library_client_config_path` that referenced the old run.yaml. + + This is the "dumb" migration mode — preserves 100% of the existing + Llama Stack schema content. A future `--smart` mode (out of scope for this + PoC) would factor portions into high-level sections. + + Parameters: + run_yaml_path: Path to the existing Llama Stack run.yaml. + lightspeed_yaml_path: Path to the existing lightspeed-stack.yaml. + output_path: Path to write the unified lightspeed-stack.yaml. + """ + logger.info("Reading %s and %s for migration", lightspeed_yaml_path, run_yaml_path) + + with open(run_yaml_path, "r", encoding="utf-8") as f: + run_yaml_content: dict[str, Any] = yaml.safe_load(f) + + with open(lightspeed_yaml_path, "r", encoding="utf-8") as f: + lcs_yaml: dict[str, Any] = yaml.safe_load(f) + + llama_stack_section = lcs_yaml.setdefault("llama_stack", {}) + llama_stack_section.pop("library_client_config_path", None) + # `baseline: empty` is required for true lossless round-trip: default baseline + # would add extra keys not present in the source run.yaml. + llama_stack_section["config"] = { + "baseline": "empty", + "native_override": run_yaml_content, + } + + logger.info("Writing unified configuration to %s", output_path) + with open(output_path, "w", encoding="utf-8") as f: + yaml.dump(lcs_yaml, f, Dumper=YamlDumper, default_flow_style=False) + + +def synthesize_to_file( + lcs_config: dict[str, Any], + output_file: str, + config_file_dir: Optional[Path] = None, +) -> None: + """Synthesize unified-mode Llama Stack config and write it to disk. + + Secrets are never resolved — env-var references like `${env.FOO}` are + preserved verbatim in the output. + + Parameters: + lcs_config: lightspeed-stack.yaml as a dict. + output_file: Path to write the synthesized run.yaml. + config_file_dir: Directory for resolving relative profile paths. + """ + ls_config = synthesize_configuration(lcs_config, config_file_dir=config_file_dir) + logger.info("Writing synthesized Llama Stack configuration to %s", output_file) + Path(output_file).parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w", encoding="utf-8") as f: + yaml.dump(ls_config, f, Dumper=YamlDumper, default_flow_style=False) + + # ============================================================================= # Main Generation Function (service/container mode only) # ============================================================================= @@ -638,9 +878,15 @@ def generate_configuration( def main() -> None: - """CLI entry point.""" + """CLI entry point. + + Auto-detects the mode: + - Unified mode: `llama_stack.config` present in the lightspeed config file. + Synthesizes the full run.yaml (no `-i/--input` needed); writes to `-o`. + - Legacy mode: requires `-i/--input` run.yaml; enriches it and writes to `-o`. + """ parser = ArgumentParser( - description="Enrich Llama Stack config with Lightspeed values", + description="Produce Llama Stack run.yaml from Lightspeed config.", ) parser.add_argument( "-c", @@ -652,13 +898,14 @@ def main() -> None: "-i", "--input", default="run.yaml", - help="Input Llama Stack config (default: run.yaml)", + help="Input Llama Stack config for legacy-mode enrichment " + "(default: run.yaml; ignored in unified mode)", ) parser.add_argument( "-o", "--output", default="run_.yaml", - help="Output enriched config (default: run_.yaml)", + help="Output run.yaml path (default: run_.yaml)", ) parser.add_argument( "-e", @@ -672,7 +919,19 @@ def main() -> None: config = yaml.safe_load(f) config = replace_env_vars(config) - generate_configuration(args.input, args.output, config, args.env_file) + unified_present = (config.get("llama_stack") or {}).get("config") is not None + if unified_present: + logger.info("Unified mode detected (llama_stack.config present)") + # Azure Entra ID side-effect (writes .env) stays part of boot — still run it. + setup_azure_entra_id_token(config.get("azure_entra_id"), args.env_file) + synthesize_to_file( + config, + args.output, + config_file_dir=Path(args.config).resolve().parent, + ) + else: + logger.info("Legacy mode detected (no llama_stack.config)") + generate_configuration(args.input, args.output, config, args.env_file) if __name__ == "__main__": diff --git a/src/models/config.py b/src/models/config.py index 95bfc4782..9252ccdba 100644 --- a/src/models/config.py +++ b/src/models/config.py @@ -582,6 +582,97 @@ def resolve_auth_headers(self) -> Self: return self +class UnifiedInferenceProvider(ConfigurationBase): + """High-level inference provider entry (unified-mode schema). + + Expanded by the synthesizer into a Llama Stack `providers.inference` entry. + Unknown provider types must be expressed via `native_override` instead. + """ + + type: Literal[ + "openai", + "sentence_transformers", + "azure", + "vertexai", + "watsonx", + "vllm_rhaiis", + "vllm_rhel_ai", + ] = Field( + ..., + description="High-level provider type. Mapped to Llama Stack provider_type.", + ) + + api_key_env: Optional[str] = Field( + None, + description="Environment variable name from which the api_key will be read " + "at Llama Stack start time (as `${env.}`). Kept as a reference; " + "secrets are never resolved into the synthesized file on disk.", + ) + + allowed_models: Optional[list[str]] = Field( + None, + description="Optional list of model ids allowed for this provider.", + ) + + extra: dict[str, Any] = Field( + default_factory=dict, + description="Extra per-provider-type config fields merged into the emitted " + "`config` map (escape hatch for per-type oddities).", + ) + + +class UnifiedInferenceSection(ConfigurationBase): + """High-level inference section (unified-mode schema).""" + + providers: list[UnifiedInferenceProvider] = Field( + default_factory=list, + description="High-level list of inference providers; replaces " + "`providers.inference` in the synthesized run.yaml.", + ) + + +class UnifiedLlamaStackConfig(ConfigurationBase): + """Operational Llama Stack config synthesized by LCORE at runtime. + + When present (unified mode), LCORE produces a full Llama Stack run.yaml + from this block. Precedence (lowest to highest): + + baseline (default / empty / profile) < high-level sections < native_override + + This section is mutually exclusive with + `llama_stack.library_client_config_path` (legacy mode). + """ + + baseline: Literal["default", "empty"] = Field( + "default", + description="Starting point before profile / high-level / native_override " + "are applied. 'default' uses LCORE's built-in baseline run.yaml; 'empty' " + "starts from an empty dict (useful when `native_override` specifies the " + "entire Llama Stack schema, as produced by dumb-mode migration). Ignored " + "when `profile` is set.", + ) + + profile: Optional[str] = Field( + None, + description="Path to a profile YAML file (absolute or relative to the " + "lightspeed-stack.yaml location). Loaded as the baseline if set; " + "overrides the `baseline` field.", + ) + + inference: Optional[UnifiedInferenceSection] = Field( + None, + description="High-level inference section. Additional high-level sections " + "(storage, safety, tools, ...) may be added in future versions.", + ) + + native_override: dict[str, Any] = Field( + default_factory=dict, + description="Raw Llama Stack schema fragment, deep-merged onto the result " + "of profile + high-level expansion. Lists are replaced (not appended). " + "Escape hatch for anything not expressible via high-level keys.", + ) + + class LlamaStackConfiguration(ConfigurationBase): """Llama stack configuration. @@ -620,7 +711,16 @@ class LlamaStackConfiguration(ConfigurationBase): library_client_config_path: Optional[str] = Field( None, title="Llama Stack configuration path", - description="Path to configuration file used when Llama Stack is run in library mode", + description="Path to configuration file used when Llama Stack is run in library " + "mode (legacy mode). Mutually exclusive with `config`.", + ) + + config: Optional[UnifiedLlamaStackConfig] = Field( + None, + title="Unified Llama Stack configuration", + description="Operational Llama Stack config synthesized by LCORE at runtime " + "(unified mode). When present, LCORE produces run.yaml from this block. " + "Mutually exclusive with `library_client_config_path`.", ) timeout: PositiveInt = Field( @@ -635,21 +735,26 @@ def check_llama_stack_model(self) -> Self: """ Validate the Llama Stack configuration and enforce mode-specific requirements. - If no URL is provided, requires explicit library-client mode selection. - When library-client mode is enabled, requires a non-empty - `library_client_config_path` that points to a regular, readable YAML - file (checked via checks.file_check). Also normalizes a None - `use_as_library_client` to False. + Unified mode (`config` set) and legacy mode (`library_client_config_path` + set) are mutually exclusive. If no URL is provided, requires explicit + library-client mode selection. When library-client mode is enabled, + requires either `config` (unified) or `library_client_config_path` + (legacy) to be set. Legacy paths are validated via checks.file_check. Returns: Self: The validated LlamaStackConfiguration instance. Raises: - ValueError: If the configuration is invalid, e.g. no - URL and library-client mode is unspecified or - disabled, or library-client mode is enabled but - `library_client_config_path` is not provided. + ValueError: If the configuration is invalid. """ + if self.config is not None and self.library_client_config_path is not None: + raise ValueError( + "llama_stack.config (unified mode) and " + "llama_stack.library_client_config_path (legacy mode) are mutually " + "exclusive. Migrate legacy configurations with: " + "lightspeed-stack --migrate-config" + ) + if self.url is None: # when URL is not set, it is supposed that Llama Stack should be run in library mode # it means that use_as_library_client attribute must be set to True @@ -667,20 +772,19 @@ def check_llama_stack_model(self) -> Self: self.use_as_library_client = False if self.use_as_library_client: - # when use_as_library_client is set to true, Llama Stack will be run in library mode - # it means that: - # - Llama Stack URL should not be set, and - # - library_client_config_path attribute must be set and must point to - # a regular readable YAML file - if self.library_client_config_path is None: - # pylint: disable=line-too-long + # library mode requires either unified config or legacy config path + if self.library_client_config_path is None and self.config is None: raise ValueError( - "Llama stack library client mode is enabled but a configuration file path is not specified" + "Llama stack library client mode is enabled but neither " + "`config` (unified) nor `library_client_config_path` (legacy) " + "is specified" + ) + if self.library_client_config_path is not None: + # legacy: the configuration file must exist and be a regular readable file + checks.file_check( + Path(self.library_client_config_path), + "Llama Stack configuration file", ) - # the configuration file must exists and be regular readable file - checks.file_check( - Path(self.library_client_config_path), "Llama Stack configuration file" - ) return self diff --git a/test.containerfile b/test.containerfile index 884fd8525..dff715b9f 100644 --- a/test.containerfile +++ b/test.containerfile @@ -36,11 +36,14 @@ RUN mkdir -p /opt/app-root/src/.llama/storage \ chown -R 1001:0 /opt/app-root && \ chmod -R 775 /opt/app-root -# Copy enrichment scripts for runtime config enrichment +# Copy enrichment / unified-mode synthesis scripts for runtime config production COPY src/llama_stack_configuration.py /opt/app-root/llama_stack_configuration.py +COPY src/data /opt/app-root/data COPY scripts/llama-stack-entrypoint.sh /opt/app-root/enrich-entrypoint.sh RUN chmod +x /opt/app-root/enrich-entrypoint.sh && \ - chown 1001:0 /opt/app-root/enrich-entrypoint.sh /opt/app-root/llama_stack_configuration.py + chown -R 1001:0 /opt/app-root/enrich-entrypoint.sh \ + /opt/app-root/llama_stack_configuration.py \ + /opt/app-root/data # Switch back to the original user USER 1001 diff --git a/tests/unit/models/config/test_dump_configuration.py b/tests/unit/models/config/test_dump_configuration.py index 06a3ef08c..c5954a478 100644 --- a/tests/unit/models/config/test_dump_configuration.py +++ b/tests/unit/models/config/test_dump_configuration.py @@ -144,6 +144,7 @@ def test_dump_configuration(tmp_path: Path) -> None: "use_as_library_client": True, "api_key": "**********", "library_client_config_path": "tests/configuration/run.yaml", + "config": None, "timeout": 180, }, "user_data_collection": { @@ -486,6 +487,7 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None: "use_as_library_client": True, "api_key": "**********", "library_client_config_path": "tests/configuration/run.yaml", + "config": None, "timeout": 180, }, "user_data_collection": { @@ -720,6 +722,7 @@ def test_dump_configuration_with_quota_limiters_different_values( "use_as_library_client": True, "api_key": "**********", "library_client_config_path": "tests/configuration/run.yaml", + "config": None, "timeout": 180, }, "user_data_collection": { @@ -934,6 +937,7 @@ def test_dump_configuration_byok(tmp_path: Path) -> None: "use_as_library_client": True, "api_key": "**********", "library_client_config_path": "tests/configuration/run.yaml", + "config": None, "timeout": 180, }, "user_data_collection": { @@ -1138,6 +1142,7 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None: "use_as_library_client": True, "api_key": "**********", "library_client_config_path": "tests/configuration/run.yaml", + "config": None, "timeout": 180, }, "user_data_collection": { diff --git a/tests/unit/models/config/test_llama_stack_configuration.py b/tests/unit/models/config/test_llama_stack_configuration.py index deec7e765..57ba66861 100644 --- a/tests/unit/models/config/test_llama_stack_configuration.py +++ b/tests/unit/models/config/test_llama_stack_configuration.py @@ -86,20 +86,65 @@ def test_llama_stack_wrong_configuration_no_config_file() -> None: """Test the LlamaStackConfiguration constructor. Verify that enabling library-client mode without providing a configuration - file path raises a ValueError. - - Asserts that constructing LlamaStackConfiguration with - use_as_library_client=True and no library_client_config_path raises a - ValueError whose message is "Llama stack library client mode is enabled but - a configuration file path is not specified". + file path or unified config raises a ValueError. """ - m = "Llama stack library client mode is enabled but a configuration file path is not specified" + m = ( + "Llama stack library client mode is enabled but neither `config` " + "\\(unified\\) nor `library_client_config_path` \\(legacy\\) is specified" + ) with pytest.raises(ValueError, match=m): LlamaStackConfiguration( use_as_library_client=True ) # pyright: ignore[reportCallIssue] +# ============================================================================= +# Unified mode (LCORE-836) +# ============================================================================= + + +def test_llama_stack_unified_mode_library_client() -> None: + """Unified mode in library mode: `config` set, no library_client_config_path.""" + # pylint: disable=import-outside-toplevel + from models.config import UnifiedLlamaStackConfig + + cfg = LlamaStackConfiguration( + use_as_library_client=True, + config=UnifiedLlamaStackConfig(), + ) # pyright: ignore[reportCallIssue] + assert cfg.config is not None + assert cfg.library_client_config_path is None + + +def test_llama_stack_unified_and_legacy_are_mutually_exclusive() -> None: + """Setting both `config` and `library_client_config_path` is rejected.""" + # pylint: disable=import-outside-toplevel + from models.config import UnifiedLlamaStackConfig + + with pytest.raises( + ValueError, + match="mutually exclusive", + ): + LlamaStackConfiguration( + use_as_library_client=True, + library_client_config_path="tests/configuration/run.yaml", + config=UnifiedLlamaStackConfig(), + ) # pyright: ignore[reportCallIssue] + + +def test_llama_stack_unified_mode_with_remote_url() -> None: + """Unified config is also allowed when connecting to a remote Llama Stack.""" + # pylint: disable=import-outside-toplevel + from models.config import UnifiedLlamaStackConfig + + cfg = LlamaStackConfiguration( + url="http://remote-ls:8321", + config=UnifiedLlamaStackConfig(), + ) # pyright: ignore[reportCallIssue] + assert cfg.config is not None + assert str(cfg.url) == "http://remote-ls:8321/" + + def test_llama_stack_configuration_valid_http_url() -> None: """Test that valid HTTP URLs are accepted.""" config = LlamaStackConfiguration( diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index bcd5ca0d6..999abf7f6 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -84,7 +84,7 @@ async def test_get_async_llama_stack_wrong_configuration() -> None: cfg.library_client_config_path = None with pytest.raises( ValueError, - match="Configuration problem: library_client_config_path is not set", + match="neither .*unified.* nor .*legacy.* is set", ): client = AsyncLlamaStackClientHolder() await client.load(cfg) diff --git a/tests/unit/test_llama_stack_synthesize.py b/tests/unit/test_llama_stack_synthesize.py new file mode 100644 index 000000000..9e86bcaa0 --- /dev/null +++ b/tests/unit/test_llama_stack_synthesize.py @@ -0,0 +1,371 @@ +"""Unit tests for unified-mode synthesizer and migration tool (LCORE-836).""" + +from pathlib import Path +from typing import Any + +import pytest +import yaml + +from llama_stack_configuration import ( + PROVIDER_TYPE_MAP, + apply_high_level_inference, + deep_merge_list_replace, + load_default_baseline, + migrate_config_dumb, + synthesize_configuration, +) + +# ============================================================================= +# deep_merge_list_replace +# ============================================================================= + + +def test_deep_merge_scalar_replace() -> None: + """Overlay scalar replaces base scalar.""" + result = deep_merge_list_replace({"a": 1}, {"a": 2}) + assert result == {"a": 2} + + +def test_deep_merge_adds_new_keys() -> None: + """Overlay keys not in base are added.""" + result = deep_merge_list_replace({"a": 1}, {"b": 2}) + assert result == {"a": 1, "b": 2} + + +def test_deep_merge_nested_map_merges() -> None: + """Nested maps merge recursively.""" + base = {"a": {"x": 1, "y": 2}} + overlay = {"a": {"y": 20, "z": 30}} + result = deep_merge_list_replace(base, overlay) + assert result == {"a": {"x": 1, "y": 20, "z": 30}} + + +def test_deep_merge_list_replaces() -> None: + """Lists are replaced, not appended.""" + base = {"items": [1, 2, 3]} + overlay = {"items": [9]} + result = deep_merge_list_replace(base, overlay) + assert result == {"items": [9]} + + +def test_deep_merge_does_not_mutate_inputs() -> None: + """Neither base nor overlay are mutated.""" + base = {"a": {"x": 1}} + overlay = {"a": {"x": 2}} + result = deep_merge_list_replace(base, overlay) + assert base == {"a": {"x": 1}} + assert overlay == {"a": {"x": 2}} + assert result == {"a": {"x": 2}} + + +def test_deep_merge_type_mismatch_replaces() -> None: + """If overlay type != base type at same key, overlay wins.""" + # base is map, overlay is scalar + result = deep_merge_list_replace({"a": {"x": 1}}, {"a": "replaced"}) + assert result == {"a": "replaced"} + + +# ============================================================================= +# apply_high_level_inference +# ============================================================================= + + +def test_apply_high_level_inference_single_provider() -> None: + """Single provider with api_key_env and allowed_models.""" + ls_config: dict[str, Any] = {} + inference = { + "providers": [ + { + "type": "openai", + "api_key_env": "OPENAI_API_KEY", + "allowed_models": ["gpt-4o-mini"], + } + ] + } + apply_high_level_inference(ls_config, inference) + assert ls_config["providers"]["inference"] == [ + { + "provider_id": "openai", + "provider_type": "remote::openai", + "config": { + "api_key": "${env.OPENAI_API_KEY}", + "allowed_models": ["gpt-4o-mini"], + }, + } + ] + + +def test_apply_high_level_inference_replaces_existing() -> None: + """Providers list is replaced entirely, not merged.""" + ls_config = {"providers": {"inference": [{"provider_id": "stale"}]}} + apply_high_level_inference( + ls_config, {"providers": [{"type": "sentence_transformers"}]} + ) + assert ls_config["providers"]["inference"] == [ + { + "provider_id": "sentence_transformers", + "provider_type": "inline::sentence-transformers", + } + ] + + +def test_apply_high_level_inference_extra_merged() -> None: + """`extra` dict fields merge into emitted config.""" + ls_config: dict[str, Any] = {} + inference = { + "providers": [ + { + "type": "vertexai", + "extra": {"project_id": "my-project", "location": "us-central1"}, + } + ] + } + apply_high_level_inference(ls_config, inference) + assert ls_config["providers"]["inference"][0]["config"] == { + "project_id": "my-project", + "location": "us-central1", + } + + +def test_provider_type_map_covers_all_literals() -> None: + """Every Literal value declared on UnifiedInferenceProvider.type has a mapping.""" + # pylint: disable=import-outside-toplevel + from models.config import UnifiedInferenceProvider + + literal_values = ( + UnifiedInferenceProvider.model_fields[ # pylint: disable=unsubscriptable-object + "type" + ].annotation.__args__ + ) + for value in literal_values: + assert value in PROVIDER_TYPE_MAP + + +# ============================================================================= +# synthesize_configuration +# ============================================================================= + + +MINIMAL_BASELINE: dict[str, Any] = { + "version": 2, + "apis": ["inference"], + "providers": { + "inference": [ + {"provider_id": "stock", "provider_type": "remote::stock", "config": {}} + ] + }, + "safety": {"default_shield_id": "llama-guard"}, +} + + +def test_synthesize_errors_without_config() -> None: + """Without llama_stack.config present, synthesize raises ValueError.""" + with pytest.raises(ValueError, match="llama_stack.config"): + synthesize_configuration({"llama_stack": {}}) + + +def test_synthesize_uses_default_baseline_when_no_profile() -> None: + """With neither profile nor native_override, result is the baseline (through enrichment).""" + lcs_config: dict[str, Any] = {"llama_stack": {"config": {}}} + result = synthesize_configuration(lcs_config, default_baseline=MINIMAL_BASELINE) + # Baseline preserved (enrichment is a no-op without byok_rag/rag/okp) + assert result["safety"] == {"default_shield_id": "llama-guard"} + assert result["providers"]["inference"] == [ + {"provider_id": "stock", "provider_type": "remote::stock", "config": {}} + ] + + +def test_synthesize_loads_profile_from_path(tmp_path: Path) -> None: + """Profile path is loaded as the baseline.""" + profile_data = { + "version": 2, + "apis": ["inference"], + "providers": {"inference": [{"provider_id": "profile_p"}]}, + } + profile_path = tmp_path / "profile.yaml" + profile_path.write_text(yaml.dump(profile_data)) + + lcs_config: dict[str, Any] = { + "llama_stack": {"config": {"profile": str(profile_path)}} + } + result = synthesize_configuration(lcs_config) + assert result["providers"]["inference"] == [{"provider_id": "profile_p"}] + + +def test_synthesize_profile_relative_path(tmp_path: Path) -> None: + """Relative profile path resolves against config_file_dir.""" + profile_data = {"version": 2} + (tmp_path / "p.yaml").write_text(yaml.dump(profile_data)) + lcs_config: dict[str, Any] = {"llama_stack": {"config": {"profile": "p.yaml"}}} + result = synthesize_configuration(lcs_config, config_file_dir=tmp_path) + assert result == {"version": 2} + + +def test_synthesize_applies_high_level_inference() -> None: + """High-level inference section expands into native providers list.""" + lcs_config: dict[str, Any] = { + "llama_stack": { + "config": { + "inference": { + "providers": [{"type": "openai", "api_key_env": "OPENAI_API_KEY"}] + } + } + } + } + result = synthesize_configuration(lcs_config, default_baseline=MINIMAL_BASELINE) + assert result["providers"]["inference"] == [ + { + "provider_id": "openai", + "provider_type": "remote::openai", + "config": {"api_key": "${env.OPENAI_API_KEY}"}, + } + ] + + +def test_synthesize_native_override_deep_merges() -> None: + """native_override deep-merges on top (scalar path).""" + lcs_config: dict[str, Any] = { + "llama_stack": { + "config": { + "native_override": { + "safety": {"default_shield_id": "overridden"}, + } + } + } + } + result = synthesize_configuration(lcs_config, default_baseline=MINIMAL_BASELINE) + assert result["safety"]["default_shield_id"] == "overridden" + + +def test_synthesize_native_override_list_replaces() -> None: + """native_override replaces lists, not appends.""" + lcs_config: dict[str, Any] = { + "llama_stack": { + "config": { + "native_override": { + "providers": { + "inference": [{"provider_id": "override-only"}], + } + } + } + } + } + result = synthesize_configuration(lcs_config, default_baseline=MINIMAL_BASELINE) + assert result["providers"]["inference"] == [{"provider_id": "override-only"}] + + +def test_synthesize_precedence_override_beats_high_level() -> None: + """When high-level and native_override both touch the same path, override wins.""" + lcs_config: dict[str, Any] = { + "llama_stack": { + "config": { + "inference": {"providers": [{"type": "openai"}]}, + "native_override": { + "providers": { + "inference": [{"provider_id": "override-wins"}], + } + }, + } + } + } + result = synthesize_configuration(lcs_config, default_baseline=MINIMAL_BASELINE) + assert result["providers"]["inference"] == [{"provider_id": "override-wins"}] + + +def test_synthesize_preserves_env_var_refs_verbatim() -> None: + """Secrets stay as ${env.FOO} references; never resolved into the output.""" + lcs_config: dict[str, Any] = { + "llama_stack": { + "config": { + "inference": { + "providers": [{"type": "openai", "api_key_env": "OPENAI_API_KEY"}] + } + } + } + } + result = synthesize_configuration(lcs_config, default_baseline=MINIMAL_BASELINE) + api_key_value = result["providers"]["inference"][0]["config"]["api_key"] + assert api_key_value == "${env.OPENAI_API_KEY}" + + +# ============================================================================= +# Built-in default baseline loader +# ============================================================================= + + +def test_load_default_baseline_returns_dict() -> None: + """The shipped default baseline loads as a dict with expected keys.""" + baseline = load_default_baseline() + assert isinstance(baseline, dict) + assert baseline.get("version") == 2 + assert "providers" in baseline + + +# ============================================================================= +# migrate_config_dumb +# ============================================================================= + + +def test_migrate_dumb_lossless_roundtrip(tmp_path: Path) -> None: + """Dumb migration places full run.yaml under config.native_override.""" + run_yaml_content = { + "version": 2, + "apis": ["inference"], + "providers": {"inference": [{"provider_id": "opa"}]}, + } + lcs_yaml_content = { + "name": "LCS", + "llama_stack": { + "use_as_library_client": True, + "library_client_config_path": str(tmp_path / "run.yaml"), + }, + } + + run_yaml_path = tmp_path / "run.yaml" + run_yaml_path.write_text(yaml.dump(run_yaml_content)) + lcs_yaml_path = tmp_path / "lightspeed-stack.yaml" + lcs_yaml_path.write_text(yaml.dump(lcs_yaml_content)) + output_path = tmp_path / "unified.yaml" + + migrate_config_dumb(str(run_yaml_path), str(lcs_yaml_path), str(output_path)) + + result = yaml.safe_load(output_path.read_text()) + + # Legacy path is gone + assert "library_client_config_path" not in result["llama_stack"] + # Unified config has full run.yaml under native_override + assert result["llama_stack"]["config"]["native_override"] == run_yaml_content + # Other fields preserved + assert result["llama_stack"]["use_as_library_client"] is True + assert result["name"] == "LCS" + + +def test_migrate_then_synthesize_reproduces_run_yaml(tmp_path: Path) -> None: + """End-to-end round trip: run.yaml → migrate → synthesize → original content.""" + run_yaml_content = { + "version": 2, + "apis": ["inference", "vector_io"], + "providers": { + "inference": [{"provider_id": "rt", "provider_type": "remote::rt"}] + }, + "safety": {"default_shield_id": "guard"}, + } + lcs_yaml_content = { + "name": "LCS", + "llama_stack": { + "use_as_library_client": True, + "library_client_config_path": str(tmp_path / "run.yaml"), + }, + } + run_yaml_path = tmp_path / "run.yaml" + run_yaml_path.write_text(yaml.dump(run_yaml_content)) + lcs_yaml_path = tmp_path / "lightspeed-stack.yaml" + lcs_yaml_path.write_text(yaml.dump(lcs_yaml_content)) + output_path = tmp_path / "unified.yaml" + migrate_config_dumb(str(run_yaml_path), str(lcs_yaml_path), str(output_path)) + + unified = yaml.safe_load(output_path.read_text()) + synthesized = synthesize_configuration(unified) + + # Synthesized == original run.yaml (lossless round trip in dumb mode) + assert synthesized == run_yaml_content From 06ef987a861360ab1584af8a1ed12934ee67eb38 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Thu, 23 Apr 2026 14:59:40 +0200 Subject: [PATCH 2/9] LCORE-836 spike: design docs and PoC evidence Add the spike doc (decisions up front, background below, 7 proposed JIRAs) and the spec doc (requirements R1..R11, architecture, implementation guide, migration worked example) under `docs/design/llama-stack-config-merge/`. Key decisions captured for reviewer confirmation: - Overall shape: Option C (high-level + native_override) with Option E (profile feature, no shipped profiles) as an optional layer. - Deprecation: calendar-based (e.g., "legacy path removed no sooner than 6 months after WARN begins"); concrete timing deferred to PM review. - Override precedence: deep-merge with list replacement at leaf level. - Secrets handling: env-var references preserved verbatim in synthesized files; never resolved to disk. - Format detection: shape-based, with an optional `config_format_version` field that, if present, must agree with the shape. - Migration tool shape: `--migrate-config` flag (no CLI refactor); dumb lift-and-shift mode only in v1; smart mode deferred. - Profile distribution: feature only, LCORE ships no profiles of its own beyond reference examples under `examples/profiles/`. - LS process supervision and hot-reload: out of scope (LCORE-777, LCORE-778, LCORE-781 territory). The spike's PoC validated library-mode end-to-end: a `lightspeed-stack.yaml` containing only `llama_stack.config` (no external run.yaml) boots LCORE, serves /v1/query with a real model response, and a `native_override` value demonstrably takes effect in the synthesized run.yaml. Server-mode end-to-end through docker-compose was skipped because the LS container image rebuild (~2 GB, UBI + llama-stack llslibdev dependency sync) was impractical for the spike timeline; the same synthesis code path is exercised by the unit tests, including the lossless migrate-then-synthesize round-trip. PoC evidence is under `poc-evidence/library-mode/` as reference material for reviewers, and per the spike howto it is intended to be removed from the branch prior to merge. The spike doc and spec doc remain permanent. --- .../llama-stack-config-merge-spike.md | 717 ++++++++++++++++++ .../llama-stack-config-merge.md | 502 ++++++++++++ .../poc-evidence/library-mode/README.md | 26 + .../library-mode/query-response.json | 1 + .../library-mode/synthesized-run.yaml | 148 ++++ .../lightspeed-stack-unified-library.yaml | 33 + 6 files changed, 1427 insertions(+) create mode 100644 docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md create mode 100644 docs/design/llama-stack-config-merge/llama-stack-config-merge.md create mode 100644 docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md create mode 100644 docs/design/llama-stack-config-merge/poc-evidence/library-mode/query-response.json create mode 100644 docs/design/llama-stack-config-merge/poc-evidence/library-mode/synthesized-run.yaml create mode 100644 docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md new file mode 100644 index 000000000..c8db06ff4 --- /dev/null +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -0,0 +1,717 @@ +# Spike: Llama Stack config merge (unified `lightspeed-stack.yaml`) + +## Overview + +**The problem**: Operators today must maintain two configuration files — +`lightspeed-stack.yaml` (LCORE settings) and `run.yaml` (Llama Stack +operational config: providers, storage, APIs, safety, registered resources). +This split increases the chance of misconfiguration, makes downstream +deployment templates larger, and forces every Lightspeed team to understand +Llama Stack's internal schema. LCORE-836 asks for a single source of truth. + +**The recommendation**: A layered approach — "Option C + Option E layer": + +- **High-level keys** in `lightspeed-stack.yaml` under a new `llama_stack.config` + section (inference, later storage/safety/...). Most downstream teams write + only these. +- **`native_override`** escape hatch under the same section — raw Llama Stack + schema, deep-merged last. Covers anything the high-level schema doesn't + express. +- **`profile`** field that points to a YAML file used as the baseline — the + "profiles" feature is mechanism-only; LCORE ships no profiles of its own + beyond one or two reference examples under `examples/profiles/`. +- **`baseline: default | empty`** selects whether the synthesis starts from + LCORE's built-in baseline or a blank slate. +- **Legacy mode preserved**: existing `llama_stack.library_client_config_path` + works unchanged through a deprecation window. Mutual exclusion with the new + `llama_stack.config` block is enforced at load time. +- **Migration tool**: `lightspeed-stack --migrate-config` produces a unified + single-file config from an existing (`run.yaml` + `lightspeed-stack.yaml`) + pair, lossless round-trip. + +**PoC validation**: A Level 3' PoC (per the spike howto) proves the mechanism +end-to-end in library mode. A unified `lightspeed-stack.yaml` containing only +`llama_stack.config` (no external `run.yaml`) successfully drives LCORE: +liveness/readiness green, `/v1/query` returns a real model response, +`native_override` demonstrably takes effect. Full unit-test suite passes +(2098 tests), including a lossless migrate-then-synthesize round-trip. +Server-mode end-to-end was not re-run through docker-compose — the container +rebuild time was impractical and unrelated to PoC quality (the container image +is ~2 GB of LS dependencies); the same synthesis code path is exercised by the +library-mode PoC and unit tests. + +--- + +## Strategic decisions — for @sbunciak (PM) and @tisnik + +These set scope, approach, and rollout shape. Each has a recommendation — +please confirm or override. + +### Decision S1: Overall shape (Option C + optional Option E) + +See [Design alternatives considered](#design-alternatives-considered) for the +full option set and scoring. + +| Option | Summary | +|---|---| +| A (Embedded native only) | `lightspeed-stack.yaml.llama_stack.config` is raw Llama Stack schema | +| **B + C (High-level + native override)** | High-level keys cover the common path, `native_override` as escape hatch | +| E (Profiles) | Named or path-based pre-built config bundles, layered on top of A/B/C | +| G (Kustomize-style patches) | Ship a default baseline, operator writes JSON-Patch-like overlays | + +**Recommendation**: **C** (high-level + native_override) with **E** (profile +feature, no shipped profiles) as an optional layer. Best balance of UX, +escape-hatch power, validation rigor, and dynamic-reconfig fit for the +broader feature roadmap (LCORE-777/781). + +### Decision S2: Deprecation timeline for the legacy path + +Legacy mode (`llama_stack.library_client_config_path` + external `run.yaml`) +must coexist with unified mode through a deprecation window to avoid breaking +downstream teams. Three candidate cadences: + +| Cadence | Timing | +|---|---| +| N+2 releases | Opt-in → warning → removed over two releases after landing | +| N+3 releases | Opt-in → warning (N+1) → removed at N+3 | +| **Calendar-based** | e.g., "removed no sooner than 6 months after warning starts" | + +**Recommendation**: **calendar-based**, because the right number depends on +LCORE's release cadence and downstream consumers' update latency — both of +which the spike author does not own. @sbunciak to set the actual numbers. + +### Decision S3: Downstream implications we may not have seen + +The spike author has direct evidence of Konflux/Tekton usage (`.tekton/` dir) +and RHOAI testing (`tests/e2e-prow/rhoai/`). Other downstream consumers — +RHOAI operator CRs, Helm charts, Kustomize overlays, any other products — +are not visible from this repo alone. + +**Ask**: Reviewers from downstream teams to confirm whether their deployment +setup treats `run.yaml` as a separate artifact (ConfigMap, templated file, +build-time asset) that this design would need to accommodate. + +### Decision S4: Scope of this spike — what is deliberately left out + +The following related work streams are **not** included in this spike and +should be tracked as separate future JIRAs: + +- **Llama Stack process supervision** from LCORE (restart-on-crash, signal + propagation, merged logs). Orthogonal to config merging; covered by + LCORE-777 / LCORE-778. +- **Hot-reload / dynamic reconfig** (e.g., live `POST /v1/rag` that adds a + BYOK RAG without restart). Llama Stack does not natively support + hot-reload; achieving it would require supervision + restart flows. + Covered by LCORE-781. + +**Recommendation**: confirm this scope split. If reviewers want any of the +above pulled in, this spike's JIRAs grow accordingly. + +--- + +## Technical decisions — for @tisnik and team leads + +Architecture-level and implementation-level. Each has a recommendation +grounded in the PoC. + +### Decision T1: Format detection (shape vs version field vs both) + +How does LCORE tell unified-mode configs from legacy-mode configs? + +| Option | Works by | +|---|---| +| Shape only | Presence of `llama_stack.config` → unified; else legacy | +| Version field only | Explicit `config_format_version: 2` required | +| **Both (soft-coupled)** | Shape decides; version field optional but must agree when present | + +**Recommendation**: **both, soft-coupled**. Gives a cheap upgrade path for +future real schema bumps without forcing every existing user to add a +version field today. Confidence: 75%. + +### Decision T2: Override precedence (inside Option C) + +When `llama_stack.config.native_override` overlaps with a high-level key, +what semantics? + +| Strategy | Example: `safety: {excluded_categories: [a, b]}` vs override `{excluded_categories: [c]}` | +|---|---| +| Deep-merge, append lists | result: `[a, b, c]` | +| **Deep-merge, replace lists** | result: `[c]`; other keys in `safety` preserved | +| Whole-key override | result: whole `safety` replaced; lose `default_shield_id` unless restated | +| JSON Patch (ops) | explicit — `{op: replace, path: /safety/excluded_categories, value: [c]}` | + +**Recommendation**: **deep-merge with list replacement**. Simple mental model, +no list-merge tarpit, keeps scalar + map overrides minimal. Implemented in +`deep_merge_list_replace()`. Confidence: 70%. + +See [Merge semantics worked examples](#merge-semantics-worked-examples). + +### Decision T3: Secrets in synthesized files + +The synthesized run.yaml lives on disk (library mode: `$TMPDIR`; server mode: +inside the LS container). Option space: + +| Option | On-disk content | +|---|---| +| **Keep env-var refs verbatim** | `api_key: ${env.OPENAI_API_KEY}` (resolved by LS at start) | +| Resolve before writing | `api_key: sk-...` | + +**Recommendation**: **keep env-var refs verbatim**. Security-leaning default; +resolved secrets never touch the disk. Implemented in +`apply_high_level_inference` (emits `${env.}` strings). Confidence: 95%. + +### Decision T4: Synthesized file location + +Where the synthesized `run.yaml` goes at runtime: + +| Option | Path | +|---|---| +| Temp file | `$TMPDIR/llama_stack_synthesized_config.yaml` | +| **Persistent known path** | Local: `./.generated/run.yaml` or `~/.local/state/lightspeed-stack/run.yaml`; Container: `/app-root/.generated/run.yaml`. Overwrite on each boot. | + +**Recommendation**: **persistent known path, overwrite on boot**. Debuggable, +no stale-file risk (always overwritten before LS starts). The PoC used +`$TMPDIR` for expediency; production should use the persistent path. CLI flag +`--synthesized-config-output ` for debugging. Confidence: 85%. + +### Decision T5: Migration tool invocation + +How operators invoke the migration tool: + +| Option | Example | +|---|---| +| Separate script under `scripts/` | `uv run python scripts/migrate-config.py ...` | +| **Flag on main entry point** | `lightspeed-stack --migrate-config --run-yaml X -c Y --migrate-output Z` | +| Subcommand refactor | `lightspeed-stack migrate-config ...` (BREAKS existing invocations) | + +**Recommendation**: **flag on main entry point**. Parallels the existing +`--dump-configuration` / `--dump-schema` flags; zero breaking change to +existing invocations. Implemented in `src/lightspeed_stack.py` + a +companion `migrate_config_dumb()` function. Confidence: 90%. + +### Decision T6: Profile distribution + +How profiles (Option E layer) reach downstream teams: + +| Option | Details | +|---|---| +| Ship named profiles in `src/profiles/` | LCORE ships a pre-curated set; `profile: openai-remote` resolves | +| **Feature only, no shipped profiles** | `profile: ` is the only invocation; teams author their own; LCORE ships 1–2 reference examples under `examples/profiles/` | + +**Recommendation**: **feature only, no shipped profiles**. Avoids +profile-sprawl and the burden of keeping "blessed" profiles in sync with +downstream products. 1–2 reference examples in `examples/profiles/` are +documentation, not shipped runtime assets. Confidence: 85%. + +### Decision T7: The `baseline` field (added during PoC) + +During the PoC, strict lossless round-trip for the migration tool surfaced +a need: when `native_override` contains an entire run.yaml body, the default +baseline's keys still leak into the result via deep-merge. Fix: a +`baseline: "default" | "empty"` field. + +- `baseline: default` (default value) — start from LCORE's built-in baseline +- `baseline: empty` — start from `{}`. Used by the dumb migration tool so + round-trip is exact. + +**Recommendation**: **accept this field**. Alternatives (`inherit_defaults: +bool`, `starting_point: ...`) are cosmetic. Confidence: 80%. Reviewers: any +preference on naming before this ships? + +### Decision T8: Konflux / Tekton pipelines + +The `.tekton/` directory exists in this repo. If any Konflux/Tekton pipeline +templates or mounts `run.yaml` separately, unified mode needs that pipeline +to either (a) keep using legacy mode during the deprecation window, or +(b) mount the unified `lightspeed-stack.yaml` and drop the `run.yaml` mount. + +**Ask**: owner of `.tekton/` to confirm current pipeline shape and plan +migration. + +### Decision T9: Library client API (resolved by PoC) + +**Finding from PoC**: `AsyncLlamaStackAsLibraryClient` in `llama-stack` only +accepts a file-path string. It does not accept a dict. This means library +mode must write the synthesized config to disk — no dict-only shortcut +available. Not a decision; a fact to note in the spec doc. + +--- + +## Proposed JIRAs + +Each JIRA's agentic-tool instruction points to the spec doc +(`llama-stack-config-merge.md`), the permanent reference. + + + +### LCORE-???? Unified `llama_stack.config` schema + synthesizer + +**Description**: Implement the unified-mode config schema +(`UnifiedLlamaStackConfig`, `UnifiedInferenceSection`, +`UnifiedInferenceProvider`) and the synthesizer that produces a full Llama +Stack `run.yaml` from it. Wire library mode to the synthesizer. Preserve +legacy mode through mutual-exclusion validation. + +**Scope**: +- New Pydantic classes in `src/models/config.py`. +- New functions in `src/llama_stack_configuration.py`: + `synthesize_configuration`, `deep_merge_list_replace`, + `apply_high_level_inference`, `load_default_baseline`, `synthesize_to_file`. +- A shipped default baseline at `src/data/default_run.yaml`. +- Library-mode wiring in `src/client.py`: detect unified vs legacy, write + synthesized file, pass path to library client. +- Cross-field validation: reject both `config` and + `library_client_config_path` set simultaneously. +- Legacy behavior (`llama_stack.library_client_config_path` path) unchanged. + +**Acceptance criteria**: +- Unified `lightspeed-stack.yaml` (no external `run.yaml`) boots LCORE in + library mode and serves `/v1/query`. +- Legacy configs continue to work with no change. +- Mutual-exclusion error message fires cleanly when both forms are set. +- Unit tests for synthesizer, merge semantics, schema validation. + +**Agentic tool instruction**: +```text +Read the "Architecture" and "Implementation Suggestions" sections of +docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files to create or modify: + src/models/config.py (new classes; modify LlamaStackConfiguration) + src/llama_stack_configuration.py (synthesize_configuration + helpers) + src/data/default_run.yaml (new) + src/client.py (library-mode wiring) +To verify: run a unified-mode config end-to-end via `uv run lightspeed-stack -c ` and confirm /v1/query succeeds. +``` + + + +### LCORE-???? Migration tool — dumb-mode lift-and-shift + +**Description**: Implement `--migrate-config` on the `lightspeed-stack` CLI +that produces a unified single-file config from an existing +(`run.yaml` + `lightspeed-stack.yaml`) pair. Dumb mode places the entire +`run.yaml` body under `llama_stack.config.native_override` with +`baseline: empty`, removes `library_client_config_path`. + +**Scope**: +- `migrate_config_dumb()` function in `src/llama_stack_configuration.py`. +- `--migrate-config`, `--run-yaml`, `--migrate-output` flags in + `src/lightspeed_stack.py`. +- Round-trip test: migrate → synthesize → byte-identical to original + `run.yaml`. + +**Acceptance criteria**: +- `lightspeed-stack --migrate-config --run-yaml X -c Y --migrate-output Z` + produces a unified config that boots LCORE in library mode to the same + Llama Stack behavior as the original pair. +- Round-trip unit test passes. +- `--help` describes the flag clearly. + +**Agentic tool instruction**: +```text +Read "Migration tool" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files: src/lightspeed_stack.py, src/llama_stack_configuration.py, +tests/unit/test_llama_stack_synthesize.py. +To verify: migrate the repo's root run.yaml + lightspeed-stack.yaml, then +start LCORE with the output; confirm /v1/query works. +``` + + + +### LCORE-???? LS container entrypoint + deployment artifacts for unified mode + +**Description**: Update the Llama Stack container entrypoint and deployment +manifests so server mode works end-to-end from a unified +`lightspeed-stack.yaml`. Rebuild guidance for container images that bundle +the synthesizer script and default baseline. + +**Scope**: +- Update `scripts/llama-stack-entrypoint.sh` — the existing script already + defers to the Python CLI for auto-detection; document that behavior. +- Update `test.containerfile` to copy `src/data/` into the LS container so + `load_default_baseline()` resolves. +- Provide a unified-mode `docker-compose.yaml` (or update the existing one) + that mounts only `lightspeed-stack.yaml` into the LS container. +- Update `.tekton/` pipelines as needed (coordinate with pipeline owner, + see Decision T8). + +**Acceptance criteria**: +- `docker compose up` with a unified `lightspeed-stack.yaml` starts both + containers healthy; `/v1/query` works through LCORE → LS. +- Legacy docker-compose layout (with external `run.yaml` mount) still works. + +**Agentic tool instruction**: +```text +Read "Architecture → Server mode" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files: scripts/llama-stack-entrypoint.sh, test.containerfile, +docker-compose.yaml, .tekton/*.yaml. +To verify: docker compose up with the unified config; curl LCORE /v1/query. +``` + + + +### LCORE-???? Migrate in-repo e2e / integration test configurations + +**User story**: As a Lightspeed Core maintainer, I want the in-repo e2e and +integration tests to use the unified-mode config format, so that the +reference configuration shapes downstream teams see are the new ones. + +**Description**: Convert `tests/e2e/configs/run-*.yaml` and +`tests/e2e/configuration/**/lightspeed-stack*.yaml` into unified form +(or delete the `run-*.yaml` side and fold the content into the +corresponding `lightspeed-stack*.yaml`). Migrate `tests/e2e-prow/rhoai/` +configs similarly. + +**Scope**: +- Identify every test config that references `run.yaml`. +- Mechanically migrate using the migration tool (dumb mode). +- Re-run the full e2e suite and resolve any differences. + +**Acceptance criteria**: +- No in-repo test config references an external `run.yaml`. +- `uv run make test-e2e` passes. +- Existing test coverage is preserved (no tests deleted solely to make the + migration pass). + +**Agentic tool instruction**: +```text +Read "Migration paths" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files: tests/e2e/configs/, tests/e2e/configuration/, tests/e2e-prow/rhoai/. +To verify: `uv run make test-e2e` green. +``` + + + +### LCORE-???? Docs migration to unified mode as primary + +**User story**: As an operator reading Lightspeed Core docs, I want the +single-file unified configuration to be the primary way documented, with +legacy mode clearly marked as a deprecation path. + +**Description**: Update +`docs/deployment_guide.md`, `docs/byok_guide.md`, `docs/okp_guide.md`, +`docs/rag_guide.md`, `docs/providers.md`, `docs/config.md`, `README.md`, +`docs/local-stack-testing.md` to document unified mode as primary. Add a +migration section with the migration tool command. Clean up the stale +`create_argument_parser` docstring in `src/lightspeed_stack.py` that still +mentions the removed `-g/-i/-o` flags. + +**Scope**: +- Each doc file touched. +- A new migration section (step-by-step). +- Update the `create_argument_parser` docstring in + `src/lightspeed_stack.py`. + +**Acceptance criteria**: +- Every doc page that showed a two-file setup also shows the unified-mode + equivalent. +- Migration tool invocation documented with a worked example. +- `docs/openapi.md` / `docs/config.html` regenerated. + +**Agentic tool instruction**: +```text +Read "Deprecation timeline" and "Migration paths" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files: docs/*.md, docs/*.html, docs/*.json, README.md, src/lightspeed_stack.py docstring. +To verify: rendered docs present the unified mode first; legacy mode is visibly deprecated. +``` + + + +### LCORE-???? Reference profile examples and profile-path doc + +**Description**: Add `examples/profiles/` with two reference profile YAML +files — one remote-provider (OpenAI) and one inline-provider (sentence- +transformers + FAISS) — purely as reference material. Document how operators +write and reference their own profiles via +`llama_stack.config.profile: `. + +**Scope**: +- `examples/profiles/openai-remote.yaml` +- `examples/profiles/inline-faiss.yaml` +- Docs section: how to author a profile, where to place it, how to + reference it from `lightspeed-stack.yaml`. + +**Acceptance criteria**: +- Both examples load cleanly via the synthesizer (sanity test). +- A docs section titled "Profiles" exists and has a worked example. + +**Agentic tool instruction**: +```text +Read "Profiles" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files to create: examples/profiles/*.yaml, a "Profiles" section in docs/config.md or docs/deployment_guide.md. +To verify: load the example via `uv run lightspeed-stack -c ` referencing the profile; confirm LS boots. +``` + + + +### LCORE-???? Deprecation warning for legacy mode + +**Description**: After the unified-mode feature lands (one release later), +emit a one-line startup WARN when `library_client_config_path` is set. Link +to the migration doc. Legacy mode continues to fully function. + +**Scope**: +- Warning emission point: on load in `LlamaStackConfiguration` + `check_llama_stack_model` validator, or at LCORE startup. +- Log line format includes a stable URL fragment to the migration doc. + +**Acceptance criteria**: +- Legacy configs still load and run. +- A single WARN line appears at startup when legacy fields are used. +- The warning is not emitted in unified mode. + +**Agentic tool instruction**: +```text +Read "Deprecation timeline" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files: src/models/config.py (or src/lightspeed_stack.py startup). +To verify: run LCORE with a legacy config; confirm WARN line; run with unified config; confirm no WARN. +``` + +--- + +## PoC results + +### What the PoC does + +The PoC is at Level 3' (per the spike howto): unified config works +end-to-end in library mode, with overrides and a profile. Server-mode +end-to-end validation was skipped — same synthesis code path, container +rebuild time was impractical. + +**Important**: The PoC diverges from the production design in these ways: + +- Uses `$TMPDIR` for the synthesized `run.yaml` instead of the persistent + known path recommended in Decision T4. +- No `--synthesized-config-output` CLI flag yet. +- Migration tool has only the "dumb" mode; "smart" factoring into + high-level keys is out of scope. +- No deprecation warning yet (that's its own JIRA). +- High-level inference's emitted `provider_id` uses the Literal value + directly (`sentence_transformers` with underscore), which differs from + the baseline's `sentence-transformers` (hyphen). Acceptable in the PoC + because the validation used `baseline: default` + a `native_override` + path, not high-level inference, to avoid this naming collision. Resolution + before production: align the emitted `provider_id` with the Literal + values that already exist in common baselines (hyphenated form). + +### Results + +See [poc-evidence/library-mode/](poc-evidence/library-mode/) for the full +evidence bundle: + +- `lightspeed-stack-unified-library.yaml` — the unified-mode config used +- `synthesized-run.yaml` — what LCORE produced (3.7 KB) +- `query-response.json` — a real `/v1/query` round-trip + +Summary of validation: + +| Check | Evidence | +|---|---| +| Liveness 200 | `curl /liveness` → `{"alive":true}` | +| Readiness 200 | `curl /readiness` → `{"ready":true,"reason":"All providers are healthy","providers":[]}` | +| `/v1/query` works | `{"response":"The three primary colors are red, blue, and yellow.",...}` | +| Profile loaded | `profile: /.../tests/e2e/configs/run-ci.yaml` resolved | +| `native_override` took effect | `safety.default_shield_id: llama-guard` in synthesized output | +| No external `run.yaml` needed | No `library_client_config_path` in config | +| Secrets preserved as env refs | `api_key: ${env.OPENAI_API_KEY}` in synthesized file | +| Full unit suite | 2098 passed, 1 skipped, 0 failed | +| Round-trip lossless | `test_migrate_then_synthesize_reproduces_run_yaml` green | + +### Surprise discovered during PoC + +- **`AsyncLlamaStackAsLibraryClient` takes a file path, not a dict** (Decision + T9). The library client reads the file itself. Consequence: library mode + must write a synthesized file to disk. No dict-only shortcut. +- **`profile:` path resolution** uses the directory of the + `lightspeed-stack.yaml`. Relative paths work only when the profile is + co-located with the LCORE config. Absolute paths always work. Spec doc + recommends documenting this clearly. +- **Default baseline requires `EXTERNAL_PROVIDERS_DIR`**. `src/data/default_run.yaml` + (copied from the repo's `run.yaml`) references `${env.EXTERNAL_PROVIDERS_DIR}` + without a default. Either ship a thinner default baseline, or change the + reference to `${env.EXTERNAL_PROVIDERS_DIR:=~/.llama/providers.d}`. Flagging + for the implementation JIRA. +- **High-level inference naming collision** (described above in "divergence + from production design"). + +--- + +## Background sections + +### Current architecture (before LCORE-836) + +Two files: + +- **`lightspeed-stack.yaml`** — LCORE settings: service host/port, auth, + conversation cache, user data collection, MCP servers, authentication, + authorization, quota, etc. Also contains `llama_stack:` with + connection-to-LS settings (URL/api_key or library-client mode with a path + to an external `run.yaml`). +- **`run.yaml`** — Llama Stack operational config: `apis`, `providers` + (inference, safety, tool_runtime, vector_io, agents, ...), `storage`, + `registered_resources`, `vector_stores`, `safety`. + +**Existing enrichment** (`src/llama_stack_configuration.py`): + +- LCORE already enriches an input `run.yaml` with dynamic values from + `lightspeed-stack.yaml`: Azure Entra ID tokens (side-effect to `.env`), + BYOK RAG entries, Solr/OKP provider/store/model registration. Output is + an enriched `run.yaml`. +- Called in two places: `scripts/llama-stack-entrypoint.sh` at LS container + boot (server mode) and `src/client.py:_enrich_library_config()` (library + mode). +- LCORE-779 made this automatic; LCORE-518 (closed spike) proved (re)generation + feasibility. Both are the groundwork the current spike builds on. + +The new synthesizer *subsumes* the enrichment: it builds the full run.yaml +(baseline + enrichment + high-level + native_override) rather than +incrementally enriching an existing one. + +### Design alternatives considered + +Attributes (★ = high-weight for LCORE-836): + +| Attribute | A | B+C | C+E | E | G | +|---|---|---|---|---|---| +| ★ Operator UX | 2 | 4–5 | **4** | 5 | 3 | +| Abstraction cleanliness | 1 | 4 | 3 | 4 | 2 | +| LS schema resilience | 1 | 4 | 3 | 3 | 2 | +| ★ Escape-hatch power | 5 | 3 | 5 | 5 | 5 | +| Implementation cost | 4 | 2 | 2 | 3 | 3 | +| Maintenance load | 2 | 3 | 3 | 2 | 3 | +| ★ Backward compatibility | 3 | 3 | 3 | 3 | 4 | +| Validation rigor | 2 | 5 | 4 | 3 | 2 | +| ★ Dynamic-reconfig fit | 2 | 5 | 4 | 4 | 2 | +| ★ Library+server parity | 5 | 4 | 4 | 5 | 5 | +| Provider plurality | 5 | 4 | 5 | 4 | 5 | +| Testability | 3 | 4 | 3 | 5 | 3 | + +- **A (Embedded native)** — no abstraction win; same LS schema exposure as today. +- **B (High-level only)** — best UX when everything maps, painful at the edges. +- **C (B + `native_override`)** — recommended; combines B's UX with A's escape hatch. +- **E (Profiles, feature-only)** — optional layer on top of C. +- **G (Kustomize-style patches)** — strong for backward compat, weak on + validation and dynamic reconfig. + +### Merge semantics — worked examples + +Given the baseline: +```yaml +safety: + default_shield_id: llama-guard + excluded_categories: [violence, sexual_content] +providers: + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} +``` + +And `native_override`: +```yaml +safety: + excluded_categories: [spam] +``` + +**Deep-merge-with-list-replacement (chosen)** produces: +```yaml +safety: + default_shield_id: llama-guard # preserved (not in override) + excluded_categories: [spam] # list replaced +providers: # not in override — preserved + inference: + - provider_id: openai + ... +``` + +The recommendation's appeal: to keep `default_shield_id`, the user doesn't +have to restate it. To replace `excluded_categories`, the user provides the +new list — they don't need to know a patch syntax. + +### Process-model recap (no LCORE supervision of LS) + +**Library mode**: LCORE process embeds the Llama Stack library client. LCORE +synthesizes `run.yaml` to a file, calls `AsyncLlamaStackAsLibraryClient(path)`, +initializes, serves. One process. + +**Server mode**: Llama Stack runs as a separate process (container). LCORE +connects to it over HTTP. Under unified mode, the LS container's entrypoint +reads the mounted `lightspeed-stack.yaml`, the Python CLI auto-detects +unified mode, synthesizes `run.yaml`, then `exec llama stack run` with it. +LCORE container reads the same `lightspeed-stack.yaml`, ignores the +`config` sub-block (server mode — only connection fields matter), connects. +Two processes. LCORE does **not** start, monitor, or supervise the LS +process — the orchestrator (docker-compose, systemd, k8s) does. Supervision +is out of scope for this spike (see Decision S4). + +### What must not break during rollout + +See [Backward compatibility scope](#backward-compatibility-scope). The four +must-not-break surfaces: + +1. Existing `lightspeed-stack.yaml` with `library_client_config_path`. +2. Existing `run.yaml` content, including fields LCORE doesn't model. +3. Existing CI/CD templating that treats `run.yaml` as a separate artifact. +4. Existing enrichment behavior (Azure Entra ID, BYOK RAG, Solr/OKP). + +### Backward compatibility scope + +Detection rule at load time: + +| `lightspeed-stack.yaml` shape | Interpretation | +|---|---| +| `llama_stack.library_client_config_path` set, no `llama_stack.config` | **Legacy** — today's behavior | +| `llama_stack.config.*` present | **Unified** — new path | +| Both present | Error at load time — clear message | +| Neither (remote URL only, no config) | Existing remote mode — unchanged | + +Three migration paths operators can choose: + +| Path | Effort | Result | +|---|---|---| +| Do nothing | 0 | Legacy keeps working until deprecation window closes | +| Lift-and-shift (via migration tool) | seconds | Unified single file, zero semantic change | +| Re-express | hours+ | Unified single file, fully adopts the high-level schema | + +--- + +## Appendix A — Files changed in the PoC + +Relative to `upstream/main`: + +| File | Purpose | +|---|---| +| `src/models/config.py` | New classes: `UnifiedInferenceProvider`, `UnifiedInferenceSection`, `UnifiedLlamaStackConfig`; modified `LlamaStackConfiguration` (adds `config` field + mutual-exclusion validator) | +| `src/llama_stack_configuration.py` | New: `synthesize_configuration`, `deep_merge_list_replace`, `apply_high_level_inference`, `load_default_baseline`, `synthesize_to_file`, `migrate_config_dumb`. CLI `main()` auto-detects unified vs legacy. | +| `src/data/default_run.yaml` | Built-in default baseline (copied from repo root `run.yaml` for the PoC — implementation JIRA should slim it down; see PoC surprise about `EXTERNAL_PROVIDERS_DIR`) | +| `src/client.py` | Library-mode path picks synthesis for unified configs, enrichment for legacy | +| `src/lightspeed_stack.py` | `--migrate-config`, `--run-yaml`, `--migrate-output` flags | +| `scripts/llama-stack-entrypoint.sh` | Comment updated — script itself needs no change (Python CLI auto-detects) | +| `test.containerfile` | Copies `src/data/` into the LS container | +| `tests/unit/test_llama_stack_synthesize.py` | 22 new tests: merge semantics, high-level inference, synthesize pipeline, migration round-trip | +| `tests/unit/models/config/test_llama_stack_configuration.py` | 3 new tests: unified/legacy mutual exclusion | +| `tests/unit/models/config/test_dump_configuration.py` | 5 expected-dict updates (new `config: None` field appears in dumps) | +| `tests/unit/test_client.py` | Error-message regex updated | +| `docs/design/llama-stack-config-merge/` | Spike doc, spec doc, PoC evidence, proposed JIRAs | + +## Appendix B — Commands to reproduce the library-mode PoC + +```bash +# 1. Start LCORE in library mode with a unified config +export OPENAI_API_KEY= +export E2E_OPENAI_MODEL=gpt-4o-mini +mkdir -p /tmp/lcore-836-poc +uv run lightspeed-stack \ + -c docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml + +# 2. In another shell — query +curl -s http://localhost:8080/liveness +curl -s http://localhost:8080/readiness +curl -s -X POST http://localhost:8080/v1/query \ + -H 'Content-Type: application/json' \ + -d '{"query": "Name three primary colors. One sentence."}' + +# 3. Inspect what was synthesized +cat /tmp/llama_stack_synthesized_config.yaml +``` diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge.md new file mode 100644 index 000000000..9847cc2fd --- /dev/null +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge.md @@ -0,0 +1,502 @@ +# Feature design: Llama Stack config merge (unified `lightspeed-stack.yaml`) + +| | | +|--------------------|----------------------------------------------------------------------------------| +| **Date** | 2026-04-23 | +| **Component** | Lightspeed Core Stack (src/models/config.py, src/llama_stack_configuration.py, src/client.py, src/lightspeed_stack.py, scripts/llama-stack-entrypoint.sh) | +| **Authors** | Maxim Svistunov | +| **Feature** | [LCORE-836](https://redhat.atlassian.net/browse/LCORE-836) | +| **Spike** | [llama-stack-config-merge-spike.md](llama-stack-config-merge-spike.md) | +| **Links** | LCORE-509 (Epic), LCORE-777 (Epic), LCORE-518 (prior spike, Closed), LCORE-779 (auto-regen, Closed) | + +## What + +This feature collapses the two Lightspeed Core configuration files — +`lightspeed-stack.yaml` (LCORE settings) and `run.yaml` (Llama Stack +operational config) — into a single `lightspeed-stack.yaml`. At runtime, +LCORE synthesizes a full Llama Stack `run.yaml` from a new +`llama_stack.config` sub-section and hands it to Llama Stack (library +client or subprocess, mode-dependent). + +Key shape: + +- High-level keys under `llama_stack.config` for the common path + (v1: `inference`; future: `storage`, `safety`, `tools`). +- `llama_stack.config.native_override` escape hatch — raw Llama Stack + schema, deep-merged with list replacement. Covers anything the + high-level schema doesn't express. +- `llama_stack.config.profile` — path to a user-authored YAML that serves + as the synthesis baseline. +- `llama_stack.config.baseline: default | empty` — pick between LCORE's + built-in baseline and an empty dict (used by the migration tool for + exact round-trip). +- Legacy two-file mode (`llama_stack.library_client_config_path` + + external `run.yaml`) is preserved during a deprecation window; + mutually exclusive with `llama_stack.config`. + +## Why + +Two-file configuration multiplies the surface area for misconfiguration +and forces every downstream Lightspeed team (RHOAI, Konflux pipelines, +any product integrating LCORE) to understand Llama Stack's full internal +schema. A single source of truth: + +- Reduces the number of artifacts deployment tooling must manage + (Helm values, ConfigMaps, Kustomize overlays). +- Lets downstream teams express their intent at a high level (e.g. "use + OpenAI with these allowed models") rather than authoring raw LS + provider entries. +- Preserves an escape hatch so edge cases don't block adoption. + +LCORE-518 (closed) proved a generation PoC in principle; LCORE-779 +(closed) made configuration regeneration automatic at startup. This +feature completes the picture by making `run.yaml` an implementation +detail that LCORE owns, not an operator-facing artifact. + +## Requirements + +- **R1:** `lightspeed-stack.yaml` with a `llama_stack.config` sub-section + and no external `run.yaml` boots LCORE in both library and server modes + and serves `/v1/query` successfully. +- **R2:** Legacy mode (`llama_stack.library_client_config_path` + + external `run.yaml`) works unchanged until the deprecation window + closes. A startup WARN is emitted one release after unified mode lands. +- **R3:** Setting both `llama_stack.config` and + `llama_stack.library_client_config_path` in the same file fails at + configuration load time with a clear error message pointing to the + migration tool. +- **R4:** `lightspeed-stack --migrate-config --run-yaml X -c Y + --migrate-output Z` produces a unified configuration from the legacy + two-file pair. Running the migrated file drives Llama Stack to + byte-identical behavior as the original pair (dumb-mode lossless + round-trip). +- **R5:** When `llama_stack.config.native_override` overlaps a key set + by the high-level section or by the baseline, deep-merge semantics + apply with list replacement (maps merge recursively; lists are + replaced wholesale; scalars are replaced). +- **R6:** Secrets are never resolved into the synthesized file on disk. + `${env.FOO}` references appear verbatim in the synthesized `run.yaml`. +- **R7:** Existing enrichment behavior (Azure Entra ID, BYOK RAG, + Solr/OKP) produces the same result in unified mode as in legacy mode + for equivalent inputs. +- **R8:** A profile referenced by a relative `profile:` path resolves + against the directory of the loaded `lightspeed-stack.yaml`. +- **R9:** The unified schema extends current `LlamaStackConfiguration` + pydantic model with a new `config: Optional[UnifiedLlamaStackConfig]` + field; validation enforces mutual exclusion with legacy mode and + rejects unknown fields (`extra="forbid"`). +- **R10:** The synthesized `run.yaml` is written to a persistent known + path (overwritten each boot), logged, and a CLI flag + `--synthesized-config-output` lets operators override the location for + debugging. +- **R11:** Shape detection determines mode (unified vs legacy); an + optional `config_format_version` field is accepted but must agree with + the shape when present. + +## Use Cases + +- **U1:** As an operator setting up LCORE for the first time, I want to + write one config file with high-level provider choices (OpenAI, Azure, + …) so that I don't have to learn Llama Stack's internal schema. +- **U2:** As a downstream team maintainer with an existing heavily + customized `run.yaml`, I want a mechanical one-shot migration so that + I can move to the unified format without re-expressing my edge cases. +- **U3:** As an operator whose deployment sits behind a vLLM serving + stack not covered by the high-level schema, I want to drop my custom + configuration into `native_override` and still benefit from the rest + of the unified schema. +- **U4:** As a Lightspeed Core maintainer, I want a single authoritative + place for docs, examples, and test configs so that downstream teams + find the same patterns everywhere. +- **U5:** As a Red Hat release manager, I want legacy configs to keep + working throughout a deprecation window so that downstream products + can migrate on their own cadence. + +## Architecture + +### Overview + +```text +lightspeed-stack.yaml (unified mode) + │ + ▼ + ┌────────────────────────────┐ + │ Configuration load │ Pydantic validation, mutual-exclusion + │ src/configuration.py │ check between `config` and + │ src/models/config.py │ `library_client_config_path`. + └────────────┬───────────────┘ + │ Configuration (typed) + ▼ + ┌────────────────────────────┐ Baseline selection (profile / + │ Synthesizer │ default / empty) + enrichment + │ synthesize_configuration │ (BYOK RAG, Solr/OKP) + high-level + │ (llama_stack_config…) │ sections + native_override deep-merge. + └────────────┬───────────────┘ + │ synthesized run.yaml (dict) + ▼ + Library mode Server mode + ──────────── ─────────── + Write to deterministic path. Written by LS container's entrypoint + AsyncLlamaStackAsLibraryClient script (same synthesizer, same CLI, + reads the path and initializes. auto-detects unified via Python). + `llama stack run ` starts LS. + LCORE connects by URL. +``` + +### Trigger mechanism + +At LCORE startup (library mode): if `llama_stack.config` is set in the +loaded `lightspeed-stack.yaml`, the synthesizer produces a `run.yaml` +dict, writes it to disk, and passes the path to the library client. + +At Llama Stack container startup (server mode): the container's +entrypoint script invokes +`python3 /opt/app-root/llama_stack_configuration.py -c +-o /opt/app-root/run.yaml`. The Python CLI auto-detects unified vs legacy +by `llama_stack.config` presence; in unified mode it synthesizes and +writes the output; in legacy mode it performs in-place enrichment as +before. + +### Storage / data model changes + +No persistent storage is added. The synthesized `run.yaml` is written +once per boot to a deterministic path; not a database. `src/data/ +default_run.yaml` is a new package-shipped file, the built-in baseline +Llama Stack configuration. + +### Configuration + +New sub-section under the existing `llama_stack` block: + +```yaml +llama_stack: + use_as_library_client: true + # NOTE: library_client_config_path intentionally OMITTED in unified mode. + # Setting both `config` and `library_client_config_path` is a validation error. + config: + # Baseline selection + baseline: default # default | empty; ignored if `profile` is set + profile: ./my-profile.yaml # optional; resolves relative to lightspeed-stack.yaml + + # High-level sections (v1: inference; future: storage, safety, tools, ...) + inference: + providers: + - type: openai # mapped to remote::openai + api_key_env: OPENAI_API_KEY + allowed_models: [gpt-4o-mini] + - type: sentence_transformers + + # Escape hatch — raw Llama Stack schema, deep-merged with list replacement + native_override: + safety: + excluded_categories: [spam] +``` + +Pydantic classes (see `src/models/config.py`): + +```python +class UnifiedInferenceProvider(ConfigurationBase): + type: Literal[ + "openai", "sentence_transformers", "azure", "vertexai", + "watsonx", "vllm_rhaiis", "vllm_rhel_ai", + ] + api_key_env: Optional[str] = None + allowed_models: Optional[list[str]] = None + extra: dict[str, Any] = Field(default_factory=dict) + + +class UnifiedInferenceSection(ConfigurationBase): + providers: list[UnifiedInferenceProvider] = Field(default_factory=list) + + +class UnifiedLlamaStackConfig(ConfigurationBase): + baseline: Literal["default", "empty"] = "default" + profile: Optional[str] = None + inference: Optional[UnifiedInferenceSection] = None + native_override: dict[str, Any] = Field(default_factory=dict) + + +class LlamaStackConfiguration(ConfigurationBase): + # existing fields unchanged (url, api_key, use_as_library_client, + # library_client_config_path, timeout) + config: Optional[UnifiedLlamaStackConfig] = None + + @model_validator(mode="after") + def check_llama_stack_model(self) -> Self: + if self.config is not None and self.library_client_config_path is not None: + raise ValueError("... mutually exclusive ... use --migrate-config") + # ...legacy checks preserved... + return self +``` + +### API changes + +None at the REST API surface. Internal API additions in +`src/llama_stack_configuration.py`: + +- `synthesize_configuration(lcs_config, config_file_dir, default_baseline) + -> dict` — the synthesis pipeline. +- `synthesize_to_file(lcs_config, output_file, config_file_dir) -> None` — + synthesis + write. +- `migrate_config_dumb(run_yaml_path, lightspeed_yaml_path, output_path) + -> None` — dumb-mode migration (lossless round-trip). +- `deep_merge_list_replace(base, overlay) -> dict` — merge helper. +- `apply_high_level_inference(ls_config, inference)` — high-level expansion. +- `load_default_baseline() -> dict` — loads `src/data/default_run.yaml`. + +CLI additions in `src/lightspeed_stack.py`: + +- `--migrate-config` — invoke the migration tool. +- `--run-yaml ` — input for `--migrate-config`. +- `--migrate-output ` — output for `--migrate-config`. +- (recommended for R10) `--synthesized-config-output ` — override + the default deterministic synthesis location. + +The legacy CLI docstring in `create_argument_parser()` referencing the +removed `-g/-i/-o` flags is cleaned up as part of the docs JIRA. + +### Error handling + +- **Unified + legacy set simultaneously**: raised during + `LlamaStackConfiguration.check_llama_stack_model`. Error message + directs to `--migrate-config`. +- **Library mode with neither `config` nor `library_client_config_path`**: + raised during the same validator. Error identifies the two valid paths. +- **`profile:` path does not exist**: surfaced as `FileNotFoundError` + from `open(profile_path)` during synthesis. The implementation JIRA + should wrap this with context about where the path was resolved. +- **Unknown provider `type` in high-level inference**: rejected by the + Pydantic `Literal` — operator sees a validation error naming the + allowed types. Escape: use `native_override`. +- **Unknown fields in any unified-mode section**: rejected by + `extra="forbid"` on `ConfigurationBase`. +- **Llama Stack rejects the synthesized `run.yaml`**: surfaces as + whatever LS itself raises (ValidationError from LS's own config + parsing). The implementation JIRA should log the synthesized file path + before handing to LS so operators can inspect what failed. + +### Security considerations + +- **No secrets written to disk**: `apply_high_level_inference` emits + `${env.}` references, never the resolved secret. The synthesized + `run.yaml` is safe to log path-wise; its contents only contain env + references for secrets. +- **`native_override` is raw YAML**: content is operator-controlled, so + no new injection surface — same trust model as the existing + `run.yaml`. LCORE does no template expansion other than the existing + `replace_env_vars()` step in the load pipeline. +- **Synthesized file location**: persistent known path, world-readable + by default in a container. This is acceptable because the file + contains only env-var references for secrets; operators who want + stricter filesystem permissions should tighten the mount. + +### Migration / backwards compatibility + +Coexistence mechanism: shape detection (see R11). Legacy configs with +`llama_stack.library_client_config_path` continue through the +configured deprecation window. + +Three operator-facing migration paths (choose per deployment): + +| Path | Effort | Result | +|---|---|---| +| Do nothing | 0 | Legacy keeps working until deprecation closes | +| Lift-and-shift | seconds — `lightspeed-stack --migrate-config ...` | Single-file, byte-equivalent LS behavior | +| Re-express | hours+ | Single-file; high-level sections replace `native_override` | + +Deprecation schedule: calendar-based (per Decision S2 in the spike); +concrete numbers set by @sbunciak at release time. Default recommended +shape: unified mode ships as opt-in at release N; legacy-mode WARN +begins one release later; legacy-mode removal no sooner than 6 months +after WARN begins. + +## Implementation Suggestions + +### Key files and insertion points + +| File | What to do | +|---|---| +| `src/models/config.py` | Add `UnifiedInferenceProvider`, `UnifiedInferenceSection`, `UnifiedLlamaStackConfig`. Modify `LlamaStackConfiguration` — add `config` field, extend the `model_validator` for mutual-exclusion check. | +| `src/llama_stack_configuration.py` | Add `synthesize_configuration`, `deep_merge_list_replace`, `apply_high_level_inference`, `load_default_baseline`, `synthesize_to_file`, `migrate_config_dumb`, `PROVIDER_TYPE_MAP`, `DEFAULT_BASELINE_RESOURCE`. Update `main()` to auto-detect unified vs legacy. | +| `src/data/default_run.yaml` | New file — a thinner baseline than today's repo-root `run.yaml`. Notably do **not** reference `${env.EXTERNAL_PROVIDERS_DIR}` without a default (see PoC surprise in the spike doc). | +| `src/client.py` | In `_load_library_client`: branch on `config.config` presence. Add `_synthesize_library_config()` that calls the synthesizer and writes to the deterministic path (R10). Keep `_enrich_library_config` for legacy. | +| `src/lightspeed_stack.py` | Add `--migrate-config`, `--run-yaml`, `--migrate-output`, `--synthesized-config-output` flags. Add an early-exit branch in `main()` that dispatches to `migrate_config_dumb` when `--migrate-config` is set. Clean up stale docstring. | +| `scripts/llama-stack-entrypoint.sh` | No functional change — the Python CLI already auto-detects. Update the comment to document both modes. | +| `test.containerfile` | Copy `src/data/` into `/opt/app-root/data/` so `load_default_baseline()` resolves inside the LS container. | +| `docker-compose.yaml` | Provide a unified-mode variant (either a new compose file or env-var-switched mount list). Legacy compose continues to work. | + +### Insertion point detail + +**`synthesize_configuration` pipeline** (the core new function): + +1. Retrieve `unified = lcs_config["llama_stack"]["config"]` — raise if absent. +2. Baseline: if `unified.profile` set → load that file. Else if + `unified.baseline == "empty"` → `{}`. Else → `default_baseline` arg or + `load_default_baseline()`. +3. Run `dedupe_providers_vector_io` on the baseline. +4. Apply existing enrichment: `enrich_byok_rag`, `enrich_solr` (Azure + Entra ID intentionally stays separate because it's a `.env` + side-effect, not an `ls_config` mutation). +5. If `unified.inference` present → `apply_high_level_inference`. +6. If `unified.native_override` non-empty → + `deep_merge_list_replace(ls_config, native_override)`. +7. `dedupe_providers_vector_io` again for good measure. +8. Return the final dict. + +**`_load_library_client` fork point** (in `src/client.py`): + +```python +if config.config is not None: + self._config_path = self._synthesize_library_config() +elif config.library_client_config_path is not None: + self._config_path = self._enrich_library_config(config.library_client_config_path) +else: + raise ValueError(...) # caught by the validator at load time; belt-and-suspenders here +``` + +### Config pattern + +All new config classes extend `ConfigurationBase` (`extra="forbid"`). +Use `Field()` with defaults, title, and description for every attribute. +Cross-field validation in `UnifiedLlamaStackConfig` is not currently +needed — the precedence is strictly ordered and handled by the +synthesizer, not by the model. + +Example config files live in `examples/profiles/` (two reference +profiles — one remote-provider, one inline-provider) and in +`examples/lightspeed-stack-unified.yaml` as the canonical "unified mode" +reference. + +### Test patterns + +- Framework: pytest + pytest-mock. Unit tests live in + `tests/unit/test_llama_stack_synthesize.py` (synthesizer + migration) + and `tests/unit/models/config/test_llama_stack_configuration.py` + (schema validation). +- Merge semantics: parametric tests over scalar / map / list / + type-mismatch / precedence cases. +- Round-trip test: migrate → synthesize → assert dict equality with the + original `run.yaml`. Pattern already live in + `test_migrate_then_synthesize_reproduces_run_yaml`. +- Schema validation tests: mutual exclusion, remote URL + config, + library mode + config without legacy path. +- Feature-specific: provider_type map completeness test asserts every + `Literal` value on `UnifiedInferenceProvider.type` has a + `PROVIDER_TYPE_MAP` entry. +- e2e behave tests: migrate `tests/e2e/configuration/**` configs to + unified form as part of LCORE-???? (test migration JIRA). + +## Open Questions for Future Work + +- **Smart migration mode** (`--migrate-config --smart`): factoring an + existing `run.yaml` into high-level sections rather than dumping to + `native_override`. Valuable ergonomic win; deferred because the + factoring rules require careful design per provider type. +- **Additional high-level sections** beyond `inference` — `storage`, + `safety`, `tools`, `vector_stores`, etc. Add as real demand appears, + not speculatively. +- **User-supplied profile directory**: `profile_dir: /etc/lcore/profiles/` + with name-based lookup. Deferred to v2. +- **LS process supervision** (restart on crash, signal propagation, + merged logs) — covered by LCORE-777 / LCORE-778, not this feature. +- **Dynamic reconfig / hot-reload** (live `POST /v1/rag` that adds a BYOK + RAG without restart) — covered by LCORE-781, not this feature. Llama + Stack's lack of native hot-reload means any implementation requires + supervised restart, which is out of scope here. +- **`config_format_version`** as an explicit schema version, accepted + but not required. Will become load-bearing the first time the unified + schema undergoes a real breaking change. +- **Validation pre-flight against the Llama Stack schema**: today LCORE + only validates its own schema; LS validates its own at startup. + Introducing a pre-flight validator would catch bad synthesis earlier + but creates a heavy dependency on LS internals. + +## Changelog + +| Date | Change | Reason | +|---|---|---| +| 2026-04-23 | Initial version | Spike completion | + +## Appendix A — Worked example: legacy → unified migration + +Given legacy: + +```yaml +# run.yaml +version: 2 +apis: [agents, inference, vector_io, ...] +providers: + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + allowed_models: ["${env.E2E_OPENAI_MODEL:=gpt-4o-mini}"] +# ... more ... +``` + +```yaml +# lightspeed-stack.yaml +name: LCS +llama_stack: + use_as_library_client: true + library_client_config_path: ./run.yaml +# ... rest ... +``` + +Run: + +```bash +lightspeed-stack --migrate-config \ + --run-yaml run.yaml \ + -c lightspeed-stack.yaml \ + --migrate-output lightspeed-stack-unified.yaml +``` + +Produces: + +```yaml +# lightspeed-stack-unified.yaml +name: LCS +llama_stack: + use_as_library_client: true + # library_client_config_path is REMOVED + config: + baseline: empty + native_override: + version: 2 + apis: [agents, inference, vector_io, ...] + providers: + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + allowed_models: ["${env.E2E_OPENAI_MODEL:=gpt-4o-mini}"] + # ... rest of run.yaml content under native_override ... +# ... rest of lightspeed-stack.yaml content ... +``` + +Operator uses the unified file directly and can delete the original +`run.yaml`. Subsequent re-expression (moving from `native_override` into +high-level sections) is optional and per-deployment. + +## Appendix B — Reference profile example + +```yaml +# examples/profiles/openai-remote.yaml +# A minimal profile for an OpenAI-backed remote Llama Stack. +# Referenced via `llama_stack.config.profile: examples/profiles/openai-remote.yaml`. +version: 2 +apis: [agents, inference, safety, tool_runtime, vector_io] +providers: + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + allowed_models: ["${env.OPENAI_MODEL:=gpt-4o-mini}"] + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers +# ... the rest is the same shape as a working run.yaml ... +``` diff --git a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md b/docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md new file mode 100644 index 000000000..6bfa7f5e9 --- /dev/null +++ b/docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md @@ -0,0 +1,26 @@ +# Library-mode PoC evidence + +Command: +```bash +export OPENAI_API_KEY= +export E2E_OPENAI_MODEL=gpt-4o-mini +uv run lightspeed-stack -c docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml +``` + +## What the unified config does + +- `llama_stack.config.profile: /abs/path/to/tests/e2e/configs/run-ci.yaml` — baseline loaded from the CI profile +- `llama_stack.config.native_override.safety.default_shield_id: llama-guard` — override proves merge works + +## Evidence + +- `synthesized-run.yaml` — the full run.yaml LCORE produced from the unified config +- `query-response.json` — a successful `/v1/query` round-trip + +## Proves + +- `llama_stack.library_client_config_path` was NOT used (no external run.yaml needed) +- `llama_stack.config.profile` was used as the synthesis baseline (path resolution works with absolute paths) +- `llama_stack.config.native_override` was merged onto the baseline +- `AsyncLlamaStackAsLibraryClient` accepts the synthesized file path (answered item #24: file-only, not dict) +- `/v1/query` succeeded end-to-end through the synthesized stack diff --git a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/query-response.json b/docs/design/llama-stack-config-merge/poc-evidence/library-mode/query-response.json new file mode 100644 index 000000000..5664cbd00 --- /dev/null +++ b/docs/design/llama-stack-config-merge/poc-evidence/library-mode/query-response.json @@ -0,0 +1 @@ +{"conversation_id":"976ef32527283085ba2f1d0cfb4c16d97071bf64391a8200","response":"The three primary colors are red, blue, and yellow.","rag_chunks":[],"referenced_documents":[],"truncated":false,"input_tokens":24,"output_tokens":12,"available_quotas":{},"tool_calls":[],"tool_results":[]} \ No newline at end of file diff --git a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/synthesized-run.yaml b/docs/design/llama-stack-config-merge/poc-evidence/library-mode/synthesized-run.yaml new file mode 100644 index 000000000..34e3e1fc9 --- /dev/null +++ b/docs/design/llama-stack-config-merge/poc-evidence/library-mode/synthesized-run.yaml @@ -0,0 +1,148 @@ +apis: + - agents + - batches + - datasetio + - eval + - files + - inference + - safety + - scoring + - tool_runtime + - vector_io +benchmarks: [] +datasets: [] +image_name: starter +providers: + agents: + - config: + persistence: + agent_state: + backend: kv_default + namespace: agents_state + responses: + backend: sql_default + table_name: agents_responses + provider_id: meta-reference + provider_type: inline::meta-reference + batches: + - config: + kvstore: + backend: kv_default + namespace: batches_store + provider_id: reference + provider_type: inline::reference + datasetio: + - config: + kvstore: + backend: kv_default + namespace: huggingface_datasetio + provider_id: huggingface + provider_type: remote::huggingface + - config: + kvstore: + backend: kv_default + namespace: localfs_datasetio + provider_id: localfs + provider_type: inline::localfs + eval: + - config: + kvstore: + backend: kv_default + namespace: eval_store + provider_id: meta-reference + provider_type: inline::meta-reference + files: + - config: + metadata_store: + backend: sql_default + table_name: files_metadata + storage_dir: ~/.llama/storage/files + provider_id: meta-reference-files + provider_type: inline::localfs + inference: + - config: + allowed_models: + - ${env.E2E_OPENAI_MODEL:=gpt-4o-mini} + api_key: ${env.OPENAI_API_KEY} + provider_id: openai + provider_type: remote::openai + - config: {} + provider_id: sentence-transformers + provider_type: inline::sentence-transformers + safety: + - config: + excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard + scoring: + - config: {} + provider_id: basic + provider_type: inline::basic + - config: {} + provider_id: llm-as-judge + provider_type: inline::llm-as-judge + - config: + openai_api_key: '********' + provider_id: braintrust + provider_type: inline::braintrust + tool_runtime: + - config: {} + provider_id: rag-runtime + provider_type: inline::rag-runtime + - config: {} + provider_id: model-context-protocol + provider_type: remote::model-context-protocol + vector_io: [] +registered_resources: + benchmarks: [] + datasets: [] + models: + - metadata: + embedding_dimension: 768 + model_id: all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: all-mpnet-base-v2 + scoring_fns: [] + shields: + - provider_id: llama-guard + provider_shield_id: openai/gpt-4o-mini + shield_id: llama-guard + tool_groups: + - provider_id: rag-runtime + toolgroup_id: builtin::rag + vector_stores: [] +safety: + default_shield_id: llama-guard +scoring_fns: [] +server: + port: 8321 +storage: + backends: + kv_default: + db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db} + type: kv_sqlite + sql_default: + db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db} + type: sql_sqlite + stores: + conversations: + backend: sql_default + table_name: openai_conversations + inference: + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + table_name: inference_store + metadata: + backend: kv_default + namespace: registry + prompts: + backend: kv_default + namespace: prompts +vector_stores: + default_embedding_model: + model_id: all-mpnet-base-v2 + provider_id: sentence-transformers + default_provider_id: faiss +version: 2 diff --git a/docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml b/docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml new file mode 100644 index 000000000..a75ad5bf6 --- /dev/null +++ b/docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml @@ -0,0 +1,33 @@ +name: Lightspeed Core Service (LCS) - Unified PoC +service: + host: 0.0.0.0 + port: 8080 + base_url: http://localhost:8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +# Unified mode: no `library_client_config_path`. Operational LS config is +# synthesized by LCORE from `llama_stack.config` below. +llama_stack: + use_as_library_client: true + config: + # Use the CI-friendly baseline via `profile` (no EXTERNAL_PROVIDERS_DIR + # env var required). Equivalent to what tests/e2e/configs/run-ci.yaml + # provides; this exercises the `profile:` path of the synthesizer. + profile: /home/msvistun/repos/lightspeed/stack/tests/e2e/configs/run-ci.yaml + # Small native_override: prove overrides take effect end-to-end. + native_override: + safety: + default_shield_id: llama-guard +user_data_collection: + feedback_enabled: false + feedback_storage: "/tmp/lcore-836-poc/feedback" + transcripts_enabled: false + transcripts_storage: "/tmp/lcore-836-poc/transcripts" +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/lcore-836-poc/conversation-cache.db" +authentication: + module: "noop" From 6e0a95bc5a931845950d61275a277322cc3607a4 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Thu, 23 Apr 2026 15:29:47 +0200 Subject: [PATCH 3/9] LCORE-836 spike: add e2e-kickoff JIRA (feature files first, no implementation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Incorporates reviewer request: the work on this feature kicks off with a Story that authors the behave `.feature` files for unified mode BEFORE the feature is implemented. The intent is to keep test-shape authorship free of implementation bias and to surface any architectural gaps early. Adds two JIRAs to the spike doc's proposed-JIRAs list, bringing the total from 7 to 9: 1. LCORE-???? (Story, inserted first) — E2E feature files for unified mode (no step implementation). Authors Gherkin scenarios against the spec doc's R1..R11 requirements. Explicitly forbids reading the implementation JIRAs or the synthesizer code while authoring. behave marks resulting steps as undefined; test-e2e still green (undefined scenarios are reported, not failed). 2. LCORE-???? (Task, inserted after the migrate-e2e-configs Story) — Implement behave step definitions for the kickoff feature files. Takes the Gherkin as-is (does not water down the tests to fit implementation). Blocked by the kickoff ticket plus the feature- implementation tickets (schema + synthesizer, migration tool, LS container entrypoint). Filing both tickets together (rather than filing only the kickoff and "letting the step-def ticket appear later") makes the dependency chain explicit from the start and ensures the step-def work is not forgotten. No other JIRAs change scope. The PR template is updated to reflect the new count and to widen the "Full JIRA list" link range to cover both new sections. --- .../llama-stack-config-merge-spike.md | 125 +++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index c8db06ff4..83451b4ff 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -240,7 +240,80 @@ available. Not a decision; a fact to note in the spec doc. ## Proposed JIRAs Each JIRA's agentic-tool instruction points to the spec doc -(`llama-stack-config-merge.md`), the permanent reference. +(`llama-stack-config-merge.md`), the permanent reference. The first JIRA +(authoring e2e feature files) is the intentional kickoff — it happens +before feature implementation so the test shape is not influenced by +implementation choices. + + + +### LCORE-???? E2E feature files for unified mode (no step implementation) + +**User story**: As a Lightspeed Core e2e engineer, I want the behave +feature files for unified-mode scenarios written before the feature +implementation lands, so that the test shape reflects the feature's +intended behavior rather than the chosen implementation, and any +architectural gaps surface early. + +**Description**: Author behave `.feature` files under `tests/e2e/features/` +that describe the behaviors required of unified mode. Step definitions +(Python glue) are explicitly **not** part of this ticket — they are +covered by a later sibling ticket (LCORE-???? — Implement step +definitions). The feature files can be submitted for review and land +before implementation of the feature itself begins. + +**Scope**: +- `.feature` files covering, at minimum, these R1–R11 surfaces from the + spec doc: + - Boot LCORE with unified `lightspeed-stack.yaml` (no external + `run.yaml`); `/liveness`, `/readiness`, and `/v1/query` succeed. + - Boot LCORE with legacy config + (`library_client_config_path` + external `run.yaml`); same result. + - Setting both `llama_stack.config` and + `llama_stack.library_client_config_path` fails at config-load time + with a clear error that mentions `--migrate-config`. + - Migration tool: `lightspeed-stack --migrate-config ...` produces a + unified file that drives equivalent Llama Stack behavior. + - `native_override` deep-merges onto the baseline with list + replacement (tested on a scalar key and a list key). + - `profile:` path (absolute and relative-to-config-dir) loads the + referenced baseline. + - Secrets appear as `${env.FOO}` references in the synthesized + `run.yaml` on disk; never resolved to raw values. + - Legacy mode emits a one-line deprecation WARN at startup; unified + mode does not. +- Additions to `tests/e2e/test_list.txt` so behave discovers the new + files. +- Gherkin scenarios authored from the spec doc (`R1..R11`) only; author + must avoid reading the implementation JIRAs' scope sections while + drafting scenarios. + +**Acceptance criteria**: +- behave parses every new `.feature` file without syntax errors. +- behave marks all new scenario steps as `undefined` (step definitions + land in LCORE-????). +- `uv run make test-e2e` remains green (new scenarios are skipped or + reported undefined, not failing). +- Any ambiguity or architectural tension uncovered while authoring is + captured either as a comment in the spec doc or as a new sub-JIRA. + +**Blocks**: LCORE-???? (Implement behave step definitions for unified +mode). + +**Agentic tool instruction**: +```text +Read "Requirements" (R1..R11) and "Use Cases" in +docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Do NOT read the other JIRAs' scope sections or the synthesizer/schema +implementation code while authoring; the point of this ticket is to +produce feature files uncontaminated by implementation detail. +Key files to create: tests/e2e/features/unified-mode-*.feature plus +additions to tests/e2e/test_list.txt. Do NOT create step definitions in +tests/e2e/features/steps/. +To verify: `uv run behave --dry-run tests/e2e/features/unified-mode-*.feature` +parses successfully; `uv run make test-e2e` still green with the new +scenarios reported as undefined. +``` @@ -380,6 +453,56 @@ Key files: tests/e2e/configs/, tests/e2e/configuration/, tests/e2e-prow/rhoai/. To verify: `uv run make test-e2e` green. ``` + + +### LCORE-???? Implement behave step definitions for unified-mode feature files + +**Description**: Implement the Python step definitions +(`@given`/`@when`/`@then` functions) under `tests/e2e/features/steps/` +for the `.feature` files authored in LCORE-???? (E2E feature files +kickoff). After this ticket lands, the scenarios transition from +`undefined` to fully executing. + +The feature files are taken as-is — do not modify the Gherkin to make +implementation easier. If a scenario cannot be implemented faithfully, +raise it against the spec doc (and possibly back to LCORE-???? kickoff) +rather than quietly weakening the test. + +**Scope**: +- Step definitions for every step pattern in the new `.feature` files. +- Fixtures or helpers under `tests/e2e/features/steps/` as needed + (e.g., temp-dir config authoring, subprocess start/stop for LCORE, + HTTP client helpers reusing existing `tests/e2e/` patterns). +- CI wiring so the new scenarios run as part of `uv run make test-e2e`. + +**Acceptance criteria**: +- behave reports zero `undefined` steps across the new `.feature` + files. +- `uv run make test-e2e` runs the new scenarios and they pass. +- No Gherkin edit was made to accommodate implementation constraints + (or if any edit was made, it is documented in a PR comment with + explicit rationale). + +**Blocked by**: +- LCORE-???? (E2E feature files for unified mode — the `.feature` + files being implemented against). +- LCORE-???? (Unified schema + synthesizer), LCORE-???? + (Migration tool), LCORE-???? (LS container entrypoint + deployment) + — the feature under test must exist. + +**Agentic tool instruction**: +```text +Read "Architecture" and "Requirements" in +docs/design/llama-stack-config-merge/llama-stack-config-merge.md. +Key files to create: tests/e2e/features/steps/unified-mode*.py (or +extend existing step-definition modules if patterns reuse cleanly). +Do not modify tests/e2e/features/unified-mode-*.feature — take the +Gherkin as-is. If a scenario genuinely cannot be implemented faithfully, +file a sub-ticket rather than changing the Gherkin quietly. +To verify: `uv run make test-e2e` runs every new scenario green and +behave reports zero undefined steps. +``` + ### LCORE-???? Docs migration to unified mode as primary From 0a32029950d6493b40dcf03f8c641eaf73fdbc98 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Tue, 5 May 2026 14:55:52 +0200 Subject: [PATCH 4/9] =?UTF-8?q?LCORE-836=20spike:=20rename=20poc-evidence/?= =?UTF-8?q?=20=E2=86=92=20poc-results/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per docs/contributing/howto-organize-poc-output.md (line 9), the convention is `poc-results/`. The LCORE-836 PoC bundle was written under `poc-evidence/` by mistake; rename to match the documented convention. Also updates references in the spike doc and the PoC README. --- .../llama-stack-config-merge-spike.md | 4 ++-- .../{poc-evidence => poc-results}/library-mode/README.md | 2 +- .../library-mode/query-response.json | 0 .../library-mode/synthesized-run.yaml | 0 .../lightspeed-stack-unified-library.yaml | 0 5 files changed, 3 insertions(+), 3 deletions(-) rename docs/design/llama-stack-config-merge/{poc-evidence => poc-results}/library-mode/README.md (95%) rename docs/design/llama-stack-config-merge/{poc-evidence => poc-results}/library-mode/query-response.json (100%) rename docs/design/llama-stack-config-merge/{poc-evidence => poc-results}/library-mode/synthesized-run.yaml (100%) rename docs/design/llama-stack-config-merge/{poc-evidence => poc-results}/lightspeed-stack-unified-library.yaml (100%) diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index 83451b4ff..b6cd66715 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -619,7 +619,7 @@ rebuild time was impractical. ### Results -See [poc-evidence/library-mode/](poc-evidence/library-mode/) for the full +See [poc-results/library-mode/](poc-results/library-mode/) for the full evidence bundle: - `lightspeed-stack-unified-library.yaml` — the unified-mode config used @@ -826,7 +826,7 @@ export OPENAI_API_KEY= export E2E_OPENAI_MODEL=gpt-4o-mini mkdir -p /tmp/lcore-836-poc uv run lightspeed-stack \ - -c docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml + -c docs/design/llama-stack-config-merge/poc-results/lightspeed-stack-unified-library.yaml # 2. In another shell — query curl -s http://localhost:8080/liveness diff --git a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md b/docs/design/llama-stack-config-merge/poc-results/library-mode/README.md similarity index 95% rename from docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md rename to docs/design/llama-stack-config-merge/poc-results/library-mode/README.md index 6bfa7f5e9..c099c2041 100644 --- a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/README.md +++ b/docs/design/llama-stack-config-merge/poc-results/library-mode/README.md @@ -4,7 +4,7 @@ Command: ```bash export OPENAI_API_KEY= export E2E_OPENAI_MODEL=gpt-4o-mini -uv run lightspeed-stack -c docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml +uv run lightspeed-stack -c docs/design/llama-stack-config-merge/poc-results/lightspeed-stack-unified-library.yaml ``` ## What the unified config does diff --git a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/query-response.json b/docs/design/llama-stack-config-merge/poc-results/library-mode/query-response.json similarity index 100% rename from docs/design/llama-stack-config-merge/poc-evidence/library-mode/query-response.json rename to docs/design/llama-stack-config-merge/poc-results/library-mode/query-response.json diff --git a/docs/design/llama-stack-config-merge/poc-evidence/library-mode/synthesized-run.yaml b/docs/design/llama-stack-config-merge/poc-results/library-mode/synthesized-run.yaml similarity index 100% rename from docs/design/llama-stack-config-merge/poc-evidence/library-mode/synthesized-run.yaml rename to docs/design/llama-stack-config-merge/poc-results/library-mode/synthesized-run.yaml diff --git a/docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml b/docs/design/llama-stack-config-merge/poc-results/lightspeed-stack-unified-library.yaml similarity index 100% rename from docs/design/llama-stack-config-merge/poc-evidence/lightspeed-stack-unified-library.yaml rename to docs/design/llama-stack-config-merge/poc-results/lightspeed-stack-unified-library.yaml From 579c28d279a2b7fa158fffb6684884266d072305 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Tue, 19 May 2026 17:04:44 +0200 Subject: [PATCH 5/9] LCORE-836 spike: clarity revisions to spike and spec docs Revisions arising from a re-read of the spike doc. Spike doc: - Add a "Design options A-E" explainer section near the top so readers encounter the option short names + one-line summaries before any decision references them. Drop options D and F (previously dropped without scoring) entirely; renumber the survivors so letters are consecutive: old E (Profiles) -> new D, old G (Kustomize patches) -> new E. References throughout the doc updated accordingly. - Restructure Decision S1: one standalone option per row in the choice table, no more "B+C" / "C+E" composite labels. Intro prose reduced to a link to the scoring section; recommendation spells out C (base shape) + D (optional profile layer). - Replace Decision S2 with a single Q3/Q4 deprecation recommendation. - Simplify Decision S3 to the "anything apart from Konflux?" framing. - Decision T6: refer to profiles as "Option D layer" (was E). - Decision T7: define "round-trip" in plain English on first use; recommend `default` as the field's default value; remove the reviewer-naming question. - Decision T8: drop the Tekton-pipelines framing, Konflux only; name @radofuchs explicitly as the owner. - Delete Decision T9 entirely (it was informational, not a decision). Its content is now an expanded bullet under "Findings discovered during PoC" covering the underlying library-client API constraint, three concrete implementation consequences, and a closing note on why a dict-only path isn't worth pursuing. - PoC results section: - Drop the "Level 3'" reference (the other levels are never defined and the term carries no value here). - Rephrase the smart-migration bullet as deferred future work pointing at the spec doc's Open Questions. - Rewrite the provider_id naming-collision text in plain English. - Fix the evidence link block: was pointing at poc-evidence/library-mode/ (wrong directory after the earlier rename) and listed lightspeed-stack-unified-library.yaml as if it lived inside that subdir, whereas it actually sits one level up. Each file now linked at its real location. - Rename "Surprise discovered during PoC" -> "Findings discovered during PoC". - Remove the stale "(Decision T9)" attribution. - Rename "Current architecture (before LCORE-836)" -> "Current architecture" (the parenthetical added no information). - "Design alternatives considered": add intro paragraph that explicitly frames the table as scoring; expand each of the 12 attribute names from a short phrase to a full definition with high/low criteria; table columns renumbered A, B, C, D, E (was A, B+C, C+E, E, G); closing paragraph swaps "E layered on top of C" -> "D layered on top of C". - Overview's recommendation paragraph: link target updated from Design options A-G to A-E; Option E -> Option D. Spec doc: - Realign the deprecation schedule with the new spike S2 recommendation (Q3/Q4 warnings, end-of-Q4 removal). Trim "subject to PM confirmation against the actual release calendar." -> "subject to PM confirmation." - Rename the stale "see PoC surprise in the spike doc" reference to point at the new "Findings discovered during PoC" section. --- .../llama-stack-config-merge-spike.md | 293 ++++++++++++------ .../llama-stack-config-merge.md | 10 +- 2 files changed, 198 insertions(+), 105 deletions(-) diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index b6cd66715..5a6202f8a 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -9,7 +9,12 @@ This split increases the chance of misconfiguration, makes downstream deployment templates larger, and forces every Lightspeed team to understand Llama Stack's internal schema. LCORE-836 asks for a single source of truth. -**The recommendation**: A layered approach — "Option C + Option E layer": +**The recommendation**: A layered approach — Option C (high-level keys + +`native_override` escape hatch) as the base structure, with Option D +(profiles) enabled as an optional layer on top. See +[Design options A–E](#design-options-ae) for the short names of each +option and [Design alternatives considered](#design-alternatives-considered) +for the scoring. - **High-level keys** in `lightspeed-stack.yaml` under a new `llama_stack.config` section (inference, later storage/safety/...). Most downstream teams write @@ -29,9 +34,9 @@ Llama Stack's internal schema. LCORE-836 asks for a single source of truth. single-file config from an existing (`run.yaml` + `lightspeed-stack.yaml`) pair, lossless round-trip. -**PoC validation**: A Level 3' PoC (per the spike howto) proves the mechanism -end-to-end in library mode. A unified `lightspeed-stack.yaml` containing only -`llama_stack.config` (no external `run.yaml`) successfully drives LCORE: +**PoC validation**: A library-mode PoC proves the mechanism end-to-end. +A unified `lightspeed-stack.yaml` containing only `llama_stack.config` +(no external `run.yaml`) successfully drives LCORE: liveness/readiness green, `/v1/query` returns a real model response, `native_override` demonstrably takes effect. Full unit-test suite passes (2098 tests), including a lossless migrate-then-synthesize round-trip. @@ -42,54 +47,68 @@ library-mode PoC and unit tests. --- +## Design options A–E + +- **A (Embedded native)** — `llama_stack.config` is the raw Llama Stack + schema, verbatim. Same surface area downstream teams see today, just + moved into one file. No abstraction win. +- **B (High-level only)** — `llama_stack.config` exposes only LCORE-defined + high-level keys (e.g. `inference.providers`). Best UX when every operator + intent maps cleanly; painful at the edges where the high-level schema + doesn't yet cover a need (no escape hatch). +- **C (B + `native_override`)** — high-level keys for the common path, plus + a raw-LS `native_override` block deep-merged last as an escape hatch. + Combines B's UX with A's flexibility. **Recommended (Decision S1).** +- **D (Profiles)** — a user-authored YAML file pointed to by + `llama_stack.config.profile: `, used as the synthesis baseline + instead of LCORE's built-in default. A composable *layer* on top of + A/B/C, not a standalone shape. LCORE ships the mechanism; downstream + teams (or operators) author the YAML. +- **E (Kustomize-style patches)** — ship a default baseline; the operator + writes JSON-Patch-like overlays against it. Viable alternative to C; + strongest for backward compat with existing `run.yaml` files, weakest + on validation rigor and dynamic-reconfig fit. + +--- + ## Strategic decisions — for @sbunciak (PM) and @tisnik These set scope, approach, and rollout shape. Each has a recommendation — please confirm or override. -### Decision S1: Overall shape (Option C + optional Option E) +### Decision S1: Overall shape -See [Design alternatives considered](#design-alternatives-considered) for the -full option set and scoring. +See [Design alternatives considered](#design-alternatives-considered) +for the scoring. -| Option | Summary | +| Option | Standalone shape | |---|---| -| A (Embedded native only) | `lightspeed-stack.yaml.llama_stack.config` is raw Llama Stack schema | -| **B + C (High-level + native override)** | High-level keys cover the common path, `native_override` as escape hatch | -| E (Profiles) | Named or path-based pre-built config bundles, layered on top of A/B/C | -| G (Kustomize-style patches) | Ship a default baseline, operator writes JSON-Patch-like overlays | +| A (Embedded native) | `llama_stack.config` is raw LS schema, verbatim | +| B (High-level only) | LCORE-defined high-level keys; no escape hatch | +| **C (B + `native_override`)** | High-level keys + raw-LS escape hatch | +| E (Kustomize-style patches) | Default baseline + JSON-Patch-like overlays | -**Recommendation**: **C** (high-level + native_override) with **E** (profile -feature, no shipped profiles) as an optional layer. Best balance of UX, -escape-hatch power, validation rigor, and dynamic-reconfig fit for the -broader feature roadmap (LCORE-777/781). - -### Decision S2: Deprecation timeline for the legacy path +D is not listed because it's a layer that composes on top of any of +A/B/C/E, not a standalone shape — the decision on whether to enable that +layer is Decision T6. -Legacy mode (`llama_stack.library_client_config_path` + external `run.yaml`) -must coexist with unified mode through a deprecation window to avoid breaking -downstream teams. Three candidate cadences: +**Recommendation**: **C** as the base structure, with **D** enabled as +an optional layer (feature only, no shipped profiles — see Decision T6). +Best balance of UX, escape-hatch power, validation rigor, and +dynamic-reconfig fit for the broader feature roadmap (LCORE-777/781). -| Cadence | Timing | -|---|---| -| N+2 releases | Opt-in → warning → removed over two releases after landing | -| N+3 releases | Opt-in → warning (N+1) → removed at N+3 | -| **Calendar-based** | e.g., "removed no sooner than 6 months after warning starts" | +### Decision S2: Deprecation timeline for the legacy path -**Recommendation**: **calendar-based**, because the right number depends on -LCORE's release cadence and downstream consumers' update latency — both of -which the spike author does not own. @sbunciak to set the actual numbers. +**Recommendation**: deprecate the legacy two-file path fully by end of Q4; +emit startup deprecation warnings during Q3 and Q4. @sbunciak to confirm +or override the calendar. ### Decision S3: Downstream implications we may not have seen -The spike author has direct evidence of Konflux/Tekton usage (`.tekton/` dir) -and RHOAI testing (`tests/e2e-prow/rhoai/`). Other downstream consumers — -RHOAI operator CRs, Helm charts, Kustomize overlays, any other products — -are not visible from this repo alone. - -**Ask**: Reviewers from downstream teams to confirm whether their deployment -setup treats `run.yaml` as a separate artifact (ConfigMap, templated file, -build-time asset) that this design would need to accommodate. +**Ask**: do we need to account for anything apart from Konflux? +Reviewers from downstream teams should flag any deployment surface that +treats `run.yaml` as a separate artifact (ConfigMap, templated file, +build-time asset) that the unified design would need to accommodate. ### Decision S4: Scope of this spike — what is deliberately left out @@ -191,7 +210,7 @@ companion `migrate_config_dumb()` function. Confidence: 90%. ### Decision T6: Profile distribution -How profiles (Option E layer) reach downstream teams: +How profiles (Option D layer) reach downstream teams: | Option | Details | |---|---| @@ -205,36 +224,36 @@ documentation, not shipped runtime assets. Confidence: 85%. ### Decision T7: The `baseline` field (added during PoC) -During the PoC, strict lossless round-trip for the migration tool surfaced -a need: when `native_override` contains an entire run.yaml body, the default -baseline's keys still leak into the result via deep-merge. Fix: a -`baseline: "default" | "empty"` field. - -- `baseline: default` (default value) — start from LCORE's built-in baseline -- `baseline: empty` — start from `{}`. Used by the dumb migration tool so - round-trip is exact. - -**Recommendation**: **accept this field**. Alternatives (`inherit_defaults: -bool`, `starting_point: ...`) are cosmetic. Confidence: 80%. Reviewers: any -preference on naming before this ships? - -### Decision T8: Konflux / Tekton pipelines - -The `.tekton/` directory exists in this repo. If any Konflux/Tekton pipeline -templates or mounts `run.yaml` separately, unified mode needs that pipeline -to either (a) keep using legacy mode during the deprecation window, or -(b) mount the unified `lightspeed-stack.yaml` and drop the `run.yaml` mount. - -**Ask**: owner of `.tekton/` to confirm current pipeline shape and plan +The migration tool must be lossless: migrate an existing `run.yaml` into a +unified config, then synthesize it back to a `run.yaml`, and the result +must match the original byte-for-byte. The PoC surfaced a leak: when +`native_override` contains the entire `run.yaml` body, LCORE's built-in +baseline still deep-merges underneath and adds keys that weren't in the +original. Fix: a `baseline: "default" | "empty"` field that lets the +caller pick the synthesis starting point. + +- `baseline: default` (default value) — start from LCORE's built-in baseline. +- `baseline: empty` — start from `{}`. Used by the dumb migration tool, so + that `native_override` is the only thing the synthesizer sees. + +**Recommendation**: **accept this field, with `default` as the default value**. +That preserves the zero-config "fresh user authors `llama_stack.config` and +gets a working LS baseline" UX; the migration tool sets `baseline: empty` +explicitly so the migrate-then-synthesize loop above matches the original +`run.yaml`. Alternatives (`inherit_defaults: bool`, `starting_point: ...`) +are cosmetic. Confidence: 80%. + +### Decision T8: Konflux pipelines — for @radofuchs + +The `.tekton/` directory in this repo holds Konflux build-pipeline +definitions. If any pipeline template mounts `run.yaml` separately, unified +mode needs that pipeline to either (a) keep using legacy mode during the +deprecation window, or (b) mount the unified `lightspeed-stack.yaml` and +drop the `run.yaml` mount. + +**Ask**: @radofuchs to confirm current Konflux pipeline shape and plan migration. -### Decision T9: Library client API (resolved by PoC) - -**Finding from PoC**: `AsyncLlamaStackAsLibraryClient` in `llama-stack` only -accepts a file-path string. It does not accept a dict. This means library -mode must write the synthesized config to disk — no dict-only shortcut -available. Not a decision; a fact to note in the spec doc. - --- ## Proposed JIRAs @@ -596,8 +615,8 @@ To verify: run LCORE with a legacy config; confirm WARN line; run with unified c ### What the PoC does -The PoC is at Level 3' (per the spike howto): unified config works -end-to-end in library mode, with overrides and a profile. Server-mode +The PoC proves the mechanism end-to-end in library mode: a unified config +works with `native_override` and a `profile:` baseline. Server-mode end-to-end validation was skipped — same synthesis code path, container rebuild time was impractical. @@ -606,25 +625,35 @@ rebuild time was impractical. - Uses `$TMPDIR` for the synthesized `run.yaml` instead of the persistent known path recommended in Decision T4. - No `--synthesized-config-output` CLI flag yet. -- Migration tool has only the "dumb" mode; "smart" factoring into - high-level keys is out of scope. +- The migration tool ships only the "dumb" mode (lift the whole `run.yaml` + into `native_override`). The "smart" mode that factors an existing + `run.yaml` into high-level keys is deliberately deferred to future work; + it is captured under the spec doc's "Open Questions for Future Work" and + is not part of the proposed implementation JIRAs. - No deprecation warning yet (that's its own JIRA). -- High-level inference's emitted `provider_id` uses the Literal value - directly (`sentence_transformers` with underscore), which differs from - the baseline's `sentence-transformers` (hyphen). Acceptable in the PoC - because the validation used `baseline: default` + a `native_override` - path, not high-level inference, to avoid this naming collision. Resolution - before production: align the emitted `provider_id` with the Literal - values that already exist in common baselines (hyphenated form). +- The high-level inference parser writes `provider_id` straight from the + `type:` Literal value (e.g. `sentence_transformers`, with an underscore). + The shipped baseline `run.yaml` and the wider LS ecosystem refer to that + same provider by the hyphenated name (`sentence-transformers`). When both + are present the two IDs don't match, so baseline references to the + embedder break. The PoC sidestepped the collision by using + `baseline: default` plus a `native_override` block — not high-level + inference — for the validation run. Fix before production: hyphenate the + emitted `provider_id` so it matches the ecosystem convention used in + baselines (or, equivalently, alias the Literal value at emit time). ### Results -See [poc-results/library-mode/](poc-results/library-mode/) for the full -evidence bundle: +Full evidence bundle for the library-mode PoC (paths relative to this doc): -- `lightspeed-stack-unified-library.yaml` — the unified-mode config used -- `synthesized-run.yaml` — what LCORE produced (3.7 KB) -- `query-response.json` — a real `/v1/query` round-trip +- [`poc-results/lightspeed-stack-unified-library.yaml`](poc-results/lightspeed-stack-unified-library.yaml) + — the unified-mode config used. +- [`poc-results/library-mode/synthesized-run.yaml`](poc-results/library-mode/synthesized-run.yaml) + — what LCORE produced (3.7 KB). +- [`poc-results/library-mode/query-response.json`](poc-results/library-mode/query-response.json) + — a real `/v1/query` round-trip. +- [`poc-results/library-mode/README.md`](poc-results/library-mode/README.md) + — walkthrough. Summary of validation: @@ -640,11 +669,22 @@ Summary of validation: | Full unit suite | 2098 passed, 1 skipped, 0 failed | | Round-trip lossless | `test_migrate_then_synthesize_reproduces_run_yaml` green | -### Surprise discovered during PoC - -- **`AsyncLlamaStackAsLibraryClient` takes a file path, not a dict** (Decision - T9). The library client reads the file itself. Consequence: library mode - must write a synthesized file to disk. No dict-only shortcut. +### Findings discovered during PoC + +- **`AsyncLlamaStackAsLibraryClient` takes a file path, not a dict.** The + initial design assumed we could pass the synthesized configuration to the + library client in memory and avoid touching the filesystem. In practice + `llama_stack.core.library_client.AsyncLlamaStackAsLibraryClient` accepts + only a string path (or, in newer versions, a `StackRunConfig` object that + is itself built from a parsed YAML file). There is no dict-only entry + point in the public API. Consequences for the implementation: + - Library mode **must** write the synthesized `run.yaml` to disk before + constructing the client (R10 in the spec doc — persistent known path, + overwritten each boot). + - The disk-write step is the same shape as server mode's, so the two + paths can share `synthesize_to_file()`. + - Any future "dict-only" optimization would require an upstream + Llama Stack API addition; not worth pursuing. - **`profile:` path resolution** uses the directory of the `lightspeed-stack.yaml`. Relative paths work only when the profile is co-located with the LCORE config. Absolute paths always work. Spec doc @@ -661,7 +701,7 @@ Summary of validation: ## Background sections -### Current architecture (before LCORE-836) +### Current architecture Two files: @@ -692,14 +732,69 @@ incrementally enriching an existing one. ### Design alternatives considered -Attributes (★ = high-weight for LCORE-836): - -| Attribute | A | B+C | C+E | E | G | +This section scores the five design alternatives (A, B, C, D, E) +against the attributes that matter for LCORE-836. Each cell is a 1–5 +rating; higher is better for that attribute. Cells marked with **★** in +the attribute name carry more weight in the final choice. The +recommendation that comes out of these scores is C as the base shape +with D enabled as an optional layer (Decision S1 + Decision T6). For the +option short names see [Design options A–E](#design-options-ae). + +Attribute definitions (★ = high-weight for LCORE-836): + +- **★ Operator UX** — how little raw LS schema a typical operator must + read or write to express common intents (one provider, one safety + filter, default storage). High = the high-level keys cover almost + everything; low = operators must hand-author LS provider blocks. +- **Abstraction cleanliness** — how well the LCORE-facing schema hides + internal LS shape. High = LCORE owns a stable surface that survives + LS schema bumps; low = LCORE just relays LS schema verbatim. +- **LS schema resilience** — how exposed downstream operators are to + Llama Stack schema churn. High = high-level keys absorb upstream + renames/restructures inside LCORE; low = every LS change is a + breaking change downstream. +- **★ Escape-hatch power** — coverage when the high-level schema + doesn't yet express something the operator needs (e.g. an obscure + provider config). High = the operator can drop in raw LS YAML without + blocking; low = the operator is stuck waiting for LCORE to add + first-class support. +- **Implementation cost** — engineering work to ship the option (one + release scope). High = small change; low = significant new code + + tests + docs. +- **Maintenance load** — ongoing burden after ship (per release). + High = touches one place; low = many surfaces to keep in sync + (high-level keys, baselines, examples, migration tool). +- **★ Backward compatibility** — how cleanly the option lets legacy + two-file configs keep working through a deprecation window without + duplicate code paths. High = legacy path stays intact while unified + path adds on; low = the option forces an early breaking change. +- **Validation rigor** — strength of static + load-time checks LCORE + can run against the operator's config. High = Pydantic + cross-field + validators catch most mistakes; low = errors only surface when LS + itself fails to start. +- **★ Dynamic-reconfig fit** — how well the option composes with the + feature roadmap that wants to change LS config at runtime + (LCORE-777/781, BYOK RAG additions). High = the synthesized config is + a single dict the supervisor can recompute and reload; low = the + shape forces in-place file edits. +- **★ Library+server parity** — whether the same operator-facing + config drives both library-mode and server-mode LS without separate + configurations. High = one file, two modes; low = needs mode-specific + variants. +- **Provider plurality** — how many of the LS provider types the option + covers without an escape hatch. High = all common types reachable via + the option's normal surface; low = the option only covers one or two. +- **Testability** — ease of writing automated tests against the + option's surface. High = small, deterministic inputs map to a single + dict output; low = templated/inherited shapes that need integration + tests to exercise. + +| Attribute | A | B | C | D | E | |---|---|---|---|---|---| -| ★ Operator UX | 2 | 4–5 | **4** | 5 | 3 | +| ★ Operator UX | 2 | 5 | **4** | 5 | 3 | | Abstraction cleanliness | 1 | 4 | 3 | 4 | 2 | | LS schema resilience | 1 | 4 | 3 | 3 | 2 | -| ★ Escape-hatch power | 5 | 3 | 5 | 5 | 5 | +| ★ Escape-hatch power | 5 | 1 | 5 | 5 | 5 | | Implementation cost | 4 | 2 | 2 | 3 | 3 | | Maintenance load | 2 | 3 | 3 | 2 | 3 | | ★ Backward compatibility | 3 | 3 | 3 | 3 | 4 | @@ -709,12 +804,12 @@ Attributes (★ = high-weight for LCORE-836): | Provider plurality | 5 | 4 | 5 | 4 | 5 | | Testability | 3 | 4 | 3 | 5 | 3 | -- **A (Embedded native)** — no abstraction win; same LS schema exposure as today. -- **B (High-level only)** — best UX when everything maps, painful at the edges. -- **C (B + `native_override`)** — recommended; combines B's UX with A's escape hatch. -- **E (Profiles, feature-only)** — optional layer on top of C. -- **G (Kustomize-style patches)** — strong for backward compat, weak on - validation and dynamic reconfig. +The recommendation column is **C**: it ties or beats every other +standalone option on the high-weight attributes except Operator UX, +where it costs one point against B (because the escape hatch adds +schema surface area) — a trade we accept to keep escape-hatch power at +5. D layered on top of C adds testability and a clean path for +deployment-team-authored baselines without changing C's structure. ### Merge semantics — worked examples diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge.md index 9847cc2fd..012f1aea0 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge.md @@ -304,11 +304,9 @@ Three operator-facing migration paths (choose per deployment): | Lift-and-shift | seconds — `lightspeed-stack --migrate-config ...` | Single-file, byte-equivalent LS behavior | | Re-express | hours+ | Single-file; high-level sections replace `native_override` | -Deprecation schedule: calendar-based (per Decision S2 in the spike); -concrete numbers set by @sbunciak at release time. Default recommended -shape: unified mode ships as opt-in at release N; legacy-mode WARN -begins one release later; legacy-mode removal no sooner than 6 months -after WARN begins. +Deprecation schedule (per Decision S2 in the spike): emit startup +deprecation warnings during Q3 and Q4; remove the legacy path fully by +the end of Q4. Subject to PM (@sbunciak) confirmation. ## Implementation Suggestions @@ -318,7 +316,7 @@ after WARN begins. |---|---| | `src/models/config.py` | Add `UnifiedInferenceProvider`, `UnifiedInferenceSection`, `UnifiedLlamaStackConfig`. Modify `LlamaStackConfiguration` — add `config` field, extend the `model_validator` for mutual-exclusion check. | | `src/llama_stack_configuration.py` | Add `synthesize_configuration`, `deep_merge_list_replace`, `apply_high_level_inference`, `load_default_baseline`, `synthesize_to_file`, `migrate_config_dumb`, `PROVIDER_TYPE_MAP`, `DEFAULT_BASELINE_RESOURCE`. Update `main()` to auto-detect unified vs legacy. | -| `src/data/default_run.yaml` | New file — a thinner baseline than today's repo-root `run.yaml`. Notably do **not** reference `${env.EXTERNAL_PROVIDERS_DIR}` without a default (see PoC surprise in the spike doc). | +| `src/data/default_run.yaml` | New file — a thinner baseline than today's repo-root `run.yaml`. Notably do **not** reference `${env.EXTERNAL_PROVIDERS_DIR}` without a default (see "Findings discovered during PoC" in the spike doc). | | `src/client.py` | In `_load_library_client`: branch on `config.config` presence. Add `_synthesize_library_config()` that calls the synthesizer and writes to the deterministic path (R10). Keep `_enrich_library_config` for legacy. | | `src/lightspeed_stack.py` | Add `--migrate-config`, `--run-yaml`, `--migrate-output`, `--synthesized-config-output` flags. Add an early-exit branch in `main()` that dispatches to `migrate_config_dumb` when `--migrate-config` is set. Clean up stale docstring. | | `scripts/llama-stack-entrypoint.sh` | No functional change — the Python CLI already auto-detects. Update the comment to document both modes. | From 6bc7a7217c75a75e953e32305de283aa11250930 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Wed, 20 May 2026 09:30:51 +0200 Subject: [PATCH 6/9] LCORE-836 spike: address CodeRabbit feedback on secrets-on-disk and anchor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spec doc: - R6 reworded: it is true that LCORE itself does not resolve env refs on disk, but `native_override` and the dumb migration tool can carry literal secrets from a legacy `run.yaml`. The previous wording made a stronger claim than the design can honour, so it has been narrowed to "secrets LCORE emits are not resolved on disk". - R10 strengthened: the synthesized file must be created with mode 0600 via an explicit create flag (not umask), so that when `native_override` or a migrated `run.yaml` does carry a literal secret, the file is not world-readable. - "Security considerations" section rewritten to (a) drop the inaccurate "no secrets written to disk" blanket statement, (b) acknowledge that `native_override` and dumb-mode migration output may contain literal secrets, (c) mandate 0600 on the synthesized file, (d) recommend the migration tool's help / docs advise operators to replace literal secrets with `${env.}` refs before or after migration, (e) note that LS's own `replace_env_vars()` (in `llama_stack.core.library_client`) is the in-memory resolver at LS startup. Spike doc: - Fix broken in-page anchor on line 166: the target heading is "### Merge semantics — worked examples", and GitHub renders that heading's anchor with a double hyphen (`#merge-semantics--worked-examples`) because the em-dash is stripped and the surrounding spaces both become hyphens. Link display text now matches the heading exactly as well. --- .../llama-stack-config-merge-spike.md | 2 +- .../llama-stack-config-merge.md | 57 +++++++++++++------ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index 5a6202f8a..8aed9bc93 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -163,7 +163,7 @@ what semantics? no list-merge tarpit, keeps scalar + map overrides minimal. Implemented in `deep_merge_list_replace()`. Confidence: 70%. -See [Merge semantics worked examples](#merge-semantics-worked-examples). +See [Merge semantics — worked examples](#merge-semantics--worked-examples). ### Decision T3: Secrets in synthesized files diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge.md index 012f1aea0..8881c4ffa 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge.md @@ -74,8 +74,13 @@ detail that LCORE owns, not an operator-facing artifact. by the high-level section or by the baseline, deep-merge semantics apply with list replacement (maps merge recursively; lists are replaced wholesale; scalars are replaced). -- **R6:** Secrets are never resolved into the synthesized file on disk. - `${env.FOO}` references appear verbatim in the synthesized `run.yaml`. +- **R6:** Secrets that LCORE itself emits are never resolved on disk: + `apply_high_level_inference` writes `${env.}` references + verbatim, and LCORE does not eagerly resolve env refs in the + baseline or in `native_override` before writing. (Operators may + still hand-write literal secrets into `native_override` or into a + legacy `run.yaml` that is migrated through dumb mode — see the + Security considerations section for the on-disk implications.) - **R7:** Existing enrichment behavior (Azure Entra ID, BYOK RAG, Solr/OKP) produces the same result in unified mode as in legacy mode for equivalent inputs. @@ -86,9 +91,12 @@ detail that LCORE owns, not an operator-facing artifact. field; validation enforces mutual exclusion with legacy mode and rejects unknown fields (`extra="forbid"`). - **R10:** The synthesized `run.yaml` is written to a persistent known - path (overwritten each boot), logged, and a CLI flag - `--synthesized-config-output` lets operators override the location for - debugging. + path (overwritten each boot) with file mode `0600` (owner read/write + only — the file may contain literal secrets when `native_override` or + the dumb migration tool's output carries them; restrictive perms must + be set on create, not left to umask). Path is logged at startup, and + a CLI flag `--synthesized-config-output` lets operators override the + location for debugging. - **R11:** Shape detection determines mode (unified vs legacy); an optional `config_format_version` field is accepted but must agree with the shape when present. @@ -277,18 +285,33 @@ removed `-g/-i/-o` flags is cleaned up as part of the docs JIRA. ### Security considerations -- **No secrets written to disk**: `apply_high_level_inference` emits - `${env.}` references, never the resolved secret. The synthesized - `run.yaml` is safe to log path-wise; its contents only contain env - references for secrets. -- **`native_override` is raw YAML**: content is operator-controlled, so - no new injection surface — same trust model as the existing - `run.yaml`. LCORE does no template expansion other than the existing - `replace_env_vars()` step in the load pipeline. -- **Synthesized file location**: persistent known path, world-readable - by default in a container. This is acceptable because the file - contains only env-var references for secrets; operators who want - stricter filesystem permissions should tighten the mount. +- **High-level inference emits env refs, not literal secrets**: + `apply_high_level_inference` writes `${env.}` strings, never + the resolved value. An operator authoring `lightspeed-stack.yaml` + from scratch with only high-level keys produces synthesized output + with no literal secrets on disk. LS itself resolves env refs to + values in-memory at startup via `replace_env_vars()` in + `llama_stack.core.library_client`. +- **`native_override` (and dumb-mode migration output) MAY carry + literal secrets**: `native_override` is whatever raw YAML the + operator drops in, and `migrate_config_dumb()` lifts an existing + `run.yaml` verbatim — which may already contain `api_key: sk-...` + if a downstream team baked the secret into their legacy file. The + synthesized file therefore CANNOT be assumed secret-free. +- **Mandate `0600` on the synthesized file**: `synthesize_to_file()` + must create the file with mode `0600` (owner read/write only) using + an explicit create flag — not relying on umask. This bounds the + blast radius when `native_override` or a migrated `run.yaml` does + contain a literal secret. (See R10.) +- **Document env-refs as the recommended pattern**: the migration + tool's `--help` and the migration doc should advise operators to + replace literal secrets in their legacy `run.yaml` with + `${env.}` references either before migrating, or after + migration inside the resulting `native_override` block. +- **`native_override` injection surface**: content is + operator-controlled, so no new surface — same trust model as the + existing `run.yaml`. LCORE does no template expansion other than + LS's own `replace_env_vars()` step at load time. ### Migration / backwards compatibility From 1983f1f372f69222814048bdc4444accd95d1f91 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Wed, 20 May 2026 09:39:50 +0200 Subject: [PATCH 7/9] LCORE-836 spike: fix MD031 (blanks around fenced code blocks) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit flagged twelve fenced code blocks in the spike doc that were missing blank lines before or after them (markdownlint MD031). Applied via `markdownlint-cli2 --fix`. No content change — purely blank-line insertions around `\`\`\`text` and `\`\`\`yaml` fences in the Proposed JIRAs section and Appendix B. Line numbers of section headings (S1-T8, ## Proposed JIRAs) are unchanged. ## PoC results moved from L614 to L642 because the auto-fix added blank lines inside the JIRAs section that precedes it. --- .../llama-stack-config-merge-spike.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index 8aed9bc93..1b186b9cd 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -282,6 +282,7 @@ definitions). The feature files can be submitted for review and land before implementation of the feature itself begins. **Scope**: + - `.feature` files covering, at minimum, these R1–R11 surfaces from the spec doc: - Boot LCORE with unified `lightspeed-stack.yaml` (no external @@ -308,6 +309,7 @@ before implementation of the feature itself begins. drafting scenarios. **Acceptance criteria**: + - behave parses every new `.feature` file without syntax errors. - behave marks all new scenario steps as `undefined` (step definitions land in LCORE-????). @@ -320,6 +322,7 @@ before implementation of the feature itself begins. mode). **Agentic tool instruction**: + ```text Read "Requirements" (R1..R11) and "Use Cases" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. @@ -345,6 +348,7 @@ Stack `run.yaml` from it. Wire library mode to the synthesizer. Preserve legacy mode through mutual-exclusion validation. **Scope**: + - New Pydantic classes in `src/models/config.py`. - New functions in `src/llama_stack_configuration.py`: `synthesize_configuration`, `deep_merge_list_replace`, @@ -357,6 +361,7 @@ legacy mode through mutual-exclusion validation. - Legacy behavior (`llama_stack.library_client_config_path` path) unchanged. **Acceptance criteria**: + - Unified `lightspeed-stack.yaml` (no external `run.yaml`) boots LCORE in library mode and serves `/v1/query`. - Legacy configs continue to work with no change. @@ -364,6 +369,7 @@ legacy mode through mutual-exclusion validation. - Unit tests for synthesizer, merge semantics, schema validation. **Agentic tool instruction**: + ```text Read the "Architecture" and "Implementation Suggestions" sections of docs/design/llama-stack-config-merge/llama-stack-config-merge.md. @@ -386,6 +392,7 @@ that produces a unified single-file config from an existing `baseline: empty`, removes `library_client_config_path`. **Scope**: + - `migrate_config_dumb()` function in `src/llama_stack_configuration.py`. - `--migrate-config`, `--run-yaml`, `--migrate-output` flags in `src/lightspeed_stack.py`. @@ -393,6 +400,7 @@ that produces a unified single-file config from an existing `run.yaml`. **Acceptance criteria**: + - `lightspeed-stack --migrate-config --run-yaml X -c Y --migrate-output Z` produces a unified config that boots LCORE in library mode to the same Llama Stack behavior as the original pair. @@ -400,6 +408,7 @@ that produces a unified single-file config from an existing - `--help` describes the flag clearly. **Agentic tool instruction**: + ```text Read "Migration tool" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. Key files: src/lightspeed_stack.py, src/llama_stack_configuration.py, @@ -418,6 +427,7 @@ manifests so server mode works end-to-end from a unified the synthesizer script and default baseline. **Scope**: + - Update `scripts/llama-stack-entrypoint.sh` — the existing script already defers to the Python CLI for auto-detection; document that behavior. - Update `test.containerfile` to copy `src/data/` into the LS container so @@ -428,11 +438,13 @@ the synthesizer script and default baseline. see Decision T8). **Acceptance criteria**: + - `docker compose up` with a unified `lightspeed-stack.yaml` starts both containers healthy; `/v1/query` works through LCORE → LS. - Legacy docker-compose layout (with external `run.yaml` mount) still works. **Agentic tool instruction**: + ```text Read "Architecture → Server mode" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. Key files: scripts/llama-stack-entrypoint.sh, test.containerfile, @@ -455,17 +467,20 @@ corresponding `lightspeed-stack*.yaml`). Migrate `tests/e2e-prow/rhoai/` configs similarly. **Scope**: + - Identify every test config that references `run.yaml`. - Mechanically migrate using the migration tool (dumb mode). - Re-run the full e2e suite and resolve any differences. **Acceptance criteria**: + - No in-repo test config references an external `run.yaml`. - `uv run make test-e2e` passes. - Existing test coverage is preserved (no tests deleted solely to make the migration pass). **Agentic tool instruction**: + ```text Read "Migration paths" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. Key files: tests/e2e/configs/, tests/e2e/configuration/, tests/e2e-prow/rhoai/. @@ -488,6 +503,7 @@ raise it against the spec doc (and possibly back to LCORE-???? kickoff) rather than quietly weakening the test. **Scope**: + - Step definitions for every step pattern in the new `.feature` files. - Fixtures or helpers under `tests/e2e/features/steps/` as needed (e.g., temp-dir config authoring, subprocess start/stop for LCORE, @@ -495,6 +511,7 @@ rather than quietly weakening the test. - CI wiring so the new scenarios run as part of `uv run make test-e2e`. **Acceptance criteria**: + - behave reports zero `undefined` steps across the new `.feature` files. - `uv run make test-e2e` runs the new scenarios and they pass. @@ -503,6 +520,7 @@ rather than quietly weakening the test. explicit rationale). **Blocked by**: + - LCORE-???? (E2E feature files for unified mode — the `.feature` files being implemented against). - LCORE-???? (Unified schema + synthesizer), LCORE-???? @@ -510,6 +528,7 @@ rather than quietly weakening the test. — the feature under test must exist. **Agentic tool instruction**: + ```text Read "Architecture" and "Requirements" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. @@ -539,18 +558,21 @@ migration section with the migration tool command. Clean up the stale mentions the removed `-g/-i/-o` flags. **Scope**: + - Each doc file touched. - A new migration section (step-by-step). - Update the `create_argument_parser` docstring in `src/lightspeed_stack.py`. **Acceptance criteria**: + - Every doc page that showed a two-file setup also shows the unified-mode equivalent. - Migration tool invocation documented with a worked example. - `docs/openapi.md` / `docs/config.html` regenerated. **Agentic tool instruction**: + ```text Read "Deprecation timeline" and "Migration paths" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. Key files: docs/*.md, docs/*.html, docs/*.json, README.md, src/lightspeed_stack.py docstring. @@ -568,16 +590,19 @@ write and reference their own profiles via `llama_stack.config.profile: `. **Scope**: + - `examples/profiles/openai-remote.yaml` - `examples/profiles/inline-faiss.yaml` - Docs section: how to author a profile, where to place it, how to reference it from `lightspeed-stack.yaml`. **Acceptance criteria**: + - Both examples load cleanly via the synthesizer (sanity test). - A docs section titled "Profiles" exists and has a worked example. **Agentic tool instruction**: + ```text Read "Profiles" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. Key files to create: examples/profiles/*.yaml, a "Profiles" section in docs/config.md or docs/deployment_guide.md. @@ -593,16 +618,19 @@ emit a one-line startup WARN when `library_client_config_path` is set. Link to the migration doc. Legacy mode continues to fully function. **Scope**: + - Warning emission point: on load in `LlamaStackConfiguration` `check_llama_stack_model` validator, or at LCORE startup. - Log line format includes a stable URL fragment to the migration doc. **Acceptance criteria**: + - Legacy configs still load and run. - A single WARN line appears at startup when legacy fields are used. - The warning is not emitted in unified mode. **Agentic tool instruction**: + ```text Read "Deprecation timeline" in docs/design/llama-stack-config-merge/llama-stack-config-merge.md. Key files: src/models/config.py (or src/lightspeed_stack.py startup). @@ -814,6 +842,7 @@ deployment-team-authored baselines without changing C's structure. ### Merge semantics — worked examples Given the baseline: + ```yaml safety: default_shield_id: llama-guard @@ -827,12 +856,14 @@ providers: ``` And `native_override`: + ```yaml safety: excluded_categories: [spam] ``` **Deep-merge-with-list-replacement (chosen)** produces: + ```yaml safety: default_shield_id: llama-guard # preserved (not in override) From de734e7a5c03ceb013eb91227e3886c007218a7b Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Wed, 20 May 2026 09:52:40 +0200 Subject: [PATCH 8/9] LCORE-836 spike: add Decision S5 (backend-agnostic key location) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In response to @tisnik's review comment about future-proofing the unified config schema against a backend swap, plus the project direction to migrate from Llama Stack to Pydantic AI over time. S5 extends S1 — it does NOT replace it. Option C + optional D remains the recommended overall shape. S5 only decides *where* the backend-agnostic high-level keys (`inference.providers` today; later `rag.providers`) live in the operator-facing YAML hierarchy: at the top level, not under the `llama_stack` subtree. LS-specific knobs (`native_override`, `profile`, `baseline`) stay under `llama_stack.config` unchanged. Confidence is 70%; the recommendation is provisional until a research pass on Pydantic AI's actual config surface lands. A TODO is captured in the decision body so the spike author resolves the per-provider `type:` vocabulary question post-research. If S5 is adopted, the existing "Unified llama_stack.config schema + synthesizer" implementation JIRA's scope shifts to ship the top-level shape from day one. No new JIRA is added. --- .../llama-stack-config-merge-spike.md | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index 1b186b9cd..6bfa771a2 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -126,6 +126,59 @@ should be tracked as separate future JIRAs: **Recommendation**: confirm this scope split. If reviewers want any of the above pulled in, this spike's JIRAs grow accordingly. +### Decision S5: Where do backend-agnostic high-level keys sit? + +**Context**: S1 places the unified config's high-level keys +(`inference.providers` today; later `rag.providers`, etc.) inside the +LS-specific subtree at `llama_stack.config.inference`. LCORE will migrate +from Llama Stack to Pydantic AI over time. Under S1's layout, that +transition would force every downstream team to relearn the config schema — +the `llama_stack` subtree name becomes a lie, and high-level keys would +have to move. + +**Recommendation**: lift the backend-agnostic keys to the top level of +`lightspeed-stack.yaml` now. Leave LS-specific knobs under +`llama_stack.config`. Extends S1; does not replace it (Option C + optional +D recommendation stands). + +| Today (per S1) | Proposed | +|---|---| +| `llama_stack.config.inference.providers: …` | `inference.providers: …` | +| (future) `llama_stack.config.rag.providers: …` | `rag.providers: …` | +| `llama_stack.config.native_override: …` | unchanged — LS-specific | +| `llama_stack.config.profile: …` | unchanged — LS-specific (points at LS run.yaml shape) | +| `llama_stack.config.baseline: …` | unchanged — LS-specific | + +The synthesizer reads `inference.providers` from the top level and emits LS +provider entries exactly as today — only the input node moves. When the +Pydantic AI transition lands, the same top-level schema is consumed by a +new synthesizer that emits Pydantic AI's shape; downstream operators see no +change. + +**Scope discipline — what stays under `llama_stack.config`**: anything +whose vocabulary is genuinely LS-specific and unlikely to translate across +backends. Today that's `native_override` (raw LS schema by definition), +`profile` (a YAML file in LS run.yaml shape), `baseline`, and any future +`apis` / `registered_resources` / `vector_io` / `safety`-shield knobs if +they ship as high-level keys before the cutover. We do **not** preemptively +lift these. + +**Confidence**: 70%. Reservation: we don't yet know Pydantic AI's provider +vocabulary, so the recommendation will be updated once a research pass on +Pydantic AI's actual config surface lands. Until then, abstract only what +is clearly stable across backends (`inference.providers`: type + creds + +allowed_models; later `rag.providers`); defer broader abstraction. + +> **TODO (spike author, post-research)**: determine whether the +> `inference.providers[].type` Literal vocabulary (`openai`, `azure`, +> `sentence_transformers`, …) maps cleanly to Pydantic AI's provider +> declaration, or whether LCORE should define its own canonical type +> vocabulary that each backend-specific synthesizer translates. + +**Implementation impact**: if adopted, this changes the scope of the +existing **Unified `llama_stack.config` schema + synthesizer** JIRA — it +ships the top-level shape from day one. No new JIRA is needed. + --- ## Technical decisions — for @tisnik and team leads From 8b85c4a12dedd6e87e6ee10da211ec4678e3d0c3 Mon Sep 17 00:00:00 2001 From: Maxim Svistunov Date: Wed, 20 May 2026 10:02:47 +0200 Subject: [PATCH 9/9] LCORE-836 spike: Pydantic AI research; vacuous-shield finding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spike doc: - Adds a new finding to "Findings discovered during PoC" calling out that the PoC's safety-shield validation was vacuous: the `native_override` registered `llama-guard` with `provider_shield_id: openai/gpt-4o-mini` — an OpenAI chat model, not a Llama Guard checkpoint, so the evidence row "`native_override` took effect" only proves the key landed, not that a real shield gated any query. Caught by CodeRabbit on `poc-results/library-mode/synthesized-run.yaml:110`. The implementation JIRA's e2e coverage must exercise a real Llama Guard model (e.g. `meta-llama/Llama-Guard-3-8B`). - Updates Decision S5 with the Pydantic AI research findings from 2026-05-20: - Drops the "future `rag.providers`" lift from the proposed table; research confirms Pydantic AI has no RAG / vector-store abstraction and no public roadmap signal one is coming in 6–12 months. RAG, safety/shields, vector storage all stay under `llama_stack.config`. - Adds a "Pydantic AI research findings" subsection citing the full report (now in `poc-results/pydantic-ai-research.md`) and the researcher's confidence numbers per concept: ~75% for `inference`, ~25% for RAG, ~20% for safety, ~60% for MCP. - Resolves the `inference.providers[].type` vocabulary TODO: keep LCORE's existing Literal vocabulary (`openai`, `azure`, `sentence_transformers`, …); each backend-specific synthesizer translates to its target shape. - Raises decision confidence from 70% to 75%; reservation is now research-bounded (Pydantic AI pre-V2 minor-surface churn, very low per their stated policy) rather than information-gap-bounded. - Adds a note that Pydantic AI V2 timing ("April 2026 at the earliest" — has not shipped as of 2026-05-20) likely overlaps LCORE's migration window; re-validate when V2 ships. Adds the full research report under `poc-results/pydantic-ai-research.md` (Llama Stack ↔ Pydantic AI concept mapping, dated 2026-05-20) so the spike doc's link to it resolves and so the evidence travels with the spike. --- .../llama-stack-config-merge-spike.md | 88 +++++-- .../poc-results/pydantic-ai-research.md | 245 ++++++++++++++++++ 2 files changed, 312 insertions(+), 21 deletions(-) create mode 100644 docs/design/llama-stack-config-merge/poc-results/pydantic-ai-research.md diff --git a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md index 6bfa771a2..1c4292285 100644 --- a/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md +++ b/docs/design/llama-stack-config-merge/llama-stack-config-merge-spike.md @@ -144,36 +144,72 @@ D recommendation stands). | Today (per S1) | Proposed | |---|---| | `llama_stack.config.inference.providers: …` | `inference.providers: …` | -| (future) `llama_stack.config.rag.providers: …` | `rag.providers: …` | | `llama_stack.config.native_override: …` | unchanged — LS-specific | | `llama_stack.config.profile: …` | unchanged — LS-specific (points at LS run.yaml shape) | | `llama_stack.config.baseline: …` | unchanged — LS-specific | +| Future RAG / safety / vector_io / shield high-level keys | stay under `llama_stack.config` — Pydantic AI has no equivalent abstraction (see "Pydantic AI research findings" below) | The synthesizer reads `inference.providers` from the top level and emits LS provider entries exactly as today — only the input node moves. When the -Pydantic AI transition lands, the same top-level schema is consumed by a -new synthesizer that emits Pydantic AI's shape; downstream operators see no -change. +Pydantic AI transition lands, a new backend-specific synthesizer reads the +same top-level `inference.providers` block and emits Pydantic AI's +per-Agent `Provider(...)` + `:` shape; downstream operators +see no change to the `inference` surface. **Scope discipline — what stays under `llama_stack.config`**: anything whose vocabulary is genuinely LS-specific and unlikely to translate across -backends. Today that's `native_override` (raw LS schema by definition), -`profile` (a YAML file in LS run.yaml shape), `baseline`, and any future -`apis` / `registered_resources` / `vector_io` / `safety`-shield knobs if -they ship as high-level keys before the cutover. We do **not** preemptively -lift these. - -**Confidence**: 70%. Reservation: we don't yet know Pydantic AI's provider -vocabulary, so the recommendation will be updated once a research pass on -Pydantic AI's actual config surface lands. Until then, abstract only what -is clearly stable across backends (`inference.providers`: type + creds + -allowed_models; later `rag.providers`); defer broader abstraction. - -> **TODO (spike author, post-research)**: determine whether the -> `inference.providers[].type` Literal vocabulary (`openai`, `azure`, -> `sentence_transformers`, …) maps cleanly to Pydantic AI's provider -> declaration, or whether LCORE should define its own canonical type -> vocabulary that each backend-specific synthesizer translates. +backends. Today that's `native_override`, `profile`, `baseline`. The +research pass (see below) confirms that RAG, safety/shields, vector +storage, and the `apis` / `registered_resources` / `storage` blocks should +also stay under `llama_stack.config` whenever they ship as high-level keys +— Pydantic AI has **no equivalent built-in abstraction** for any of these. + +**On the `inference.providers[].type` vocabulary**: keep LCORE's existing +Literal values (`openai`, `azure`, `sentence_transformers`, `vertexai`, +`watsonx`, `vllm_rhaiis`, `vllm_rhel_ai`). They are vendor identifiers +that both Llama Stack (`provider_type: remote::openai`) and Pydantic AI +(model-string prefixes such as `openai:gpt-4o-mini`) recognise. Each +backend-specific synthesizer translates the canonical LCORE vocabulary to +its target shape; we do not adopt either backend's surface verbatim. + +**Pydantic AI research findings** (full report: +[`compass_artifact_…_text_markdown.md`](https://github.com/max-svistunov/lightspeed-stack/blob/lcore-836-spike-llama-stack-config-merge/docs/design/llama-stack-config-merge/poc-results/pydantic-ai-research.md), +pass dated 2026-05-20 against `pydantic-ai 1.98.0`): + +- Pydantic AI's per-Agent `:` string + `Provider(...)` + constructor maps cleanly onto LCORE's `inference.providers` vocabulary. + Type + env-var name + base_url + `allowed_models` translate without + loss. **Abstract this.** Researcher confidence: ~75%. +- Pydantic AI ships **no built-in RAG or vector-store abstraction**. The + official RAG example wires pgvector via raw `asyncpg` calls and the + OpenAI SDK for embeddings; there is no `pydantic_ai.vector_store` + module and no public roadmap signal one is coming in the next 6–12 + months. **Do not preemptively abstract `rag.*`** — keep any future + high-level RAG keys under `llama_stack.config`. Researcher confidence + it would survive a cutover today: ~25%. +- Pydantic AI ships **no built-in safety / shield abstraction**. + `pydantic-ai` Issue #1197 ("Guardrails") is open with no merge + timeline. Third-party capability packages + (`pydantic-ai-shields`, `pydantic-ai-guardrails`) exist but have + incompatible vocabularies with each other and with Llama Guard. **Do + not preemptively abstract `safety.*`** — keep any future high-level + safety keys under `llama_stack.config`. Researcher confidence on + survival: ~20%. +- MCP endpoints are the one tool-runtime concept worth abstracting + later (~60% confidence): both backends support MCP natively and the + URI + auth-token + allowed-tools surface is stable. Out of scope for + this spike; capture as future work when the first high-level + tool-runtime ticket lands. +- Pydantic AI is currently at V1 (API-stable until V2, which is "April + 2026 at the earliest" and has not shipped as of 2026-05-20). V2 + timing likely overlaps the LCORE migration window; re-validate this + decision when V2 ships. + +**Confidence**: **75%**. The 25% reservation is research-driven, not +information-gap-driven: it accounts for Pydantic AI's pre-V2 freedom to +break minor surfaces (very low probability per its stated policy), plus +the inherent risk that the per-Agent model the researcher described +forces LCORE's synthesizer to do more work than expected. Both manageable. **Implementation impact**: if adopted, this changes the scope of the existing **Unified `llama_stack.config` schema + synthesizer** JIRA — it @@ -777,6 +813,16 @@ Summary of validation: for the implementation JIRA. - **High-level inference naming collision** (described above in "divergence from production design"). +- **Vacuous safety-shield validation in the library-mode PoC**. The + `native_override` used during PoC validation registered `llama-guard` + with `provider_shield_id: openai/gpt-4o-mini` — an OpenAI chat model, + not a Llama Guard checkpoint. The "`native_override` took effect" + evidence row above only shows that the key landed in the synthesized + output; it does **not** show that a real safety shield gated any + query. The implementation JIRAs' e2e coverage must exercise a real + Llama Guard model (e.g. `meta-llama/Llama-Guard-3-8B`) end-to-end. + Caught by CodeRabbit on the PoC artifact at + `poc-results/library-mode/synthesized-run.yaml:110`. --- diff --git a/docs/design/llama-stack-config-merge/poc-results/pydantic-ai-research.md b/docs/design/llama-stack-config-merge/poc-results/pydantic-ai-research.md new file mode 100644 index 000000000..a9a5fa3cb --- /dev/null +++ b/docs/design/llama-stack-config-merge/poc-results/pydantic-ai-research.md @@ -0,0 +1,245 @@ +# Pydantic AI ↔ Llama Stack: Concept Mapping for a Backend-Agnostic YAML Schema + +## 1. Pydantic AI core concepts and configuration surface + +**What is an Agent.** In Pydantic AI an `Agent` is a generic, type-parameterised container that owns: a default model, instructions / system prompts, tools (and toolsets), capabilities (composable behavior units), an output type / output validators, retry budgets, model settings, dependency type, and instrumentation settings. From the official Agents API reference (ai.pydantic.dev/api/agent/): `Agent` is generic in `(AgentDepsT, OutputDataT)` and "by default, if neither generic parameter is customised, agents have type `Agent[None, str]`" (https://ai.pydantic.dev/api/agent/). + +Canonical construction (from the project README / overview at ai.pydantic.dev): +```python +from pydantic_ai import Agent +agent = Agent( + 'anthropic:claude-sonnet-4-6', + instructions='Be concise, reply with one sentence.', +) +result = agent.run_sync('Where does "hello world" come from?') +``` +(https://ai.pydantic.dev/) + +The Agent also owns tools registered via `@agent.tool` / `@agent.tool_plain` or via `tools=[...]`, dependencies via `deps_type=...`, structured outputs via `output_type=...`, and capabilities via `capabilities=[...]` (https://ai.pydantic.dev/api/agent/, https://ai.pydantic.dev/capabilities/). + +**How the model/provider is declared.** Pydantic AI is **model-string + client-object based**, not Llama-Stack-style named provider entries. The simplest form is a string `':'`: + +> "When you instantiate an Agent with just a name formatted as `:`, e.g. `openai:gpt-5.2` or `openrouter:google/gemini-3-pro-preview`, Pydantic AI will automatically select the appropriate model class, provider, and profile." +> — https://ai.pydantic.dev/models/overview/ + +For non-default endpoints, auth, or AI-gateway use, you instantiate a `Model` class and pass a `Provider`: +```python +from pydantic_ai.models.openai import OpenAIChatModel +from pydantic_ai.providers.azure import AzureProvider +agent = Agent(OpenAIChatModel('gpt-5.2', provider=AzureProvider(...))) +``` +(https://ai.pydantic.dev/models/overview/, https://ai.pydantic.dev/api/providers/) + +There is **no native concept of named provider entries with type/id/config that get looked up by name at runtime**, the way Llama Stack does. The `Provider` in Pydantic AI is a Python class with constructor args (`api_key`, `base_url`, `openai_client`, `http_client`), not a registry entry referenced from a YAML file by id (https://pydantic.dev/docs/ai/api/pydantic-ai/providers/). + +**Multiple models / providers in one app.** Per-agent: each `Agent` instance owns its own model. Globally there is no "default provider list"; the closest thing is the `gateway/...` prefix (Pydantic AI Gateway) or the `FallbackModel` wrapper that takes multiple models and falls back on failure (https://ai.pydantic.dev/models/overview/). Multi-model applications usually create multiple `Agent` instances and pass them around, optionally via dependency injection (`deps_type` carries a `RunContext` with whatever shared clients/configs you want). + +**File-based config — yes, natively.** Since the introduction of `AgentSpec` and `Agent.from_file` / `Agent.from_spec`, Pydantic AI supports declarative YAML/JSON agent definitions: + +```python +from pydantic_ai import Agent +agent = Agent.from_file('agent.yaml') +``` + +```yaml +# agent.yaml +model: anthropic:claude-opus-4-6 +instructions: "You are a helpful assistant." +capabilities: + - WebSearch: {local: duckduckgo} + - Thinking: {effort: high} +``` +(https://ai.pydantic.dev/core-concepts/agent-spec/, https://ai.pydantic.dev/api/agent/) + +`AgentSpec.to_file('agent.yaml')` can also emit a companion `agent_schema.json` for editor autocompletion. The spec is **per-agent**, not a server-wide config — there is no equivalent to Llama Stack's single `run.yaml` describing the whole runtime, APIs, and provider registry. (See section 5 for what this implies for a single-file operator config.) + +**Credentials / API keys at runtime.** Three mechanisms, in order of expressiveness: +1. **Environment variables** — each `Provider` class reads a conventional env var if no explicit `api_key=` is passed (e.g., `OLLAMA_API_KEY`, `OPENAI_API_KEY`, documented in https://pydantic.dev/docs/ai/api/pydantic-ai/providers/). +2. **Explicit provider client construction** — pass `api_key=`, `base_url=`, or a fully-constructed vendor SDK client (e.g., `openai_client=AsyncOpenAI(...)`) to the `Provider`. +3. **AgentSpec/from_file** — the YAML spec does **not** declare credentials; it declares model name and capabilities, and credentials still come from env vars or from Python code that wires the `Provider`. + +There is no built-in `${env.VAR}` interpolation in `AgentSpec` YAML. Template strings (`{{user_name}}`) exist but resolve against `deps`, not environment variables (https://ai.pydantic.dev/core-concepts/agent-spec/). + +## 2. RAG / retrieval / vector stores + +**No built-in RAG abstraction. No vector-store abstraction.** Pydantic AI's official docs are explicit: + +> "The main semantic difference between Pydantic AI Tools and RAG is RAG is synonymous with vector search, while Pydantic AI tools are more general-purpose. For vector search, you can use our embeddings support to generate embeddings across multiple providers." +> — https://ai.pydantic.dev/tools/ + +RAG is implemented as a user-written tool that calls whatever vector DB the user has chosen. The official "RAG" example (https://ai.pydantic.dev/examples/rag/, https://github.com/pydantic/pydantic-ai/blob/main/docs/examples/rag.md) uses **PostgreSQL + pgvector** directly via `asyncpg` and the **OpenAI SDK for embeddings** — Pydantic AI itself isn't involved in indexing: + +```python +@rag_agent.tool +async def retrieve(context: RunContext[Deps], search_query: str) -> str: + embedding = await context.deps.openai.embeddings.create( + input=search_query, model='text-embedding-3-small', + ) + rows = await context.deps.pool.fetch( + 'SELECT chunk FROM text_chunks ORDER BY embedding <-> $1 LIMIT 5', + pydantic_core.to_json(embedding.data[0].embedding).decode(), + ) + return '\n\n'.join(f'# Chunk:\n{row["chunk"]}\n' for row in rows) +``` +(verbatim from the official example) + +The docs even note: *"Note building the database doesn't use Pydantic AI right now, instead it uses the OpenAI SDK directly."* (https://github.com/pydantic/pydantic-ai/blob/main/docs/examples/rag.md) + +**Canonical community patterns.** From observed community projects and Pydantic AI's own example: +- **pgvector + OpenAI / Voyage embeddings via raw SDK calls** — the pattern in the official example, and in projects such as github.com/serkanyasr/agentic_rag_project (Pydantic AI + FastAPI + pgvector) and github.com/cskwork/pydantic-rag-ollama (Ollama embeddings + pgvector). +- **MCP server for retrieval** — exposing a vector DB through an MCP server and consuming it via Pydantic AI's `MCP(url=...)` capability (https://ai.pydantic.dev/capabilities/). +- **LlamaIndex / LangChain as a retrieval backend** — used purely as a library inside a Pydantic AI tool; no first-class integration is documented in ai.pydantic.dev. + +There is no `pydantic_ai.vector_store` module, no `VectorStore` protocol, and no roadmap entry that surfaced in my search for adding one (caveat: I did not find a public roadmap document, only the version policy and release notes — see §7). + +## 3. Safety / guardrails + +**No built-in safety / guardrails / shield API in Pydantic AI core.** GitHub Issue #1197 ("Guardrails") is the open feature request, with a working-design proposal that mirrors the OpenAI Agents SDK's `@input_guardrail` / `@output_guardrail` decorators (https://github.com/pydantic/pydantic-ai/issues/1197). As of `pydantic-ai-slim 1.97.0` (May 15, 2026, https://pypi.org/project/pydantic-ai-slim/) it has not been merged into core. + +**What Pydantic AI *does* provide is validation-as-correctness, not safety:** +- `output_type=` enforces the response shape via Pydantic validation (https://ai.pydantic.dev/output/). +- `@agent.output_validator` lets you raise `ModelRetry` and force the model to try again — but it operates only on the structured/typed output and is enforced via a per-run retry budget (https://ai.pydantic.dev/output/, https://ai.pydantic.dev/api/agent/). +- Tools can raise `ModelRetry` for argument-level checks (https://ai.pydantic.dev/tools-advanced/). +- Tool-call approval (`requires_approval=`) is a deterministic human-in-the-loop gate on a per-tool basis (https://ai.pydantic.dev/api/tools/, https://ai.pydantic.dev/). + +These are **schema enforcement and retry control**, not content-safety in the Llama Guard / Prompt Guard sense. They don't see the user prompt before the model does, and they don't classify content categories. + +**Typical user patterns for actual safety:** +1. **Custom output validator** that calls an external moderation API (OpenAI moderations, Lakera, LLM Guard) — minimal but only catches output, runs after the model call. +2. **Tool gating** — a `prepare` function on `Tool` that filters tool availability based on `RunContext` (https://ai.pydantic.dev/api/tools/). +3. **Third-party capability packages built on the Capabilities API:** + - `pydantic-ai-guardrails` (https://pypi.org/project/pydantic-ai-guardrails/, https://github.com/jagreehal/pydantic-ai-guardrails) — `GuardedAgent` wrapper with input/output guardrails, llm-guard + autoevals + Guardrails Hub integrations, OpenAI Guardrails-UI config loading, parallel execution. + - `pydantic-ai-shields` (https://github.com/vstorm-co/pydantic-ai-shields) — `PromptInjection`, `PiiDetector`, `SecretRedaction`, `BlockedKeywords`, `NoRefusals`, `OutputGuard`, `AsyncGuardrail` capabilities passed via `capabilities=[...]`. +4. **NeMo Guardrails / Guardrails AI / Llama Guard via tool call** — wrap the entire model call in your own pipeline outside Pydantic AI. + +None of these are first-party. None are mentioned in the official Pydantic AI docs as the canonical answer. *I am unsure* whether any will be brought into core before V2. + +## 4. Tools / function calling + +**Declaration / registration** (https://ai.pydantic.dev/tools/, https://ai.pydantic.dev/toolsets/): +- `@agent.tool` — decorator, function receives `RunContext` as first arg. +- `@agent.tool_plain` — decorator, no `RunContext`. +- `tools=[fn1, fn2, Tool(fn3, name=..., description=...)]` on the `Agent` constructor. +- `FunctionToolset(tools=[...])` + `toolsets=[...]` on the constructor — first-class collections of tools, can be combined dynamically with `@agent.toolset`. +- Dynamic registration inside a running tool: `toolset.add_function(...)` / `toolset.add_tool(...)`. + +Schema is **auto-generated** from function signatures and docstrings via griffe (https://ai.pydantic.dev/tools/). Args validated by Pydantic; `ModelRetry` triggers a retry with feedback to the model. + +**Comparison to Llama Stack's `tool_runtime` / `registered_resources.tool_groups`.** Llama Stack treats tool *implementations* as plug-in providers under `providers.tool_runtime` (e.g., `remote::model-context-protocol`, `remote::tavily-search`, `inline::rag-runtime`), and the *named tool groups* the agent can invoke as a separate registered-resources list (`tool_groups:` or `registered_resources.tool_groups:`), each referencing a runtime by `provider_id` (e.g., MCP endpoint URI). The split is intentional: it lets the operator add or remove tool backends without touching application code (https://llamastack.github.io/docs). + +Pydantic AI has no equivalent split. Tools are **Python objects/callables**, not configuration. There is one exception that brings configuration-driven extensibility: **capability packages** (`AbstractCapability` subclasses) can be referenced from `AgentSpec` YAML by class name and registered via `custom_capability_types=` on `Agent.from_spec` / `Agent.from_file` (https://ai.pydantic.dev/capabilities/). MCP is exposed this way: in `agent.yaml` you can write `capabilities: [{MCP: {url: https://mcp.example.com/api}}]` — that's the closest analogue to Llama Stack's `tool_groups` entry for an MCP endpoint, and it's the surface you would use if you want a YAML-only tool declaration. + +There is no general plugin discovery via entry-points documented in ai.pydantic.dev; you must `pip install` the capability package and pass its class to `custom_capability_types`. + +## 5. Mapping table (Llama Stack ↔ Pydantic AI) + +| Llama Stack concept | Pydantic AI equivalent / pattern | +|---|---| +| `providers.inference` (named entry with id+type+config) | No direct equivalent. Pattern: `Agent('openai:gpt-5.2')` model string, or explicit `Provider(api_key=…, base_url=…)` + `Model` instance per agent. | +| `providers.safety` / `shields` | No direct equivalent in core. Pattern: `@agent.output_validator` for shape, third-party capability packages (`pydantic-ai-shields`, `pydantic-ai-guardrails`) for content safety. | +| `providers.vector_io` / vector stores | No direct equivalent. Pattern: tool that calls pgvector/Milvus/Qdrant via the user's chosen client; embeddings via Pydantic AI's `embeddings` support or a raw SDK. | +| `providers.tool_runtime` | No direct equivalent as a provider registry. Pattern: tools registered as Python callables; MCP endpoints declared via the `MCP` capability in code or `AgentSpec` YAML. | +| `providers.agents` (Agents API as a provider) | The `Agent` class itself; not a server provider — it's a Python object. No equivalent of swapping the agent runtime via config. | +| `apis: [agents, inference, safety, …]` | No direct equivalent. Pydantic AI has no notion of selectively enabling capability *APIs* — every Agent always supports tools, output validation, etc., as Python APIs. | +| `registered_resources` (models, shields, vector stores) | Partially: model is declared per-Agent in `AgentSpec` (`model:`). Shields/vector stores have no equivalent registry — they're code. | +| `storage` (sqlstore / kvstore) | No equivalent. Pydantic AI itself is stateless per run; durable execution is delegated to **Temporal / DBOS / Prefect** integrations (https://ai.pydantic.dev/api/agent/). Conversation state is the caller's responsibility. | +| `${env.VAR}` env-ref resolution in config | No equivalent in `AgentSpec` YAML. Env vars are read by `Provider` classes at construction time. For YAML-side interpolation you must layer your own loader (e.g., `pydantic-settings` or a manual `os.path.expandvars` pre-pass). | + +## 6. Implications for a backend-agnostic operator-facing schema + +Replaying the user's example schema: +```yaml +inference: + providers: + - type: openai + api_key_env: OPENAI_API_KEY + allowed_models: [gpt-4o-mini] +rag: + providers: + - type: faiss + embedding_model: sentence-transformers +safety: + default_shield: llama-guard +``` + +### `inference` block — **STABLE-ish across both backends. ~75% confidence.** + +This concept maps cleanly to Llama Stack today (`providers.inference` with `provider_type: remote::openai`, `api_key: ${env.OPENAI_API_KEY}`, optionally a `registered_resources.models` allow-list) and to Pydantic AI tomorrow (synthesize a Python `OpenAIProvider(api_key=os.environ["OPENAI_API_KEY"])` and use model strings constrained to `allowed_models`). The single-item-list shape (`providers: [...]`) on the Llama Stack side preserves the model that Llama Stack uses today; on the Pydantic AI side a synthesizer picks the first matching entry per agent. Operators describe "what credentials, what endpoints, what models are allowed" — a vocabulary stable in both worlds. + +**Where it breaks:** Llama Stack allows multiple named provider entries of the same API serving different models (e.g., `vllm-inference` and `vllm-safety`); Pydantic AI has no global registry, so naming providers is meaningless until your synthesizer assigns them to specific Agents. Decision: keep the list but treat the entries as available-clients, not as a global registry. Don't expose `provider_id` in the abstract schema unless you also expose agent-to-provider binding. + +### `rag` block — **NOT STABLE. ~25% confidence it survives.** + +Llama Stack has `providers.vector_io` (with `inline::faiss`, `remote::milvus`, `remote::pgvector`, etc.) plus `registered_resources.vector_stores` and a first-class `/v1/vector_stores` API. Pydantic AI has **none of this**. The official RAG example wires pgvector directly via `asyncpg` (https://github.com/pydantic/pydantic-ai/blob/main/docs/examples/rag.md). A backend-agnostic synthesizer can take your `rag.providers[].type=faiss` and produce a Llama Stack provider entry, but for Pydantic AI the synthesizer would have to *generate code or instantiate a vector-DB client and a tool that uses it* — a much larger gap. + +Worse, vocabulary diverges: `embedding_model: sentence-transformers` is a model identifier in Llama Stack (registered under `registered_resources.models` with `model_type: embedding`); in Pydantic AI it would be a parameter to `pydantic_ai.embeddings.SentenceTransformersEmbedder` or similar (the `sentence-transformers` package extra is in `pydantic-ai-slim[sentence-transformers]` — https://pypi.org/project/pydantic-ai-slim/). + +**Recommendation:** Until Pydantic AI ships a built-in vector-store abstraction (no public signal it's coming in the next 6–12 months — see §7), keep RAG configuration **under a Llama-Stack-specific subtree**, and on the Pydantic AI side require operators to declare which tool implements retrieval. A minimal portable surface might be `rag.embedding_model:` only — both backends understand "which model to embed with." + +### `safety` block — **NOT STABLE. ~20% confidence it survives.** + +`default_shield: llama-guard` translates 1:1 to Llama Stack — `providers.safety: - provider_type: inline::llama-guard` + `registered_resources.shields: - shield_id: llama-guard, provider_id: llama-guard`, plus per-agent `input_shields` / `output_shields`. On Pydantic AI, there is no shield concept; the closest path is a third-party capability (`pydantic-ai-shields` provides a `PromptInjection` capability you'd add to `agent.yaml`'s `capabilities:` list), but the actual *model used* is hard-coded inside each capability, and Llama Guard specifically is not a first-class Pydantic AI option. + +The vocabulary `default_shield: ` makes sense in Llama Stack (where shields are registered resources you can name); it makes no sense in Pydantic AI without inventing a registry layer. + +**Recommendation:** Keep `safety.default_shield` under a Llama-Stack-specific subtree. The portable surface is essentially nothing today — at best `safety.enabled: bool` and `safety.fail_closed: bool`. + +### Minimum YAML surface stable across both backends + +```yaml +# This much survives a Llama Stack → Pydantic AI migration, with caveats: +inference: + providers: + - type: openai # vendor identifier (stable) + api_key_env: OPENAI_API_KEY # env-var name only (synthesizer reads value) + base_url: https://... # optional override + allowed_models: [gpt-4o-mini] +# everything else (rag, safety, storage, tool_runtime, shields, vector_stores) +# goes under a backend-specific block: +backend_specific: + llama_stack: + rag: {...} + safety: {...} + storage: {...} + pydantic_ai: + capabilities: [...] + spec_overrides: {...} +``` + +## 7. Pydantic AI's own roadmap / stability signals + +**Stability today.** Pydantic AI reached **V1.0.0 on September 4, 2025** with an explicit API-stability commitment: *"V1 means we're committing to API stability: we will not break your code for at least 6 months."* (https://pydantic.dev/articles/pydantic-ai-v1). The version policy adds: *"We will not intentionally make breaking changes in minor releases of V1. V2 will be released in April 2026 at the earliest, 6 months after the release of V1 in September 2025."* (https://ai.pydantic.dev/version-policy/). Current PyPI version is **`pydantic-ai 1.98.0` (May 19, 2026)** and **`pydantic-ai-slim 1.97.0` (May 15, 2026)** (https://pypi.org/project/pydantic-ai/, https://pypi.org/project/pydantic-ai-slim/) — production/stable classification. + +As of this report (May 20, 2026), **V2 has not shipped** — the upgrade guide at https://ai.pydantic.dev/changelog/ lists only V1.x breaking changes (which the team explicitly notes were *accidental* leftovers from pre-V1 work, e.g., the Python evaluator removal in #2808 left out of v1.0.0). + +**Release cadence.** Weekly to bi-weekly minor releases since V1 (e.g., v1.90.0 on May 4, 2026; v1.91.0; v1.93.0 on May 9; v1.94.0 on May 12; v1.95.0; v1.97.0 on May 15; v1.98.0 on May 19) — https://github.com/pydantic/pydantic-ai/releases. No breaking changes to agent construction, model strings, or `@agent.tool` since 1.0; recent breaking-change entries in the changelog are pre-V1 (the upgrade guide is filtered to historical pre-1.0 churn). + +**Recent changes touching agent / provider / tool surface in the last 12 months** (https://ai.pydantic.dev/changelog/, https://github.com/pydantic/pydantic-ai/releases): +- `AgentStreamEvent` expanded to a union — backward compatible (#2689). +- `format_as_xml` import path moved (#2446/#1484) — minor. +- Removal of deprecated `Agent.result_validator`, `AgentRunResult.data`, `Agent.last_run_messages` (#2451). +- `TenacityTransport` now requires `RetryConfig` TypedDict (#2670, #2717). +- v1.94.0 (May 12, 2026): "Drop mistralai as dependency from pydantic-ai by @Kludex in #5384"; OpenAI profile flag for multi-system messages (https://github.com/pydantic/pydantic-ai/releases). +- v1.95.0 / v1.97.0 / v1.98.0: incremental fixes; deprecation of `OutlinesModel` / `OutlinesProvider`, `AGUIApp` / `Agent.to_ag_ui()` in favor of `AGUIAdapter`. + +None touched the public Agent constructor signature, the `:` string convention, the `@agent.tool` decorator, or `AgentSpec`/`from_file`. + +**Roadmap signals for built-in safety / RAG / registered-resources.** +- **Guardrails (Issue #1197)** is open with an OpenAI-Agents-SDK-style proposal; not on a stated milestone. *I am unsure* whether it will land before V2. +- No public roadmap document at ai.pydantic.dev mentions a built-in vector-store / RAG abstraction or a Llama-Stack-style provider registry. Thoughtworks Technology Radar **Volume 33** (published November 5, 2025, per PRNewswire) confirms the framework is intentionally narrow: *"Rather than trying to be a Swiss Army knife, PydanticAI offers a lightweight yet powerful approach."* (https://www.thoughtworks.com/radar/languages-and-frameworks/pydantic-ai) — and the same page notes "This blip is not on the current edition of the Radar," indicating the entry was Volume 33 and not Volume 34. +- The big roadmap themes per the v1 launch article (https://pydantic.dev/articles/pydantic-ai-v1) are: durable execution (Temporal/DBOS/Prefect), human-in-the-loop tool approval, MCP/A2A/AG-UI interop, and the Pydantic AI Gateway. **Not** safety shields, not vector stores, not server-side configuration. + +**Conclusion (§7):** API stability is high (V1, ~9 months in production, weekly minors, no agent-surface breakage). The library is **intentionally not growing into Llama Stack's territory** on a 6–12 month horizon. + +## 8. Recommendation + +**Abstract only the inference vocabulary today. Keep RAG, safety, storage, and tool-runtime under backend-specific subtrees. Confidence: ~80%.** Build the single-file schema around an `inference:` block of vendor + endpoint + env-var-name + allow-listed models — that vocabulary maps cleanly to Llama Stack today and to Pydantic AI's Provider + model-string surface tomorrow, and a thin synthesizer per backend covers the gap. Do **not** abstract `rag.*` or `safety.*` into a portable vocabulary right now: Pydantic AI has no built-in vector-store or shield concept, no public roadmap signal that either is coming before V2, and the third-party capability packages that fill the gap (pydantic-ai-shields, pydantic-ai-guardrails) have incompatible vocabularies with each other and with Llama Guard. Park them under `backend_specific.llama_stack.{rag,safety,storage}` and a parallel `backend_specific.pydantic_ai.{capabilities,spec_overrides}`. Treat MCP endpoints as the one tool-runtime concept worth abstracting (~60% confidence) — both backends support MCP natively and the URI + auth-token + allowed-tools fields are stable on both sides. Re-evaluate at every Pydantic AI minor release for: (a) a built-in guardrails API merging Issue #1197, (b) any vector-store abstraction, (c) a server-side / multi-agent config concept. If any of those land, the safety or RAG vocabulary becomes worth abstracting; until then, premature abstraction will cost you more than the duplication. + +## Caveats + +- **Llama Stack rebrand.** On **April 28, 2026**, the Llama Stack project rebranded to **OGX**, per the official announcement blog post (https://ogx-ai.github.io/blog/from-llama-stack-to-ogx): *"Llama Stack is now OGX. The name changed, but more importantly, so did the mission."* The repo `github.com/llamastack/llama-stack` and the mirror `github.com/meta-llama/llama-stack` redirect there. Latest release tag observed: **v1.0.2 (May 13, 2026)**. The OGX rebrand post also states that **"The project supports 23 inference providers. You can run GPT-4, Claude, Gemini, Mistral, or any model you want behind OGX."** Some templates still ship `image_name:` (legacy) while newer ones use `distro_name:` (PR #4396). The `registered_resources:` block introduced in PR #4600 is the new canonical home for `models / shields / vector_stores / tool_groups / datasets / benchmarks`; older templates with these as bare top-level keys still load. +- **Llama Stack env-var syntax.** Confirmed forms: `${env.VAR}` (required), `${env.VAR:=default}` (with default), `${env.VAR:+value}` (conditional, e.g., enable a provider only when a key is set). The single-colon form `${env.VAR:default}` seen in some older blog posts is **not** the current canonical syntax. +- **Llama Stack source-file paths in the schema reconstruction could not be directly verified** because raw GitHub fetches were blocked during research. The schema sketch is reconstructed from third-party verbatim quotes (Cerebras, Red Hat, Medium) and PR titles. Before locking your schema, fetch `src/llama_stack/core/datatypes.py` (look for `StackRunConfig`) and a current `src/llama_stack/distributions/starter/run.yaml` to confirm. +- **`pydantic-ai-shields` and `pydantic-ai-guardrails` are third-party**, maintained by independent authors (vstorm-co and jagreehal respectively). They are not part of pydantic/pydantic-ai. Treat them as community packages whose APIs may diverge from anything the Pydantic team eventually ships. +- **No Pydantic AI public roadmap doc** was found; conclusions about "not on the roadmap" are inferred from the version-policy doc, the v1 launch post, recent release notes, and the absence of relevant milestones — not from a positive statement that these features are out of scope. Flagged as inference, not fact. +- **Pydantic AI V2 timing.** The version policy says *"V2 will be released in April 2026 at the earliest"*; as of May 20, 2026, V2 has not shipped and the changelog page lists only V1.x entries. The user's stated migration window (2026 or Q1 2027) likely overlaps the V2 release; plan to re-validate this report when V2 ships. \ No newline at end of file