feat: Return structured training task results

phoevos · phoevos · commit 651ec9e749f6 · 2025-12-17T16:35:17.000+02:00
Store and return structured results from training tasks, including run
info, timing, metrics, parameters, tags, artifacts, and any errors.
Previously, only the internal MLflow URL was returned (using the Docker
service name for the host), limiting usability for programmatic access
as well as users attempting to access the MLflow UI from outside the
Docker network.

Signed-off-by: Phoevos Kalemkeris &lt;phoevos.kalemkeris@ucl.ac.uk&gt;
diff --git a/cogstack_model_gateway/common/tracking.py b/cogstack_model_gateway/common/tracking.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from datetime import UTC, datetime
 
 import mlflow
 import mlflow.models
@@ -74,6 +75,54 @@ def get_exceptions(self):
             else [value for key, value in self.data.tags.items() if key.startswith("exception_")]
         )
 
+    def to_dict(self) -> dict:
+        """Convert the tracking task to a structured results dictionary.
+
+        Returns a comprehensive dictionary containing run metadata, timing information,
+        training metrics, parameters, tags, and artifact locations.
+
+        Returns:
+            dict: Structured results with the following top-level keys:
+                - run: Run identification and status
+                - timing: Start/end times and duration
+                - metrics: Training metrics (e.g. accuracy, loss)
+                - params: Training parameters (e.g. learning_rate, epochs)
+                - tags: Custom tags and metadata
+                - artifacts: Artifact URIs
+                - error: Logged exceptions, if any
+        """
+        start_time_ms, end_time_ms = self.info.start_time, self.info.end_time
+
+        started_at = datetime.fromtimestamp(start_time_ms / 1000, tz=UTC).isoformat()
+        finished_at, duration_seconds = None, None
+
+        if end_time_ms:
+            finished_at = datetime.fromtimestamp(end_time_ms / 1000, tz=UTC).isoformat()
+            duration_seconds = (end_time_ms - start_time_ms) / 1000
+
+        return {
+            "run": {
+                "run_id": self.info.run_id,
+                "run_name": self.info.run_name,
+                "experiment_id": self.info.experiment_id,
+                "status": self.status,
+                "lifecycle_stage": self.info.lifecycle_stage,
+                "internal_url": self.url,
+            },
+            "timing": {
+                "started_at": started_at,
+                "finished_at": finished_at,
+                "duration_seconds": duration_seconds,
+            },
+            "metrics": self.data.metrics or {},
+            "params": self.data.params or {},
+            "tags": self.data.tags or {},
+            "artifacts": {
+                "artifact_uri": self.info.artifact_uri,
+            },
+            "error": exceptions if (exceptions := self.get_exceptions()) else None,
+        }
+
 
 class TrackingClient:
     def __init__(
diff --git a/cogstack_model_gateway/scheduler/scheduler.py b/cogstack_model_gateway/scheduler/scheduler.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import time
 
@@ -140,11 +141,11 @@ def poll_task_status(self, task_uuid: str, tracking_id: str = None) -> dict:
             task = self.tracking_client.get_task(tracking_id)
             if task is None:
                 raise ValueError(f"Task '{task_uuid}' not found in tracking server")
-            res = {"url": task.url, "error": task.get_exceptions()}
+
             if task.is_finished:
-                return {"status": Status.SUCCEEDED, **res}
+                return {"status": Status.SUCCEEDED, "results": task.to_dict()}
             elif task.is_failed or task.is_killed:
-                return {"status": Status.FAILED, **res}
+                return {"status": Status.FAILED, "results": task.to_dict()}
             else:
                 # Task is scheduled or still running
                 time.sleep(5)
@@ -266,18 +267,19 @@ def _handle_task_success(self, task_uuid: str, response: Response, ack: callable
 
             results = self.poll_task_status(task_uuid, tracking_id)
             if results["status"] == Status.FAILED:
-                log.error(f"Task '{task_uuid}' failed: {results['error']}")
+                log.error(f"Task '{task_uuid}' failed: {results['results']['error']}")
                 task = self.task_manager.update_task(
-                    task_uuid, status=Status.FAILED, error_message=str(results["error"])
+                    task_uuid, status=Status.FAILED, error_message=str(results["results"]["error"])
                 )
                 tasks_completed_total.labels(
                     **get_task_labels(task), status=task.status.value
                 ).inc()
                 return task
             else:
                 log.info(f"Task '{task_uuid}' completed, writing results to object store")
+                results_json = json.dumps(results["results"], indent=2)
                 object_key = self.results_object_store_manager.upload_object(
-                    results["url"].encode(), "results.url", prefix=task_uuid
+                    results_json.encode(), "results.json", prefix=task_uuid
                 )
                 task = self.task_manager.update_task(
                     task_uuid, status=Status.SUCCEEDED, result=object_key
diff --git a/tests/integration/test_api.py b/tests/integration/test_api.py
@@ -23,7 +23,6 @@
     count_deployed_model_containers,
     download_result_object,
     get_deployed_model_container,
-    parse_mlflow_url,
     setup_cms,
     setup_scheduler,
     setup_testcontainers,
@@ -34,6 +33,7 @@
     verify_results_match_api_info,
     verify_task_payload_in_object_store,
     verify_task_submitted_successfully,
+    verify_training_task_results,
     wait_for_task_completion,
 )
 
@@ -116,8 +116,11 @@ def trained_model(client: TestClient, config: Config) -> tuple[str, str]:
     tm: TaskManager = config.task_manager
     task = wait_for_task_completion(response_json["uuid"], tm, expected_status=Status.SUCCEEDED)
 
-    _, parsed = download_result_object(task.result, config.results_object_store_manager, "text")
-    _, _, run_id = parse_mlflow_url(parsed)
+    _, parsed = download_result_object(task.result, config.results_object_store_manager)
+
+    verify_training_task_results(parsed, task)
+
+    run_id = parsed["run"]["run_id"]
 
     tc: TrackingClient = config.tracking_client
     model_uri = tc.get_model_uri(run_id)
@@ -570,11 +573,9 @@ def test_train_supervised(client: TestClient, config: Config):
 
     verify_queue_is_empty(config.queue_manager)
 
-    res, parsed = download_result_object(task.result, config.results_object_store_manager, "text")
-
-    _, _, run_id = parse_mlflow_url(parsed)
-    assert run_id == task.tracking_id
+    res, parsed = download_result_object(task.result, config.results_object_store_manager)
 
+    verify_training_task_results(parsed, task)
     verify_results_match_api_info(client, task, res)
 
 
@@ -599,11 +600,9 @@ def test_train_unsupervised(client: TestClient, config: Config):
 
     verify_queue_is_empty(config.queue_manager)
 
-    res, parsed = download_result_object(task.result, config.results_object_store_manager, "text")
-
-    _, _, run_id = parse_mlflow_url(parsed)
-    assert run_id == task.tracking_id
+    res, parsed = download_result_object(task.result, config.results_object_store_manager)
 
+    verify_training_task_results(parsed, task)
     verify_results_match_api_info(client, task, res)
 
 
@@ -622,11 +621,9 @@ def test_train_unsupervised_with_hf_hub_dataset(client: TestClient, config: Conf
 
     verify_queue_is_empty(config.queue_manager)
 
-    res, parsed = download_result_object(task.result, config.results_object_store_manager, "text")
-
-    _, _, run_id = parse_mlflow_url(parsed)
-    assert run_id == task.tracking_id
+    res, parsed = download_result_object(task.result, config.results_object_store_manager)
 
+    verify_training_task_results(parsed, task)
     verify_results_match_api_info(client, task, res)
 
 
@@ -651,11 +648,9 @@ def test_train_metacat(client: TestClient, config: Config):
 
     verify_queue_is_empty(config.queue_manager)
 
-    res, parsed = download_result_object(task.result, config.results_object_store_manager, "text")
-
-    _, _, run_id = parse_mlflow_url(parsed)
-    assert run_id == task.tracking_id
+    res, parsed = download_result_object(task.result, config.results_object_store_manager)
 
+    verify_training_task_results(parsed, task)
     verify_results_match_api_info(client, task, res)
 
 
@@ -679,11 +674,9 @@ def test_evaluate(client: TestClient, config: Config):
 
     verify_queue_is_empty(config.queue_manager)
 
-    res, parsed = download_result_object(task.result, config.results_object_store_manager, "text")
-
-    _, _, run_id = parse_mlflow_url(parsed)
-    assert run_id == task.tracking_id
+    res, parsed = download_result_object(task.result, config.results_object_store_manager)
 
+    verify_training_task_results(parsed, task)
     verify_results_match_api_info(client, task, res)
 
 
diff --git a/tests/integration/utils.py b/tests/integration/utils.py
@@ -13,6 +13,7 @@
 from docker.models.containers import Container
 from fastapi.testclient import TestClient
 from git import Repo
+from mlflow.entities import LifecycleStage, RunStatus
 from testcontainers.compose import DockerCompose
 from testcontainers.core.container import DockerClient, DockerContainer
 from testcontainers.minio import MinioContainer
@@ -108,8 +109,6 @@ def setup_scheduler(request: pytest.FixtureRequest):
 
 
 def setup_cms(request: pytest.FixtureRequest, cleanup_cms: bool) -> dict[str, dict]:
-    cleanup_deployed_model_containers()
-
     try:
         clone_cogstack_model_serve()
     except Exception as e:
@@ -440,6 +439,64 @@ def verify_results_match_api_info(client: TestClient, task: Task, result: bytes)
     assert download_results.content == result
 
 
+def verify_training_task_results(parsed_results: dict, task: Task):
+    """Verify structured results from a training task.
+
+    Validates that the results contain all expected top-level keys (run, timing, metrics, params,
+    tags, artifacts) and that key fields match the task metadata.
+
+    Args:
+        parsed_results: The parsed JSON results from the training task
+        task: The Task object to verify against
+    """
+    assert isinstance(parsed_results, dict)
+    assert all(
+        key in parsed_results
+        for key in ["run", "timing", "metrics", "params", "tags", "artifacts", "error"]
+    )
+
+    assert isinstance(parsed_results["run"], dict)
+    assert all(
+        key in parsed_results["run"]
+        for key in [
+            "run_id",
+            "run_name",
+            "experiment_id",
+            "status",
+            "lifecycle_stage",
+            "internal_url",
+        ]
+    )
+    assert parsed_results["run"]["run_id"] == task.tracking_id
+    assert parsed_results["run"]["status"] == RunStatus.to_string(RunStatus.FINISHED)
+    assert parsed_results["run"]["lifecycle_stage"] == LifecycleStage.ACTIVE
+    assert parsed_results["run"]["run_name"] is not None
+    assert parsed_results["run"]["experiment_id"] is not None
+    assert parsed_results["run"]["internal_url"] is not None
+
+    _, parsed_experiment_id, parsed_run_id = parse_mlflow_url(parsed_results["run"]["internal_url"])
+    assert parsed_results["run"]["experiment_id"] == parsed_experiment_id
+    assert parsed_results["run"]["run_id"] == parsed_run_id
+
+    assert isinstance(parsed_results["timing"], dict)
+    assert all(
+        key in parsed_results["timing"] for key in ["started_at", "finished_at", "duration_seconds"]
+    )
+    assert parsed_results["timing"]["started_at"] is not None
+    assert parsed_results["timing"]["finished_at"] is not None
+    assert parsed_results["timing"]["duration_seconds"] is not None
+
+    assert isinstance(parsed_results["metrics"], dict)
+    assert isinstance(parsed_results["params"], dict)
+    assert isinstance(parsed_results["tags"], dict)
+
+    assert isinstance(parsed_results["artifacts"], dict)
+    assert "artifact_uri" in parsed_results["artifacts"]
+    assert parsed_results["artifacts"]["artifact_uri"] is not None
+
+    assert parsed_results["error"] is None
+
+
 def parse_mlflow_url(url: str) -> tuple:
     response = requests.get(url)
     assert response.status_code == 200