diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 2bb73cbd2..2ba48ff81 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -6,6 +6,7 @@ import dataclasses import os import platform +import re import time import uuid from abc import ABC, abstractmethod @@ -447,7 +448,16 @@ def _init_output_methods(self, *, api_key: str = None): self.run_id = uuid.uuid4() if self._save_to_prometheus: - self._output_handlers.append(PrometheusOutput(self._prometheus_url)) + self._output_handlers.append( + PrometheusOutput( + self._prometheus_url, + job_name=re.sub( + r"[^a-zA-Z0-9_-]", + "_", + f"{self._project_name}_{self._experiment_name}", + ), + ) + ) if self._save_to_logfire: self._output_handlers.append(LogfireOutput()) @@ -686,6 +696,10 @@ def stop(self) -> Optional[float]: self.final_emissions_data = emissions_data self.final_emissions = emissions_data.emissions + + for handler in self._output_handlers: + handler.exit() + return emissions_data.emissions def _persist_data( diff --git a/codecarbon/output_methods/base_output.py b/codecarbon/output_methods/base_output.py index 4b152c29b..ff6ea1778 100644 --- a/codecarbon/output_methods/base_output.py +++ b/codecarbon/output_methods/base_output.py @@ -22,3 +22,6 @@ def live_out(self, total: EmissionsData, delta: EmissionsData): def task_out(self, data: List[TaskEmissionsData], experiment_name: str): pass + + def exit(self): + pass diff --git a/codecarbon/output_methods/metrics/metric_docs.py b/codecarbon/output_methods/metrics/metric_docs.py index 864641ee8..83ba0e31e 100644 --- a/codecarbon/output_methods/metrics/metric_docs.py +++ b/codecarbon/output_methods/metrics/metric_docs.py @@ -50,17 +50,22 @@ class MetricDocumentation: ) cpu_energy_doc = MetricDocumentation( "codecarbon_cpu_energy", - description="Energy used per CPU (kWh)", + description="Energy used per CPU since last reading (kWh)", ) gpu_energy_doc = MetricDocumentation( "codecarbon_gpu_energy", - description="Energy used per GPU (kWh)", + description="Energy used per GPU since last reading (kWh)", ) ram_energy_doc = MetricDocumentation( "codecarbon_ram_energy", - description="Energy used per RAM (kWh)", + description="Energy used per RAM since last reading (kWh)", ) energy_consumed_doc = MetricDocumentation( "codecarbon_energy_consumed", - description="Sum of cpu_energy, gpu_energy and ram_energy (kW)", + description="Sum of cpu_energy, gpu_energy and ram_energy (kWh)", +) + +energy_consumed_total_doc = MetricDocumentation( + "codecarbon_energy_total", + description="Accumulated cpu_energy, gpu_energy and ram_energy (kWh) since the start of the run", ) diff --git a/codecarbon/output_methods/metrics/prometheus.py b/codecarbon/output_methods/metrics/prometheus.py index 24318c6b8..0cbd3e046 100644 --- a/codecarbon/output_methods/metrics/prometheus.py +++ b/codecarbon/output_methods/metrics/prometheus.py @@ -1,7 +1,13 @@ import dataclasses import os -from prometheus_client import CollectorRegistry, Gauge, push_to_gateway +from prometheus_client import ( + CollectorRegistry, + Counter, + Gauge, + delete_from_gateway, + push_to_gateway, +) from prometheus_client.exposition import basic_auth_handler from codecarbon.external.logger import logger @@ -15,6 +21,7 @@ emissions_doc, emissions_rate_doc, energy_consumed_doc, + energy_consumed_total_doc, gpu_energy_doc, gpu_power_doc, ram_energy_doc, @@ -60,6 +67,15 @@ def generate_gauge(metric_doc: MetricDocumentation): ) +def generate_counter(metric_doc: MetricDocumentation): + return Counter( + metric_doc.name, + metric_doc.description, + labelnames, + registry=registry, + ) + + duration_gauge = generate_gauge(duration_doc) emissions_gauge = generate_gauge(emissions_doc) emissions_rate_gauge = generate_gauge(emissions_rate_doc) @@ -70,6 +86,7 @@ def generate_gauge(metric_doc: MetricDocumentation): gpu_energy_gauge = generate_gauge(gpu_energy_doc) ram_energy_gauge = generate_gauge(ram_energy_doc) energy_consumed_gauge = generate_gauge(energy_consumed_doc) +energy_consumed_total = generate_counter(energy_consumed_total_doc) class PrometheusOutput(BaseOutput): @@ -77,8 +94,18 @@ class PrometheusOutput(BaseOutput): Send emissions data to prometheus pushgateway """ - def __init__(self, prometheus_url: str): + def __init__(self, prometheus_url: str, job_name: str = "codecarbon"): self.prometheus_url = prometheus_url + self.job_name = job_name + + def exit(self): + # Cleanup metrics from pushgateway on shutdown, prometheus should already have read them + # Otherwise they will persist with their last values + try: + logger.info("Deleting metrics from Prometheus Pushgateway") + delete_from_gateway(self.prometheus_url, job=self.job_name) + except Exception as e: + logger.error(e, exc_info=True) def out(self, total: EmissionsData, delta: EmissionsData): try: @@ -121,10 +148,14 @@ def add_emission(self, carbon_emission: dict): ]: gauge.labels(**labels).set(carbon_emission[emission_name]) + # Update the total energy consumed counter + # This is separate from the total values given to self.out(...) + energy_consumed_total.labels(**labels).inc(carbon_emission["energy_consumed"]) + # Send the new metric values push_to_gateway( self.prometheus_url, - job="codecarbon", + job=self.job_name, registry=registry, handler=self._auth_handler, ) diff --git a/tests/output_methods/metrics/test_prometheus.py b/tests/output_methods/metrics/test_prometheus.py index 5802bcaa4..5ee4bc74d 100644 --- a/tests/output_methods/metrics/test_prometheus.py +++ b/tests/output_methods/metrics/test_prometheus.py @@ -51,6 +51,12 @@ def test_out_method(self, mock_push_to_gateway): output = prometheus.PrometheusOutput("url") output.out(total=EMISSIONS_DATA, delta=EMISSIONS_DATA) + @patch("codecarbon.output_methods.metrics.prometheus.delete_from_gateway") + def test_exit_method(self, mock_delete): + output = prometheus.PrometheusOutput("url", job_name="custom_job") + output.exit() + mock_delete.assert_called_once_with("url", job="custom_job") + @patch( "codecarbon.output_methods.metrics.prometheus.push_to_gateway", side_effect=Exception("Test error"),