Skip to content

Commit d396b94

Browse files
committed
Update client metric API
1 parent ca00275 commit d396b94

22 files changed

Lines changed: 1067 additions & 135 deletions

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66

77
Graphsignal is an inference observability platform that helps developers accelerate and troubleshoot AI systems. With Graphsignal, developers can:
88

9-
* Trace LLM generations, communication, kernel launches, and more.
10-
* Identify top contributors to inference latency.
11-
* Monitor inference performance, CPU/GPU utilization, and errors.
12-
* Track and get alerts on errors and inefficiencies.
9+
* Trace and profile LLM generations, communication, CUDA kernels, batching, and more.
10+
* Monitor inference performance, CPU/GPU utilization, memory usage, and server metrics.
11+
* Track and get alerst on errors and exceptions - with contextual data, stack traces, and triggering conditions.
12+
* Compare performance across models, versions, hardware setups, and optimization configurations.
1313

1414

1515
[![Dashboards](https://graphsignal.com/external/screenshot-dashboard.png)](https://graphsignal.com/)

graphsignal/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def configure(
2525
deployment: Optional[str] = None,
2626
tags: Optional[Dict[str, str]] = None,
2727
auto_instrument: Optional[bool] = None,
28-
profiles_per_min: Optional[int] = None,
28+
samples_per_min: Optional[int] = None,
2929
include_profiles: Optional[list] = None,
3030
debug_mode: Optional[bool] = None
3131
) -> None:
@@ -39,7 +39,7 @@ def configure(
3939
api_url = read_config_param("api_url", str, api_url)
4040
tags = read_config_tags(tags)
4141
auto_instrument = read_config_param("auto_instrument", bool, auto_instrument, default_value=True)
42-
profiles_per_min = read_config_param("profiles_per_min", int, profiles_per_min)
42+
samples_per_min = read_config_param("samples_per_min", int, samples_per_min)
4343
include_profiles = read_config_param("include_profiles", list, include_profiles)
4444
debug_mode = read_config_param("debug_mode", bool, debug_mode, default_value=False)
4545

@@ -52,7 +52,7 @@ def configure(
5252
api_url=api_url,
5353
tags=tags,
5454
auto_instrument=auto_instrument,
55-
profiles_per_min=profiles_per_min,
55+
samples_per_min=samples_per_min,
5656
include_profiles=include_profiles,
5757
debug_mode=debug_mode)
5858
_tracer.setup()

graphsignal/client/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@
4141
from graphsignal.client.models.metric_type import MetricType
4242
from graphsignal.client.models.param import Param
4343
from graphsignal.client.models.profile import Profile
44-
from graphsignal.client.models.rate import Rate
4544
from graphsignal.client.models.span import Span
4645
from graphsignal.client.models.span_query_result import SpanQueryResult
46+
from graphsignal.client.models.summary import Summary
4747
from graphsignal.client.models.tag import Tag
4848
from graphsignal.client.models.validation_error import ValidationError
4949
from graphsignal.client.models.validation_error_loc_inner import ValidationErrorLocInner

graphsignal/client/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
from graphsignal.client.models.metric_type import MetricType
2525
from graphsignal.client.models.param import Param
2626
from graphsignal.client.models.profile import Profile
27-
from graphsignal.client.models.rate import Rate
2827
from graphsignal.client.models.span import Span
2928
from graphsignal.client.models.span_query_result import SpanQueryResult
29+
from graphsignal.client.models.summary import Summary
3030
from graphsignal.client.models.tag import Tag
3131
from graphsignal.client.models.validation_error import ValidationError
3232
from graphsignal.client.models.validation_error_loc_inner import ValidationErrorLocInner

graphsignal/client/models/metric.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from typing import Any, ClassVar, Dict, List, Optional, Union
2222
from graphsignal.client.models.histogram import Histogram
2323
from graphsignal.client.models.metric_type import MetricType
24-
from graphsignal.client.models.rate import Rate
24+
from graphsignal.client.models.summary import Summary
2525
from graphsignal.client.models.tag import Tag
2626
from typing import Optional, Set
2727
from typing_extensions import Self
@@ -37,11 +37,11 @@ class Metric(BaseModel):
3737
is_time: Optional[StrictBool] = Field(default=False, description="Indicates if the metric is in nanoseconds.")
3838
is_size: Optional[StrictBool] = Field(default=False, description="Indicates if the metric is in bytes.")
3939
gauge: Optional[Union[StrictFloat, StrictInt]] = Field(default=None, description="The value for gauge type metrics.")
40-
counter: Optional[Union[StrictFloat, StrictInt]] = Field(default=None, description="The value for counter type metrics.")
41-
rate: Optional[Rate] = Field(default=None, description="The value for rate type metrics.")
40+
total: Optional[Union[StrictFloat, StrictInt]] = Field(default=None, description="The value for counter type metrics.")
41+
summary: Optional[Summary] = Field(default=None, description="The value for summary type metrics.")
4242
histogram: Optional[Histogram] = Field(default=None, description="The histogram data for histogram type metrics.")
4343
update_ts: StrictInt = Field(description="Unix timestamp (seconds) when the metric was last updated.")
44-
__properties: ClassVar[List[str]] = ["name", "tags", "type", "unit", "is_time", "is_size", "gauge", "counter", "rate", "histogram", "update_ts"]
44+
__properties: ClassVar[List[str]] = ["name", "tags", "type", "unit", "is_time", "is_size", "gauge", "total", "summary", "histogram", "update_ts"]
4545

4646
model_config = ConfigDict(
4747
populate_by_name=True,
@@ -89,9 +89,9 @@ def to_dict(self) -> Dict[str, Any]:
8989
if _item_tags:
9090
_items.append(_item_tags.to_dict())
9191
_dict['tags'] = _items
92-
# override the default output from pydantic by calling `to_dict()` of rate
93-
if self.rate:
94-
_dict['rate'] = self.rate.to_dict()
92+
# override the default output from pydantic by calling `to_dict()` of summary
93+
if self.summary:
94+
_dict['summary'] = self.summary.to_dict()
9595
# override the default output from pydantic by calling `to_dict()` of histogram
9696
if self.histogram:
9797
_dict['histogram'] = self.histogram.to_dict()
@@ -114,8 +114,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
114114
"is_time": obj.get("is_time") if obj.get("is_time") is not None else False,
115115
"is_size": obj.get("is_size") if obj.get("is_size") is not None else False,
116116
"gauge": obj.get("gauge"),
117-
"counter": obj.get("counter"),
118-
"rate": Rate.from_dict(obj["rate"]) if obj.get("rate") is not None else None,
117+
"total": obj.get("total"),
118+
"summary": Summary.from_dict(obj["summary"]) if obj.get("summary") is not None else None,
119119
"histogram": Histogram.from_dict(obj["histogram"]) if obj.get("histogram") is not None else None,
120120
"update_ts": obj.get("update_ts")
121121
})

graphsignal/client/models/metric_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class MetricType(str, Enum):
2828
"""
2929
GAUGE = 'gauge'
3030
COUNTER = 'counter'
31-
RATE = 'rate'
31+
SUMMARY = 'summary'
3232
HISTOGRAM = 'histogram'
3333

3434
@classmethod
Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,18 @@
1818
import json
1919

2020
from pydantic import BaseModel, ConfigDict, Field, StrictFloat, StrictInt
21-
from typing import Any, ClassVar, Dict, List, Union
21+
from typing import Any, ClassVar, Dict, List, Optional, Union
2222
from typing import Optional, Set
2323
from typing_extensions import Self
2424

25-
class Rate(BaseModel):
25+
class Summary(BaseModel):
2626
"""
27-
Rate
27+
Summary
2828
""" # noqa: E501
29-
count: Union[StrictFloat, StrictInt] = Field(description="The count value.")
30-
interval: Union[StrictFloat, StrictInt] = Field(description="The the interval value.")
31-
__properties: ClassVar[List[str]] = ["count", "interval"]
29+
count: StrictInt = Field(description="The count value.")
30+
sum: Union[StrictFloat, StrictInt] = Field(description="The sum value.")
31+
sum2: Optional[Union[StrictFloat, StrictInt]] = Field(default=None, description="The sum of squares value.")
32+
__properties: ClassVar[List[str]] = ["count", "sum", "sum2"]
3233

3334
model_config = ConfigDict(
3435
populate_by_name=True,
@@ -48,7 +49,7 @@ def to_json(self) -> str:
4849

4950
@classmethod
5051
def from_json(cls, json_str: str) -> Optional[Self]:
51-
"""Create an instance of Rate from a JSON string"""
52+
"""Create an instance of Summary from a JSON string"""
5253
return cls.from_dict(json.loads(json_str))
5354

5455
def to_dict(self) -> Dict[str, Any]:
@@ -73,7 +74,7 @@ def to_dict(self) -> Dict[str, Any]:
7374

7475
@classmethod
7576
def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
76-
"""Create an instance of Rate from a dict"""
77+
"""Create an instance of Summary from a dict"""
7778
if obj is None:
7879
return None
7980

@@ -82,7 +83,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
8283

8384
_obj = cls.model_validate({
8485
"count": obj.get("count"),
85-
"interval": obj.get("interval")
86+
"sum": obj.get("sum"),
87+
"sum2": obj.get("sum2")
8688
})
8789
return _obj
8890

graphsignal/metrics.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -68,40 +68,42 @@ class CounterMetric(BaseMetric):
6868
def __init__(self, *args, **kwargs):
6969
super().__init__(*args, **kwargs)
7070
self.type = 'counter'
71-
self.counter = 0
71+
self.total = 0
7272

7373
def update(self, value, update_ts):
7474
with self._update_lock:
7575
self.touch()
76-
self.counter += value
76+
self.total += value
7777
self.update_ts = update_ts
7878

7979
def export(self):
8080
with self._update_lock:
8181
model = super().export()
82-
model.counter = self.counter
83-
self.counter = 0
82+
model.total = self.total
83+
self.total = 0
8484
return model
8585

8686

87-
class RateMetric(BaseMetric):
87+
class SummaryMetric(BaseMetric):
8888
def __init__(self, *args, **kwargs):
8989
super().__init__(*args, **kwargs)
90-
self.type = 'rate'
90+
self.type = 'summary'
9191
self.count = 0
92-
self.interval = 0
92+
self.sum = 0
93+
self.sum2 = 0
9394

94-
def update(self, count, interval, update_ts):
95+
def update(self, count, sum_val, sum2_val, update_ts):
9596
with self._update_lock:
9697
self.touch()
9798
self.count += count
98-
self.interval += interval
99+
self.sum += sum_val
100+
self.sum2 += sum2_val
99101
self.update_ts = update_ts
100102

101103
def export(self):
102104
with self._update_lock:
103105
model = super().export()
104-
model.rate = client.Rate(count=self.count, interval=self.interval)
106+
model.summary = client.Summary(count=self.count, sum=self.sum, sum2=self.sum2)
105107
return model
106108

107109

@@ -171,21 +173,23 @@ def inc_counter(self, name, tags, value, update_ts, unit=None):
171173
metric.update(value, update_ts)
172174
return metric
173175

174-
def update_rate(self, name, tags, count, interval, update_ts, unit=None):
176+
def update_summary(self, name, tags, count, sum_val, sum2_val, update_ts, unit=None):
175177
if name is None:
176-
raise ValueError('Rate name cannot be None')
178+
raise ValueError('Summary name cannot be None')
177179
if count is None:
178-
raise ValueError('Rate count cannot be None')
179-
if interval is None:
180-
raise ValueError('Rate interval cannot be None')
180+
raise ValueError('Summary count cannot be None')
181+
if sum_val is None:
182+
raise ValueError('Summary sum cannot be None')
183+
if sum2_val is None:
184+
raise ValueError('Summary sum2 cannot be None')
181185

182186
key = self.metric_key(name, tags)
183187
with self._update_lock:
184188
if key not in self._metrics:
185-
metric = self._metrics[key] = RateMetric(name, tags, unit=unit)
189+
metric = self._metrics[key] = SummaryMetric(name, tags, unit=unit)
186190
else:
187191
metric = self._metrics[key]
188-
metric.update(count, interval, update_ts)
192+
metric.update(count, sum_val, sum2_val, update_ts)
189193
return metric
190194

191195
def update_histogram(self, name, tags, value, update_ts, unit=None, is_time=False, is_size=False):

graphsignal/recorders/nvml_recorder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ def on_metric_update(self):
487487
name='gpu.errors.xid', tags=metric_tags,
488488
value=num_xid_errors, update_ts=now)
489489
for xid_error_code in device_usage.last_xid_error_codes:
490-
graphsignal._tracer().report_error(
490+
graphsignal._tracer.report_error(
491491
name='gpu.errors.xid',
492492
tags=metric_tags,
493493
level='error',
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
import logging
3+
from opentelemetry import trace
4+
from opentelemetry.sdk.resources import Resource
5+
from opentelemetry.sdk.trace import TracerProvider
6+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter
7+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
8+
9+
from graphsignal.recorders.base_recorder import BaseRecorder
10+
11+
logger = logging.getLogger('graphsignal')
12+
13+
14+
class LocalSpanExporter(SpanExporter):
15+
def export(self, spans):
16+
try:
17+
self._export_callback(spans)
18+
except Exception as e:
19+
logger.error(f"Error during OpenTelemetry export: {e}", exc_info=True)
20+
return SpanExporter.ResultCode.SUCCESS
21+
22+
23+
class OTELAdapter():
24+
def __init__(self, service_name='service'):
25+
self._service_name = service_name
26+
self._provider = None
27+
self._exporter = None
28+
self._local_exporter = None
29+
self._export_callback = None
30+
31+
def setup(self, export_callback):
32+
self._export_callback = export_callback
33+
try:
34+
provider = trace.get_tracer_provider()
35+
if isinstance(provider, TracerProvider):
36+
self._local_exporter = LocalSpanExporter()
37+
provider.add_span_processor(BatchSpanProcessor(self._local_exporter))
38+
else:
39+
service_name = os.environ.get("OTEL_SERVICE_NAME", self._service_name)
40+
resource = Resource.create({"service.name": service_name})
41+
self._provider = TracerProvider(resource=resource)
42+
self._exporter = OTLPSpanExporter()
43+
self._local_exporter = LocalSpanExporter()
44+
self._provider.add_span_processor(BatchSpanProcessor(self._exporter))
45+
self._provider.add_span_processor(BatchSpanProcessor(self._local_exporter))
46+
trace.set_tracer_provider(self._provider)
47+
48+
logger.info(f"OpenTelemetry tracer provider configured for service: {service_name}")
49+
50+
except Exception as e:
51+
logger.error(f"Failed to set up OpenTelemetry tracer provider: {e}", exc_info=True)
52+
53+
def shutdown(self):
54+
try:
55+
if self._provider:
56+
self._provider.force_flush()
57+
logger.debug("OpenTelemetry tracer provider flushed")
58+
except Exception as e:
59+
logger.error(f"Error during OpenTelemetry shutdown: {e}", exc_info=True)

0 commit comments

Comments
 (0)