Skip to content

Commit d4e48ee

Browse files
committed
Improve observability
1 parent 07d4abc commit d4e48ee

14 files changed

Lines changed: 1334 additions & 53 deletions

File tree

dashboards/grafana-dashboard.json

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
{
2+
"dashboard": {
3+
"title": "DataFlow Operator",
4+
"tags": ["dataflow", "kubernetes", "operator"],
5+
"timezone": "browser",
6+
"schemaVersion": 16,
7+
"version": 1,
8+
"refresh": "30s",
9+
"panels": [
10+
{
11+
"id": 1,
12+
"title": "Сообщения получено по манифестам",
13+
"type": "graph",
14+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
15+
"targets": [
16+
{
17+
"expr": "sum(rate(dataflow_messages_received_total[5m])) by (namespace, name, source_type)",
18+
"legendFormat": "{{namespace}}/{{name}} ({{source_type}})",
19+
"refId": "A"
20+
}
21+
],
22+
"yaxes": [
23+
{"format": "short", "label": "Сообщений/сек"},
24+
{"format": "short"}
25+
],
26+
"xaxis": {"mode": "time", "show": true}
27+
},
28+
{
29+
"id": 2,
30+
"title": "Сообщения отправлено по манифестам",
31+
"type": "graph",
32+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
33+
"targets": [
34+
{
35+
"expr": "sum(rate(dataflow_messages_sent_total[5m])) by (namespace, name, sink_type, route)",
36+
"legendFormat": "{{namespace}}/{{name}} ({{sink_type}}/{{route}})",
37+
"refId": "A"
38+
}
39+
],
40+
"yaxes": [
41+
{"format": "short", "label": "Сообщений/сек"},
42+
{"format": "short"}
43+
],
44+
"xaxis": {"mode": "time", "show": true}
45+
},
46+
{
47+
"id": 3,
48+
"title": "Ошибки коннекторов",
49+
"type": "graph",
50+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
51+
"targets": [
52+
{
53+
"expr": "sum(rate(dataflow_connector_errors_total[5m])) by (namespace, name, connector_type, connector_name, operation, error_type)",
54+
"legendFormat": "{{namespace}}/{{name}} - {{connector_type}}/{{connector_name}} - {{operation}} - {{error_type}}",
55+
"refId": "A"
56+
}
57+
],
58+
"yaxes": [
59+
{"format": "short", "label": "Ошибок/сек"},
60+
{"format": "short"}
61+
],
62+
"xaxis": {"mode": "time", "show": true}
63+
},
64+
{
65+
"id": 4,
66+
"title": "Ошибки трансформеров",
67+
"type": "graph",
68+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
69+
"targets": [
70+
{
71+
"expr": "sum(rate(dataflow_transformer_errors_total[5m])) by (namespace, name, transformer_type, transformer_index, error_type)",
72+
"legendFormat": "{{namespace}}/{{name}} - {{transformer_type}}[{{transformer_index}}] - {{error_type}}",
73+
"refId": "A"
74+
}
75+
],
76+
"yaxes": [
77+
{"format": "short", "label": "Ошибок/сек"},
78+
{"format": "short"}
79+
],
80+
"xaxis": {"mode": "time", "show": true}
81+
},
82+
{
83+
"id": 5,
84+
"title": "Время обработки сообщений",
85+
"type": "graph",
86+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
87+
"targets": [
88+
{
89+
"expr": "histogram_quantile(0.95, sum(rate(dataflow_processing_duration_seconds_bucket[5m])) by (namespace, name, le))",
90+
"legendFormat": "p95 - {{namespace}}/{{name}}",
91+
"refId": "A"
92+
},
93+
{
94+
"expr": "histogram_quantile(0.99, sum(rate(dataflow_processing_duration_seconds_bucket[5m])) by (namespace, name, le))",
95+
"legendFormat": "p99 - {{namespace}}/{{name}}",
96+
"refId": "B"
97+
},
98+
{
99+
"expr": "avg(dataflow_processing_duration_seconds) by (namespace, name)",
100+
"legendFormat": "avg - {{namespace}}/{{name}}",
101+
"refId": "C"
102+
}
103+
],
104+
"yaxes": [
105+
{"format": "s", "label": "Время"},
106+
{"format": "short"}
107+
],
108+
"xaxis": {"mode": "time", "show": true}
109+
},
110+
{
111+
"id": 6,
112+
"title": "Время выполнения трансформеров",
113+
"type": "graph",
114+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
115+
"targets": [
116+
{
117+
"expr": "histogram_quantile(0.95, sum(rate(dataflow_transformer_duration_seconds_bucket[5m])) by (namespace, name, transformer_type, transformer_index, le))",
118+
"legendFormat": "p95 - {{namespace}}/{{name}} - {{transformer_type}}[{{transformer_index}}]",
119+
"refId": "A"
120+
},
121+
{
122+
"expr": "histogram_quantile(0.99, sum(rate(dataflow_transformer_duration_seconds_bucket[5m])) by (namespace, name, transformer_type, transformer_index, le))",
123+
"legendFormat": "p99 - {{namespace}}/{{name}} - {{transformer_type}}[{{transformer_index}}]",
124+
"refId": "B"
125+
}
126+
],
127+
"yaxes": [
128+
{"format": "s", "label": "Время"},
129+
{"format": "short"}
130+
],
131+
"xaxis": {"mode": "time", "show": true}
132+
},
133+
{
134+
"id": 7,
135+
"title": "Статус подключения коннекторов",
136+
"type": "table",
137+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
138+
"targets": [
139+
{
140+
"expr": "dataflow_connector_connection_status",
141+
"format": "table",
142+
"refId": "A"
143+
}
144+
],
145+
"transformations": [
146+
{
147+
"id": "organize",
148+
"options": {
149+
"excludeByName": {
150+
"Time": true,
151+
"__name__": true
152+
},
153+
"indexByName": {
154+
"namespace": 0,
155+
"name": 1,
156+
"connector_type": 2,
157+
"connector_name": 3,
158+
"Value": 4
159+
},
160+
"renameByName": {
161+
"namespace": "Namespace",
162+
"name": "Name",
163+
"connector_type": "Тип",
164+
"connector_name": "Коннектор",
165+
"Value": "Статус"
166+
}
167+
}
168+
}
169+
],
170+
"fieldConfig": {
171+
"overrides": [
172+
{
173+
"matcher": {"id": "byName", "options": "Статус"},
174+
"properties": [
175+
{
176+
"id": "custom.displayMode",
177+
"value": "color-background"
178+
},
179+
{
180+
"id": "thresholds",
181+
"value": {
182+
"mode": "absolute",
183+
"steps": [
184+
{"color": "red", "value": 0},
185+
{"color": "green", "value": 1}
186+
]
187+
}
188+
}
189+
]
190+
}
191+
]
192+
}
193+
},
194+
{
195+
"id": 8,
196+
"title": "Статус DataFlow манифестов",
197+
"type": "table",
198+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
199+
"targets": [
200+
{
201+
"expr": "dataflow_status",
202+
"format": "table",
203+
"refId": "A"
204+
}
205+
],
206+
"transformations": [
207+
{
208+
"id": "organize",
209+
"options": {
210+
"excludeByName": {
211+
"Time": true,
212+
"__name__": true
213+
},
214+
"indexByName": {
215+
"namespace": 0,
216+
"name": 1,
217+
"phase": 2,
218+
"Value": 3
219+
},
220+
"renameByName": {
221+
"namespace": "Namespace",
222+
"name": "Name",
223+
"phase": "Phase",
224+
"Value": "Статус"
225+
}
226+
}
227+
}
228+
],
229+
"fieldConfig": {
230+
"overrides": [
231+
{
232+
"matcher": {"id": "byName", "options": "Статус"},
233+
"properties": [
234+
{
235+
"id": "custom.displayMode",
236+
"value": "color-background"
237+
},
238+
{
239+
"id": "thresholds",
240+
"value": {
241+
"mode": "absolute",
242+
"steps": [
243+
{"color": "red", "value": 0},
244+
{"color": "green", "value": 1}
245+
]
246+
}
247+
}
248+
]
249+
}
250+
]
251+
}
252+
},
253+
{
254+
"id": 9,
255+
"title": "Сообщения в коннекторах (чтение)",
256+
"type": "graph",
257+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 32},
258+
"targets": [
259+
{
260+
"expr": "sum(rate(dataflow_connector_messages_read_total[5m])) by (namespace, name, connector_type, connector_name)",
261+
"legendFormat": "{{namespace}}/{{name}} - {{connector_type}}/{{connector_name}}",
262+
"refId": "A"
263+
}
264+
],
265+
"yaxes": [
266+
{"format": "short", "label": "Сообщений/сек"},
267+
{"format": "short"}
268+
],
269+
"xaxis": {"mode": "time", "show": true}
270+
},
271+
{
272+
"id": 10,
273+
"title": "Сообщения в коннекторах (запись)",
274+
"type": "graph",
275+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 32},
276+
"targets": [
277+
{
278+
"expr": "sum(rate(dataflow_connector_messages_written_total[5m])) by (namespace, name, connector_type, connector_name, route)",
279+
"legendFormat": "{{namespace}}/{{name}} - {{connector_type}}/{{connector_name}} ({{route}})",
280+
"refId": "A"
281+
}
282+
],
283+
"yaxes": [
284+
{"format": "short", "label": "Сообщений/сек"},
285+
{"format": "short"}
286+
],
287+
"xaxis": {"mode": "time", "show": true}
288+
},
289+
{
290+
"id": 11,
291+
"title": "Выполнения трансформеров",
292+
"type": "graph",
293+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 40},
294+
"targets": [
295+
{
296+
"expr": "sum(rate(dataflow_transformer_executions_total[5m])) by (namespace, name, transformer_type, transformer_index)",
297+
"legendFormat": "{{namespace}}/{{name}} - {{transformer_type}}[{{transformer_index}}]",
298+
"refId": "A"
299+
}
300+
],
301+
"yaxes": [
302+
{"format": "short", "label": "Выполнений/сек"},
303+
{"format": "short"}
304+
],
305+
"xaxis": {"mode": "time", "show": true}
306+
},
307+
{
308+
"id": 12,
309+
"title": "Сообщения в/из трансформеров",
310+
"type": "graph",
311+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 40},
312+
"targets": [
313+
{
314+
"expr": "sum(rate(dataflow_transformer_messages_in_total[5m])) by (namespace, name, transformer_type, transformer_index)",
315+
"legendFormat": "Вход - {{namespace}}/{{name}} - {{transformer_type}}[{{transformer_index}}]",
316+
"refId": "A"
317+
},
318+
{
319+
"expr": "sum(rate(dataflow_transformer_messages_out_total[5m])) by (namespace, name, transformer_type, transformer_index)",
320+
"legendFormat": "Выход - {{namespace}}/{{name}} - {{transformer_type}}[{{transformer_index}}]",
321+
"refId": "B"
322+
}
323+
],
324+
"yaxes": [
325+
{"format": "short", "label": "Сообщений/сек"},
326+
{"format": "short"}
327+
],
328+
"xaxis": {"mode": "time", "show": true}
329+
}
330+
],
331+
"time": {
332+
"from": "now-1h",
333+
"to": "now"
334+
},
335+
"timepicker": {
336+
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
337+
}
338+
}
339+
}

docs/en/index.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,21 @@ Each `DataFlow` resource has a status that includes:
197197
- **ProcessedCount** - number of processed messages
198198
- **ErrorCount** - number of errors
199199

200+
The operator also exports Prometheus metrics for detailed monitoring:
201+
- Number of messages received/sent per manifest
202+
- Errors in connectors and transformers
203+
- Message processing time and transformer execution time
204+
- Connector connection status
205+
206+
See [Metrics](metrics.md) for more details.
207+
200208
## Documentation
201209

202210
- [Getting Started](getting-started.md) - detailed getting started guide
203211
- [Connectors](connectors.md) - detailed description of all connectors
204212
- [Transformations](transformations.md) - detailed transformation descriptions with examples
205213
- [Examples](examples.md) - practical usage examples
214+
- [Metrics](metrics.md) - Prometheus metrics and monitoring
206215
- [Development](development.md) - developer guide
207216

208217
## License

0 commit comments

Comments
 (0)