From 5b38a1f582dc808d1a8196788657c67e44a8d1c1 Mon Sep 17 00:00:00 2001 From: manogna_grandhi Date: Wed, 13 May 2026 10:30:16 +0530 Subject: [PATCH] adding error_counts and dimm_label to edac Signed-off-by: manogna_grandhi --- collector/edac_linux.go | 113 ++++++++++++++------- collector/fixtures/e2e-64k-page-output.txt | 20 ++-- collector/fixtures/e2e-output.txt | 20 ++-- collector/fixtures/sys.ttar | 63 ++++++++++++ 4 files changed, 164 insertions(+), 52 deletions(-) diff --git a/collector/edac_linux.go b/collector/edac_linux.go index e86062fb10..95db6c0830 100644 --- a/collector/edac_linux.go +++ b/collector/edac_linux.go @@ -30,7 +30,7 @@ const ( var ( edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`) - edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`) + edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)/csrow([0-9]*)`) ) type edacCollector struct { @@ -47,24 +47,26 @@ var ( "Total correctable memory errors.", []string{"controller"}, nil, ) + edacUeCount = prometheus.NewDesc( prometheus.BuildFQName(namespace, edacSubsystem, "uncorrectable_errors_total"), "Total uncorrectable memory errors.", []string{"controller"}, nil, ) - edacCsRowCECount = prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "csrow_correctable_errors_total"), - "Total correctable memory errors for this csrow.", - []string{"controller", "csrow"}, nil, + + edacChannelCECount = prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "channel_correctable_errors_total"), + "Total correctable memory errors for this channel.", + []string{"controller", "csrow", "channel", "dimm_label"}, nil, ) - edacCsRowUECount = prometheus.NewDesc( - prometheus.BuildFQName(namespace, edacSubsystem, "csrow_uncorrectable_errors_total"), - "Total uncorrectable memory errors for this csrow.", - []string{"controller", "csrow"}, nil, + + edacChannelUECount = prometheus.NewDesc( + prometheus.BuildFQName(namespace, edacSubsystem, "channel_uncorrectable_errors_total"), + "Total uncorrectable memory errors for this channel.", + []string{"controller", "csrow", "channel", "dimm_label"}, nil, ) ) -// NewEdacCollector returns a new Collector exposing edac stats. func NewEdacCollector(logger *slog.Logger) (Collector, error) { return &edacCollector{ logger: logger, @@ -76,68 +78,107 @@ func (c *edacCollector) Update(ch chan<- prometheus.Metric) error { if err != nil { return err } + for _, controller := range memControllers { controllerMatch := edacMemControllerRE.FindStringSubmatch(controller) if controllerMatch == nil { return fmt.Errorf("controller string didn't match regexp: %s", controller) } + controllerNumber := controllerMatch[1] + // Controller CE count value, err := readUintFromFile(filepath.Join(controller, "ce_count")) if err != nil { return fmt.Errorf("couldn't get ce_count for controller %s: %w", controllerNumber, err) } - ch <- prometheus.MustNewConstMetric( - edacCeCount, prometheus.CounterValue, float64(value), controllerNumber) - value, err = readUintFromFile(filepath.Join(controller, "ce_noinfo_count")) - if err != nil { - return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %w", controllerNumber, err) - } ch <- prometheus.MustNewConstMetric( - edacCsRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") + edacCeCount, + prometheus.CounterValue, + float64(value), + controllerNumber, + ) + // Controller UE count value, err = readUintFromFile(filepath.Join(controller, "ue_count")) if err != nil { return fmt.Errorf("couldn't get ue_count for controller %s: %w", controllerNumber, err) } - ch <- prometheus.MustNewConstMetric( - edacUeCount, prometheus.CounterValue, float64(value), controllerNumber) - value, err = readUintFromFile(filepath.Join(controller, "ue_noinfo_count")) - if err != nil { - return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %w", controllerNumber, err) - } ch <- prometheus.MustNewConstMetric( - edacCsRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") + edacUeCount, + prometheus.CounterValue, + float64(value), + controllerNumber, + ) - // For each controller, walk the csrow directories. csrows, err := filepath.Glob(controller + "/csrow[0-9]*") if err != nil { return err } + for _, csrow := range csrows { csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow) if csrowMatch == nil { return fmt.Errorf("csrow string didn't match regexp: %s", csrow) } - csrowNumber := csrowMatch[1] - value, err = readUintFromFile(filepath.Join(csrow, "ce_count")) + csrowNumber := csrowMatch[2] + + channelFiles, err := filepath.Glob(csrow + "/ch[0-9]*_ce_count") if err != nil { - return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err) + return err } - ch <- prometheus.MustNewConstMetric( - edacCsRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) - value, err = readUintFromFile(filepath.Join(csrow, "ue_count")) - if err != nil { - return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %w", controllerNumber, csrowNumber, err) + for _, chFile := range channelFiles { + match := regexp.MustCompile(`ch([0-9]+)_ce_count`).FindStringSubmatch(filepath.Base(chFile)) + if match == nil { + continue + } + + channelNumber := match[1] + + label := fmt.Sprintf( + "mc%s_csrow%s_channel%s", + controllerNumber, + csrowNumber, + channelNumber, + ) + + value, err := readUintFromFile(chFile) + if err == nil { + ch <- prometheus.MustNewConstMetric( + edacChannelCECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + csrowNumber, + channelNumber, + label, + ) + } + + ueFile := filepath.Join( + csrow, + fmt.Sprintf("ch%s_ue_count", channelNumber), + ) + + value, err = readUintFromFile(ueFile) + if err == nil { + ch <- prometheus.MustNewConstMetric( + edacChannelUECount, + prometheus.CounterValue, + float64(value), + controllerNumber, + csrowNumber, + channelNumber, + label, + ) + } } - ch <- prometheus.MustNewConstMetric( - edacCsRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) } } - return err + return nil } diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 3be158f783..d4ad66a8c5 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -1331,17 +1331,21 @@ node_drbd_remote_pending{device="drbd1"} 12346 # HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered. # TYPE node_drbd_remote_unacknowledged gauge node_drbd_remote_unacknowledged{device="drbd1"} 12347 +# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel. +# TYPE node_edac_channel_correctable_errors_total counter +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0 +# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel. +# TYPE node_edac_channel_uncorrectable_errors_total counter +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2 # HELP node_edac_correctable_errors_total Total correctable memory errors. # TYPE node_edac_correctable_errors_total counter node_edac_correctable_errors_total{controller="0"} 1 -# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow. -# TYPE node_edac_csrow_correctable_errors_total counter -node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3 -node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2 -# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow. -# TYPE node_edac_csrow_uncorrectable_errors_total counter -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4 -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6 # HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors. # TYPE node_edac_uncorrectable_errors_total counter node_edac_uncorrectable_errors_total{controller="0"} 5 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 380e812c98..a11be91fba 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -1363,17 +1363,21 @@ node_drbd_remote_pending{device="drbd1"} 12346 # HELP node_drbd_remote_unacknowledged Number of requests received by the peer via the network connection, but that have not yet been answered. # TYPE node_drbd_remote_unacknowledged gauge node_drbd_remote_unacknowledged{device="drbd1"} 12347 +# HELP node_edac_channel_correctable_errors_total Total correctable memory errors for this channel. +# TYPE node_edac_channel_correctable_errors_total counter +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 0 +node_edac_channel_correctable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 0 +# HELP node_edac_channel_uncorrectable_errors_total Total uncorrectable memory errors for this channel. +# TYPE node_edac_channel_uncorrectable_errors_total counter +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="0",dimm_label="mc0_csrow0_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="0",controller="0",csrow="1",dimm_label="mc0_csrow1_channel0"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="0",dimm_label="mc0_csrow0_channel1"} 2 +node_edac_channel_uncorrectable_errors_total{channel="1",controller="0",csrow="1",dimm_label="mc0_csrow1_channel1"} 2 # HELP node_edac_correctable_errors_total Total correctable memory errors. # TYPE node_edac_correctable_errors_total counter node_edac_correctable_errors_total{controller="0"} 1 -# HELP node_edac_csrow_correctable_errors_total Total correctable memory errors for this csrow. -# TYPE node_edac_csrow_correctable_errors_total counter -node_edac_csrow_correctable_errors_total{controller="0",csrow="0"} 3 -node_edac_csrow_correctable_errors_total{controller="0",csrow="unknown"} 2 -# HELP node_edac_csrow_uncorrectable_errors_total Total uncorrectable memory errors for this csrow. -# TYPE node_edac_csrow_uncorrectable_errors_total counter -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="0"} 4 -node_edac_csrow_uncorrectable_errors_total{controller="0",csrow="unknown"} 6 # HELP node_edac_uncorrectable_errors_total Total uncorrectable memory errors. # TYPE node_edac_uncorrectable_errors_total counter node_edac_uncorrectable_errors_total{controller="0"} 5 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index bc8744cbe7..2b99fd9cbe 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -9174,11 +9174,54 @@ Mode: 644 Directory: sys/devices/system/edac/mc/mc0/csrow0 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/devices/system/edac/mc/mc0/csrow1 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_ce_count +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ce_count Lines: 1 3 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_ue_count +Lines: 1 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_ue_count +Lines: 1 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_ue_count +Lines: 1 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_ue_count +Lines: 1 +2 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/system/edac/mc/mc0/csrow0/ue_count Lines: 1 4 @@ -9194,6 +9237,26 @@ Lines: 1 6 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch0_dimm_label +Lines: 1 +mc0csrow0channel0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow0/ch1_dimm_label +Lines: 1 +mc0csrow0channel1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch0_dimm_label +Lines: 1 +mc0csrow1channel0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/devices/system/edac/mc/mc0/csrow1/ch1_dimm_label +Lines: 1 +mc0csrow1channel1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices/system/node Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -