From b210f696e6828b0b00a0262762cb4a4ff426b524 Mon Sep 17 00:00:00 2001 From: De Clercq Wentzel <10665586+wentzeld@users.noreply.github.com> Date: Sat, 14 Feb 2026 16:04:34 -0800 Subject: [PATCH 1/3] wave 1.5 error improvements --- pkg/capabilities/consensus/ocr3/reporting_plugin.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/capabilities/consensus/ocr3/reporting_plugin.go b/pkg/capabilities/consensus/ocr3/reporting_plugin.go index 94ac98c069..3ea83a267b 100644 --- a/pkg/capabilities/consensus/ocr3/reporting_plugin.go +++ b/pkg/capabilities/consensus/ocr3/reporting_plugin.go @@ -359,7 +359,8 @@ func (r *reportingPlugin) Outcome(ctx context.Context, outctx ocr3types.OutcomeC } if len(obs) < (2*r.config.F + 1) { - lggr.Debugw("insufficient observations for workflow execution id") + lggr.Warnw("Insufficient observations for workflow execution: consensus skipped because fewer than 2f+1 nodes reported observations", + "observationCount", len(obs), "requiredCount", 2*r.config.F+1, "workflowExecutionID", weid.WorkflowExecutionId, "workflowID", weid.WorkflowId) continue } From def477726832621cbcc91f05faf25336adb99076 Mon Sep 17 00:00:00 2001 From: De Clercq Wentzel <10665586+wentzeld@users.noreply.github.com> Date: Sun, 15 Feb 2026 10:53:17 -0800 Subject: [PATCH 2/3] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- pkg/capabilities/consensus/ocr3/reporting_plugin.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/capabilities/consensus/ocr3/reporting_plugin.go b/pkg/capabilities/consensus/ocr3/reporting_plugin.go index 3ea83a267b..d896a69cd8 100644 --- a/pkg/capabilities/consensus/ocr3/reporting_plugin.go +++ b/pkg/capabilities/consensus/ocr3/reporting_plugin.go @@ -360,7 +360,7 @@ func (r *reportingPlugin) Outcome(ctx context.Context, outctx ocr3types.OutcomeC if len(obs) < (2*r.config.F + 1) { lggr.Warnw("Insufficient observations for workflow execution: consensus skipped because fewer than 2f+1 nodes reported observations", - "observationCount", len(obs), "requiredCount", 2*r.config.F+1, "workflowExecutionID", weid.WorkflowExecutionId, "workflowID", weid.WorkflowId) + "observationCount", len(obs), "requiredCount", 2*r.config.F+1) continue } From 2d40ea1c3fdf0b9c58f6d363174242705d52c45c Mon Sep 17 00:00:00 2001 From: De Clercq Wentzel <10665586+wentzeld@users.noreply.github.com> Date: Tue, 17 Feb 2026 22:09:26 -0800 Subject: [PATCH 3/3] fix: deterministic tie-breaking in mode() and identical aggregators Resolve non-deterministic map iteration in mode() (reduce_aggregator.go) and collectHighestCounts() (identical.go) that caused cross-node consensus failures with 'PrepareSignature failed to verify' errors. --- .../consensus/ocr3/aggregators/identical.go | 8 +- .../identical_nondeterminism_test.go | 177 ++++++++++++++ .../aggregators/mode_nondeterminism_test.go | 228 ++++++++++++++++++ .../ocr3/aggregators/reduce_aggregator.go | 15 +- 4 files changed, 420 insertions(+), 8 deletions(-) create mode 100644 pkg/capabilities/consensus/ocr3/aggregators/identical_nondeterminism_test.go create mode 100644 pkg/capabilities/consensus/ocr3/aggregators/mode_nondeterminism_test.go diff --git a/pkg/capabilities/consensus/ocr3/aggregators/identical.go b/pkg/capabilities/consensus/ocr3/aggregators/identical.go index 41002caa3c..da1c0513e9 100644 --- a/pkg/capabilities/consensus/ocr3/aggregators/identical.go +++ b/pkg/capabilities/consensus/ocr3/aggregators/identical.go @@ -1,6 +1,7 @@ package aggregators import ( + "bytes" "crypto/sha256" "fmt" @@ -75,10 +76,13 @@ func (a *identicalAggregator) collectHighestCounts(counters []map[[32]byte]*coun outcome := make(map[string]any) for idx, shaToCounter := range counters { highestCount := 0 + var highestSHA [32]byte var highestObservation values.Value - for _, counter := range shaToCounter { - if counter.count > highestCount { + for sha, counter := range shaToCounter { + if counter.count > highestCount || + (counter.count == highestCount && bytes.Compare(sha[:], highestSHA[:]) < 0) { highestCount = counter.count + highestSHA = sha highestObservation = counter.fullObservation } } diff --git a/pkg/capabilities/consensus/ocr3/aggregators/identical_nondeterminism_test.go b/pkg/capabilities/consensus/ocr3/aggregators/identical_nondeterminism_test.go new file mode 100644 index 0000000000..1fe7ea376b --- /dev/null +++ b/pkg/capabilities/consensus/ocr3/aggregators/identical_nondeterminism_test.go @@ -0,0 +1,177 @@ +package aggregators_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/proto" + + "github.com/smartcontractkit/libocr/commontypes" + + "github.com/smartcontractkit/chainlink-common/pkg/capabilities/consensus/ocr3/aggregators" + "github.com/smartcontractkit/chainlink-common/pkg/logger" + "github.com/smartcontractkit/chainlink-protos/cre/go/values" +) + +// ============================================================================= +// Bug: identicalAggregator.collectHighestCounts() map iteration tie-breaking +// in identical.go:73-103 +// +// The collectHighestCounts function iterates a map[sha256]*counter with a +// strict ">" comparison (line 80): +// +// for _, counter := range shaToCounter { +// if counter.count > highestCount { // <-- strict greater-than +// highestCount = counter.count +// highestObservation = counter.fullObservation +// } +// } +// +// When two distinct observation values have the SAME count, the first entry +// encountered in map iteration order wins (since "count > highestCount" is +// false for equal counts). Go randomizes map iteration order, so different +// processes (nodes) may pick different winners. +// +// This is exploitable when n >= 2*(2f+1), allowing two groups to each reach +// the 2f+1 quorum threshold. Example: f=2, n=10, 2f+1=5 -- two groups of 5. +// +// In production, nodes disagree on the outcome, causing: +// "PrepareSignature failed to verify. This is commonly caused by +// non-determinism in the ReportingPlugin" +// +// Run: go test -v -run TestIdentical -count=1 +// ============================================================================= + +// TestIdenticalTiedCounts creates two observation groups of equal size, both +// meeting the 2f+1 threshold, and verifies whether the aggregator produces +// consistent results. +func TestIdenticalTiedCounts(t *testing.T) { + config := newIdenticalTestConfig(t, nil) + agg, err := aggregators.NewIdenticalAggregator(*config) + require.NoError(t, err) + + // f=2, 2f+1=5. With 10 nodes: 5 report "alpha", 5 report "beta". + // Both groups meet the quorum threshold. The > comparison means + // whichever map entry is iterated first sets highestCount=5, and the + // other entry (also count=5) fails the > check. The winner depends + // on map iteration order. + observations := map[commontypes.OracleID][]values.Value{ + 0: {values.NewString("alpha")}, + 1: {values.NewString("alpha")}, + 2: {values.NewString("alpha")}, + 3: {values.NewString("alpha")}, + 4: {values.NewString("alpha")}, + 5: {values.NewString("beta")}, + 6: {values.NewString("beta")}, + 7: {values.NewString("beta")}, + 8: {values.NewString("beta")}, + 9: {values.NewString("beta")}, + } + + const iterations = 200 + seen := make(map[string]int) + for i := 0; i < iterations; i++ { + outcome, err := agg.Aggregate(logger.Nop(), nil, observations, 2) // f=2 + require.NoError(t, err) + require.NotNil(t, outcome) + + m, err := values.FromMapValueProto(outcome.EncodableOutcome) + require.NoError(t, err) + + val := m.Underlying["0"] + require.NotNil(t, val) + + b, err := proto.MarshalOptions{Deterministic: true}.Marshal(values.Proto(val)) + require.NoError(t, err) + seen[string(b)]++ + } + + if len(seen) > 1 { + t.Errorf("CONFIRMED: identicalAggregator non-determinism -- produced %d distinct outcomes "+ + "over %d iterations (identical.go:79 map iteration tie-breaking with equal counts)", + len(seen), iterations) + } +} + +// TestIdenticalCrossNodeSimulation simulates multiple nodes running the +// identical aggregator on the same observations. All nodes should agree +// on the same outcome. +func TestIdenticalCrossNodeSimulation(t *testing.T) { + config := newIdenticalTestConfig(t, nil) + + // f=2, 2f+1=5. Two groups of 5 -- both meet quorum. + observations := map[commontypes.OracleID][]values.Value{ + 0: {values.NewString("alpha")}, + 1: {values.NewString("alpha")}, + 2: {values.NewString("alpha")}, + 3: {values.NewString("alpha")}, + 4: {values.NewString("alpha")}, + 5: {values.NewString("beta")}, + 6: {values.NewString("beta")}, + 7: {values.NewString("beta")}, + 8: {values.NewString("beta")}, + 9: {values.NewString("beta")}, + } + + const numNodes = 10 + outcomeBytes := make([][]byte, numNodes) + for i := 0; i < numNodes; i++ { + agg, err := aggregators.NewIdenticalAggregator(*config) + require.NoError(t, err) + + outcome, err := agg.Aggregate(logger.Nop(), nil, observations, 2) // f=2 + require.NoError(t, err) + require.NotNil(t, outcome) + + b, err := proto.MarshalOptions{Deterministic: true}.Marshal(outcome.EncodableOutcome) + require.NoError(t, err) + outcomeBytes[i] = b + } + + allMatch := true + for i := 1; i < numNodes; i++ { + if string(outcomeBytes[i]) != string(outcomeBytes[0]) { + allMatch = false + break + } + } + + if !allMatch { + t.Errorf("CONFIRMED: Cross-node consensus failure -- %d simulated nodes produced "+ + "different outcome bytes. Root cause: identicalAggregator map iteration "+ + "tie-breaking in identical.go:79", numNodes) + } +} + +// TestIdenticalQuorumEnforcement is a sanity check that verifies the aggregator +// correctly rejects observations when neither group reaches the 2f+1 threshold. +func TestIdenticalQuorumEnforcement(t *testing.T) { + config := newIdenticalTestConfig(t, nil) + agg, err := aggregators.NewIdenticalAggregator(*config) + require.NoError(t, err) + + // f=1, 2f+1=3. With 4 nodes: 2 report "A", 2 report "B". + // Neither group meets the quorum of 3. + observations := map[commontypes.OracleID][]values.Value{ + 0: {values.NewString("A")}, + 1: {values.NewString("A")}, + 2: {values.NewString("B")}, + 3: {values.NewString("B")}, + } + + outcome, err := agg.Aggregate(logger.Nop(), nil, observations, 1) // f=1 + require.Error(t, err) + require.Nil(t, outcome) + require.Contains(t, err.Error(), "can't reach consensus") +} + +func newIdenticalTestConfig(t *testing.T, overrideKeys []string) *values.Map { + t.Helper() + unwrappedConfig := map[string]any{ + "expectedObservationsLen": len(overrideKeys), + "keyOverrides": overrideKeys, + } + config, err := values.NewMap(unwrappedConfig) + require.NoError(t, err) + return config +} diff --git a/pkg/capabilities/consensus/ocr3/aggregators/mode_nondeterminism_test.go b/pkg/capabilities/consensus/ocr3/aggregators/mode_nondeterminism_test.go new file mode 100644 index 0000000000..a65dbf6c3d --- /dev/null +++ b/pkg/capabilities/consensus/ocr3/aggregators/mode_nondeterminism_test.go @@ -0,0 +1,228 @@ +package aggregators_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/proto" + + "github.com/smartcontractkit/libocr/commontypes" + + "github.com/smartcontractkit/chainlink-common/pkg/capabilities/consensus/ocr3/aggregators" + "github.com/smartcontractkit/chainlink-common/pkg/logger" + "github.com/smartcontractkit/chainlink-protos/cre/go/values" +) + +// ============================================================================= +// Bug: mode() tie-breaking via map iteration in reduce_aggregator.go:464-480 +// +// The mode() function builds a map[sha256]*counter to count observation +// frequencies, then iterates the map twice: +// 1. Find the maximum count (lines 464-469) +// 2. Collect all values with that count into a slice (lines 471-476) +// +// It returns modes[0] — the first element of the collected slice. Because Go +// map iteration order is randomized, the slice ordering varies between calls, +// making the result non-deterministic when multiple values tie for highest +// frequency. +// +// In production (OCR3 consensus), every node independently runs Outcome() on +// the same observations. If mode() returns different values on different nodes, +// their outcome bytes diverge and PrepareSignature verification fails with: +// "PrepareSignature failed to verify. This is commonly caused by +// non-determinism in the ReportingPlugin" +// +// Run: go test -v -run TestMode -count=1 +// ============================================================================= + +// TestModeTiedFrequencies creates a 2-way tie (equal frequency for two values) +// and verifies that repeated aggregation produces consistent results. +func TestModeTiedFrequencies(t *testing.T) { + fields := []aggregators.AggregationField{ + { + InputKey: "data", + OutputKey: "data", + Method: "mode", + ModeQuorum: "any", // don't enforce f+1 quorum so tie results are returned + }, + } + + config := newModeTestConfig(t, fields) + agg, err := aggregators.NewReduceAggregator(*config) + require.NoError(t, err) + + // 3 nodes report "value_A", 3 nodes report "value_B" -- a perfect tie + mkObs := func(val string) values.Value { + m, err := values.WrapMap(map[string]any{"data": val}) + require.NoError(t, err) + return m + } + + observations := map[commontypes.OracleID][]values.Value{ + 0: {mkObs("value_A")}, + 1: {mkObs("value_A")}, + 2: {mkObs("value_A")}, + 3: {mkObs("value_B")}, + 4: {mkObs("value_B")}, + 5: {mkObs("value_B")}, + } + + // Run once to get a reference result + firstOutcome, err := agg.Aggregate(logger.Nop(), nil, observations, 1) + require.NoError(t, err) + require.NotNil(t, firstOutcome) + + // Run 100 more times -- if mode() is deterministic, all results match. + // If non-deterministic, at least one will differ. + const iterations = 100 + mismatchCount := 0 + for i := 0; i < iterations; i++ { + outcome, err := agg.Aggregate(logger.Nop(), nil, observations, 1) + require.NoError(t, err) + require.NotNil(t, outcome) + + if !proto.Equal(firstOutcome.EncodableOutcome, outcome.EncodableOutcome) { + mismatchCount++ + } + } + + assert.Zerof(t, mismatchCount, + "mode() produced different outcomes in %d/%d iterations with tied frequencies -- "+ + "this confirms non-deterministic tie-breaking via map iteration in reduce_aggregator.go:464-480", + mismatchCount, iterations) +} + +// TestModeThreeWayTie uses a 3-way tie to further increase the probability +// of observing different map iteration orderings. +func TestModeThreeWayTie(t *testing.T) { + fields := []aggregators.AggregationField{ + { + InputKey: "data", + OutputKey: "data", + Method: "mode", + ModeQuorum: "any", + }, + } + + config := newModeTestConfig(t, fields) + agg, err := aggregators.NewReduceAggregator(*config) + require.NoError(t, err) + + mkObs := func(val string) values.Value { + m, err := values.WrapMap(map[string]any{"data": val}) + require.NoError(t, err) + return m + } + + // 3-way tie: 2 nodes each report "X", "Y", "Z" + observations := map[commontypes.OracleID][]values.Value{ + 0: {mkObs("X")}, + 1: {mkObs("X")}, + 2: {mkObs("Y")}, + 3: {mkObs("Y")}, + 4: {mkObs("Z")}, + 5: {mkObs("Z")}, + } + + const iterations = 200 + seen := make(map[string]int) + for i := 0; i < iterations; i++ { + outcome, err := agg.Aggregate(logger.Nop(), nil, observations, 1) + require.NoError(t, err) + require.NotNil(t, outcome) + + m, err := values.FromMapValueProto(outcome.EncodableOutcome) + require.NoError(t, err) + + reports := m.Underlying["Reports"] + require.NotNil(t, reports) + + b, err := proto.MarshalOptions{Deterministic: true}.Marshal(values.Proto(reports)) + require.NoError(t, err) + seen[string(b)]++ + } + + if len(seen) > 1 { + t.Errorf("CONFIRMED: mode() non-determinism -- produced %d distinct outcomes over %d iterations "+ + "(reduce_aggregator.go:464-480 map iteration tie-breaking)", + len(seen), iterations) + } +} + +// TestModeCrossNodeConsensusSimulation simulates an actual OCR3 round: multiple +// "nodes" independently run the same aggregation on the same observations. +// If the aggregation is deterministic all nodes produce identical outcome bytes. +// If non-deterministic, nodes disagree and PrepareSignature fails. +func TestModeCrossNodeConsensusSimulation(t *testing.T) { + fields := []aggregators.AggregationField{ + { + InputKey: "price_source", + OutputKey: "price_source", + Method: "mode", + ModeQuorum: "any", + }, + } + + config := newModeTestConfig(t, fields) + + mkObs := func(val string) values.Value { + m, err := values.WrapMap(map[string]any{"price_source": val}) + require.NoError(t, err) + return m + } + + // Volatile data source: 3 nodes saw "coinbase", 3 saw "binance" -- a tie + observations := map[commontypes.OracleID][]values.Value{ + 0: {mkObs("coinbase")}, + 1: {mkObs("coinbase")}, + 2: {mkObs("coinbase")}, + 3: {mkObs("binance")}, + 4: {mkObs("binance")}, + 5: {mkObs("binance")}, + } + + // Each simulated node creates its own aggregator and runs independently + const numNodes = 10 + outcomeBytes := make([][]byte, numNodes) + for i := 0; i < numNodes; i++ { + agg, err := aggregators.NewReduceAggregator(*config) + require.NoError(t, err) + + outcome, err := agg.Aggregate(logger.Nop(), nil, observations, 1) + require.NoError(t, err) + require.NotNil(t, outcome) + + b, err := proto.MarshalOptions{Deterministic: true}.Marshal(outcome.EncodableOutcome) + require.NoError(t, err) + outcomeBytes[i] = b + } + + // In a real OCR3 round, all nodes must produce identical outcome bytes + allMatch := true + for i := 1; i < numNodes; i++ { + if string(outcomeBytes[i]) != string(outcomeBytes[0]) { + allMatch = false + break + } + } + + if !allMatch { + t.Errorf("CONFIRMED: Cross-node consensus failure -- %d simulated nodes produced "+ + "different outcome bytes for the same observations. In production this causes "+ + "\"PrepareSignature failed to verify\" errors. Root cause: mode() tie-breaking "+ + "via map iteration in reduce_aggregator.go:464-480", numNodes) + } +} + +func newModeTestConfig(t *testing.T, fields []aggregators.AggregationField) *values.Map { + t.Helper() + unwrappedConfig := map[string]any{ + "fields": fields, + "outputFieldName": "Reports", + "reportFormat": "array", + } + config, err := values.NewMap(unwrappedConfig) + require.NoError(t, err) + return config +} diff --git a/pkg/capabilities/consensus/ocr3/aggregators/reduce_aggregator.go b/pkg/capabilities/consensus/ocr3/aggregators/reduce_aggregator.go index f63df075a4..4b355ce2da 100644 --- a/pkg/capabilities/consensus/ocr3/aggregators/reduce_aggregator.go +++ b/pkg/capabilities/consensus/ocr3/aggregators/reduce_aggregator.go @@ -468,16 +468,19 @@ func mode(items []values.Value) (values.Value, int, error) { } } - var modes []values.Value - for _, ctr := range counts { + // Collect all SHA keys that have the max count, then sort them to ensure + // deterministic tie-breaking regardless of map iteration order. + var modeKeys [][32]byte + for sha, ctr := range counts { if ctr.count == maxCount { - modes = append(modes, ctr.fullObservation) + modeKeys = append(modeKeys, sha) } } + sort.Slice(modeKeys, func(i, j int) bool { + return bytes.Compare(modeKeys[i][:], modeKeys[j][:]) < 0 + }) - // If more than one mode found, choose first - - return modes[0], maxCount, nil + return counts[modeKeys[0]].fullObservation, maxCount, nil } func modeHasQuorum(quorumType string, count int, f int) error {