From 65a6f6b9c49a7d735cea8b186b1784cf85415010 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Fri, 9 Jan 2026 23:57:28 +0800 Subject: [PATCH 01/23] enable active active check Signed-off-by: Jianjun Liao --- api/v2/model.go | 23 ++- .../checkpoint_watcher.go | 170 ++++++++++++++++++ .../config.example.toml | 26 +++ .../config.go | 104 +++++++++++ cmd/multi-cluster-consistency-checker/main.go | 76 ++++++++ .../dispatcher/basic_dispatcher.go | 3 +- .../basic_dispatcher_active_active_test.go | 1 + .../dispatcher/basic_dispatcher_info.go | 39 ++-- .../dispatcher/event_dispatcher_test.go | 4 + .../dispatcher/redo_dispatcher_test.go | 1 + .../dispatchermanager/dispatcher_manager.go | 1 + .../dispatcher_manager_test.go | 2 + pkg/common/event/active_active.go | 9 +- pkg/common/event/active_active_test.go | 12 +- pkg/config/changefeed.go | 2 + pkg/config/replica_config.go | 1 + 16 files changed, 438 insertions(+), 36 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/checkpoint_watcher.go create mode 100644 cmd/multi-cluster-consistency-checker/config.example.toml create mode 100644 cmd/multi-cluster-consistency-checker/config.go create mode 100644 cmd/multi-cluster-consistency-checker/main.go diff --git a/api/v2/model.go b/api/v2/model.go index 01927bbc44..c2a949d3c3 100644 --- a/api/v2/model.go +++ b/api/v2/model.go @@ -193,6 +193,7 @@ type ReplicaConfig struct { EnableTableMonitor *bool `json:"enable_table_monitor,omitempty"` BDRMode *bool `json:"bdr_mode,omitempty"` EnableActiveActive *bool `json:"enable_active_active,omitempty"` + EnableActiveActiveCheck *bool `json:"enable_active_active_check,omitempty"` ActiveActiveProgressInterval *JSONDuration `json:"active_active_progress_interval,omitempty"` ActiveActiveSyncStatsInterval *JSONDuration `json:"active_active_sync_stats_interval,omitempty"` @@ -250,6 +251,9 @@ func (c *ReplicaConfig) toInternalReplicaConfigWithOriginConfig( if c.EnableActiveActive != nil { res.EnableActiveActive = c.EnableActiveActive } + if c.EnableActiveActiveCheck != nil { + res.EnableActiveActiveCheck = c.EnableActiveActiveCheck + } if c.Filter != nil { efs := make([]*config.EventFilterRule, 0, len(c.Filter.EventFilters)) @@ -631,15 +635,16 @@ func ToAPIReplicaConfig(c *config.ReplicaConfig) *ReplicaConfig { cloned := c.Clone() res := &ReplicaConfig{ - MemoryQuota: cloned.MemoryQuota, - CaseSensitive: cloned.CaseSensitive, - ForceReplicate: cloned.ForceReplicate, - IgnoreIneligibleTable: cloned.IgnoreIneligibleTable, - CheckGCSafePoint: cloned.CheckGCSafePoint, - EnableSyncPoint: cloned.EnableSyncPoint, - EnableTableMonitor: cloned.EnableTableMonitor, - BDRMode: cloned.BDRMode, - EnableActiveActive: cloned.EnableActiveActive, + MemoryQuota: cloned.MemoryQuota, + CaseSensitive: cloned.CaseSensitive, + ForceReplicate: cloned.ForceReplicate, + IgnoreIneligibleTable: cloned.IgnoreIneligibleTable, + CheckGCSafePoint: cloned.CheckGCSafePoint, + EnableSyncPoint: cloned.EnableSyncPoint, + EnableTableMonitor: cloned.EnableTableMonitor, + BDRMode: cloned.BDRMode, + EnableActiveActive: cloned.EnableActiveActive, + EnableActiveActiveCheck: cloned.EnableActiveActiveCheck, } if cloned.SyncPointInterval != nil { diff --git a/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go new file mode 100644 index 0000000000..165bd24371 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go @@ -0,0 +1,170 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "fmt" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/pkg/common" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/etcd" + "github.com/tikv/client-go/v2/oracle" + pd "github.com/tikv/pd/client" + clientv3 "go.etcd.io/etcd/client/v3" + "go.uber.org/zap" +) + +// CheckpointWatcher watches CDC checkpoint from etcd and records TSO from PD +type CheckpointWatcher struct { + // checkpoint stores the current CDC checkpoint + checkpoint uint64 + + // pdclientUp is the PD client for upstream cluster + pdclientUp pd.Client + + // pdclientDown is the PD client for downstream cluster + pdclientDown pd.Client + + // etcdClientUp is the etcd client for upstream cluster (used to watch checkpoint) + etcdClientUp etcd.CDCEtcdClient + + // p stores the TSO obtained from pdclientDown + p uint64 + + // changefeedID is the ID of the changefeed to watch + changefeedID common.ChangeFeedID +} + +// NewCheckpointWatcher creates a new CheckpointWatcher instance +func NewCheckpointWatcher( + changefeedID common.ChangeFeedID, + pdclientUp pd.Client, + pdclientDown pd.Client, + etcdClientUp etcd.CDCEtcdClient, +) *CheckpointWatcher { + return &CheckpointWatcher{ + changefeedID: changefeedID, + pdclientUp: pdclientUp, + pdclientDown: pdclientDown, + etcdClientUp: etcdClientUp, + } +} + +// WaitForCheckpoint waits for the checkpoint to exceed minCheckpointTs, +// then gets a TSO from pdclientDown and records it to p +func (cw *CheckpointWatcher) WaitForCheckpoint(ctx context.Context, minCheckpointTs uint64) error { + log.Info("Starting to watch checkpoint", + zap.String("changefeedID", cw.changefeedID.String()), + zap.Uint64("minCheckpointTs", minCheckpointTs)) + + // First, get the current checkpoint status + status, modRev, err := cw.etcdClientUp.GetChangeFeedStatus(ctx, cw.changefeedID) + if err != nil { + return fmt.Errorf("failed to get changefeed status: %w", err) + } + + cw.checkpoint = status.CheckpointTs + log.Info("Current checkpoint", + zap.Uint64("checkpoint", cw.checkpoint), + zap.Uint64("minCheckpointTs", minCheckpointTs)) + + // Watch for checkpoint updates + clusterID := cw.etcdClientUp.GetClusterID() + statusKey := etcd.GetEtcdKeyJob(clusterID, cw.changefeedID.DisplayName) + + watchCtx, cancel := context.WithCancel(ctx) + defer cancel() + + watchCh := cw.etcdClientUp.GetEtcdClient().Watch( + watchCtx, + statusKey, + "checkpoint-watcher", + clientv3.WithRev(modRev+1), + ) + + log.Info("Watching checkpoint status", + zap.String("statusKey", statusKey), + zap.Int64("startRev", modRev+1)) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case watchResp := <-watchCh: + if err := watchResp.Err(); err != nil { + return fmt.Errorf("watch error: %w", err) + } + + for _, event := range watchResp.Events { + if event.Type == clientv3.EventTypeDelete { + log.Warn("Changefeed status deleted", + zap.String("changefeedID", cw.changefeedID.String())) + continue + } + + // Parse the updated status + status := &config.ChangeFeedStatus{} + if err := status.Unmarshal(event.Kv.Value); err != nil { + log.Warn("Failed to unmarshal changefeed status", + zap.String("changefeedID", cw.changefeedID.String()), + zap.Error(err)) + continue + } + + cw.checkpoint = status.CheckpointTs + log.Info("Checkpoint updated", + zap.Uint64("checkpoint", cw.checkpoint), + zap.Uint64("minCheckpointTs", minCheckpointTs)) + + // Check if checkpoint exceeds minCheckpointTs + if cw.checkpoint > minCheckpointTs { + log.Info("Checkpoint exceeds minCheckpointTs, getting TSO from downstream") + return cw.getAndRecordTSO(ctx) + } + } + } + } +} + +// getAndRecordTSO gets a TSO from pdclientDown and records it to p +func (cw *CheckpointWatcher) getAndRecordTSO(ctx context.Context) error { + // Get TSO from downstream PD client + physical, logical, err := cw.pdclientDown.GetTS(ctx) + if err != nil { + return fmt.Errorf("failed to get TSO from downstream PD: %w", err) + } + + // Compose TSO from physical and logical parts + cw.p = oracle.ComposeTS(physical, logical) + + log.Info("TSO obtained and recorded", + zap.Int64("physical", physical), + zap.Int64("logical", logical), + zap.Uint64("tso", cw.p), + zap.Uint64("checkpoint", cw.checkpoint)) + + return nil +} + +// GetCheckpoint returns the current checkpoint +func (cw *CheckpointWatcher) GetCheckpoint() uint64 { + return cw.checkpoint +} + +// GetRecordedTSO returns the recorded TSO (p) +func (cw *CheckpointWatcher) GetRecordedTSO() uint64 { + return cw.p +} diff --git a/cmd/multi-cluster-consistency-checker/config.example.toml b/cmd/multi-cluster-consistency-checker/config.example.toml new file mode 100644 index 0000000000..888de3ab59 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/config.example.toml @@ -0,0 +1,26 @@ +# Example configuration file for multi-cluster consistency checker + +# Global configuration (reserved for future use) +[global] +# timeout = "30s" +# retry-count = 3 + +# Cluster configurations +[clusters] + # First cluster configuration + [clusters.cluster1] + pd-addr = "127.0.0.1:2379" + cdc-addr = "127.0.0.1:8300" + s3-sink-uri = "s3://bucket-name/cluster1/" + + # Second cluster configuration + [clusters.cluster2] + pd-addr = "127.0.0.1:2479" + cdc-addr = "127.0.0.1:8400" + s3-sink-uri = "s3://bucket-name/cluster2/" + + # Third cluster configuration (optional) + # [clusters.cluster3] + # pd-addr = "127.0.0.1:2579" + # cdc-addr = "127.0.0.1:8500" + # s3-sink-uri = "s3://bucket-name/cluster3/" diff --git a/cmd/multi-cluster-consistency-checker/config.go b/cmd/multi-cluster-consistency-checker/config.go new file mode 100644 index 0000000000..444c5f49ae --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/config.go @@ -0,0 +1,104 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + + "github.com/BurntSushi/toml" +) + +// Config represents the configuration for multi-cluster consistency checker +type Config struct { + // GlobalConfig contains global settings (reserved for future use) + GlobalConfig GlobalConfig `toml:"global" json:"global"` + + // Clusters contains configurations for multiple clusters + Clusters map[string]ClusterConfig `toml:"clusters" json:"clusters"` +} + +// GlobalConfig contains global configuration settings +// This is reserved for future use +type GlobalConfig struct { + // Add global configuration fields here as needed + // For example: + // Timeout time.Duration `toml:"timeout" json:"timeout"` + // RetryCount int `toml:"retry-count" json:"retry-count"` +} + +// ClusterConfig represents configuration for a single cluster +type ClusterConfig struct { + // PDAddr is the address of the PD (Placement Driver) server + PDAddr string `toml:"pd-addr" json:"pd-addr"` + + // CDCAddr is the address of the CDC server + CDCAddr string `toml:"cdc-addr" json:"cdc-addr"` + + // S3SinkURI is the S3 sink URI for this cluster + S3SinkURI string `toml:"s3-sink-uri" json:"s3-sink-uri"` +} + +// loadConfig loads the configuration from a TOML file +func loadConfig(path string) (*Config, error) { + // Check if file exists + if _, err := os.Stat(path); os.IsNotExist(err) { + return nil, fmt.Errorf("config file does not exist: %s", path) + } + + cfg := &Config{ + Clusters: make(map[string]ClusterConfig), + } + + meta, err := toml.DecodeFile(path, cfg) + if err != nil { + return nil, fmt.Errorf("failed to decode config file: %w", err) + } + + // Validate that at least one cluster is configured + if len(cfg.Clusters) == 0 { + return nil, fmt.Errorf("at least one cluster must be configured") + } + + // Validate cluster configurations + for name, cluster := range cfg.Clusters { + if cluster.PDAddr == "" { + return nil, fmt.Errorf("cluster '%s': pd-addr is required", name) + } + if cluster.CDCAddr == "" { + return nil, fmt.Errorf("cluster '%s': cdc-addr is required", name) + } + if cluster.S3SinkURI == "" { + return nil, fmt.Errorf("cluster '%s': s3-sink-uri is required", name) + } + } + + // Check for unknown configuration keys + if undecoded := meta.Undecoded(); len(undecoded) > 0 { + // Filter out keys under [global] and [clusters] sections + var unknownKeys []string + for _, key := range undecoded { + keyStr := key.String() + // Only warn about keys that are not in the expected sections + if keyStr != "global" && keyStr != "clusters" { + unknownKeys = append(unknownKeys, keyStr) + } + } + if len(unknownKeys) > 0 { + fmt.Fprintf(os.Stderr, "Warning: unknown configuration keys found: %v\n", unknownKeys) + } + } + + return cfg, nil +} diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go new file mode 100644 index 0000000000..6e7043dd5e --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -0,0 +1,76 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" +) + +var ( + cfgPath string +) + +const ( + ExitCodeExecuteFailed = 1 + ExitCodeInvalidConfig = 2 + ExitCodeDecodeConfigFailed = 3 +) + +const ( + FlagConfig = "config" +) + +func main() { + rootCmd := &cobra.Command{ + Use: "multi-cluster-consistency-checker", + Short: "A tool to check consistency across multiple TiCDC clusters", + Long: "A tool to check consistency across multiple TiCDC clusters by comparing data from different clusters' S3 sink locations", + Run: run, + } + + rootCmd.Flags().StringVarP(&cfgPath, FlagConfig, "c", "", "configuration file path (required)") + rootCmd.MarkFlagRequired(FlagConfig) + + if err := rootCmd.Execute(); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(ExitCodeExecuteFailed) + } +} + +func run(cmd *cobra.Command, args []string) { + if cfgPath == "" { + fmt.Fprintln(os.Stderr, "error: --config flag is required") + os.Exit(ExitCodeInvalidConfig) + } + + cfg, err := loadConfig(cfgPath) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to load config: %v\n", err) + os.Exit(ExitCodeDecodeConfigFailed) + } + + fmt.Printf("Loaded configuration with %d cluster(s)\n", len(cfg.Clusters)) + for name, cluster := range cfg.Clusters { + fmt.Printf(" Cluster: %s\n", name) + fmt.Printf(" PD Address: %s\n", cluster.PDAddr) + fmt.Printf(" CDC Address: %s\n", cluster.CDCAddr) + fmt.Printf(" S3 Sink URI: %s\n", cluster.S3SinkURI) + } + + // TODO: Implement actual consistency checking logic + fmt.Println("\nConsistency checking logic will be implemented here") +} diff --git a/downstreamadapter/dispatcher/basic_dispatcher.go b/downstreamadapter/dispatcher/basic_dispatcher.go index eddb932ecc..be604915b0 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher.go +++ b/downstreamadapter/dispatcher/basic_dispatcher.go @@ -249,7 +249,8 @@ func (d *BasicDispatcher) AddDMLEventsToSink(events []*commonEvent.DMLEvent) boo // FilterDMLEvent returns the original event for normal tables and only // allocates a new event when the table needs active-active or soft-delete // processing. Skip is only true when every row in the event should be dropped. - filtered, skip, err := commonEvent.FilterDMLEvent(event, d.sharedInfo.enableActiveActive) + filtered, skip, err := commonEvent.FilterDMLEvent( + event, d.sharedInfo.enableActiveActive, d.sharedInfo.enableActiveActiveCheck) if err != nil { d.HandleError(err) continue diff --git a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go index 76db356127..e99d10d1a3 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go +++ b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go @@ -131,6 +131,7 @@ func newTestBasicDispatcher(t *testing.T, sinkType common.SinkType, enableActive false, enableActiveActive, false, + false, nil, nil, nil, diff --git a/downstreamadapter/dispatcher/basic_dispatcher_info.go b/downstreamadapter/dispatcher/basic_dispatcher_info.go index db8f9de0b8..193bdf40fd 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher_info.go +++ b/downstreamadapter/dispatcher/basic_dispatcher_info.go @@ -29,11 +29,12 @@ import ( // This eliminates the need to pass these parameters individually to each dispatcher. type SharedInfo struct { // Basic configuration - changefeedID common.ChangeFeedID - timezone string - bdrMode bool - enableActiveActive bool - outputRawChangeEvent bool + changefeedID common.ChangeFeedID + timezone string + bdrMode bool + enableActiveActive bool + enableActiveActiveCheck bool + outputRawChangeEvent bool // Configuration objects integrityConfig *eventpb.IntegrityConfig @@ -75,6 +76,7 @@ func NewSharedInfo( timezone string, bdrMode bool, enableActiveActive bool, + enableActiveActiveCheck bool, outputRawChangeEvent bool, integrityConfig *eventpb.IntegrityConfig, filterConfig *eventpb.FilterConfig, @@ -86,19 +88,20 @@ func NewSharedInfo( errCh chan error, ) *SharedInfo { sharedInfo := &SharedInfo{ - changefeedID: changefeedID, - timezone: timezone, - bdrMode: bdrMode, - enableActiveActive: enableActiveActive, - outputRawChangeEvent: outputRawChangeEvent, - integrityConfig: integrityConfig, - filterConfig: filterConfig, - syncPointConfig: syncPointConfig, - enableSplittableCheck: enableSplittableCheck, - statusesChan: statusesChan, - blockStatusesChan: blockStatusesChan, - blockExecutor: newBlockEventExecutor(), - errCh: errCh, + changefeedID: changefeedID, + timezone: timezone, + bdrMode: bdrMode, + enableActiveActive: enableActiveActive, + enableActiveActiveCheck: enableActiveActiveCheck, + outputRawChangeEvent: outputRawChangeEvent, + integrityConfig: integrityConfig, + filterConfig: filterConfig, + syncPointConfig: syncPointConfig, + enableSplittableCheck: enableSplittableCheck, + statusesChan: statusesChan, + blockStatusesChan: blockStatusesChan, + blockExecutor: newBlockEventExecutor(), + errCh: errCh, } if txnAtomicity != nil { diff --git a/downstreamadapter/dispatcher/event_dispatcher_test.go b/downstreamadapter/dispatcher/event_dispatcher_test.go index 2265889a49..1de9b9ddee 100644 --- a/downstreamadapter/dispatcher/event_dispatcher_test.go +++ b/downstreamadapter/dispatcher/event_dispatcher_test.go @@ -73,6 +73,7 @@ func newDispatcherForTest(sink sink.Sink, tableSpan *heartbeatpb.TableSpan) *Eve false, false, false, + false, nil, nil, &syncpoint.SyncPointConfig{ @@ -812,6 +813,7 @@ func TestDispatcherSplittableCheck(t *testing.T) { false, false, false, + false, nil, nil, &syncpoint.SyncPointConfig{ @@ -922,6 +924,7 @@ func TestDispatcher_SkipDMLAsStartTs_FilterCorrectly(t *testing.T) { false, false, false, + false, nil, nil, &syncpoint.SyncPointConfig{ @@ -1002,6 +1005,7 @@ func TestDispatcher_SkipDMLAsStartTs_Disabled(t *testing.T) { false, false, false, + false, nil, nil, &syncpoint.SyncPointConfig{ diff --git a/downstreamadapter/dispatcher/redo_dispatcher_test.go b/downstreamadapter/dispatcher/redo_dispatcher_test.go index 4c5ff02eb5..c8609ac3b7 100644 --- a/downstreamadapter/dispatcher/redo_dispatcher_test.go +++ b/downstreamadapter/dispatcher/redo_dispatcher_test.go @@ -42,6 +42,7 @@ func newRedoDispatcherForTest(sink sink.Sink, tableSpan *heartbeatpb.TableSpan) false, false, false, + false, nil, nil, nil, // redo dispatcher doesn't need syncPointConfig diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager.go b/downstreamadapter/dispatchermanager/dispatcher_manager.go index d8a1d729b3..4696ff5dc9 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager.go @@ -234,6 +234,7 @@ func NewDispatcherManager( manager.config.TimeZone, manager.config.BDRMode, manager.config.EnableActiveActive, + manager.config.EnableActiveActiveCheck, outputRawChangeEvent, integrityCfg, filterCfg, diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager_test.go b/downstreamadapter/dispatchermanager/dispatcher_manager_test.go index 575489752f..a168a78b09 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager_test.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager_test.go @@ -53,6 +53,7 @@ func createTestDispatcher(t *testing.T, manager *DispatcherManager, id common.Di false, false, false, + false, nil, nil, nil, @@ -111,6 +112,7 @@ func createTestManager(t *testing.T) *DispatcherManager { "system", manager.config.BDRMode, manager.config.EnableActiveActive, + false, false, // outputRawChangeEvent nil, // integrityConfig nil, // filterConfig diff --git a/pkg/common/event/active_active.go b/pkg/common/event/active_active.go index 7c1ce77e4c..0095fd0be9 100644 --- a/pkg/common/event/active_active.go +++ b/pkg/common/event/active_active.go @@ -55,6 +55,7 @@ func EvaluateRowPolicy( tableInfo *common.TableInfo, row *RowChange, enableActiveActive bool, + enableActiveActiveCheck bool, ) (RowPolicyDecision, error) { if tableInfo == nil || row == nil { return RowPolicyKeep, nil @@ -75,6 +76,10 @@ func EvaluateRowPolicy( return RowPolicyKeep, nil } + if enableActiveActiveCheck { + return RowPolicyKeep, nil + } + if row.RowType != common.RowTypeUpdate { return RowPolicyKeep, nil } @@ -152,7 +157,7 @@ func ApplyRowPolicyDecision(row *RowChange, decision RowPolicyDecision) { // // It returns the possibly modified event, whether the event should be skipped entirely, // and an error if evaluation fails. -func FilterDMLEvent(event *DMLEvent, enableActiveActive bool) (*DMLEvent, bool, error) { +func FilterDMLEvent(event *DMLEvent, enableActiveActive bool, enableActiveActiveCheck bool) (*DMLEvent, bool, error) { if event == nil { return nil, true, nil } @@ -185,7 +190,7 @@ func FilterDMLEvent(event *DMLEvent, enableActiveActive bool) (*DMLEvent, bool, break } - decision, err := EvaluateRowPolicy(tableInfo, &row, enableActiveActive) + decision, err := EvaluateRowPolicy(tableInfo, &row, enableActiveActive, enableActiveActiveCheck) if err != nil { event.Rewind() return nil, false, err diff --git a/pkg/common/event/active_active_test.go b/pkg/common/event/active_active_test.go index c145840a4b..335096e6b9 100644 --- a/pkg/common/event/active_active_test.go +++ b/pkg/common/event/active_active_test.go @@ -32,7 +32,7 @@ func TestFilterDMLEventNormalTablePassthrough(t *testing.T) { {int64(1)}, }) - filtered, skip, err := FilterDMLEvent(event, false) + filtered, skip, err := FilterDMLEvent(event, false, false) require.NoError(t, err) require.False(t, skip) require.Equal(t, event, filtered) @@ -48,7 +48,7 @@ func TestFilterDMLEventActiveActiveWithEnableDropsDeletes(t *testing.T) { {int64(2), nil}, // insert row }) - filtered, skip, err := FilterDMLEvent(event, true) + filtered, skip, err := FilterDMLEvent(event, true, false) require.NoError(t, err) require.False(t, skip) require.NotEqual(t, event, filtered) @@ -73,7 +73,7 @@ func TestFilterDMLEventSoftDeleteConvertUpdate(t *testing.T) { {int64(1), ts}, // post row with soft delete timestamp }) - filtered, skip, err := FilterDMLEvent(event, false) + filtered, skip, err := FilterDMLEvent(event, false, false) require.NoError(t, err) require.False(t, skip) require.NotEqual(t, event, filtered) @@ -98,7 +98,7 @@ func TestFilterDMLEventActiveActiveConvertWhenDisabled(t *testing.T) { {int64(2), ts}, }) - filtered, skip, err := FilterDMLEvent(event, false) + filtered, skip, err := FilterDMLEvent(event, false, false) require.NoError(t, err) require.False(t, skip) require.NotEqual(t, event, filtered) @@ -123,7 +123,7 @@ func TestFilterDMLEventActiveActiveKeepUpdateWhenEnabled(t *testing.T) { {int64(3), ts}, }) - filtered, skip, err := FilterDMLEvent(event, true) + filtered, skip, err := FilterDMLEvent(event, true, false) require.NoError(t, err) require.False(t, skip) require.Equal(t, event, filtered) @@ -146,7 +146,7 @@ func TestFilterDMLEventAllRowsSkipped(t *testing.T) { {int64(1), nil}, }) - filtered, skip, err := FilterDMLEvent(event, false) + filtered, skip, err := FilterDMLEvent(event, false, false) require.NoError(t, err) require.True(t, skip) require.Nil(t, filtered) diff --git a/pkg/config/changefeed.go b/pkg/config/changefeed.go index 4334096bdc..385b8f29e9 100644 --- a/pkg/config/changefeed.go +++ b/pkg/config/changefeed.go @@ -202,6 +202,7 @@ type ChangefeedConfig struct { Epoch uint64 `json:"epoch"` BDRMode bool `json:"bdr_mode" default:"false"` EnableActiveActive bool `json:"enable_active_active" default:"false"` + EnableActiveActiveCheck bool `json:"enable_active_active_check" default:"false"` ActiveActiveProgressInterval time.Duration `json:"active_active_progress_interval" default:"30m"` ActiveActiveSyncStatsInterval time.Duration `json:"active_active_sync_stats_interval" default:"1m"` // redo releated @@ -281,6 +282,7 @@ func (info *ChangeFeedInfo) ToChangefeedConfig() *ChangefeedConfig { Epoch: info.Epoch, BDRMode: util.GetOrZero(info.Config.BDRMode), EnableActiveActive: util.GetOrZero(info.Config.EnableActiveActive), + EnableActiveActiveCheck: util.GetOrZero(info.Config.EnableActiveActiveCheck), ActiveActiveProgressInterval: util.GetOrZero(info.Config.ActiveActiveProgressInterval), ActiveActiveSyncStatsInterval: util.GetOrZero(info.Config.ActiveActiveSyncStatsInterval), TimeZone: GetGlobalServerConfig().TZ, diff --git a/pkg/config/replica_config.go b/pkg/config/replica_config.go index 0a70d749b9..13de1bbcba 100644 --- a/pkg/config/replica_config.go +++ b/pkg/config/replica_config.go @@ -174,6 +174,7 @@ type replicaConfig struct { SyncedStatus *SyncedStatusConfig `toml:"synced-status" json:"synced-status,omitempty"` EnableActiveActive *bool `toml:"enable-active-active" json:"enable-active-active,omitempty"` + EnableActiveActiveCheck *bool `toml:"enable-active-active-check" json:"enable-active-active-check,omitempty"` ActiveActiveProgressInterval *time.Duration `toml:"active-active-progress-interval" json:"active-active-progress-interval,omitempty"` // ActiveActiveSyncStatsInterval controls how often MySQL/TiDB sink queries // TiDB session variable @@tidb_cdc_active_active_sync_stats for conflict statistics. From cfa0eec4ba5e3ae09a08212d9b908f6647c61057 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 15 Jan 2026 18:21:25 +0800 Subject: [PATCH 02/23] draft Signed-off-by: Jianjun Liao --- Makefile | 3 + .../checkpoint_watcher.go | 148 ++++--------- .../config.example.toml | 15 +- .../config.go | 33 ++- cmd/multi-cluster-consistency-checker/main.go | 5 +- cmd/multi-cluster-consistency-checker/pd.go | 48 +++++ .../s3_watcher.go | 35 ++++ cmd/multi-cluster-consistency-checker/task.go | 61 ++++++ .../time_window_advancer.go | 196 ++++++++++++++++++ 9 files changed, 431 insertions(+), 113 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/pd.go create mode 100644 cmd/multi-cluster-consistency-checker/s3_watcher.go create mode 100644 cmd/multi-cluster-consistency-checker/task.go create mode 100644 cmd/multi-cluster-consistency-checker/time_window_advancer.go diff --git a/Makefile b/Makefile index 942e7893fa..1fff26d774 100644 --- a/Makefile +++ b/Makefile @@ -174,6 +174,9 @@ filter_helper: config-converter: $(GOBUILD) -ldflags '$(LDFLAGS)' -o bin/cdc_config_converter ./cmd/config-converter/main.go +multi-cluster-consistency-checker: + $(GOBUILD) -ldflags '$(LDFLAGS)' -o bin/multi-cluster-consistency-checker ./cmd/multi-cluster-consistency-checker + fmt: tools/bin/gofumports tools/bin/shfmt tools/bin/gci @echo "run gci (format imports)" tools/bin/gci write $(FILES) 2>&1 | $(FAIL_ON_STDOUT) diff --git a/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go index 165bd24371..a819761d03 100644 --- a/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go +++ b/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go @@ -15,156 +15,100 @@ package main import ( "context" - "fmt" "github.com/pingcap/log" "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/etcd" - "github.com/tikv/client-go/v2/oracle" - pd "github.com/tikv/pd/client" clientv3 "go.etcd.io/etcd/client/v3" "go.uber.org/zap" ) -// CheckpointWatcher watches CDC checkpoint from etcd and records TSO from PD -type CheckpointWatcher struct { - // checkpoint stores the current CDC checkpoint - checkpoint uint64 - - // pdclientUp is the PD client for upstream cluster - pdclientUp pd.Client - - // pdclientDown is the PD client for downstream cluster - pdclientDown pd.Client - - // etcdClientUp is the etcd client for upstream cluster (used to watch checkpoint) - etcdClientUp etcd.CDCEtcdClient - - // p stores the TSO obtained from pdclientDown - p uint64 - - // changefeedID is the ID of the changefeed to watch - changefeedID common.ChangeFeedID +type checkpointWatcher struct { + upstreamClusterID string + downstreamClusterID string + changefeedID common.ChangeFeedID + etcdClient etcd.CDCEtcdClient } -// NewCheckpointWatcher creates a new CheckpointWatcher instance func NewCheckpointWatcher( - changefeedID common.ChangeFeedID, - pdclientUp pd.Client, - pdclientDown pd.Client, - etcdClientUp etcd.CDCEtcdClient, -) *CheckpointWatcher { - return &CheckpointWatcher{ - changefeedID: changefeedID, - pdclientUp: pdclientUp, - pdclientDown: pdclientDown, - etcdClientUp: etcdClientUp, + upstreamClusterID, downstreamClusterID, changefeedID string, + etcdClient etcd.CDCEtcdClient, +) *checkpointWatcher { + return &checkpointWatcher{ + upstreamClusterID: upstreamClusterID, + downstreamClusterID: downstreamClusterID, + changefeedID: common.NewChangeFeedIDWithName(changefeedID, "default"), + etcdClient: etcdClient, } } -// WaitForCheckpoint waits for the checkpoint to exceed minCheckpointTs, -// then gets a TSO from pdclientDown and records it to p -func (cw *CheckpointWatcher) WaitForCheckpoint(ctx context.Context, minCheckpointTs uint64) error { - log.Info("Starting to watch checkpoint", - zap.String("changefeedID", cw.changefeedID.String()), - zap.Uint64("minCheckpointTs", minCheckpointTs)) - - // First, get the current checkpoint status - status, modRev, err := cw.etcdClientUp.GetChangeFeedStatus(ctx, cw.changefeedID) +// advanceCheckpointTs waits for the checkpoint to exceed minCheckpointTs +func (cw *checkpointWatcher) advanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + // First, get the current chceckpoint status from etcd + status, modRev, err := cw.etcdClient.GetChangeFeedStatus(ctx, cw.changefeedID) if err != nil { - return fmt.Errorf("failed to get changefeed status: %w", err) + return 0, errors.Annotate(err, "failed to get changefeed status") } - - cw.checkpoint = status.CheckpointTs - log.Info("Current checkpoint", - zap.Uint64("checkpoint", cw.checkpoint), - zap.Uint64("minCheckpointTs", minCheckpointTs)) - + statusKey := etcd.GetEtcdKeyJob(cw.etcdClient.GetClusterID(), cw.changefeedID.DisplayName) // Watch for checkpoint updates - clusterID := cw.etcdClientUp.GetClusterID() - statusKey := etcd.GetEtcdKeyJob(clusterID, cw.changefeedID.DisplayName) - watchCtx, cancel := context.WithCancel(ctx) defer cancel() + log.Info("Starting to watch checkpoint", + zap.String("changefeedID", cw.changefeedID.String()), + zap.String("statusKey", statusKey), + zap.String("upstreamClusterID", cw.upstreamClusterID), + zap.String("downstreamClusterID", cw.downstreamClusterID), + zap.Uint64("checkpoint", status.CheckpointTs), + zap.Int64("startRev", modRev+1), + zap.Uint64("minCheckpointTs", minCheckpointTs)) - watchCh := cw.etcdClientUp.GetEtcdClient().Watch( + watchCh := cw.etcdClient.GetEtcdClient().Watch( watchCtx, statusKey, "checkpoint-watcher", clientv3.WithRev(modRev+1), ) - log.Info("Watching checkpoint status", - zap.String("statusKey", statusKey), - zap.Int64("startRev", modRev+1)) - for { select { case <-ctx.Done(): - return ctx.Err() - case watchResp := <-watchCh: + return 0, errors.Annotate(ctx.Err(), "context canceled") + case watchResp, ok := <-watchCh: + if !ok { + return 0, errors.Errorf("[changefeedID: %s] watch channel closed", cw.changefeedID.String()) + } + if err := watchResp.Err(); err != nil { - return fmt.Errorf("watch error: %w", err) + return 0, errors.Annotatef(err, "[changefeedID: %s] watch error", cw.changefeedID.String()) } for _, event := range watchResp.Events { if event.Type == clientv3.EventTypeDelete { - log.Warn("Changefeed status deleted", - zap.String("changefeedID", cw.changefeedID.String())) - continue + return 0, errors.Errorf("[changefeedID: %s] changefeed status key is deleted", cw.changefeedID.String()) } // Parse the updated status status := &config.ChangeFeedStatus{} if err := status.Unmarshal(event.Kv.Value); err != nil { - log.Warn("Failed to unmarshal changefeed status", - zap.String("changefeedID", cw.changefeedID.String()), - zap.Error(err)) - continue + return 0, errors.Annotatef(err, "[changefeedID: %s] failed to unmarshal changefeed status", cw.changefeedID.String()) } - cw.checkpoint = status.CheckpointTs + checkpointTs := status.CheckpointTs log.Info("Checkpoint updated", - zap.Uint64("checkpoint", cw.checkpoint), + zap.String("changefeedID", cw.changefeedID.String()), + zap.Uint64("checkpoint", checkpointTs), zap.Uint64("minCheckpointTs", minCheckpointTs)) // Check if checkpoint exceeds minCheckpointTs - if cw.checkpoint > minCheckpointTs { - log.Info("Checkpoint exceeds minCheckpointTs, getting TSO from downstream") - return cw.getAndRecordTSO(ctx) + if checkpointTs > minCheckpointTs { + log.Info("Checkpoint exceeds minCheckpointTs, getting TSO from downstream", + zap.String("changefeedID", cw.changefeedID.String()), + zap.Uint64("checkpoint", checkpointTs)) + return checkpointTs, nil } } } } } - -// getAndRecordTSO gets a TSO from pdclientDown and records it to p -func (cw *CheckpointWatcher) getAndRecordTSO(ctx context.Context) error { - // Get TSO from downstream PD client - physical, logical, err := cw.pdclientDown.GetTS(ctx) - if err != nil { - return fmt.Errorf("failed to get TSO from downstream PD: %w", err) - } - - // Compose TSO from physical and logical parts - cw.p = oracle.ComposeTS(physical, logical) - - log.Info("TSO obtained and recorded", - zap.Int64("physical", physical), - zap.Int64("logical", logical), - zap.Uint64("tso", cw.p), - zap.Uint64("checkpoint", cw.checkpoint)) - - return nil -} - -// GetCheckpoint returns the current checkpoint -func (cw *CheckpointWatcher) GetCheckpoint() uint64 { - return cw.checkpoint -} - -// GetRecordedTSO returns the recorded TSO (p) -func (cw *CheckpointWatcher) GetRecordedTSO() uint64 { - return cw.p -} diff --git a/cmd/multi-cluster-consistency-checker/config.example.toml b/cmd/multi-cluster-consistency-checker/config.example.toml index 888de3ab59..89206bb13a 100644 --- a/cmd/multi-cluster-consistency-checker/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config.example.toml @@ -10,17 +10,28 @@ # First cluster configuration [clusters.cluster1] pd-addr = "127.0.0.1:2379" - cdc-addr = "127.0.0.1:8300" s3-sink-uri = "s3://bucket-name/cluster1/" + s3-changefeed-id = "s3-changefeed-id-1" + # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } + [clusters.cluster1.downstream-cluster-changefeed-config] + cluster2 = { changefeed-id = "active-active-changefeed-id-from-cluster1-to-cluster2" } # Second cluster configuration [clusters.cluster2] pd-addr = "127.0.0.1:2479" - cdc-addr = "127.0.0.1:8400" s3-sink-uri = "s3://bucket-name/cluster2/" + s3-changefeed-id = "s3-changefeed-id-2" + # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } + [clusters.cluster2.downstream-cluster-changefeed-config] + cluster1 = { changefeed-id = "active-active-changefeed-id-from-cluster2-to-cluster1" } # Third cluster configuration (optional) # [clusters.cluster3] # pd-addr = "127.0.0.1:2579" # cdc-addr = "127.0.0.1:8500" # s3-sink-uri = "s3://bucket-name/cluster3/" + # s3-changefeed-id = "s3-changefeed-id-3" + # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } + # [clusters.cluster3.downstream-cluster-changefeed-config] + # cluster1 = { changefeed-id = "active-active-changefeed-id-from-cluster3-to-cluster1" } + # cluster2 = { changefeed-id = "active-active-changefeed-id-from-cluster3-to-cluster2" } diff --git a/cmd/multi-cluster-consistency-checker/config.go b/cmd/multi-cluster-consistency-checker/config.go index 444c5f49ae..f839dc0c93 100644 --- a/cmd/multi-cluster-consistency-checker/config.go +++ b/cmd/multi-cluster-consistency-checker/config.go @@ -18,6 +18,7 @@ import ( "os" "github.com/BurntSushi/toml" + "github.com/pingcap/ticdc/pkg/security" ) // Config represents the configuration for multi-cluster consistency checker @@ -38,16 +39,28 @@ type GlobalConfig struct { // RetryCount int `toml:"retry-count" json:"retry-count"` } +type DownstreamClusterChangefeedConfig struct { + // ChangefeedID is the changefeed ID for the changefeed + ChangefeedID string `toml:"changefeed-id" json:"changefeed-id"` +} + // ClusterConfig represents configuration for a single cluster type ClusterConfig struct { // PDAddr is the address of the PD (Placement Driver) server PDAddr string `toml:"pd-addr" json:"pd-addr"` - // CDCAddr is the address of the CDC server - CDCAddr string `toml:"cdc-addr" json:"cdc-addr"` - // S3SinkURI is the S3 sink URI for this cluster S3SinkURI string `toml:"s3-sink-uri" json:"s3-sink-uri"` + + // S3ChangefeedID is the changefeed ID for the S3 changefeed + S3ChangefeedID string `toml:"s3-changefeed-id" json:"s3-changefeed-id"` + + // SecurityConfig is the security configuration for the cluster + SecurityConfig *security.Credential `toml:"security-config" json:"security-config"` + + // DownstreamClusterChangefeedConfig is the configuration for the changefeed of the downstream cluster + // mapping from downstream cluster ID to the changefeed configuration + DownstreamClusterChangefeedConfig map[string]DownstreamClusterChangefeedConfig `toml:"downstream-cluster-changefeed-config" json:"downstream-cluster-changefeed-config"` } // loadConfig loads the configuration from a TOML file @@ -76,12 +89,20 @@ func loadConfig(path string) (*Config, error) { if cluster.PDAddr == "" { return nil, fmt.Errorf("cluster '%s': pd-addr is required", name) } - if cluster.CDCAddr == "" { - return nil, fmt.Errorf("cluster '%s': cdc-addr is required", name) - } if cluster.S3SinkURI == "" { return nil, fmt.Errorf("cluster '%s': s3-sink-uri is required", name) } + if cluster.S3ChangefeedID == "" { + return nil, fmt.Errorf("cluster '%s': s3-changefeed-id is required", name) + } + if len(cluster.DownstreamClusterChangefeedConfig) != len(cfg.Clusters)-1 { + return nil, fmt.Errorf("cluster '%s': downstream-cluster-changefeed-config is not entirely configured", name) + } + for downstreamClusterID, downstreamClusterChangefeedConfig := range cluster.DownstreamClusterChangefeedConfig { + if downstreamClusterChangefeedConfig.ChangefeedID == "" { + return nil, fmt.Errorf("cluster '%s': downstream-cluster-changefeed-config[%s]: changefeed-id is required", name, downstreamClusterID) + } + } } // Check for unknown configuration keys diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 6e7043dd5e..335475ed42 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -14,6 +14,7 @@ package main import ( + "context" "fmt" "os" @@ -67,10 +68,8 @@ func run(cmd *cobra.Command, args []string) { for name, cluster := range cfg.Clusters { fmt.Printf(" Cluster: %s\n", name) fmt.Printf(" PD Address: %s\n", cluster.PDAddr) - fmt.Printf(" CDC Address: %s\n", cluster.CDCAddr) fmt.Printf(" S3 Sink URI: %s\n", cluster.S3SinkURI) } - // TODO: Implement actual consistency checking logic - fmt.Println("\nConsistency checking logic will be implemented here") + runTask(context.Background(), cfg) } diff --git a/cmd/multi-cluster-consistency-checker/pd.go b/cmd/multi-cluster-consistency-checker/pd.go new file mode 100644 index 0000000000..a304e7c13d --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/pd.go @@ -0,0 +1,48 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "time" + + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/etcd" + "github.com/pingcap/ticdc/pkg/security" + pd "github.com/tikv/pd/client" + pdopt "github.com/tikv/pd/client/opt" + "google.golang.org/grpc" +) + +func newClient(ctx context.Context, pdAddr string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { + pdClient, err := pd.NewClientWithContext( + ctx, "consistency-checker", []string{pdAddr}, securityConfig.PDSecurityOption(), + pdopt.WithCustomTimeoutOption(10*time.Second), + ) + if err != nil { + return nil, nil, errors.Trace(err) + } + + etcdCli, err := etcd.CreateRawEtcdClient(securityConfig, grpc.EmptyDialOption{}, pdAddr) + if err != nil { + return nil, nil, errors.Trace(err) + } + + cdcEtcdClient, err := etcd.NewCDCEtcdClient(ctx, etcdCli, "default") + if err != nil { + return nil, nil, errors.Trace(err) + } + + return pdClient, cdcEtcdClient, nil +} diff --git a/cmd/multi-cluster-consistency-checker/s3_watcher.go b/cmd/multi-cluster-consistency-checker/s3_watcher.go new file mode 100644 index 0000000000..0f56d2d10d --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/s3_watcher.go @@ -0,0 +1,35 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + + "github.com/pingcap/ticdc/pkg/errors" +) + +type s3Watcher struct { + checkpointWatcher *checkpointWatcher +} + +func (sw *s3Watcher) advanceS3CheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + checkpointTs, err := sw.checkpointWatcher.advanceCheckpointTs(ctx, minCheckpointTs) + if err != nil { + return 0, errors.Annotate(err, "advance s3 checkpoint timestamp failed") + } + + // TODO: get the index updated from the s3 + + return checkpointTs, nil +} diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go new file mode 100644 index 0000000000..ec5a9284f7 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -0,0 +1,61 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + + "github.com/pingcap/ticdc/pkg/errors" + pd "github.com/tikv/pd/client" +) + +func runTask(ctx context.Context, cfg *Config) error { + checkpointWatchers, s3Watchers, pdClients, err := initClients(ctx, cfg) + if err != nil { + return errors.Trace(err) + } + + timeWindowAdvancer := NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) + for { + err = timeWindowAdvancer.AdvanceTimeWindow(ctx) + if err != nil { + return errors.Trace(err) + } + } +} + +func initClients(ctx context.Context, cfg *Config) (map[string]map[string]*checkpointWatcher, map[string]*s3Watcher, map[string]pd.Client, error) { + checkpointWatchers := make(map[string]map[string]*checkpointWatcher) + s3Watchers := make(map[string]*s3Watcher) + pdClients := make(map[string]pd.Client) + for clusterID, clusterConfig := range cfg.Clusters { + pdClient, etcdClient, err := newClient(ctx, clusterConfig.PDAddr, clusterConfig.SecurityConfig) + if err != nil { + return nil, nil, nil, errors.Trace(err) + } + upstreamCheckpointWatchers := make(map[string]*checkpointWatcher) + for downstreamClusterID, downstreamClusterChangefeedConfig := range clusterConfig.DownstreamClusterChangefeedConfig { + checkpointWatcher := NewCheckpointWatcher(clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) + upstreamCheckpointWatchers[downstreamClusterID] = checkpointWatcher + } + checkpointWatchers[clusterID] = upstreamCheckpointWatchers + s3Watcher := &s3Watcher{ + checkpointWatcher: NewCheckpointWatcher(clusterID, "s3", clusterConfig.S3ChangefeedID, etcdClient), + } + s3Watchers[clusterID] = s3Watcher + pdClients[clusterID] = pdClient + } + + return checkpointWatchers, s3Watchers, pdClients, nil +} diff --git a/cmd/multi-cluster-consistency-checker/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/time_window_advancer.go new file mode 100644 index 0000000000..6a6ddf81b9 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/time_window_advancer.go @@ -0,0 +1,196 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "maps" + "sync" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/tikv/client-go/v2/oracle" + pd "github.com/tikv/pd/client" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" +) + +// TimeWindow is the time window of the cluster, including the left boundary, right boundary and checkpoint ts +// Assert 1: LeftBoundary < CheckpointTs < RightBoundary +// Assert 2: The checkpoint timestamp of next time window should be larger than the MaxPDTimestampAfterTimeWindow +type TimeWindow struct { + LeftBoundary uint64 + RightBoundary uint64 + // CheckpointTs is the checkpoint timestamp for each changefeed from upstream cluster, + // mapping from downstream cluster ID to the checkpoint timestamp + CheckpointTs map[string]uint64 + // MaxPDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, + // mapping from upstream cluster ID to the max PD timestamp + MaxPDTimestampAfterTimeWindow map[string]uint64 +} + +type TimeWindowAdvancer struct { + // round is the current round of the time window + round uint64 + + // timeWindowTriplet is the triplet of adjacent time windows, mapping from cluster ID to the triplet + timeWindowTriplet map[string][3]TimeWindow + + // checkpointWatcher is the Active-Active checkpoint watcher for each cluster, + // mapping from cluster ID to the downstream cluster ID to the checkpoint watcher + checkpointWatcher map[string]map[string]*checkpointWatcher + + // s3checkpointWatcher is the S3 checkpoint watcher for each cluster, mapping from cluster ID to the s3 checkpoint watcher + s3Watcher map[string]*s3Watcher + + // pdClients is the pd clients for each cluster, mapping from cluster ID to the pd client + pdClients map[string]pd.Client +} + +func NewTimeWindowAdvancer( + checkpointWatchers map[string]map[string]*checkpointWatcher, + s3Watchers map[string]*s3Watcher, + pdClients map[string]pd.Client, +) *TimeWindowAdvancer { + return &TimeWindowAdvancer{ + round: 0, + timeWindowTriplet: make(map[string][3]TimeWindow), + checkpointWatcher: checkpointWatchers, + s3Watcher: s3Watchers, + pdClients: pdClients, + } +} + +func (t *TimeWindowAdvancer) AdvanceTimeWindow(pctx context.Context) error { + log.Info("advance time window", zap.Uint64("round", t.round)) + // mapping from upstream cluster ID to the downstream cluster ID to the min checkpoint timestamp + minCheckpointTsMap := make(map[string]map[string]uint64) + maxTimeWindowRightBoundary := uint64(0) + for downstreamClusterID, triplet := range t.timeWindowTriplet { + for upstreamClusterID, maxPDTimestampAfterTimeWindow := range triplet[2].MaxPDTimestampAfterTimeWindow { + if _, ok := minCheckpointTsMap[upstreamClusterID]; !ok { + minCheckpointTsMap[upstreamClusterID] = make(map[string]uint64) + } + minCheckpointTsMap[upstreamClusterID][downstreamClusterID] = max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], maxPDTimestampAfterTimeWindow) + } + maxTimeWindowRightBoundary = max(maxTimeWindowRightBoundary, triplet[2].RightBoundary) + } + + var lock sync.Mutex + newTimeWindow := make(map[string]TimeWindow) + maxPDTimestampAfterCheckpointTs := make(map[string]uint64) + // for cluster ID, the max checkpoint timestamp is maximum of checkpoint from cluster to other clusters and checkpoint from other clusters to cluster + maxCheckpointTs := make(map[string]uint64) + // Advance the checkpoint ts for each cluster + eg, ctx := errgroup.WithContext(pctx) + for upstreamClusterID, downstreamCheckpointWatcherMap := range t.checkpointWatcher { + for downstreamClusterID, checkpointWatcher := range downstreamCheckpointWatcherMap { + mincheckpointTs := max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], maxTimeWindowRightBoundary) + eg.Go(func() error { + checkpointTs, err := checkpointWatcher.advanceCheckpointTs(ctx, mincheckpointTs) + if err != nil { + return errors.Trace(err) + } + pdtsos, err := t.getPDTsFromOtherClusters(ctx, upstreamClusterID) + if err != nil { + return errors.Trace(err) + } + lock.Lock() + timeWindow := newTimeWindow[upstreamClusterID] + if timeWindow.CheckpointTs == nil { + timeWindow.CheckpointTs = make(map[string]uint64) + } + timeWindow.CheckpointTs[downstreamClusterID] = checkpointTs + newTimeWindow[upstreamClusterID] = timeWindow + for otherClusterID, pdtso := range pdtsos { + maxPDTimestampAfterCheckpointTs[otherClusterID] = max(maxPDTimestampAfterCheckpointTs[otherClusterID], pdtso) + } + maxCheckpointTs[upstreamClusterID] = max(maxCheckpointTs[upstreamClusterID], checkpointTs) + maxCheckpointTs[downstreamClusterID] = max(maxCheckpointTs[downstreamClusterID], checkpointTs) + lock.Unlock() + return nil + }) + } + } + if err := eg.Wait(); err != nil { + return errors.Annotate(err, "advance checkpoint timestamp failed") + } + + // Update the time window for each cluster + eg, ctx = errgroup.WithContext(pctx) + for clusterID := range t.timeWindowTriplet { + minTimeWindowRightBoundary := max(maxCheckpointTs[clusterID], maxPDTimestampAfterCheckpointTs[clusterID]) + s3Watcher := t.s3Watcher[clusterID] + eg.Go(func() error { + s3CheckpointTs, err := s3Watcher.advanceS3CheckpointTs(ctx, minTimeWindowRightBoundary) + if err != nil { + return errors.Trace(err) + } + pdtsos, err := t.getPDTsFromOtherClusters(ctx, clusterID) + if err != nil { + return errors.Trace(err) + } + lock.Lock() + timeWindow := newTimeWindow[clusterID] + timeWindow.RightBoundary = s3CheckpointTs + timeWindow.MaxPDTimestampAfterTimeWindow = make(map[string]uint64) + maps.Copy(timeWindow.MaxPDTimestampAfterTimeWindow, pdtsos) + newTimeWindow[clusterID] = timeWindow + lock.Unlock() + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Annotate(err, "advance time window failed") + } + t.updateTimeWindow(newTimeWindow) + t.round += 1 + return nil +} + +func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]TimeWindow) { + for clusterID, timeWindow := range newTimeWindow { + triplet := t.timeWindowTriplet[clusterID] + triplet[0] = triplet[1] + triplet[1] = triplet[2] + timeWindow.LeftBoundary = triplet[2].RightBoundary + triplet[2] = timeWindow + t.timeWindowTriplet[clusterID] = triplet + log.Info("update time window", zap.String("clusterID", clusterID), zap.Any("timeWindow", timeWindow)) + } +} + +func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(ctx context.Context, clusterID string) (map[string]uint64, error) { + var lock sync.Mutex + pdtsos := make(map[string]uint64) + eg, ctx := errgroup.WithContext(ctx) + for otherClusterID := range t.pdClients { + if otherClusterID == clusterID { + continue + } + pdClient := t.pdClients[otherClusterID] + eg.Go(func() error { + phyTs, logicTs, err := pdClient.GetTS(ctx) + if err != nil { + return errors.Trace(err) + } + ts := oracle.ComposeTS(phyTs, logicTs) + lock.Lock() + pdtsos[otherClusterID] = ts + lock.Unlock() + return nil + }) + } + return pdtsos, nil +} From e46d2740895378c034762527c84301f9b24e792a Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Fri, 23 Jan 2026 17:38:03 +0800 Subject: [PATCH 03/23] draft Signed-off-by: Jianjun Liao --- .../{ => advancer}/time_window_advancer.go | 95 +++- .../checker/checker.go | 428 ++++++++++++++++++ .../{ => config}/config.example.toml | 0 .../{ => config}/config.go | 6 +- .../consumer/s3_consumer.go | 388 ++++++++++++++++ cmd/multi-cluster-consistency-checker/main.go | 3 +- .../parser/decoder.go | 53 +++ .../parser/parser.go | 200 ++++++++ cmd/multi-cluster-consistency-checker/pd.go | 48 -- .../s3_watcher.go | 35 -- cmd/multi-cluster-consistency-checker/task.go | 63 ++- .../utils.go/types.go | 63 +++ .../{ => watcher}/checkpoint_watcher.go | 10 +- .../watcher/s3_watcher.go | 54 +++ pkg/common/table_info.go | 9 + 15 files changed, 1329 insertions(+), 126 deletions(-) rename cmd/multi-cluster-consistency-checker/{ => advancer}/time_window_advancer.go (61%) create mode 100644 cmd/multi-cluster-consistency-checker/checker/checker.go rename cmd/multi-cluster-consistency-checker/{ => config}/config.example.toml (100%) rename cmd/multi-cluster-consistency-checker/{ => config}/config.go (96%) create mode 100644 cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go create mode 100644 cmd/multi-cluster-consistency-checker/parser/decoder.go create mode 100644 cmd/multi-cluster-consistency-checker/parser/parser.go delete mode 100644 cmd/multi-cluster-consistency-checker/pd.go delete mode 100644 cmd/multi-cluster-consistency-checker/s3_watcher.go create mode 100644 cmd/multi-cluster-consistency-checker/utils.go/types.go rename cmd/multi-cluster-consistency-checker/{ => watcher}/checkpoint_watcher.go (95%) create mode 100644 cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go diff --git a/cmd/multi-cluster-consistency-checker/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go similarity index 61% rename from cmd/multi-cluster-consistency-checker/time_window_advancer.go rename to cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go index 6a6ddf81b9..c5c2aa7d66 100644 --- a/cmd/multi-cluster-consistency-checker/time_window_advancer.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go @@ -11,7 +11,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package main +package advancer import ( "context" @@ -19,7 +19,10 @@ import ( "sync" "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/consumer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "github.com/tikv/client-go/v2/oracle" pd "github.com/tikv/pd/client" "go.uber.org/zap" @@ -28,16 +31,23 @@ import ( // TimeWindow is the time window of the cluster, including the left boundary, right boundary and checkpoint ts // Assert 1: LeftBoundary < CheckpointTs < RightBoundary -// Assert 2: The checkpoint timestamp of next time window should be larger than the MaxPDTimestampAfterTimeWindow +// Assert 2: The other cluster's checkpoint timestamp of next time window should be larger than the PDTimestampAfterTimeWindow saved in this cluster's time window +// Assert 3: CheckpointTs of this cluster should be larger than other clusters' RightBoundary of previous time window +// Assert 4: RightBoundary of this cluster should be larger than other clusters' CheckpointTs of this time window type TimeWindow struct { LeftBoundary uint64 RightBoundary uint64 // CheckpointTs is the checkpoint timestamp for each changefeed from upstream cluster, // mapping from downstream cluster ID to the checkpoint timestamp CheckpointTs map[string]uint64 - // MaxPDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, + // PDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, // mapping from upstream cluster ID to the max PD timestamp - MaxPDTimestampAfterTimeWindow map[string]uint64 + PDTimestampAfterTimeWindow map[string]uint64 +} + +type TimeWindowData struct { + TimeWindow + Data map[cloudstorage.DmlPathKey]consumer.IncrementalData } type TimeWindowAdvancer struct { @@ -49,40 +59,63 @@ type TimeWindowAdvancer struct { // checkpointWatcher is the Active-Active checkpoint watcher for each cluster, // mapping from cluster ID to the downstream cluster ID to the checkpoint watcher - checkpointWatcher map[string]map[string]*checkpointWatcher + checkpointWatcher map[string]map[string]*watcher.CheckpointWatcher // s3checkpointWatcher is the S3 checkpoint watcher for each cluster, mapping from cluster ID to the s3 checkpoint watcher - s3Watcher map[string]*s3Watcher + s3Watcher map[string]*watcher.S3Watcher // pdClients is the pd clients for each cluster, mapping from cluster ID to the pd client pdClients map[string]pd.Client } func NewTimeWindowAdvancer( - checkpointWatchers map[string]map[string]*checkpointWatcher, - s3Watchers map[string]*s3Watcher, + checkpointWatchers map[string]map[string]*watcher.CheckpointWatcher, + s3Watchers map[string]*watcher.S3Watcher, pdClients map[string]pd.Client, ) *TimeWindowAdvancer { + timeWindowTriplet := make(map[string][3]TimeWindow) + for clusterID := range pdClients { + timeWindowTriplet[clusterID] = [3]TimeWindow{} + } return &TimeWindowAdvancer{ round: 0, - timeWindowTriplet: make(map[string][3]TimeWindow), + timeWindowTriplet: timeWindowTriplet, checkpointWatcher: checkpointWatchers, s3Watcher: s3Watchers, pdClients: pdClients, } } -func (t *TimeWindowAdvancer) AdvanceTimeWindow(pctx context.Context) error { +// AdvanceTimeWindow advances the time window for each cluster. Here is the steps: +// 1. Advance the checkpoint ts for each upstream-downstream cluster changefeed. +// +// For any upstream-downstream cluster changefeed, the checkpoint ts should be advanced to +// the maximum of pd timestamp after previouds time window of downstream advanced and +// the right boundary of previouds time window of every clusters. +// +// 2. Advance the right boundary for each cluster. +// +// For any cluster, the right boundary should be advanced to the maximum of pd timestamp of +// the cluster after the checkpoint ts of its upstream cluster advanced and the previous +// timewindow's checkpoint ts of changefeed where the cluster is the upstream cluster or +// the downstream cluster. +// +// 3. Update the time window for each cluster. +// +// For any cluster, the time window should be updated to the new time window. +func (t *TimeWindowAdvancer) AdvanceTimeWindow( + pctx context.Context, +) (map[string]TimeWindowData, error) { log.Info("advance time window", zap.Uint64("round", t.round)) // mapping from upstream cluster ID to the downstream cluster ID to the min checkpoint timestamp minCheckpointTsMap := make(map[string]map[string]uint64) maxTimeWindowRightBoundary := uint64(0) for downstreamClusterID, triplet := range t.timeWindowTriplet { - for upstreamClusterID, maxPDTimestampAfterTimeWindow := range triplet[2].MaxPDTimestampAfterTimeWindow { + for upstreamClusterID, pdTimestampAfterTimeWindow := range triplet[2].PDTimestampAfterTimeWindow { if _, ok := minCheckpointTsMap[upstreamClusterID]; !ok { minCheckpointTsMap[upstreamClusterID] = make(map[string]uint64) } - minCheckpointTsMap[upstreamClusterID][downstreamClusterID] = max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], maxPDTimestampAfterTimeWindow) + minCheckpointTsMap[upstreamClusterID][downstreamClusterID] = max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], pdTimestampAfterTimeWindow) } maxTimeWindowRightBoundary = max(maxTimeWindowRightBoundary, triplet[2].RightBoundary) } @@ -98,7 +131,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow(pctx context.Context) error { for downstreamClusterID, checkpointWatcher := range downstreamCheckpointWatcherMap { mincheckpointTs := max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], maxTimeWindowRightBoundary) eg.Go(func() error { - checkpointTs, err := checkpointWatcher.advanceCheckpointTs(ctx, mincheckpointTs) + checkpointTs, err := checkpointWatcher.AdvanceCheckpointTs(ctx, mincheckpointTs) if err != nil { return errors.Trace(err) } @@ -124,16 +157,17 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow(pctx context.Context) error { } } if err := eg.Wait(); err != nil { - return errors.Annotate(err, "advance checkpoint timestamp failed") + return nil, errors.Annotate(err, "advance checkpoint timestamp failed") } // Update the time window for each cluster + newDataMap := make(map[string]map[cloudstorage.DmlPathKey]consumer.IncrementalData) eg, ctx = errgroup.WithContext(pctx) - for clusterID := range t.timeWindowTriplet { + for clusterID, triplet := range t.timeWindowTriplet { minTimeWindowRightBoundary := max(maxCheckpointTs[clusterID], maxPDTimestampAfterCheckpointTs[clusterID]) s3Watcher := t.s3Watcher[clusterID] eg.Go(func() error { - s3CheckpointTs, err := s3Watcher.advanceS3CheckpointTs(ctx, minTimeWindowRightBoundary) + s3CheckpointTs, newData, err := s3Watcher.AdvanceS3CheckpointTs(ctx, minTimeWindowRightBoundary) if err != nil { return errors.Trace(err) } @@ -142,21 +176,23 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow(pctx context.Context) error { return errors.Trace(err) } lock.Lock() + newDataMap[clusterID] = newData timeWindow := newTimeWindow[clusterID] + timeWindow.LeftBoundary = triplet[2].RightBoundary timeWindow.RightBoundary = s3CheckpointTs - timeWindow.MaxPDTimestampAfterTimeWindow = make(map[string]uint64) - maps.Copy(timeWindow.MaxPDTimestampAfterTimeWindow, pdtsos) + timeWindow.PDTimestampAfterTimeWindow = make(map[string]uint64) + maps.Copy(timeWindow.PDTimestampAfterTimeWindow, pdtsos) newTimeWindow[clusterID] = timeWindow lock.Unlock() return nil }) } if err := eg.Wait(); err != nil { - return errors.Annotate(err, "advance time window failed") + return nil, errors.Annotate(err, "advance time window failed") } t.updateTimeWindow(newTimeWindow) t.round += 1 - return nil + return newTimeWindowData(newTimeWindow, newDataMap), nil } func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]TimeWindow) { @@ -164,17 +200,16 @@ func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]TimeWindo triplet := t.timeWindowTriplet[clusterID] triplet[0] = triplet[1] triplet[1] = triplet[2] - timeWindow.LeftBoundary = triplet[2].RightBoundary triplet[2] = timeWindow t.timeWindowTriplet[clusterID] = triplet log.Info("update time window", zap.String("clusterID", clusterID), zap.Any("timeWindow", timeWindow)) } } -func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(ctx context.Context, clusterID string) (map[string]uint64, error) { +func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(pctx context.Context, clusterID string) (map[string]uint64, error) { var lock sync.Mutex pdtsos := make(map[string]uint64) - eg, ctx := errgroup.WithContext(ctx) + eg, ctx := errgroup.WithContext(pctx) for otherClusterID := range t.pdClients { if otherClusterID == clusterID { continue @@ -192,5 +227,19 @@ func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(ctx context.Context, clust return nil }) } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } return pdtsos, nil } + +func newTimeWindowData(newTimeWindow map[string]TimeWindow, newDataMap map[string]map[cloudstorage.DmlPathKey]consumer.IncrementalData) map[string]TimeWindowData { + timeWindowDatas := make(map[string]TimeWindowData) + for clusterID, timeWindow := range newTimeWindow { + timeWindowDatas[clusterID] = TimeWindowData{ + TimeWindow: timeWindow, + Data: newDataMap[clusterID], + } + } + return timeWindowDatas +} diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go new file mode 100644 index 0000000000..1f6d626732 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -0,0 +1,428 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package checker + +import ( + "context" + "sort" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils.go" + "github.com/pingcap/ticdc/pkg/errors" + "go.uber.org/zap" +) + +type versionCacheEntry struct { + previous int + cdcVersion utils.CdcVersion +} + +type clusterViolationChecker struct { + clusterID string + twoPreviousTimeWindowKeyVersionCache map[utils.PkType]versionCacheEntry +} + +func newClusterViolationChecker(clusterID string) *clusterViolationChecker { + return &clusterViolationChecker{ + clusterID: clusterID, + twoPreviousTimeWindowKeyVersionCache: make(map[utils.PkType]versionCacheEntry), + } +} + +func (c *clusterViolationChecker) Check(r *utils.Record) { + entry, exists := c.twoPreviousTimeWindowKeyVersionCache[r.Pk] + if !exists { + c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ + previous: 0, + cdcVersion: r.CdcVersion, + } + return + } + if entry.cdcVersion.CommitTs >= r.CommitTs { + // duplicated old version, just skip it + return + } + entryCompareTs := entry.cdcVersion.GetCompareTs() + recordCompareTs := r.GetCompareTs() + if entryCompareTs >= recordCompareTs { + // violation detected + log.Error("LWW violation detected", + zap.String("clusterID", c.clusterID), + zap.Any("entry", entry), + zap.Any("record", r)) + } + c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ + previous: 0, + cdcVersion: r.CdcVersion, + } +} + +func (c *clusterViolationChecker) UpdateCache() { + newTwoPreviousTimeWindowKeyVersionCache := make(map[utils.PkType]versionCacheEntry) + for primaryKey, entry := range c.twoPreviousTimeWindowKeyVersionCache { + if entry.previous >= 2 { + continue + } + newTwoPreviousTimeWindowKeyVersionCache[primaryKey] = versionCacheEntry{ + previous: entry.previous + 1, + cdcVersion: entry.cdcVersion, + } + } + c.twoPreviousTimeWindowKeyVersionCache = newTwoPreviousTimeWindowKeyVersionCache +} + +type timeWindowDataCache struct { + // upstreamDataCache is a map of primary key to a map of commit ts to a record + upstreamDataCache map[utils.PkType]map[uint64]*utils.Record + + // downstreamDataCache is a map of primary key to a map of origin ts to a record + downstreamDataCache map[utils.PkType]map[uint64]*utils.Record + + leftBoundary uint64 + rightBoundary uint64 + checkpointTs map[string]uint64 +} + +func newTimeWindowDataCache(leftBoundary, rightBoundary uint64, checkpointTs map[string]uint64) timeWindowDataCache { + return timeWindowDataCache{ + upstreamDataCache: make(map[utils.PkType]map[uint64]*utils.Record), + downstreamDataCache: make(map[utils.PkType]map[uint64]*utils.Record), + leftBoundary: leftBoundary, + rightBoundary: rightBoundary, + checkpointTs: checkpointTs, + } +} + +func (twdc *timeWindowDataCache) newUpstreamRecord(record *utils.Record) { + recordsMap, exists := twdc.upstreamDataCache[record.Pk] + if !exists { + recordsMap = make(map[uint64]*utils.Record) + twdc.upstreamDataCache[record.Pk] = recordsMap + } + recordsMap[record.CommitTs] = record +} + +func (twdc *timeWindowDataCache) newDownstreamRecord(record *utils.Record) { + recordsMap, exists := twdc.downstreamDataCache[record.Pk] + if !exists { + recordsMap = make(map[uint64]*utils.Record) + twdc.downstreamDataCache[record.Pk] = recordsMap + } + recordsMap[record.OriginTs] = record +} + +func (twdc *timeWindowDataCache) NewRecord(record *utils.Record) { + if record.CommitTs <= twdc.leftBoundary { + // record is before the left boundary, just skip it + return + } + if record.OriginTs == 0 { + twdc.newUpstreamRecord(record) + } else { + twdc.newDownstreamRecord(record) + } +} + +type clusterDataChecker struct { + clusterID string + + timeWindowDataCaches [3]timeWindowDataCache + + rightBoundary uint64 + + overDataCaches []*utils.Record + + clusterViolationChecker *clusterViolationChecker +} + +func newClusterDataChecker(clusterID string) *clusterDataChecker { + return &clusterDataChecker{ + clusterID: clusterID, + timeWindowDataCaches: [3]timeWindowDataCache{}, + rightBoundary: 0, + overDataCaches: make([]*utils.Record, 0), + clusterViolationChecker: newClusterViolationChecker(clusterID), + } +} + +func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow advancer.TimeWindow) error { + if timeWindow.LeftBoundary != cd.rightBoundary { + return errors.Errorf("time window left boundary(%d) mismatch right boundary ts(%d)", timeWindow.LeftBoundary, cd.rightBoundary) + } + cd.timeWindowDataCaches[0] = cd.timeWindowDataCaches[1] + cd.timeWindowDataCaches[1] = cd.timeWindowDataCaches[2] + newTimeWindowDataCache := newTimeWindowDataCache(timeWindow.LeftBoundary, timeWindow.RightBoundary, timeWindow.CheckpointTs) + cd.rightBoundary = timeWindow.RightBoundary + newOverDataCache := make([]*utils.Record, 0, len(cd.overDataCaches)) + for _, overRecord := range cd.overDataCaches { + if overRecord.CommitTs > timeWindow.RightBoundary { + newOverDataCache = append(newOverDataCache, overRecord) + } else { + newTimeWindowDataCache.NewRecord(overRecord) + } + } + cd.timeWindowDataCaches[2] = newTimeWindowDataCache + cd.overDataCaches = newOverDataCache + return nil +} + +func (cd *clusterDataChecker) NewRecord(record *utils.Record) { + if record.CommitTs > cd.rightBoundary { + cd.overDataCaches = append(cd.overDataCaches, record) + return + } + cd.timeWindowDataCaches[2].NewRecord(record) +} + +func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowIdx int, pk utils.PkType, originTs uint64) (*utils.Record, bool) { + records, exists := cd.timeWindowDataCaches[timeWindowIdx].downstreamDataCache[pk] + if !exists { + return nil, false + } + if record, exists := records[originTs]; exists { + return record, false + } + for _, record := range records { + if record.GetCompareTs() >= originTs { + return nil, true + } + } + return nil, false +} + +func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx int, pk utils.PkType, commitTs uint64) bool { + records, exists := cd.timeWindowDataCaches[timeWindowIdx].upstreamDataCache[pk] + if !exists { + return false + } + _, exists = records[commitTs] + return exists +} + +// datalossDetection iterates through the upstream data cache [1] and [2] and filter out the records +// whose checkpoint ts falls within the (checkpoint[1], checkpoint[2]]. The record must be present +// in the downstream data cache [1] or [2] or another new record is present in the downstream data +// cache [1] or [2]. +func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { + for _, upstreamDataCache := range cd.timeWindowDataCaches[1].upstreamDataCache { + for _, record := range upstreamDataCache { + for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[1].checkpointTs { + if record.CommitTs <= checkpointTs { + continue + } + downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, record.Pk, record.CommitTs) + if skipped { + continue + } + if downstreamRecord == nil { + // data loss detected + log.Error("data loss detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + } else if !record.EqualDownstreamRecord(downstreamRecord) { + // data inconsistent detected + log.Error("data inconsistent detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + } + } + } + } + for _, upstreamDataCache := range cd.timeWindowDataCaches[2].upstreamDataCache { + for _, record := range upstreamDataCache { + for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[2].checkpointTs { + if record.CommitTs > checkpointTs { + continue + } + downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, record.Pk, record.CommitTs) + if skipped { + continue + } + if downstreamRecord == nil { + // data loss detected + log.Error("data loss detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + } else if !record.EqualDownstreamRecord(downstreamRecord) { + // data inconsistent detected + log.Error("data inconsistent detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + } + } + } + } +} + +// dataRedundantDetection iterates through the downstream data cache [2]. The record must be present +// in the upstream data cache [1] [2] or [3]. +func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { + for _, downstreamDataCache := range cd.timeWindowDataCaches[2].downstreamDataCache { + for _, record := range downstreamDataCache { + // For downstream records, OriginTs is the upstream commit ts + if !checker.FindClusterUpstreamData(cd.clusterID, record.Pk, record.OriginTs) { + // data redundant detected + log.Error("data redundant detected", + zap.String("downstreamClusterID", cd.clusterID), + zap.Any("record", record)) + } + } + } +} + +// lwwViolationDetection check the orderliness of the records +func (cd *clusterDataChecker) lwwViolationDetection() { + for pk, upstreamRecords := range cd.timeWindowDataCaches[2].upstreamDataCache { + downstreamRecords := cd.timeWindowDataCaches[2].downstreamDataCache[pk] + pkRecords := make([]*utils.Record, 0, len(upstreamRecords)+len(downstreamRecords)) + for _, upstreamRecord := range upstreamRecords { + pkRecords = append(pkRecords, upstreamRecord) + } + for _, downstreamRecord := range downstreamRecords { + pkRecords = append(pkRecords, downstreamRecord) + } + sort.Slice(pkRecords, func(i, j int) bool { + return pkRecords[i].CommitTs < pkRecords[j].CommitTs + }) + for _, record := range pkRecords { + cd.clusterViolationChecker.Check(record) + } + } + for pk, downstreamRecords := range cd.timeWindowDataCaches[2].downstreamDataCache { + if _, exists := cd.timeWindowDataCaches[2].upstreamDataCache[pk]; exists { + continue + } + pkRecords := make([]*utils.Record, 0, len(downstreamRecords)) + for _, downstreamRecord := range downstreamRecords { + pkRecords = append(pkRecords, downstreamRecord) + } + sort.Slice(pkRecords, func(i, j int) bool { + return pkRecords[i].CommitTs < pkRecords[j].CommitTs + }) + for _, record := range pkRecords { + cd.clusterViolationChecker.Check(record) + } + } + + cd.clusterViolationChecker.UpdateCache() +} + +func (cd *clusterDataChecker) Check(checker *DataChecker) { + // CHECK 1 - Data Loss Detection + cd.dataLossDetection(checker) + // CHECK 2 - Data Redundant Detection + cd.dataRedundantDetection(checker) + // CHECK 3 - LWW Violation Detection + cd.lwwViolationDetection() +} + +type DataChecker struct { + clusterDataCheckers map[string]*clusterDataChecker +} + +func NewDataChecker(clusterConfig map[string]config.ClusterConfig) *DataChecker { + clusterDataChecker := make(map[string]*clusterDataChecker) + for clusterID := range clusterConfig { + clusterDataChecker[clusterID] = newClusterDataChecker(clusterID) + } + return &DataChecker{ + clusterDataCheckers: clusterDataChecker, + } +} + +// FindClusterDownstreamData checks whether the record is present in the downstream data +// cache [1] or [2] or another new record is present in the downstream data cache [1] or [2]. +func (c *DataChecker) FindClusterDownstreamData(clusterID string, pk utils.PkType, originTs uint64) (*utils.Record, bool) { + clusterDataChecker, exists := c.clusterDataCheckers[clusterID] + if !exists { + return nil, false + } + record, skipped := clusterDataChecker.findClusterDownstreamDataInTimeWindow(1, pk, originTs) + if skipped || record != nil { + return record, skipped + } + return clusterDataChecker.findClusterDownstreamDataInTimeWindow(2, pk, originTs) +} + +func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk utils.PkType, commitTs uint64) bool { + for _, clusterDataChecker := range c.clusterDataCheckers { + if clusterDataChecker.clusterID == downstreamClusterID { + continue + } + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(1, pk, commitTs) { + return true + } + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(2, pk, commitTs) { + return true + } + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(3, pk, commitTs) { + return true + } + } + return false +} + +func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) error { + if err := c.decodeNewTimeWindowData(ctx, newTimeWindowData); err != nil { + log.Error("failed to decode new time window data", zap.Error(err)) + return errors.Annotate(err, "failed to decode new time window data") + } + for _, clusterDataChecker := range c.clusterDataCheckers { + clusterDataChecker.Check(c) + } + return nil +} + +func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) error { + if len(newTimeWindowData) != len(c.clusterDataCheckers) { + return errors.Errorf("number of clusters mismatch, expected %d, got %d", len(c.clusterDataCheckers), len(newTimeWindowData)) + } + for clusterID, timeWindowData := range newTimeWindowData { + clusterDataChecker, exists := c.clusterDataCheckers[clusterID] + if !exists { + return errors.Errorf("cluster %s not found", clusterID) + } + if err := clusterDataChecker.PrepareNextTimeWindowData(timeWindowData.TimeWindow); err != nil { + return errors.Trace(err) + } + for dmlPathKey, incrementalData := range timeWindowData.Data { + tableParser, err := parser.NewTableParser(dmlPathKey.GetKey(), incrementalData.SchemaContent) + if err != nil { + return errors.Trace(err) + } + + // Parse CSV data from all file slices + for _, contents := range incrementalData.DataContentSlices { + for _, content := range contents { + records, err := tableParser.DecodeFiles(ctx, content) + if err != nil { + return errors.Trace(err) + } + for _, record := range records { + clusterDataChecker.NewRecord(record) + } + } + } + } + } + + return nil +} diff --git a/cmd/multi-cluster-consistency-checker/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml similarity index 100% rename from cmd/multi-cluster-consistency-checker/config.example.toml rename to cmd/multi-cluster-consistency-checker/config/config.example.toml diff --git a/cmd/multi-cluster-consistency-checker/config.go b/cmd/multi-cluster-consistency-checker/config/config.go similarity index 96% rename from cmd/multi-cluster-consistency-checker/config.go rename to cmd/multi-cluster-consistency-checker/config/config.go index f839dc0c93..a726e3c1d2 100644 --- a/cmd/multi-cluster-consistency-checker/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -11,7 +11,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package main +package config import ( "fmt" @@ -56,7 +56,7 @@ type ClusterConfig struct { S3ChangefeedID string `toml:"s3-changefeed-id" json:"s3-changefeed-id"` // SecurityConfig is the security configuration for the cluster - SecurityConfig *security.Credential `toml:"security-config" json:"security-config"` + SecurityConfig security.Credential `toml:"security-config" json:"security-config"` // DownstreamClusterChangefeedConfig is the configuration for the changefeed of the downstream cluster // mapping from downstream cluster ID to the changefeed configuration @@ -64,7 +64,7 @@ type ClusterConfig struct { } // loadConfig loads the configuration from a TOML file -func loadConfig(path string) (*Config, error) { +func LoadConfig(path string) (*Config, error) { // Check if file exists if _, err := os.Stat(path); os.IsNotExist(err) { return nil, fmt.Errorf("config file does not exist: %s", path) diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go new file mode 100644 index 0000000000..103333134c --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go @@ -0,0 +1,388 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package consumer + +import ( + "context" + "maps" + "sort" + "strings" + "sync" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/pingcap/tidb/br/pkg/storage" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" +) + +type ( + fileIndexRange map[cloudstorage.FileIndexKey]indexRange + fileIndexKeyMap map[cloudstorage.FileIndexKey]uint64 +) + +// indexRange defines a range of files. eg. CDC000002.csv ~ CDC000005.csv +type indexRange struct { + start uint64 + end uint64 +} + +type schemaFilePathWithChecksum struct { + filepath string + checksum uint32 +} + +type S3Consumer struct { + s3Storage storage.ExternalStorage + fileExtension string + dateSeparator string + tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap + schemaChecksumMap map[cloudstorage.SchemaPathKey]schemaFilePathWithChecksum + fileIndexWidth int +} + +func NewS3Consumer(s3Storage storage.ExternalStorage) *S3Consumer { + // Use default values for file extension and date separator + // File extension will be detected from files, date separator defaults to day + return &S3Consumer{ + s3Storage: s3Storage, + fileExtension: ".csv", + dateSeparator: config.DateSeparatorDay.String(), + tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), + schemaChecksumMap: make(map[cloudstorage.SchemaPathKey]schemaFilePathWithChecksum), + fileIndexWidth: config.DefaultFileIndexWidth, + } +} + +// diffDMLMaps returns the difference between two DML index maps (map1 - map2) +func diffDMLMaps( + map1, map2 map[cloudstorage.DmlPathKey]fileIndexKeyMap, +) map[cloudstorage.DmlPathKey]fileIndexRange { + resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) + for dmlPathKey1, fileIndexKeyMap1 := range map1 { + dmlPathKey2, ok := map2[dmlPathKey1] + if !ok { + resMap[dmlPathKey1] = make(fileIndexRange) + for indexKey, val1 := range fileIndexKeyMap1 { + resMap[dmlPathKey1][indexKey] = indexRange{ + start: 1, + end: val1, + } + } + continue + } + for fileIndexKey, val1 := range fileIndexKeyMap1 { + val2 := dmlPathKey2[fileIndexKey] + if val1 > val2 { + if _, ok := resMap[dmlPathKey1]; !ok { + resMap[dmlPathKey1] = make(fileIndexRange) + } + resMap[dmlPathKey1][fileIndexKey] = indexRange{ + start: val2 + 1, + end: val1, + } + } + } + } + return resMap +} + +// parseDMLFilePath parses a DML file path and updates the tableDMLIdxMap +func (c *S3Consumer) parseDMLFilePath(_ context.Context, path string) error { + var dmlkey cloudstorage.DmlPathKey + fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, path) + if err != nil { + return err + } + + m, ok := c.tableDMLIdxMap[dmlkey] + if !ok { + c.tableDMLIdxMap[dmlkey] = fileIndexKeyMap{ + fileIdx.FileIndexKey: fileIdx.Idx, + } + } else if fileIdx.Idx >= m[fileIdx.FileIndexKey] { + c.tableDMLIdxMap[dmlkey][fileIdx.FileIndexKey] = fileIdx.Idx + } + return nil +} + +// getNewFiles returns newly created dml files in specific ranges +func (c *S3Consumer) getNewFiles( + ctx context.Context, +) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { + tableDMLMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) + opt := &storage.WalkOption{SubDir: ""} + + // Save a snapshot of current tableDMLIdxMap + origDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap, len(c.tableDMLIdxMap)) + for k, v := range c.tableDMLIdxMap { + m := make(fileIndexKeyMap) + maps.Copy(m, v) + origDMLIdxMap[k] = m + } + + // Walk through all files in S3 storage + if err := c.s3Storage.WalkDir(ctx, opt, func(path string, size int64) error { + if cloudstorage.IsSchemaFile(path) { + var schemaKey cloudstorage.SchemaPathKey + checksumInFile, err := schemaKey.ParseSchemaFilePath(path) + if err != nil { + log.Error("failed to parse schema file path, skipping", + zap.String("path", path), + zap.Error(err)) + return nil + } + c.schemaChecksumMap[schemaKey] = schemaFilePathWithChecksum{ + filepath: path, + checksum: checksumInFile, + } + return nil + } + + // Try to parse DML file path if it matches the expected extension + if strings.HasSuffix(path, c.fileExtension) { + err := c.parseDMLFilePath(ctx, path) + if err != nil { + log.Error("failed to parse dml file path, skipping", + zap.String("path", path), + zap.Error(err)) + return nil + } + } + return nil + }); err != nil { + return tableDMLMap, errors.Trace(err) + } + + // Calculate the difference to find new files + tableDMLMap = diffDMLMaps(c.tableDMLIdxMap, origDMLIdxMap) + return tableDMLMap, nil +} + +type IncrementalData struct { + DataContentSlices map[cloudstorage.FileIndexKey][][]byte + SchemaContent []byte +} + +// downloadDMLFiles downloads DML files concurrently and returns their content +func (c *S3Consumer) downloadDMLFiles( + ctx context.Context, + newFiles map[cloudstorage.DmlPathKey]fileIndexRange, +) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { + result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) + + // Prepare all file download tasks + type downloadTask struct { + dmlPathKey cloudstorage.DmlPathKey + fileIndex *cloudstorage.FileIndex + } + + var tasks []downloadTask + for dmlPathKey, fileRange := range newFiles { + for indexKey, indexRange := range fileRange { + for i := indexRange.start; i <= indexRange.end; i++ { + fileIdx := &cloudstorage.FileIndex{ + FileIndexKey: indexKey, + Idx: i, + } + tasks = append(tasks, downloadTask{ + dmlPathKey: dmlPathKey, + fileIndex: fileIdx, + }) + } + } + } + + log.Info("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) + + // Concurrently download files + type fileContent struct { + dmlPathKey cloudstorage.DmlPathKey + indexKey cloudstorage.FileIndexKey + idx uint64 + content []byte + } + + fileContents := make(chan fileContent, len(tasks)) + eg, egCtx := errgroup.WithContext(ctx) + for _, task := range tasks { + eg.Go(func() error { + filePath := task.dmlPathKey.GenerateDMLFilePath( + task.fileIndex, + c.fileExtension, + c.fileIndexWidth, + ) + + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read file: %s", filePath) + } + + // Channel writes are thread-safe, no mutex needed + fileContents <- fileContent{ + dmlPathKey: task.dmlPathKey, + indexKey: task.fileIndex.FileIndexKey, + idx: task.fileIndex.Idx, + content: content, + } + return nil + }) + } + + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + close(fileContents) + + // Group and merge file contents by DmlPathKey and FileIndexKey + type contentWithIdx struct { + idx uint64 + content []byte + } + tempResult := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][]contentWithIdx) + + for fc := range fileContents { + if tempResult[fc.dmlPathKey] == nil { + tempResult[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][]contentWithIdx) + } + tempResult[fc.dmlPathKey][fc.indexKey] = append( + tempResult[fc.dmlPathKey][fc.indexKey], + contentWithIdx{idx: fc.idx, content: fc.content}, + ) + } + + // Merge contents in order (by idx) for each FileIndexKey + for dmlPathKey, indexKeyMap := range tempResult { + if result[dmlPathKey] == nil { + result[dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) + } + for indexKey, contentWithIdxs := range indexKeyMap { + // Sort by idx to maintain file order + sort.Slice(contentWithIdxs, func(i, j int) bool { + return contentWithIdxs[i].idx < contentWithIdxs[j].idx + }) + contents := make([][]byte, len(contentWithIdxs)) + for i, contentWithIdx := range contentWithIdxs { + contents[i] = contentWithIdx.content + } + result[dmlPathKey][indexKey] = contents + } + } + + return result, nil +} + +// downloadSchemaFiles downloads schema files concurrently for the given SchemaPathKeys +func (c *S3Consumer) downloadSchemaFiles( + ctx context.Context, + schemaPathKeys map[cloudstorage.SchemaPathKey]struct{}, +) (map[cloudstorage.SchemaPathKey][]byte, error) { + schemaContents := make(map[cloudstorage.SchemaPathKey][]byte) + schemaMutex := &sync.Mutex{} + eg, egCtx := errgroup.WithContext(ctx) + + log.Info("starting concurrent schema file download", zap.Int("totalSchemas", len(schemaPathKeys))) + + for schemaPathKey := range schemaPathKeys { + eg.Go(func() error { + checksumInFile, exists := c.schemaChecksumMap[schemaPathKey] + if !exists { + return errors.Annotatef(errors.ErrInternalCheckFailed, "schema file not found: %s", schemaPathKey.GetKey()) + } + schemaPath := checksumInFile.filepath + content, err := c.s3Storage.ReadFile(egCtx, schemaPath) + if err != nil { + return errors.Annotatef(err, "failed to read schema file: %s", schemaPath) + } + + schemaMutex.Lock() + schemaContents[schemaPathKey] = content + schemaMutex.Unlock() + + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + + return schemaContents, nil +} + +// ConsumeNewFiles downloads new files concurrently and returns their content +func (c *S3Consumer) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlPathKey]IncrementalData, error) { + newFiles, err := c.getNewFiles(ctx) + if err != nil { + return nil, errors.Trace(err) + } + + if len(newFiles) == 0 { + log.Info("no new dml files found") + return nil, nil + } + + // Download DML files + dmlData, err := c.downloadDMLFiles(ctx, newFiles) + if err != nil { + return nil, errors.Trace(err) + } + + // Collect unique SchemaPathKeys + schemaPathKeys := make(map[cloudstorage.SchemaPathKey]struct{}) + for dmlPathKey := range newFiles { + schemaPathKey := dmlPathKey.SchemaPathKey + schemaPathKeys[schemaPathKey] = struct{}{} + } + + // Download schema files + schemaContents, err := c.downloadSchemaFiles(ctx, schemaPathKeys) + if err != nil { + return nil, errors.Trace(err) + } + + // Combine DML data and schema data into result + result := make(map[cloudstorage.DmlPathKey]IncrementalData) + for dmlPathKey, dataSlices := range dmlData { + incrementalData := IncrementalData{ + DataContentSlices: dataSlices, + } + // Assign schema content if available + schemaPathKey := dmlPathKey.SchemaPathKey + if schemaContent, ok := schemaContents[schemaPathKey]; ok { + incrementalData.SchemaContent = schemaContent + } + result[dmlPathKey] = incrementalData + } + + // Log the new files found + for dmlPathKey, fileRange := range newFiles { + for indexKey, indexRange := range fileRange { + log.Info("found and downloaded new dml files", + zap.String("schema", dmlPathKey.Schema), + zap.String("table", dmlPathKey.Table), + zap.Uint64("tableVersion", dmlPathKey.TableVersion), + zap.Int64("partitionNum", dmlPathKey.PartitionNum), + zap.String("date", dmlPathKey.Date), + zap.String("dispatcherID", indexKey.DispatcherID), + zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), + zap.Uint64("startIndex", indexRange.start), + zap.Uint64("endIndex", indexRange.end), + zap.Int("fileCount", int(indexRange.end-indexRange.start+1))) + } + } + + return result, nil +} diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 335475ed42..4f24ae3e85 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -18,6 +18,7 @@ import ( "fmt" "os" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" "github.com/spf13/cobra" ) @@ -58,7 +59,7 @@ func run(cmd *cobra.Command, args []string) { os.Exit(ExitCodeInvalidConfig) } - cfg, err := loadConfig(cfgPath) + cfg, err := config.LoadConfig(cfgPath) if err != nil { fmt.Fprintf(os.Stderr, "failed to load config: %v\n", err) os.Exit(ExitCodeDecodeConfigFailed) diff --git a/cmd/multi-cluster-consistency-checker/parser/decoder.go b/cmd/multi-cluster-consistency-checker/parser/decoder.go new file mode 100644 index 0000000000..2179a18b7c --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/parser/decoder.go @@ -0,0 +1,53 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package parser + +import ( + "context" + + commonType "github.com/pingcap/ticdc/pkg/common" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" + codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" + "github.com/pingcap/ticdc/pkg/sink/codec/csv" +) + +func defaultCsvCodecConfig(protocol config.Protocol) *codecCommon.Config { + codecConfig := codecCommon.NewConfig(protocol) + codecConfig.Delimiter = "," + codecConfig.Quote = "\"" + codecConfig.NullString = "NULL" + codecConfig.IncludeCommitTs = true + codecConfig.Terminator = "\r\n" + return codecConfig +} + +type csvDecoder struct { + codecConfig *codecCommon.Config +} + +func NewCsvDecoder() *csvDecoder { + codecConfig := defaultCsvCodecConfig(config.ProtocolCsv) + return &csvDecoder{ + codecConfig: codecConfig, + } +} + +func (d *csvDecoder) NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) { + decoder, err := csv.NewDecoder(ctx, d.codecConfig, tableInfo, content) + if err != nil { + return nil, errors.Trace(err) + } + return decoder, nil +} diff --git a/cmd/multi-cluster-consistency-checker/parser/parser.go b/cmd/multi-cluster-consistency-checker/parser/parser.go new file mode 100644 index 0000000000..6e9dec90d1 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/parser/parser.go @@ -0,0 +1,200 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package parser + +import ( + "context" + "encoding/hex" + "encoding/json" + "time" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils.go" + "github.com/pingcap/ticdc/pkg/common" + commonType "github.com/pingcap/ticdc/pkg/common" + "github.com/pingcap/ticdc/pkg/common/event" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" + "github.com/pingcap/tidb/pkg/types" + "github.com/pingcap/tidb/pkg/util/chunk" + "github.com/pingcap/tidb/pkg/util/codec" + "go.uber.org/zap" +) + +type tableParser struct { + tableKey string + tableInfo *common.TableInfo + pkColumnOffsets map[int64]int + csvDecoder *csvDecoder +} + +func NewTableParser(tableKey string, content []byte) (*tableParser, error) { + tableParser := &tableParser{} + if err := tableParser.parseTableInfo(tableKey, content); err != nil { + return nil, errors.Trace(err) + } + tableParser.csvDecoder = NewCsvDecoder() + return tableParser, nil +} + +func (pt *tableParser) parseTableInfo(tableKey string, content []byte) error { + // Parse schema content to get tableInfo + var tableDef cloudstorage.TableDefinition + if err := json.Unmarshal(content, &tableDef); err != nil { + log.Error("failed to unmarshal schema content", + zap.String("tableKey", tableKey), + zap.ByteString("content", content), + zap.Error(err)) + return errors.Trace(err) + } + + tableInfo, err := tableDef.ToTableInfo() + if err != nil { + log.Error("failed to convert table definition to table info", + zap.String("tableKey", tableKey), + zap.ByteString("content", content), + zap.Error(err)) + return errors.Trace(err) + } + + pkColInfos := tableInfo.GetPrimaryKeyColumnInfos() + if len(pkColInfos) == 0 { + log.Error("table has no primary key", + zap.String("tableKey", tableKey), + zap.ByteString("content", content)) + return errors.Errorf("table %s has no primary key", tableKey) + } + + columns := tableInfo.GetColumns() + pkColumnOffsets := make(map[int64]int) + for i, pkColInfo := range pkColInfos { + if pkColInfo.Offset < 0 || pkColInfo.Offset >= len(columns) { + log.Error("primary key column offset out of range", + zap.String("tableKey", tableKey), + zap.Int("offset", pkColInfo.Offset), + zap.Int("len(columns)", len(columns)), + zap.ByteString("content", content)) + return errors.Errorf("primary key column offset out of range for column %d in table %s", pkColInfo.Offset, tableKey) + } + pkColumnOffsets[columns[pkColInfo.Offset].ID] = i + } + pt.tableKey = tableKey + pt.tableInfo = tableInfo + pt.pkColumnOffsets = pkColumnOffsets + return nil +} + +func (pt *tableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Record, error) { + originTs := uint64(0) + pkCount := 0 + colInfos := pt.tableInfo.GetColInfosForRowChangedEvent() + columnValues := make([]utils.ColumnValue, 0, len(colInfos)) + pkColumnValues := make([]types.Datum, len(pt.pkColumnOffsets)) + for _, colInfo := range colInfos { + col, ok := pt.tableInfo.GetColumnInfo(colInfo.ID) + if !ok { + log.Error("column info not found", + zap.String("tableKey", pt.tableKey), + zap.Int64("colID", colInfo.ID)) + return nil, errors.Errorf("column info not found for column %d in table %s", colInfo.ID, pt.tableKey) + } + rowColOffset, ok := pt.tableInfo.GetRowColumnsOffset()[colInfo.ID] + if !ok { + log.Error("row column offset not found", + zap.String("tableKey", pt.tableKey), + zap.Int64("colID", colInfo.ID)) + return nil, errors.Errorf("row column offset not found for column %d in table %s", colInfo.ID, pt.tableKey) + } + if offset, ok := pt.pkColumnOffsets[colInfo.ID]; ok { + dt := row.GetDatum(rowColOffset, &col.FieldType) + if !pkColumnValues[offset].IsNull() { + log.Error("duplicated primary key column value", + zap.String("tableKey", pt.tableKey), + zap.Int64("colID", colInfo.ID)) + return nil, errors.Errorf("duplicated primary key column value for column %d in table %s", colInfo.ID, pt.tableKey) + } + pkColumnValues[offset] = dt + pkCount += 1 + continue + } + colValue := commonType.ExtractColVal(row, col, rowColOffset) + if col.Name.O == event.OriginTsColumn { + var ok bool + originTs, ok = colValue.(uint64) + if !ok { + log.Error("origin ts column value is not uint64", + zap.String("tableKey", pt.tableKey), + zap.Int64("colID", colInfo.ID), + zap.Any("colValue", colValue)) + return nil, errors.Errorf("origin ts column value is not uint64 for column %d in table %s", colInfo.ID, pt.tableKey) + } + } else { + columnValues = append(columnValues, utils.ColumnValue{ + ColumnID: colInfo.ID, + Value: colValue, + }) + } + } + if pkCount != len(pt.pkColumnOffsets) { + log.Error("primary key column value missing", + zap.String("tableKey", pt.tableKey), + zap.Int("pkCount", pkCount), + zap.Int("len(pt.pkColumnOffsets)", len(pt.pkColumnOffsets))) + return nil, errors.Errorf("primary key column value is null for table %s", pt.tableKey) + } + pkEncoded, err := codec.EncodeKey(time.UTC, nil, pkColumnValues...) + if err != nil { + return nil, errors.Annotate(err, "failed to encode primary key") + } + pk := hex.EncodeToString(pkEncoded) + return &utils.Record{ + Pk: utils.PkType(pk), + ColumnValues: columnValues, + CdcVersion: utils.CdcVersion{ + CommitTs: commitTs, + OriginTs: originTs, + }, + }, nil +} + +func (pt *tableParser) DecodeFiles(ctx context.Context, content []byte) ([]*utils.Record, error) { + records := make([]*utils.Record, 0) + + decoder, err := pt.csvDecoder.NewDecoder(ctx, pt.tableInfo, content) + if err != nil { + return nil, errors.Trace(err) + } + + for { + msgType, hasNext := decoder.HasNext() + if !hasNext { + break + } + if msgType != codecCommon.MessageTypeRow { + continue + } + dmlEvent := decoder.NextDMLEvent() + if dmlEvent == nil || dmlEvent.Rows == nil || dmlEvent.Rows.NumRows() == 0 { + continue + } + row := dmlEvent.Rows.GetRow(0) + record, err := pt.parseRecord(&row, dmlEvent.CommitTs) + if err != nil { + return nil, errors.Trace(err) + } + records = append(records, record) + } + return records, nil +} diff --git a/cmd/multi-cluster-consistency-checker/pd.go b/cmd/multi-cluster-consistency-checker/pd.go deleted file mode 100644 index a304e7c13d..0000000000 --- a/cmd/multi-cluster-consistency-checker/pd.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - "time" - - "github.com/pingcap/ticdc/pkg/errors" - "github.com/pingcap/ticdc/pkg/etcd" - "github.com/pingcap/ticdc/pkg/security" - pd "github.com/tikv/pd/client" - pdopt "github.com/tikv/pd/client/opt" - "google.golang.org/grpc" -) - -func newClient(ctx context.Context, pdAddr string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { - pdClient, err := pd.NewClientWithContext( - ctx, "consistency-checker", []string{pdAddr}, securityConfig.PDSecurityOption(), - pdopt.WithCustomTimeoutOption(10*time.Second), - ) - if err != nil { - return nil, nil, errors.Trace(err) - } - - etcdCli, err := etcd.CreateRawEtcdClient(securityConfig, grpc.EmptyDialOption{}, pdAddr) - if err != nil { - return nil, nil, errors.Trace(err) - } - - cdcEtcdClient, err := etcd.NewCDCEtcdClient(ctx, etcdCli, "default") - if err != nil { - return nil, nil, errors.Trace(err) - } - - return pdClient, cdcEtcdClient, nil -} diff --git a/cmd/multi-cluster-consistency-checker/s3_watcher.go b/cmd/multi-cluster-consistency-checker/s3_watcher.go deleted file mode 100644 index 0f56d2d10d..0000000000 --- a/cmd/multi-cluster-consistency-checker/s3_watcher.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - - "github.com/pingcap/ticdc/pkg/errors" -) - -type s3Watcher struct { - checkpointWatcher *checkpointWatcher -} - -func (sw *s3Watcher) advanceS3CheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { - checkpointTs, err := sw.checkpointWatcher.advanceCheckpointTs(ctx, minCheckpointTs) - if err != nil { - return 0, errors.Annotate(err, "advance s3 checkpoint timestamp failed") - } - - // TODO: get the index updated from the s3 - - return checkpointTs, nil -} diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index ec5a9284f7..468107009f 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -15,47 +15,88 @@ package main import ( "context" + "time" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/checker" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/etcd" + "github.com/pingcap/ticdc/pkg/security" + putil "github.com/pingcap/ticdc/pkg/util" pd "github.com/tikv/pd/client" + pdopt "github.com/tikv/pd/client/opt" + "google.golang.org/grpc" ) -func runTask(ctx context.Context, cfg *Config) error { +func runTask(ctx context.Context, cfg *config.Config) error { checkpointWatchers, s3Watchers, pdClients, err := initClients(ctx, cfg) if err != nil { return errors.Trace(err) } - timeWindowAdvancer := NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) + timeWindowAdvancer := advancer.NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) + dataChecker := checker.NewDataChecker(cfg.Clusters) for { - err = timeWindowAdvancer.AdvanceTimeWindow(ctx) + newTimeWindowData, err := timeWindowAdvancer.AdvanceTimeWindow(ctx) if err != nil { return errors.Trace(err) } + if err := dataChecker.CheckInNextTimeWindow(ctx, newTimeWindowData); err != nil { + return errors.Trace(err) + } } } -func initClients(ctx context.Context, cfg *Config) (map[string]map[string]*checkpointWatcher, map[string]*s3Watcher, map[string]pd.Client, error) { - checkpointWatchers := make(map[string]map[string]*checkpointWatcher) - s3Watchers := make(map[string]*s3Watcher) +func initClients(ctx context.Context, cfg *config.Config) (map[string]map[string]*watcher.CheckpointWatcher, map[string]*watcher.S3Watcher, map[string]pd.Client, error) { + checkpointWatchers := make(map[string]map[string]*watcher.CheckpointWatcher) + s3Watchers := make(map[string]*watcher.S3Watcher) pdClients := make(map[string]pd.Client) for clusterID, clusterConfig := range cfg.Clusters { - pdClient, etcdClient, err := newClient(ctx, clusterConfig.PDAddr, clusterConfig.SecurityConfig) + pdClient, etcdClient, err := newPDClient(ctx, clusterConfig.PDAddr, &clusterConfig.SecurityConfig) if err != nil { return nil, nil, nil, errors.Trace(err) } - upstreamCheckpointWatchers := make(map[string]*checkpointWatcher) + upstreamCheckpointWatchers := make(map[string]*watcher.CheckpointWatcher) for downstreamClusterID, downstreamClusterChangefeedConfig := range clusterConfig.DownstreamClusterChangefeedConfig { - checkpointWatcher := NewCheckpointWatcher(clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) + checkpointWatcher := watcher.NewCheckpointWatcher(clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) upstreamCheckpointWatchers[downstreamClusterID] = checkpointWatcher } checkpointWatchers[clusterID] = upstreamCheckpointWatchers - s3Watcher := &s3Watcher{ - checkpointWatcher: NewCheckpointWatcher(clusterID, "s3", clusterConfig.S3ChangefeedID, etcdClient), + s3Storage, err := putil.GetExternalStorageWithDefaultTimeout(ctx, clusterConfig.S3SinkURI) + if err != nil { + return nil, nil, nil, errors.Trace(err) } + s3Watcher := watcher.NewS3Watcher( + watcher.NewCheckpointWatcher(clusterID, "s3", clusterConfig.S3ChangefeedID, etcdClient), + s3Storage, + ) s3Watchers[clusterID] = s3Watcher pdClients[clusterID] = pdClient } return checkpointWatchers, s3Watchers, pdClients, nil } + +func newPDClient(ctx context.Context, pdAddr string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { + pdClient, err := pd.NewClientWithContext( + ctx, "consistency-checker", []string{pdAddr}, securityConfig.PDSecurityOption(), + pdopt.WithCustomTimeoutOption(10*time.Second), + ) + if err != nil { + return nil, nil, errors.Trace(err) + } + + etcdCli, err := etcd.CreateRawEtcdClient(securityConfig, grpc.EmptyDialOption{}, pdAddr) + if err != nil { + return nil, nil, errors.Trace(err) + } + + cdcEtcdClient, err := etcd.NewCDCEtcdClient(ctx, etcdCli, "default") + if err != nil { + return nil, nil, errors.Trace(err) + } + + return pdClient, cdcEtcdClient, nil +} diff --git a/cmd/multi-cluster-consistency-checker/utils.go/types.go b/cmd/multi-cluster-consistency-checker/utils.go/types.go new file mode 100644 index 0000000000..e8168dc83a --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/utils.go/types.go @@ -0,0 +1,63 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +type PkType string + +type ColumnValue struct { + ColumnID int64 + Value any +} + +type CdcVersion struct { + CommitTs uint64 + OriginTs uint64 +} + +func (v *CdcVersion) GetCompareTs() uint64 { + if v.OriginTs > 0 { + return v.OriginTs + } + return v.CommitTs +} + +type Record struct { + CdcVersion + Pk PkType + ColumnValues []ColumnValue +} + +func (r *Record) EqualDownstreamRecord(downstreamRecord *Record) bool { + if downstreamRecord == nil { + return false + } + if r.CommitTs != downstreamRecord.OriginTs { + return false + } + if r.Pk != downstreamRecord.Pk { + return false + } + if len(r.ColumnValues) != len(downstreamRecord.ColumnValues) { + return false + } + for i, columnValue := range r.ColumnValues { + if columnValue.ColumnID != downstreamRecord.ColumnValues[i].ColumnID { + return false + } + if columnValue.Value != downstreamRecord.ColumnValues[i].Value { + return false + } + } + return true +} diff --git a/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go similarity index 95% rename from cmd/multi-cluster-consistency-checker/checkpoint_watcher.go rename to cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go index a819761d03..86c56e20b8 100644 --- a/cmd/multi-cluster-consistency-checker/checkpoint_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go @@ -11,7 +11,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package main +package watcher import ( "context" @@ -25,7 +25,7 @@ import ( "go.uber.org/zap" ) -type checkpointWatcher struct { +type CheckpointWatcher struct { upstreamClusterID string downstreamClusterID string changefeedID common.ChangeFeedID @@ -35,8 +35,8 @@ type checkpointWatcher struct { func NewCheckpointWatcher( upstreamClusterID, downstreamClusterID, changefeedID string, etcdClient etcd.CDCEtcdClient, -) *checkpointWatcher { - return &checkpointWatcher{ +) *CheckpointWatcher { + return &CheckpointWatcher{ upstreamClusterID: upstreamClusterID, downstreamClusterID: downstreamClusterID, changefeedID: common.NewChangeFeedIDWithName(changefeedID, "default"), @@ -45,7 +45,7 @@ func NewCheckpointWatcher( } // advanceCheckpointTs waits for the checkpoint to exceed minCheckpointTs -func (cw *checkpointWatcher) advanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { +func (cw *CheckpointWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { // First, get the current chceckpoint status from etcd status, modRev, err := cw.etcdClient.GetChangeFeedStatus(ctx, cw.changefeedID) if err != nil { diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go new file mode 100644 index 0000000000..e83146f0bf --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go @@ -0,0 +1,54 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package watcher + +import ( + "context" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/consumer" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/pingcap/tidb/br/pkg/storage" +) + +type S3Watcher struct { + checkpointWatcher *CheckpointWatcher + consumer *consumer.S3Consumer +} + +func NewS3Watcher( + checkpointWatcher *CheckpointWatcher, + s3Storage storage.ExternalStorage, +) *S3Watcher { + consumer := consumer.NewS3Consumer(s3Storage) + return &S3Watcher{ + checkpointWatcher: checkpointWatcher, + consumer: consumer, + } +} + +func (sw *S3Watcher) AdvanceS3CheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, map[cloudstorage.DmlPathKey]consumer.IncrementalData, error) { + checkpointTs, err := sw.checkpointWatcher.AdvanceCheckpointTs(ctx, minCheckpointTs) + if err != nil { + return 0, nil, errors.Annotate(err, "advance s3 checkpoint timestamp failed") + } + + // TODO: get the index updated from the s3 + newData, err := sw.consumer.ConsumeNewFiles(ctx) + if err != nil { + return 0, nil, errors.Annotate(err, "consume new files failed") + } + + return checkpointTs, newData, nil +} diff --git a/pkg/common/table_info.go b/pkg/common/table_info.go index 3b1d3d7cf4..c8a8c58b3b 100644 --- a/pkg/common/table_info.go +++ b/pkg/common/table_info.go @@ -525,6 +525,15 @@ func (ti *TableInfo) GetPkColInfo() *model.ColumnInfo { return ti.columnSchema.GetPkColInfo() } +func (ti *TableInfo) GetPrimaryKeyColumnInfos() []*model.IndexColumn { + for _, idx := range ti.columnSchema.Indices { + if idx.Primary { + return idx.Columns + } + } + return nil +} + // GetPrimaryKeyColumnNames returns the primary key column names func (ti *TableInfo) GetPrimaryKeyColumnNames() []string { var result []string From 186fcc764f08868e12e76fb7848027cfb343ad58 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Mon, 26 Jan 2026 11:46:45 +0800 Subject: [PATCH 04/23] draft Signed-off-by: Jianjun Liao --- .../parser/parser.go | 51 ++++++++++++------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/parser/parser.go b/cmd/multi-cluster-consistency-checker/parser/parser.go index 6e9dec90d1..8e730ae6f0 100644 --- a/cmd/multi-cluster-consistency-checker/parser/parser.go +++ b/cmd/multi-cluster-consistency-checker/parser/parser.go @@ -33,6 +33,31 @@ import ( "go.uber.org/zap" ) +func getPkColumnOffset(tableInfo *commonType.TableInfo) (map[int64]int, error) { + if tableInfo.PKIsHandle() { + pkColInfo := tableInfo.GetPkColInfo() + if pkColInfo == nil { + return nil, errors.Errorf("table %s has no primary key", tableInfo.GetTableName()) + } + return map[int64]int{pkColInfo.ID: 0}, nil + } + + pkColInfos := tableInfo.GetPrimaryKeyColumnInfos() + if len(pkColInfos) == 0 { + return nil, errors.Errorf("table %s has no primary key", tableInfo.GetTableName()) + } + + columns := tableInfo.GetColumns() + pkColumnOffsets := make(map[int64]int) + for i, pkColInfo := range pkColInfos { + if pkColInfo.Offset < 0 || pkColInfo.Offset >= len(columns) { + return nil, errors.Errorf("primary key column offset (%d) out of range for column (%d) in table %s", pkColInfo.Offset, len(columns), tableInfo.GetTableName()) + } + pkColumnOffsets[columns[pkColInfo.Offset].ID] = i + } + return pkColumnOffsets, nil +} + type tableParser struct { tableKey string tableInfo *common.TableInfo @@ -69,27 +94,15 @@ func (pt *tableParser) parseTableInfo(tableKey string, content []byte) error { return errors.Trace(err) } - pkColInfos := tableInfo.GetPrimaryKeyColumnInfos() - if len(pkColInfos) == 0 { - log.Error("table has no primary key", + pkColumnOffsets, err := getPkColumnOffset(tableInfo) + if err != nil { + log.Error("failed to get primary key column offsets", zap.String("tableKey", tableKey), - zap.ByteString("content", content)) - return errors.Errorf("table %s has no primary key", tableKey) + zap.ByteString("content", content), + zap.Error(err)) + return errors.Annotate(err, "failed to get primary key column offsets") } - columns := tableInfo.GetColumns() - pkColumnOffsets := make(map[int64]int) - for i, pkColInfo := range pkColInfos { - if pkColInfo.Offset < 0 || pkColInfo.Offset >= len(columns) { - log.Error("primary key column offset out of range", - zap.String("tableKey", tableKey), - zap.Int("offset", pkColInfo.Offset), - zap.Int("len(columns)", len(columns)), - zap.ByteString("content", content)) - return errors.Errorf("primary key column offset out of range for column %d in table %s", pkColInfo.Offset, tableKey) - } - pkColumnOffsets[columns[pkColInfo.Offset].ID] = i - } pt.tableKey = tableKey pt.tableInfo = tableInfo pt.pkColumnOffsets = pkColumnOffsets @@ -133,7 +146,7 @@ func (pt *tableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Reco if col.Name.O == event.OriginTsColumn { var ok bool originTs, ok = colValue.(uint64) - if !ok { + if !ok && colValue != nil { log.Error("origin ts column value is not uint64", zap.String("tableKey", pt.tableKey), zap.Int64("colID", colInfo.ID), From c3eca3e87615bc576ef57f39370ac8aa49604e2f Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Tue, 27 Jan 2026 18:34:31 +0800 Subject: [PATCH 05/23] draft Signed-off-by: Jianjun Liao --- .../advancer/time_window_advancer.go | 25 +- .../checker/checker.go | 18 +- .../config/config.go | 2 + .../consumer/consumer.go | 461 ++++++++++++++++++ .../consumer/s3_consumer.go | 388 --------------- cmd/multi-cluster-consistency-checker/main.go | 34 +- .../parser/parser.go | 14 +- cmd/multi-cluster-consistency-checker/task.go | 76 ++- .../{utils.go => utils}/types.go | 0 .../watcher/s3_watcher.go | 18 +- 10 files changed, 614 insertions(+), 422 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/consumer/consumer.go delete mode 100644 cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go rename cmd/multi-cluster-consistency-checker/{utils.go => utils}/types.go (100%) diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go index c5c2aa7d66..7fc6704cfe 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go @@ -43,6 +43,8 @@ type TimeWindow struct { // PDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, // mapping from upstream cluster ID to the max PD timestamp PDTimestampAfterTimeWindow map[string]uint64 + // NextMinLeftBoundary is the minimum left boundary of the next time window for the cluster + NextMinLeftBoundary uint64 } type TimeWindowData struct { @@ -164,10 +166,18 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( newDataMap := make(map[string]map[cloudstorage.DmlPathKey]consumer.IncrementalData) eg, ctx = errgroup.WithContext(pctx) for clusterID, triplet := range t.timeWindowTriplet { - minTimeWindowRightBoundary := max(maxCheckpointTs[clusterID], maxPDTimestampAfterCheckpointTs[clusterID]) + minTimeWindowRightBoundary := max(maxCheckpointTs[clusterID], maxPDTimestampAfterCheckpointTs[clusterID], triplet[2].NextMinLeftBoundary) s3Watcher := t.s3Watcher[clusterID] eg.Go(func() error { - s3CheckpointTs, newData, err := s3Watcher.AdvanceS3CheckpointTs(ctx, minTimeWindowRightBoundary) + s3CheckpointTs, err := s3Watcher.AdvanceS3CheckpointTs(ctx, minTimeWindowRightBoundary) + if err != nil { + return errors.Trace(err) + } + newData, err := s3Watcher.ConsumeNewFiles(ctx) + if err != nil { + return errors.Trace(err) + } + pdtso, err := t.getPDTsFromCluster(ctx, clusterID) if err != nil { return errors.Trace(err) } @@ -181,6 +191,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( timeWindow.LeftBoundary = triplet[2].RightBoundary timeWindow.RightBoundary = s3CheckpointTs timeWindow.PDTimestampAfterTimeWindow = make(map[string]uint64) + timeWindow.NextMinLeftBoundary = pdtso maps.Copy(timeWindow.PDTimestampAfterTimeWindow, pdtsos) newTimeWindow[clusterID] = timeWindow lock.Unlock() @@ -206,6 +217,16 @@ func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]TimeWindo } } +func (t *TimeWindowAdvancer) getPDTsFromCluster(ctx context.Context, clusterID string) (uint64, error) { + pdClient := t.pdClients[clusterID] + phyTs, logicTs, err := pdClient.GetTS(ctx) + if err != nil { + return 0, errors.Trace(err) + } + ts := oracle.ComposeTS(phyTs, logicTs) + return ts, nil +} + func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(pctx context.Context, clusterID string) (map[string]uint64, error) { var lock sync.Mutex pdtsos := make(map[string]uint64) diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index 1f6d626732..e95bd39d0f 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -20,8 +20,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils.go" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/errors" "go.uber.org/zap" ) @@ -367,13 +366,13 @@ func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk uti if clusterDataChecker.clusterID == downstreamClusterID { continue } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(1, pk, commitTs) { + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(0, pk, commitTs) { return true } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(2, pk, commitTs) { + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(1, pk, commitTs) { return true } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(3, pk, commitTs) { + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(2, pk, commitTs) { return true } } @@ -403,16 +402,11 @@ func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindow if err := clusterDataChecker.PrepareNextTimeWindowData(timeWindowData.TimeWindow); err != nil { return errors.Trace(err) } - for dmlPathKey, incrementalData := range timeWindowData.Data { - tableParser, err := parser.NewTableParser(dmlPathKey.GetKey(), incrementalData.SchemaContent) - if err != nil { - return errors.Trace(err) - } - + for _, incrementalData := range timeWindowData.Data { // Parse CSV data from all file slices for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { - records, err := tableParser.DecodeFiles(ctx, content) + records, err := incrementalData.Parser.DecodeFiles(ctx, content) if err != nil { return errors.Trace(err) } diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index a726e3c1d2..3a5405dd84 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -37,6 +37,8 @@ type GlobalConfig struct { // For example: // Timeout time.Duration `toml:"timeout" json:"timeout"` // RetryCount int `toml:"retry-count" json:"retry-count"` + LogLevel string `toml:"log-level" json:"log-level"` + Tables map[string]map[string]struct{} `toml:"tables" json:"tables"` } type DownstreamClusterChangefeedConfig struct { diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go new file mode 100644 index 0000000000..e8ca91c4c8 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -0,0 +1,461 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package consumer + +import ( + "context" + "fmt" + "maps" + "path" + "strings" + "sync" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/pingcap/tidb/br/pkg/storage" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" +) + +type ( + fileIndexRange map[cloudstorage.FileIndexKey]indexRange + fileIndexKeyMap map[cloudstorage.FileIndexKey]uint64 +) + +type indexRange struct { + start uint64 + end uint64 +} + +func updateTableDMLIdxMap( + tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap, + dmlkey cloudstorage.DmlPathKey, + fileIdx *cloudstorage.FileIndex, +) { + m, ok := tableDMLIdxMap[dmlkey] + if !ok { + tableDMLIdxMap[dmlkey] = fileIndexKeyMap{ + fileIdx.FileIndexKey: fileIdx.Idx, + } + } else if fileIdx.Idx > m[fileIdx.FileIndexKey] { + m[fileIdx.FileIndexKey] = fileIdx.Idx + } +} + +type schemaParser struct { + path string + parser *parser.TableParser +} + +type schemaKey struct { + schema string + table string +} + +type IncrementalData struct { + DataContentSlices map[cloudstorage.FileIndexKey][][]byte + Parser *parser.TableParser +} + +type Consumer struct { + s3Storage storage.ExternalStorage + fileExtension string + dateSeparator string + fileIndexWidth int + tables map[string]map[string]struct{} + + versionMapMu sync.RWMutex + currentTableVersionMap map[schemaKey]uint64 + tableDMLIdxMapMu sync.Mutex + tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap + schemaParserMapMu sync.RWMutex + schemaParserMap map[cloudstorage.SchemaPathKey]schemaParser +} + +func NewConsumer( + s3Storage storage.ExternalStorage, + tables map[string]map[string]struct{}, +) *Consumer { + return &Consumer{ + s3Storage: s3Storage, + fileExtension: ".csv", + dateSeparator: config.DateSeparatorDay.String(), + fileIndexWidth: config.DefaultFileIndexWidth, + tables: tables, + currentTableVersionMap: make(map[schemaKey]uint64, 0), + tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), + schemaParserMap: make(map[cloudstorage.SchemaPathKey]schemaParser), + } +} + +// getCurrentTableVersion returns the current table version for a given schema and table +func (c *Consumer) getCurrentTableVersion(schema, table string) uint64 { + tableKey := schemaKey{ + schema: schema, + table: table, + } + c.versionMapMu.RLock() + currentVersion := c.currentTableVersionMap[tableKey] + c.versionMapMu.RUnlock() + return currentVersion +} + +// updateCurrentTableVersion updates the current table version for a given schema and table +func (c *Consumer) updateCurrentTableVersion(schema, table string, version uint64) { + tableKey := schemaKey{ + schema: schema, + table: table, + } + c.versionMapMu.Lock() + c.currentTableVersionMap[tableKey] = version + c.versionMapMu.Unlock() +} + +// getSchemaParser returns the schema parser for a given schema and table version +func (c *Consumer) getSchemaParser(schema, table string, version uint64) (*parser.TableParser, error) { + schemaPathKey := cloudstorage.SchemaPathKey{ + Schema: schema, + Table: table, + TableVersion: version, + } + c.schemaParserMapMu.RLock() + schemaParser, ok := c.schemaParserMap[schemaPathKey] + c.schemaParserMapMu.RUnlock() + if !ok { + return nil, errors.Errorf("schema parser not found for schema: %s, table: %s, version: %d", schema, table, version) + } + return schemaParser.parser, nil +} + +// setSchemaParser sets the schema parser for a given schema and table version +func (c *Consumer) setSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *parser.TableParser) { + c.schemaParserMapMu.Lock() + c.schemaParserMap[schemaPathKey] = schemaParser{ + path: filePath, + parser: parser, + } + c.schemaParserMapMu.Unlock() +} + +// downloadSchemaFiles downloads schema files concurrently for given schema path keys +func (c *Consumer) downloadSchemaFiles( + ctx context.Context, + newVersionPaths map[cloudstorage.SchemaPathKey]string, +) error { + eg, egCtx := errgroup.WithContext(ctx) + + log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) + for schemaPathKey, filePath := range newVersionPaths { + eg.Go(func() error { + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read schema file: %s", filePath) + } + + parser, err := parser.NewTableParser(schemaPathKey.GetKey(), content) + if err != nil { + return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) + } + + c.setSchemaParser(schemaPathKey, filePath, parser) + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } + return nil +} + +func (c *Consumer) discoverAndDownloadNewTableVersions( + ctx context.Context, + schema, table string, +) ([]uint64, error) { + currentVersion := c.getCurrentTableVersion(schema, table) + metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) + opt := &storage.WalkOption{ + SubDir: metaSubDir, + ObjPrefix: "schema_", + } + + var scanVersions []uint64 + newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) + versionSet := make(map[uint64]struct{}) + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if !cloudstorage.IsSchemaFile(filePath) { + return nil + } + var schemaKey cloudstorage.SchemaPathKey + _, err := schemaKey.ParseSchemaFilePath(filePath) + if err != nil { + log.Error("failed to parse schema file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + version := schemaKey.TableVersion + if version > currentVersion { + if _, exists := versionSet[version]; !exists { + versionSet[version] = struct{}{} + scanVersions = append(scanVersions, version) + } + } + + newVersionPaths[schemaKey] = filePath + return nil + }); err != nil { + return nil, errors.Trace(err) + } + + // download new version schema files concurrently + if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { + return nil, errors.Trace(err) + } + + if currentVersion > 0 { + scanVersions = append(scanVersions, currentVersion) + } + return scanVersions, nil +} + +func (c *Consumer) diffNewTableDMLIdxMap( + newTableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap, +) map[cloudstorage.DmlPathKey]fileIndexRange { + resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) + c.tableDMLIdxMapMu.Lock() + defer c.tableDMLIdxMapMu.Unlock() + for newDMLPathKey, newFileIndexKeyMap := range newTableDMLIdxMap { + origFileIndexKeyMap, ok := c.tableDMLIdxMap[newDMLPathKey] + if !ok { + c.tableDMLIdxMap[newDMLPathKey] = newFileIndexKeyMap + resMap[newDMLPathKey] = make(fileIndexRange) + for indexKey, newEndVal := range newFileIndexKeyMap { + resMap[newDMLPathKey][indexKey] = indexRange{ + start: 1, + end: newEndVal, + } + } + continue + } + for indexKey, newEndVal := range newFileIndexKeyMap { + origEndVal := origFileIndexKeyMap[indexKey] + if newEndVal > origEndVal { + origFileIndexKeyMap[indexKey] = newEndVal + if _, ok := resMap[newDMLPathKey]; !ok { + resMap[newDMLPathKey] = make(fileIndexRange) + } + resMap[newDMLPathKey][indexKey] = indexRange{ + start: origEndVal + 1, + end: newEndVal, + } + } + } + } + return resMap +} + +func (c *Consumer) getNewFilesForSchemaPathKey( + ctx context.Context, + schema, table string, + version uint64, +) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { + schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version)) + opt := &storage.WalkOption{SubDir: schemaPrefix} + + // Save a snapshot of current tableDMLIdxMap + origDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap, len(c.tableDMLIdxMap)) + for k, v := range c.tableDMLIdxMap { + m := make(fileIndexKeyMap) + maps.Copy(m, v) + origDMLIdxMap[k] = m + } + + // Walk through all files in S3 storage + newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + if err := c.s3Storage.WalkDir(ctx, opt, func(path string, size int64) error { + // Try to parse DML file path if it matches the expected extension + if strings.HasSuffix(path, c.fileExtension) { + var dmlkey cloudstorage.DmlPathKey + fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, path) + if err != nil { + log.Error("failed to parse dml file path, skipping", + zap.String("path", path), + zap.Error(err)) + return nil + } + updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) + } + return nil + }); err != nil { + return nil, errors.Trace(err) + } + + // Calculate the difference to find new files + return c.diffNewTableDMLIdxMap(newTableDMLIdxMap), nil +} + +func (c *Consumer) downloadDMLFiles( + ctx context.Context, + schema, table string, + version uint64, +) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { + newFiles, err := c.getNewFilesForSchemaPathKey(ctx, schema, table, version) + if err != nil { + return nil, errors.Trace(err) + } + + if len(newFiles) == 0 { + return nil, nil + } + + result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) + + // Prepare all file download tasks + type downloadTask struct { + dmlPathKey cloudstorage.DmlPathKey + fileIndex cloudstorage.FileIndex + } + + var tasks []downloadTask + for dmlPathKey, fileRange := range newFiles { + for indexKey, indexRange := range fileRange { + log.Debug("prepare to download new dml file in index range", + zap.String("schema", dmlPathKey.Schema), + zap.String("table", dmlPathKey.Table), + zap.Uint64("version", dmlPathKey.TableVersion), + zap.Int64("partitionNum", dmlPathKey.PartitionNum), + zap.String("date", dmlPathKey.Date), + zap.String("dispatcherID", indexKey.DispatcherID), + zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), + zap.Uint64("startIndex", indexRange.start), + zap.Uint64("endIndex", indexRange.end)) + for i := indexRange.start; i <= indexRange.end; i++ { + tasks = append(tasks, downloadTask{ + dmlPathKey: dmlPathKey, + fileIndex: cloudstorage.FileIndex{ + FileIndexKey: indexKey, + Idx: i, + }, + }) + } + } + } + + log.Debug("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) + + // Concurrently download files + type fileContent struct { + dmlPathKey cloudstorage.DmlPathKey + indexKey cloudstorage.FileIndexKey + idx uint64 + content []byte + } + + fileContents := make(chan fileContent, len(tasks)) + eg, egCtx := errgroup.WithContext(ctx) + for _, task := range tasks { + eg.Go(func() error { + filePath := task.dmlPathKey.GenerateDMLFilePath( + &task.fileIndex, + c.fileExtension, + c.fileIndexWidth, + ) + + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read file: %s", filePath) + } + + // Channel writes are thread-safe, no mutex needed + fileContents <- fileContent{ + dmlPathKey: task.dmlPathKey, + indexKey: task.fileIndex.FileIndexKey, + idx: task.fileIndex.Idx, + content: content, + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + + // Close the channel to signal no more writes + close(fileContents) + + // Process the downloaded file contents + for fc := range fileContents { + if result[fc.dmlPathKey] == nil { + result[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) + } + result[fc.dmlPathKey][fc.indexKey] = append( + result[fc.dmlPathKey][fc.indexKey], + fc.content, + ) + } + + return result, nil +} + +func (c *Consumer) ConsumeNewFiles( + ctx context.Context, +) (map[cloudstorage.DmlPathKey]IncrementalData, error) { + var mu sync.Mutex + // Combine DML data and schema data into result + result := make(map[cloudstorage.DmlPathKey]IncrementalData) + eg, egCtx := errgroup.WithContext(ctx) + for schema, tables := range c.tables { + for table := range tables { + eg.Go(func() error { + newVersions, err := c.discoverAndDownloadNewTableVersions(egCtx, schema, table) + if err != nil { + return errors.Trace(err) + } + maxVersion := uint64(0) + for _, version := range newVersions { + maxVersion = max(maxVersion, version) + eg.Go(func() error { + dmlData, err := c.downloadDMLFiles(egCtx, schema, table, version) + if err != nil { + return errors.Trace(err) + } + parser, err := c.getSchemaParser(schema, table, version) + if err != nil { + return errors.Trace(err) + } + for dmlPathKey, dmlSlices := range dmlData { + mu.Lock() + result[dmlPathKey] = IncrementalData{ + DataContentSlices: dmlSlices, + Parser: parser, + } + mu.Unlock() + } + return nil + }) + } + c.updateCurrentTableVersion(schema, table, maxVersion) + return nil + }) + } + } + + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + return result, nil +} diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go deleted file mode 100644 index 103333134c..0000000000 --- a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go +++ /dev/null @@ -1,388 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package consumer - -import ( - "context" - "maps" - "sort" - "strings" - "sync" - - "github.com/pingcap/log" - "github.com/pingcap/ticdc/pkg/config" - "github.com/pingcap/ticdc/pkg/errors" - "github.com/pingcap/ticdc/pkg/sink/cloudstorage" - "github.com/pingcap/tidb/br/pkg/storage" - "go.uber.org/zap" - "golang.org/x/sync/errgroup" -) - -type ( - fileIndexRange map[cloudstorage.FileIndexKey]indexRange - fileIndexKeyMap map[cloudstorage.FileIndexKey]uint64 -) - -// indexRange defines a range of files. eg. CDC000002.csv ~ CDC000005.csv -type indexRange struct { - start uint64 - end uint64 -} - -type schemaFilePathWithChecksum struct { - filepath string - checksum uint32 -} - -type S3Consumer struct { - s3Storage storage.ExternalStorage - fileExtension string - dateSeparator string - tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap - schemaChecksumMap map[cloudstorage.SchemaPathKey]schemaFilePathWithChecksum - fileIndexWidth int -} - -func NewS3Consumer(s3Storage storage.ExternalStorage) *S3Consumer { - // Use default values for file extension and date separator - // File extension will be detected from files, date separator defaults to day - return &S3Consumer{ - s3Storage: s3Storage, - fileExtension: ".csv", - dateSeparator: config.DateSeparatorDay.String(), - tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), - schemaChecksumMap: make(map[cloudstorage.SchemaPathKey]schemaFilePathWithChecksum), - fileIndexWidth: config.DefaultFileIndexWidth, - } -} - -// diffDMLMaps returns the difference between two DML index maps (map1 - map2) -func diffDMLMaps( - map1, map2 map[cloudstorage.DmlPathKey]fileIndexKeyMap, -) map[cloudstorage.DmlPathKey]fileIndexRange { - resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) - for dmlPathKey1, fileIndexKeyMap1 := range map1 { - dmlPathKey2, ok := map2[dmlPathKey1] - if !ok { - resMap[dmlPathKey1] = make(fileIndexRange) - for indexKey, val1 := range fileIndexKeyMap1 { - resMap[dmlPathKey1][indexKey] = indexRange{ - start: 1, - end: val1, - } - } - continue - } - for fileIndexKey, val1 := range fileIndexKeyMap1 { - val2 := dmlPathKey2[fileIndexKey] - if val1 > val2 { - if _, ok := resMap[dmlPathKey1]; !ok { - resMap[dmlPathKey1] = make(fileIndexRange) - } - resMap[dmlPathKey1][fileIndexKey] = indexRange{ - start: val2 + 1, - end: val1, - } - } - } - } - return resMap -} - -// parseDMLFilePath parses a DML file path and updates the tableDMLIdxMap -func (c *S3Consumer) parseDMLFilePath(_ context.Context, path string) error { - var dmlkey cloudstorage.DmlPathKey - fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, path) - if err != nil { - return err - } - - m, ok := c.tableDMLIdxMap[dmlkey] - if !ok { - c.tableDMLIdxMap[dmlkey] = fileIndexKeyMap{ - fileIdx.FileIndexKey: fileIdx.Idx, - } - } else if fileIdx.Idx >= m[fileIdx.FileIndexKey] { - c.tableDMLIdxMap[dmlkey][fileIdx.FileIndexKey] = fileIdx.Idx - } - return nil -} - -// getNewFiles returns newly created dml files in specific ranges -func (c *S3Consumer) getNewFiles( - ctx context.Context, -) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { - tableDMLMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) - opt := &storage.WalkOption{SubDir: ""} - - // Save a snapshot of current tableDMLIdxMap - origDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap, len(c.tableDMLIdxMap)) - for k, v := range c.tableDMLIdxMap { - m := make(fileIndexKeyMap) - maps.Copy(m, v) - origDMLIdxMap[k] = m - } - - // Walk through all files in S3 storage - if err := c.s3Storage.WalkDir(ctx, opt, func(path string, size int64) error { - if cloudstorage.IsSchemaFile(path) { - var schemaKey cloudstorage.SchemaPathKey - checksumInFile, err := schemaKey.ParseSchemaFilePath(path) - if err != nil { - log.Error("failed to parse schema file path, skipping", - zap.String("path", path), - zap.Error(err)) - return nil - } - c.schemaChecksumMap[schemaKey] = schemaFilePathWithChecksum{ - filepath: path, - checksum: checksumInFile, - } - return nil - } - - // Try to parse DML file path if it matches the expected extension - if strings.HasSuffix(path, c.fileExtension) { - err := c.parseDMLFilePath(ctx, path) - if err != nil { - log.Error("failed to parse dml file path, skipping", - zap.String("path", path), - zap.Error(err)) - return nil - } - } - return nil - }); err != nil { - return tableDMLMap, errors.Trace(err) - } - - // Calculate the difference to find new files - tableDMLMap = diffDMLMaps(c.tableDMLIdxMap, origDMLIdxMap) - return tableDMLMap, nil -} - -type IncrementalData struct { - DataContentSlices map[cloudstorage.FileIndexKey][][]byte - SchemaContent []byte -} - -// downloadDMLFiles downloads DML files concurrently and returns their content -func (c *S3Consumer) downloadDMLFiles( - ctx context.Context, - newFiles map[cloudstorage.DmlPathKey]fileIndexRange, -) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { - result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) - - // Prepare all file download tasks - type downloadTask struct { - dmlPathKey cloudstorage.DmlPathKey - fileIndex *cloudstorage.FileIndex - } - - var tasks []downloadTask - for dmlPathKey, fileRange := range newFiles { - for indexKey, indexRange := range fileRange { - for i := indexRange.start; i <= indexRange.end; i++ { - fileIdx := &cloudstorage.FileIndex{ - FileIndexKey: indexKey, - Idx: i, - } - tasks = append(tasks, downloadTask{ - dmlPathKey: dmlPathKey, - fileIndex: fileIdx, - }) - } - } - } - - log.Info("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) - - // Concurrently download files - type fileContent struct { - dmlPathKey cloudstorage.DmlPathKey - indexKey cloudstorage.FileIndexKey - idx uint64 - content []byte - } - - fileContents := make(chan fileContent, len(tasks)) - eg, egCtx := errgroup.WithContext(ctx) - for _, task := range tasks { - eg.Go(func() error { - filePath := task.dmlPathKey.GenerateDMLFilePath( - task.fileIndex, - c.fileExtension, - c.fileIndexWidth, - ) - - content, err := c.s3Storage.ReadFile(egCtx, filePath) - if err != nil { - return errors.Annotatef(err, "failed to read file: %s", filePath) - } - - // Channel writes are thread-safe, no mutex needed - fileContents <- fileContent{ - dmlPathKey: task.dmlPathKey, - indexKey: task.fileIndex.FileIndexKey, - idx: task.fileIndex.Idx, - content: content, - } - return nil - }) - } - - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - close(fileContents) - - // Group and merge file contents by DmlPathKey and FileIndexKey - type contentWithIdx struct { - idx uint64 - content []byte - } - tempResult := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][]contentWithIdx) - - for fc := range fileContents { - if tempResult[fc.dmlPathKey] == nil { - tempResult[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][]contentWithIdx) - } - tempResult[fc.dmlPathKey][fc.indexKey] = append( - tempResult[fc.dmlPathKey][fc.indexKey], - contentWithIdx{idx: fc.idx, content: fc.content}, - ) - } - - // Merge contents in order (by idx) for each FileIndexKey - for dmlPathKey, indexKeyMap := range tempResult { - if result[dmlPathKey] == nil { - result[dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) - } - for indexKey, contentWithIdxs := range indexKeyMap { - // Sort by idx to maintain file order - sort.Slice(contentWithIdxs, func(i, j int) bool { - return contentWithIdxs[i].idx < contentWithIdxs[j].idx - }) - contents := make([][]byte, len(contentWithIdxs)) - for i, contentWithIdx := range contentWithIdxs { - contents[i] = contentWithIdx.content - } - result[dmlPathKey][indexKey] = contents - } - } - - return result, nil -} - -// downloadSchemaFiles downloads schema files concurrently for the given SchemaPathKeys -func (c *S3Consumer) downloadSchemaFiles( - ctx context.Context, - schemaPathKeys map[cloudstorage.SchemaPathKey]struct{}, -) (map[cloudstorage.SchemaPathKey][]byte, error) { - schemaContents := make(map[cloudstorage.SchemaPathKey][]byte) - schemaMutex := &sync.Mutex{} - eg, egCtx := errgroup.WithContext(ctx) - - log.Info("starting concurrent schema file download", zap.Int("totalSchemas", len(schemaPathKeys))) - - for schemaPathKey := range schemaPathKeys { - eg.Go(func() error { - checksumInFile, exists := c.schemaChecksumMap[schemaPathKey] - if !exists { - return errors.Annotatef(errors.ErrInternalCheckFailed, "schema file not found: %s", schemaPathKey.GetKey()) - } - schemaPath := checksumInFile.filepath - content, err := c.s3Storage.ReadFile(egCtx, schemaPath) - if err != nil { - return errors.Annotatef(err, "failed to read schema file: %s", schemaPath) - } - - schemaMutex.Lock() - schemaContents[schemaPathKey] = content - schemaMutex.Unlock() - - return nil - }) - } - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - - return schemaContents, nil -} - -// ConsumeNewFiles downloads new files concurrently and returns their content -func (c *S3Consumer) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlPathKey]IncrementalData, error) { - newFiles, err := c.getNewFiles(ctx) - if err != nil { - return nil, errors.Trace(err) - } - - if len(newFiles) == 0 { - log.Info("no new dml files found") - return nil, nil - } - - // Download DML files - dmlData, err := c.downloadDMLFiles(ctx, newFiles) - if err != nil { - return nil, errors.Trace(err) - } - - // Collect unique SchemaPathKeys - schemaPathKeys := make(map[cloudstorage.SchemaPathKey]struct{}) - for dmlPathKey := range newFiles { - schemaPathKey := dmlPathKey.SchemaPathKey - schemaPathKeys[schemaPathKey] = struct{}{} - } - - // Download schema files - schemaContents, err := c.downloadSchemaFiles(ctx, schemaPathKeys) - if err != nil { - return nil, errors.Trace(err) - } - - // Combine DML data and schema data into result - result := make(map[cloudstorage.DmlPathKey]IncrementalData) - for dmlPathKey, dataSlices := range dmlData { - incrementalData := IncrementalData{ - DataContentSlices: dataSlices, - } - // Assign schema content if available - schemaPathKey := dmlPathKey.SchemaPathKey - if schemaContent, ok := schemaContents[schemaPathKey]; ok { - incrementalData.SchemaContent = schemaContent - } - result[dmlPathKey] = incrementalData - } - - // Log the new files found - for dmlPathKey, fileRange := range newFiles { - for indexKey, indexRange := range fileRange { - log.Info("found and downloaded new dml files", - zap.String("schema", dmlPathKey.Schema), - zap.String("table", dmlPathKey.Table), - zap.Uint64("tableVersion", dmlPathKey.TableVersion), - zap.Int64("partitionNum", dmlPathKey.PartitionNum), - zap.String("date", dmlPathKey.Date), - zap.String("dispatcherID", indexKey.DispatcherID), - zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), - zap.Uint64("startIndex", indexRange.start), - zap.Uint64("endIndex", indexRange.end), - zap.Int("fileCount", int(indexRange.end-indexRange.start+1))) - } - } - - return result, nil -} diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 4f24ae3e85..4494d08ab0 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -17,6 +17,8 @@ import ( "context" "fmt" "os" + "os/signal" + "syscall" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" "github.com/spf13/cobra" @@ -72,5 +74,35 @@ func run(cmd *cobra.Command, args []string) { fmt.Printf(" S3 Sink URI: %s\n", cluster.S3SinkURI) } - runTask(context.Background(), cfg) + // Create a context that can be cancelled by signals + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Set up signal handling for graceful shutdown + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) + + // Start the task in a goroutine + errChan := make(chan error, 1) + go func() { + errChan <- runTask(ctx, cfg) + }() + + // Wait for either a signal or task completion + select { + case sig := <-sigChan: + fmt.Fprintf(os.Stdout, "\nReceived signal: %v, shutting down gracefully...\n", sig) + cancel() + // Wait for the task to finish + if err := <-errChan; err != nil && err != context.Canceled { + fmt.Fprintf(os.Stderr, "task error: %v\n", err) + os.Exit(ExitCodeExecuteFailed) + } + fmt.Fprintf(os.Stdout, "Shutdown complete\n") + case err := <-errChan: + if err != nil { + fmt.Fprintf(os.Stderr, "failed to run task: %v\n", err) + os.Exit(ExitCodeExecuteFailed) + } + } } diff --git a/cmd/multi-cluster-consistency-checker/parser/parser.go b/cmd/multi-cluster-consistency-checker/parser/parser.go index 8e730ae6f0..ad222468cf 100644 --- a/cmd/multi-cluster-consistency-checker/parser/parser.go +++ b/cmd/multi-cluster-consistency-checker/parser/parser.go @@ -20,7 +20,7 @@ import ( "time" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils.go" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/common" commonType "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/common/event" @@ -58,15 +58,15 @@ func getPkColumnOffset(tableInfo *commonType.TableInfo) (map[int64]int, error) { return pkColumnOffsets, nil } -type tableParser struct { +type TableParser struct { tableKey string tableInfo *common.TableInfo pkColumnOffsets map[int64]int csvDecoder *csvDecoder } -func NewTableParser(tableKey string, content []byte) (*tableParser, error) { - tableParser := &tableParser{} +func NewTableParser(tableKey string, content []byte) (*TableParser, error) { + tableParser := &TableParser{} if err := tableParser.parseTableInfo(tableKey, content); err != nil { return nil, errors.Trace(err) } @@ -74,7 +74,7 @@ func NewTableParser(tableKey string, content []byte) (*tableParser, error) { return tableParser, nil } -func (pt *tableParser) parseTableInfo(tableKey string, content []byte) error { +func (pt *TableParser) parseTableInfo(tableKey string, content []byte) error { // Parse schema content to get tableInfo var tableDef cloudstorage.TableDefinition if err := json.Unmarshal(content, &tableDef); err != nil { @@ -109,7 +109,7 @@ func (pt *tableParser) parseTableInfo(tableKey string, content []byte) error { return nil } -func (pt *tableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Record, error) { +func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Record, error) { originTs := uint64(0) pkCount := 0 colInfos := pt.tableInfo.GetColInfosForRowChangedEvent() @@ -182,7 +182,7 @@ func (pt *tableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Reco }, nil } -func (pt *tableParser) DecodeFiles(ctx context.Context, content []byte) ([]*utils.Record, error) { +func (pt *TableParser) DecodeFiles(ctx context.Context, content []byte) ([]*utils.Record, error) { records := make([]*utils.Record, 0) decoder, err := pt.csvDecoder.NewDecoder(ctx, pt.tableInfo, content) diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index 468107009f..b008075755 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -17,6 +17,7 @@ import ( "context" "time" + "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/checker" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" @@ -27,56 +28,86 @@ import ( putil "github.com/pingcap/ticdc/pkg/util" pd "github.com/tikv/pd/client" pdopt "github.com/tikv/pd/client/opt" + "go.uber.org/zap" "google.golang.org/grpc" ) func runTask(ctx context.Context, cfg *config.Config) error { - checkpointWatchers, s3Watchers, pdClients, err := initClients(ctx, cfg) + checkpointWatchers, s3Watchers, pdClients, etcdClients, err := initClients(ctx, cfg) if err != nil { return errors.Trace(err) } + // Ensure cleanup happens even if there's an error + defer cleanupClients(pdClients, etcdClients) timeWindowAdvancer := advancer.NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) dataChecker := checker.NewDataChecker(cfg.Clusters) + + log.Info("Starting consistency checker task") for { + // Check if context is cancelled before starting a new iteration + select { + case <-ctx.Done(): + log.Info("Context cancelled, shutting down gracefully") + return ctx.Err() + default: + } + newTimeWindowData, err := timeWindowAdvancer.AdvanceTimeWindow(ctx) if err != nil { return errors.Trace(err) } + if err := dataChecker.CheckInNextTimeWindow(ctx, newTimeWindowData); err != nil { return errors.Trace(err) } } } -func initClients(ctx context.Context, cfg *config.Config) (map[string]map[string]*watcher.CheckpointWatcher, map[string]*watcher.S3Watcher, map[string]pd.Client, error) { +func initClients(ctx context.Context, cfg *config.Config) ( + map[string]map[string]*watcher.CheckpointWatcher, + map[string]*watcher.S3Watcher, + map[string]pd.Client, + map[string]*etcd.CDCEtcdClientImpl, + error, +) { checkpointWatchers := make(map[string]map[string]*watcher.CheckpointWatcher) s3Watchers := make(map[string]*watcher.S3Watcher) pdClients := make(map[string]pd.Client) + etcdClients := make(map[string]*etcd.CDCEtcdClientImpl) + for clusterID, clusterConfig := range cfg.Clusters { pdClient, etcdClient, err := newPDClient(ctx, clusterConfig.PDAddr, &clusterConfig.SecurityConfig) if err != nil { - return nil, nil, nil, errors.Trace(err) + // Clean up already created clients before returning error + cleanupClients(pdClients, etcdClients) + return nil, nil, nil, nil, errors.Trace(err) } + etcdClients[clusterID] = etcdClient + upstreamCheckpointWatchers := make(map[string]*watcher.CheckpointWatcher) for downstreamClusterID, downstreamClusterChangefeedConfig := range clusterConfig.DownstreamClusterChangefeedConfig { checkpointWatcher := watcher.NewCheckpointWatcher(clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) upstreamCheckpointWatchers[downstreamClusterID] = checkpointWatcher } checkpointWatchers[clusterID] = upstreamCheckpointWatchers + s3Storage, err := putil.GetExternalStorageWithDefaultTimeout(ctx, clusterConfig.S3SinkURI) if err != nil { - return nil, nil, nil, errors.Trace(err) + // Clean up already created clients before returning error + cleanupClients(pdClients, etcdClients) + return nil, nil, nil, nil, errors.Trace(err) } s3Watcher := watcher.NewS3Watcher( watcher.NewCheckpointWatcher(clusterID, "s3", clusterConfig.S3ChangefeedID, etcdClient), s3Storage, + cfg.GlobalConfig.Tables, ) s3Watchers[clusterID] = s3Watcher pdClients[clusterID] = pdClient } - return checkpointWatchers, s3Watchers, pdClients, nil + return checkpointWatchers, s3Watchers, pdClients, etcdClients, nil } func newPDClient(ctx context.Context, pdAddr string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { @@ -90,13 +121,48 @@ func newPDClient(ctx context.Context, pdAddr string, securityConfig *security.Cr etcdCli, err := etcd.CreateRawEtcdClient(securityConfig, grpc.EmptyDialOption{}, pdAddr) if err != nil { + // Clean up PD client if etcd client creation fails + if pdClient != nil { + pdClient.Close() + } return nil, nil, errors.Trace(err) } cdcEtcdClient, err := etcd.NewCDCEtcdClient(ctx, etcdCli, "default") if err != nil { + // Clean up resources if CDC etcd client creation fails + etcdCli.Close() + pdClient.Close() return nil, nil, errors.Trace(err) } return pdClient, cdcEtcdClient, nil } + +// cleanupClients closes all PD and etcd clients gracefully +func cleanupClients(pdClients map[string]pd.Client, etcdClients map[string]*etcd.CDCEtcdClientImpl) { + log.Info("Cleaning up clients", zap.Int("pdClients", len(pdClients)), zap.Int("etcdClients", len(etcdClients))) + + // Close PD clients + for clusterID, pdClient := range pdClients { + if pdClient != nil { + pdClient.Close() + log.Debug("PD client closed", zap.String("clusterID", clusterID)) + } + } + + // Close etcd clients + for clusterID, etcdClient := range etcdClients { + if etcdClient != nil { + if err := etcdClient.Close(); err != nil { + log.Warn("Failed to close etcd client", + zap.String("clusterID", clusterID), + zap.Error(err)) + } else { + log.Debug("Etcd client closed", zap.String("clusterID", clusterID)) + } + } + } + + log.Info("Client cleanup completed") +} diff --git a/cmd/multi-cluster-consistency-checker/utils.go/types.go b/cmd/multi-cluster-consistency-checker/utils/types.go similarity index 100% rename from cmd/multi-cluster-consistency-checker/utils.go/types.go rename to cmd/multi-cluster-consistency-checker/utils/types.go diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go index e83146f0bf..fa8cc67bc4 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go @@ -24,31 +24,35 @@ import ( type S3Watcher struct { checkpointWatcher *CheckpointWatcher - consumer *consumer.S3Consumer + consumer *consumer.Consumer } func NewS3Watcher( checkpointWatcher *CheckpointWatcher, s3Storage storage.ExternalStorage, + tables map[string]map[string]struct{}, ) *S3Watcher { - consumer := consumer.NewS3Consumer(s3Storage) + consumer := consumer.NewConsumer(s3Storage, tables) return &S3Watcher{ checkpointWatcher: checkpointWatcher, consumer: consumer, } } -func (sw *S3Watcher) AdvanceS3CheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, map[cloudstorage.DmlPathKey]consumer.IncrementalData, error) { +func (sw *S3Watcher) AdvanceS3CheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { checkpointTs, err := sw.checkpointWatcher.AdvanceCheckpointTs(ctx, minCheckpointTs) if err != nil { - return 0, nil, errors.Annotate(err, "advance s3 checkpoint timestamp failed") + return 0, errors.Annotate(err, "advance s3 checkpoint timestamp failed") } + return checkpointTs, nil +} + +func (sw *S3Watcher) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlPathKey]consumer.IncrementalData, error) { // TODO: get the index updated from the s3 newData, err := sw.consumer.ConsumeNewFiles(ctx) if err != nil { - return 0, nil, errors.Annotate(err, "consume new files failed") + return nil, errors.Annotate(err, "consume new files failed") } - - return checkpointTs, newData, nil + return newData, nil } From 5404fd87d4538894383f8af68d918dd9fcc0d3a4 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Tue, 27 Jan 2026 22:08:08 +0800 Subject: [PATCH 06/23] draft Signed-off-by: Jianjun Liao --- .../config/config.example.toml | 5 +++++ cmd/multi-cluster-consistency-checker/config/config.go | 9 ++------- .../consumer/consumer.go | 6 +++--- .../watcher/s3_watcher.go | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index 89206bb13a..6680dcd76c 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -5,6 +5,11 @@ # timeout = "30s" # retry-count = 3 +# Tables configuration +[global.tables] + schema1 = ["table1", "table2"] + schema2 = ["table1", "table2"] + # Cluster configurations [clusters] # First cluster configuration diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index 3a5405dd84..72a233c1ce 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -31,14 +31,9 @@ type Config struct { } // GlobalConfig contains global configuration settings -// This is reserved for future use type GlobalConfig struct { - // Add global configuration fields here as needed - // For example: - // Timeout time.Duration `toml:"timeout" json:"timeout"` - // RetryCount int `toml:"retry-count" json:"retry-count"` - LogLevel string `toml:"log-level" json:"log-level"` - Tables map[string]map[string]struct{} `toml:"tables" json:"tables"` + LogLevel string `toml:"log-level" json:"log-level"` + Tables map[string][]string `toml:"tables" json:"tables"` } type DownstreamClusterChangefeedConfig struct { diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go index e8ca91c4c8..eef3e80976 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -76,7 +76,7 @@ type Consumer struct { fileExtension string dateSeparator string fileIndexWidth int - tables map[string]map[string]struct{} + tables map[string][]string versionMapMu sync.RWMutex currentTableVersionMap map[schemaKey]uint64 @@ -88,7 +88,7 @@ type Consumer struct { func NewConsumer( s3Storage storage.ExternalStorage, - tables map[string]map[string]struct{}, + tables map[string][]string, ) *Consumer { return &Consumer{ s3Storage: s3Storage, @@ -419,7 +419,7 @@ func (c *Consumer) ConsumeNewFiles( result := make(map[cloudstorage.DmlPathKey]IncrementalData) eg, egCtx := errgroup.WithContext(ctx) for schema, tables := range c.tables { - for table := range tables { + for _, table := range tables { eg.Go(func() error { newVersions, err := c.discoverAndDownloadNewTableVersions(egCtx, schema, table) if err != nil { diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go index fa8cc67bc4..bd4e446ee7 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go @@ -30,7 +30,7 @@ type S3Watcher struct { func NewS3Watcher( checkpointWatcher *CheckpointWatcher, s3Storage storage.ExternalStorage, - tables map[string]map[string]struct{}, + tables map[string][]string, ) *S3Watcher { consumer := consumer.NewConsumer(s3Storage, tables) return &S3Watcher{ From 2f8d923fd072cb2b6d56f9721d2590cd04432c23 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 28 Jan 2026 13:05:03 +0800 Subject: [PATCH 07/23] draft Signed-off-by: Jianjun Liao --- .../advancer/time_window_advancer.go | 4 +- .../parser/decoder.go | 8 ++-- .../parser/parser.go | 19 +++++---- .../recorder/recorder.go | 42 +++++++++++++++++++ cmd/multi-cluster-consistency-checker/task.go | 4 ++ .../watcher/checkpoint_watcher.go | 6 +-- 6 files changed, 65 insertions(+), 18 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/recorder/recorder.go diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go index 7fc6704cfe..b69287f0e6 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go @@ -108,7 +108,7 @@ func NewTimeWindowAdvancer( func (t *TimeWindowAdvancer) AdvanceTimeWindow( pctx context.Context, ) (map[string]TimeWindowData, error) { - log.Info("advance time window", zap.Uint64("round", t.round)) + log.Debug("advance time window", zap.Uint64("round", t.round)) // mapping from upstream cluster ID to the downstream cluster ID to the min checkpoint timestamp minCheckpointTsMap := make(map[string]map[string]uint64) maxTimeWindowRightBoundary := uint64(0) @@ -213,7 +213,7 @@ func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]TimeWindo triplet[1] = triplet[2] triplet[2] = timeWindow t.timeWindowTriplet[clusterID] = triplet - log.Info("update time window", zap.String("clusterID", clusterID), zap.Any("timeWindow", timeWindow)) + log.Debug("update time window", zap.String("clusterID", clusterID), zap.Any("timeWindow", timeWindow)) } } diff --git a/cmd/multi-cluster-consistency-checker/parser/decoder.go b/cmd/multi-cluster-consistency-checker/parser/decoder.go index 2179a18b7c..97e2d0eb3f 100644 --- a/cmd/multi-cluster-consistency-checker/parser/decoder.go +++ b/cmd/multi-cluster-consistency-checker/parser/decoder.go @@ -25,11 +25,11 @@ import ( func defaultCsvCodecConfig(protocol config.Protocol) *codecCommon.Config { codecConfig := codecCommon.NewConfig(protocol) - codecConfig.Delimiter = "," - codecConfig.Quote = "\"" - codecConfig.NullString = "NULL" + codecConfig.Delimiter = config.Comma + codecConfig.Quote = string(config.DoubleQuoteChar) + codecConfig.NullString = config.NULL codecConfig.IncludeCommitTs = true - codecConfig.Terminator = "\r\n" + codecConfig.Terminator = config.CRLF return codecConfig } diff --git a/cmd/multi-cluster-consistency-checker/parser/parser.go b/cmd/multi-cluster-consistency-checker/parser/parser.go index ad222468cf..5af64a07db 100644 --- a/cmd/multi-cluster-consistency-checker/parser/parser.go +++ b/cmd/multi-cluster-consistency-checker/parser/parser.go @@ -142,18 +142,19 @@ func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Reco pkCount += 1 continue } - colValue := commonType.ExtractColVal(row, col, rowColOffset) if col.Name.O == event.OriginTsColumn { - var ok bool - originTs, ok = colValue.(uint64) - if !ok && colValue != nil { - log.Error("origin ts column value is not uint64", - zap.String("tableKey", pt.tableKey), - zap.Int64("colID", colInfo.ID), - zap.Any("colValue", colValue)) - return nil, errors.Errorf("origin ts column value is not uint64 for column %d in table %s", colInfo.ID, pt.tableKey) + if !row.IsNull(rowColOffset) { + d := row.GetDatum(rowColOffset, &col.FieldType) + if d.Kind() != types.KindInt64 && d.Kind() != types.KindUint64 { + log.Error("origin ts column value is not int64 or uint64", + zap.String("tableKey", pt.tableKey), + zap.String("datum", d.String())) + return nil, errors.Errorf("origin ts column value is not int64 or uint64 for column %d in table %s", colInfo.ID, pt.tableKey) + } + originTs = d.GetUint64() } } else { + colValue := commonType.ExtractColVal(row, col, rowColOffset) columnValues = append(columnValues, utils.ColumnValue{ ColumnID: colInfo.ID, Value: colValue, diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go new file mode 100644 index 0000000000..4b45814dbc --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -0,0 +1,42 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package recorder + +import ( + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" + "go.uber.org/zap" +) + +type Recorder struct { + round uint64 +} + +func NewRecorder() *Recorder { + return &Recorder{ + round: 0, + } +} + +func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindowData) { + for clusterID, timeWindow := range timeWindowData { + log.Info("time window advanced", + zap.Uint64("round", r.round), + zap.String("clusterID", clusterID), + zap.Uint64("window left boundary", timeWindow.LeftBoundary), + zap.Uint64("window right boundary", timeWindow.RightBoundary), + zap.Any("checkpoint ts", timeWindow.CheckpointTs)) + } + r.round += 1 +} diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index b008075755..b6f404722e 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -21,6 +21,7 @@ import ( "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/checker" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/etcd" @@ -41,6 +42,7 @@ func runTask(ctx context.Context, cfg *config.Config) error { defer cleanupClients(pdClients, etcdClients) timeWindowAdvancer := advancer.NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) + recorder := recorder.NewRecorder() dataChecker := checker.NewDataChecker(cfg.Clusters) log.Info("Starting consistency checker task") @@ -61,6 +63,8 @@ func runTask(ctx context.Context, cfg *config.Config) error { if err := dataChecker.CheckInNextTimeWindow(ctx, newTimeWindowData); err != nil { return errors.Trace(err) } + + recorder.RecordTimeWindow(newTimeWindowData) } } diff --git a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go index 86c56e20b8..02238fd3f7 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go @@ -55,7 +55,7 @@ func (cw *CheckpointWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpo // Watch for checkpoint updates watchCtx, cancel := context.WithCancel(ctx) defer cancel() - log.Info("Starting to watch checkpoint", + log.Debug("Starting to watch checkpoint", zap.String("changefeedID", cw.changefeedID.String()), zap.String("statusKey", statusKey), zap.String("upstreamClusterID", cw.upstreamClusterID), @@ -96,14 +96,14 @@ func (cw *CheckpointWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpo } checkpointTs := status.CheckpointTs - log.Info("Checkpoint updated", + log.Debug("Checkpoint updated", zap.String("changefeedID", cw.changefeedID.String()), zap.Uint64("checkpoint", checkpointTs), zap.Uint64("minCheckpointTs", minCheckpointTs)) // Check if checkpoint exceeds minCheckpointTs if checkpointTs > minCheckpointTs { - log.Info("Checkpoint exceeds minCheckpointTs, getting TSO from downstream", + log.Debug("Checkpoint exceeds minCheckpointTs, getting TSO from downstream", zap.String("changefeedID", cw.changefeedID.String()), zap.Uint64("checkpoint", checkpointTs)) return checkpointTs, nil From adda97039295e02c340a106c8e4b3fa00fad1745 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 28 Jan 2026 15:47:20 +0800 Subject: [PATCH 08/23] add report directory Signed-off-by: Jianjun Liao --- .../checker/checker.go | 34 +++- .../config/config.example.toml | 6 +- .../config/config.go | 5 +- .../recorder/recorder.go | 26 +++- .../recorder/types.go | 146 ++++++++++++++++++ cmd/multi-cluster-consistency-checker/task.go | 7 +- 6 files changed, 204 insertions(+), 20 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/recorder/types.go diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index e95bd39d0f..b27c95035a 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -20,6 +20,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/errors" "go.uber.org/zap" @@ -42,7 +43,7 @@ func newClusterViolationChecker(clusterID string) *clusterViolationChecker { } } -func (c *clusterViolationChecker) Check(r *utils.Record) { +func (c *clusterViolationChecker) Check(r *utils.Record, report *recorder.ClusterReport) { entry, exists := c.twoPreviousTimeWindowKeyVersionCache[r.Pk] if !exists { c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ @@ -63,6 +64,8 @@ func (c *clusterViolationChecker) Check(r *utils.Record) { zap.String("clusterID", c.clusterID), zap.Any("entry", entry), zap.Any("record", r)) + report.AddLWWViolationItem(string(r.Pk), entry.cdcVersion.OriginTs, entry.cdcVersion.CommitTs, r.OriginTs, r.CommitTs) + return } c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ previous: 0, @@ -146,6 +149,8 @@ type clusterDataChecker struct { overDataCaches []*utils.Record clusterViolationChecker *clusterViolationChecker + + report *recorder.ClusterReport } func newClusterDataChecker(clusterID string) *clusterDataChecker { @@ -233,12 +238,14 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("upstreamClusterID", cd.clusterID), zap.String("downstreamClusterID", downstreamClusterID), zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, false) } else if !record.EqualDownstreamRecord(downstreamRecord) { // data inconsistent detected log.Error("data inconsistent detected", zap.String("upstreamClusterID", cd.clusterID), zap.String("downstreamClusterID", downstreamClusterID), zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, true) } } } @@ -259,12 +266,14 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("upstreamClusterID", cd.clusterID), zap.String("downstreamClusterID", downstreamClusterID), zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, false) } else if !record.EqualDownstreamRecord(downstreamRecord) { // data inconsistent detected log.Error("data inconsistent detected", zap.String("upstreamClusterID", cd.clusterID), zap.String("downstreamClusterID", downstreamClusterID), zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, true) } } } @@ -282,6 +291,7 @@ func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { log.Error("data redundant detected", zap.String("downstreamClusterID", cd.clusterID), zap.Any("record", record)) + cd.report.AddDataRedundantItem(string(record.Pk), record.OriginTs, record.CommitTs) } } } @@ -302,7 +312,7 @@ func (cd *clusterDataChecker) lwwViolationDetection() { return pkRecords[i].CommitTs < pkRecords[j].CommitTs }) for _, record := range pkRecords { - cd.clusterViolationChecker.Check(record) + cd.clusterViolationChecker.Check(record, cd.report) } } for pk, downstreamRecords := range cd.timeWindowDataCaches[2].downstreamDataCache { @@ -317,7 +327,7 @@ func (cd *clusterDataChecker) lwwViolationDetection() { return pkRecords[i].CommitTs < pkRecords[j].CommitTs }) for _, record := range pkRecords { - cd.clusterViolationChecker.Check(record) + cd.clusterViolationChecker.Check(record, cd.report) } } @@ -325,6 +335,7 @@ func (cd *clusterDataChecker) lwwViolationDetection() { } func (cd *clusterDataChecker) Check(checker *DataChecker) { + cd.report = recorder.NewClusterReport(cd.clusterID) // CHECK 1 - Data Loss Detection cd.dataLossDetection(checker) // CHECK 2 - Data Redundant Detection @@ -333,7 +344,12 @@ func (cd *clusterDataChecker) Check(checker *DataChecker) { cd.lwwViolationDetection() } +func (cd *clusterDataChecker) GetReport() *recorder.ClusterReport { + return cd.report +} + type DataChecker struct { + round uint64 clusterDataCheckers map[string]*clusterDataChecker } @@ -343,6 +359,7 @@ func NewDataChecker(clusterConfig map[string]config.ClusterConfig) *DataChecker clusterDataChecker[clusterID] = newClusterDataChecker(clusterID) } return &DataChecker{ + round: 0, clusterDataCheckers: clusterDataChecker, } } @@ -379,15 +396,18 @@ func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk uti return false } -func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) error { +func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) (*recorder.Report, error) { if err := c.decodeNewTimeWindowData(ctx, newTimeWindowData); err != nil { log.Error("failed to decode new time window data", zap.Error(err)) - return errors.Annotate(err, "failed to decode new time window data") + return nil, errors.Annotate(err, "failed to decode new time window data") } - for _, clusterDataChecker := range c.clusterDataCheckers { + report := recorder.NewReport(c.round) + for clusterID, clusterDataChecker := range c.clusterDataCheckers { clusterDataChecker.Check(c) + report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) } - return nil + c.round += 1 + return report, nil } func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) error { diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index 6680dcd76c..eba9c340d1 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -2,8 +2,10 @@ # Global configuration (reserved for future use) [global] -# timeout = "30s" -# retry-count = 3 + +# Report configuration +[global.report-dir] + report-dir = "/tmp/multi-cluster-consistency-checker-reports" # Tables configuration [global.tables] diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index 72a233c1ce..d280eb61cc 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -32,8 +32,9 @@ type Config struct { // GlobalConfig contains global configuration settings type GlobalConfig struct { - LogLevel string `toml:"log-level" json:"log-level"` - Tables map[string][]string `toml:"tables" json:"tables"` + LogLevel string `toml:"log-level" json:"log-level"` + ReportDir string `toml:"report-dir" json:"report-dir"` + Tables map[string][]string `toml:"tables" json:"tables"` } type DownstreamClusterChangefeedConfig struct { diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 4b45814dbc..7fd22a6955 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -14,29 +14,43 @@ package recorder import ( + "fmt" + "os" + "path" + "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" + "github.com/pingcap/ticdc/pkg/errors" "go.uber.org/zap" ) type Recorder struct { - round uint64 + recordDir string } -func NewRecorder() *Recorder { +func NewRecorder(reportDir string) *Recorder { return &Recorder{ - round: 0, + recordDir: reportDir, } } -func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindowData) { +func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindowData, report *Report) error { for clusterID, timeWindow := range timeWindowData { log.Info("time window advanced", - zap.Uint64("round", r.round), + zap.Uint64("round", report.Round), zap.String("clusterID", clusterID), zap.Uint64("window left boundary", timeWindow.LeftBoundary), zap.Uint64("window right boundary", timeWindow.RightBoundary), zap.Any("checkpoint ts", timeWindow.CheckpointTs)) } - r.round += 1 + if err := r.flushReport(report); err != nil { + return errors.Trace(err) + } + return nil +} + +func (r *Recorder) flushReport(report *Report) error { + filename := path.Join(r.recordDir, fmt.Sprintf("report-%d.log", report.Round)) + data := report.MarshalReport() + return os.WriteFile(filename, []byte(data), 0644) } diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go new file mode 100644 index 0000000000..d1786c8c9d --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -0,0 +1,146 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package recorder + +import ( + "fmt" + "strings" +) + +type DataLossItem struct { + DownstreamClusterID string + PK string + OriginTS uint64 + CommitTS uint64 + Inconsistent bool +} + +func (item *DataLossItem) String() string { + errType := "data loss" + if item.Inconsistent { + errType = "data inconsistent" + } + return fmt.Sprintf("downstream cluster: %s, pk: %s, origin ts: %d, commit ts: %d, type: %s", item.DownstreamClusterID, item.PK, item.OriginTS, item.CommitTS, errType) +} + +type DataRedundantItem struct { + PK string + OriginTS uint64 + CommitTS uint64 +} + +func (item *DataRedundantItem) String() string { + return fmt.Sprintf("pk: %s, origin ts: %d, commit ts: %d", item.PK, item.OriginTS, item.CommitTS) +} + +type LWWViolationItem struct { + PK string + ExistingOriginTS uint64 + ExistingCommitTS uint64 + OriginTS uint64 + CommitTS uint64 +} + +func (item *LWWViolationItem) String() string { + return fmt.Sprintf( + "pk: %s, existing origin ts: %d, existing commit ts: %d, origin ts: %d, commit ts: %d", + item.PK, item.ExistingOriginTS, item.ExistingCommitTS, item.OriginTS, item.CommitTS) +} + +type ClusterReport struct { + ClusterID string + + DataLossItems []DataLossItem + DataRedundantItems []DataRedundantItem + LWWViolationItems []LWWViolationItem +} + +func NewClusterReport(clusterID string) *ClusterReport { + return &ClusterReport{ + ClusterID: clusterID, + DataLossItems: make([]DataLossItem, 0), + DataRedundantItems: make([]DataRedundantItem, 0), + LWWViolationItems: make([]LWWViolationItem, 0), + } +} + +func (r *ClusterReport) AddDataLossItem(downstreamClusterID, pk string, originTS, commitTS uint64, inconsistent bool) { + r.DataLossItems = append(r.DataLossItems, DataLossItem{ + DownstreamClusterID: downstreamClusterID, + PK: pk, + OriginTS: originTS, + CommitTS: commitTS, + Inconsistent: inconsistent, + }) +} + +func (r *ClusterReport) AddDataRedundantItem(pk string, originTS, commitTS uint64) { + r.DataRedundantItems = append(r.DataRedundantItems, DataRedundantItem{ + PK: pk, + OriginTS: originTS, + CommitTS: commitTS, + }) +} + +func (r *ClusterReport) AddLWWViolationItem( + pk string, + existingOriginTS, existingCommitTS uint64, + originTS, commitTS uint64, +) { + r.LWWViolationItems = append(r.LWWViolationItems, LWWViolationItem{ + PK: pk, + ExistingOriginTS: existingOriginTS, + ExistingCommitTS: existingCommitTS, + OriginTS: originTS, + CommitTS: commitTS, + }) +} + +type Report struct { + Round uint64 + ClusterReports map[string]*ClusterReport +} + +func NewReport(round uint64) *Report { + return &Report{ + Round: round, + ClusterReports: make(map[string]*ClusterReport), + } +} + +func (r *Report) AddClusterReport(clusterID string, clusterReport *ClusterReport) { + r.ClusterReports[clusterID] = clusterReport +} + +func (r *Report) MarshalReport() string { + var reportMsg strings.Builder + reportMsg.WriteString(fmt.Sprintf("round: %d\n", r.Round)) + for clusterID, clusterReport := range r.ClusterReports { + reportMsg.WriteString(fmt.Sprintf("\n[cluster: %s]\n", clusterID)) + reportMsg.WriteString(fmt.Sprintf(" - [data loss items: %d]\n", len(clusterReport.DataLossItems))) + for _, dataLossItem := range clusterReport.DataLossItems { + reportMsg.WriteString(fmt.Sprintf(" - [%s]\n", dataLossItem.String())) + } + reportMsg.WriteString(fmt.Sprintf(" - [data redundant items: %d]\n", len(clusterReport.DataRedundantItems))) + for _, dataRedundantItem := range clusterReport.DataRedundantItems { + reportMsg.WriteString(fmt.Sprintf(" - [%s]\n", dataRedundantItem.String())) + } + reportMsg.WriteString(fmt.Sprintf(" - [lww violation items: %d]\n", len(clusterReport.LWWViolationItems))) + for _, lwwViolationItem := range clusterReport.LWWViolationItems { + reportMsg.WriteString(fmt.Sprintf(" - [%s]\n", lwwViolationItem.String())) + } + } + reportMsg.WriteString("\n") + return reportMsg.String() +} diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index b6f404722e..10f3161ddb 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -42,7 +42,7 @@ func runTask(ctx context.Context, cfg *config.Config) error { defer cleanupClients(pdClients, etcdClients) timeWindowAdvancer := advancer.NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) - recorder := recorder.NewRecorder() + recorder := recorder.NewRecorder(cfg.GlobalConfig.ReportDir) dataChecker := checker.NewDataChecker(cfg.Clusters) log.Info("Starting consistency checker task") @@ -60,11 +60,12 @@ func runTask(ctx context.Context, cfg *config.Config) error { return errors.Trace(err) } - if err := dataChecker.CheckInNextTimeWindow(ctx, newTimeWindowData); err != nil { + report, err := dataChecker.CheckInNextTimeWindow(ctx, newTimeWindowData) + if err != nil { return errors.Trace(err) } - recorder.RecordTimeWindow(newTimeWindowData) + recorder.RecordTimeWindow(newTimeWindowData, report) } } From 9c88adbb4133aa3035ef35871faf38f7f1156d85 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 28 Jan 2026 16:14:04 +0800 Subject: [PATCH 09/23] fix bug Signed-off-by: Jianjun Liao --- .../config/config.example.toml | 8 +++++--- cmd/multi-cluster-consistency-checker/main.go | 18 ++++++++++++++++++ .../recorder/recorder.go | 10 +++++++--- cmd/multi-cluster-consistency-checker/task.go | 9 +++++++-- 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index eba9c340d1..ddae05c415 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -1,11 +1,13 @@ # Example configuration file for multi-cluster consistency checker -# Global configuration (reserved for future use) +# Global configuration [global] +# Log level: debug, info, warn, error, fatal, panic +log-level = "info" + # Report configuration -[global.report-dir] - report-dir = "/tmp/multi-cluster-consistency-checker-reports" +report-dir = "/tmp/multi-cluster-consistency-checker-reports" # Tables configuration [global.tables] diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 4494d08ab0..bb46185a1a 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -20,8 +20,11 @@ import ( "os/signal" "syscall" + "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/pkg/logger" "github.com/spf13/cobra" + "go.uber.org/zap" ) var ( @@ -67,6 +70,21 @@ func run(cmd *cobra.Command, args []string) { os.Exit(ExitCodeDecodeConfigFailed) } + // Initialize logger with configured log level + logLevel := cfg.GlobalConfig.LogLevel + if logLevel == "" { + logLevel = "info" // default log level + } + loggerConfig := &logger.Config{ + Level: logLevel, + } + err = logger.InitLogger(loggerConfig) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to init logger: %v\n", err) + os.Exit(ExitCodeExecuteFailed) + } + log.Info("Logger initialized", zap.String("level", logLevel)) + fmt.Printf("Loaded configuration with %d cluster(s)\n", len(cfg.Clusters)) for name, cluster := range cfg.Clusters { fmt.Printf(" Cluster: %s\n", name) diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 7fd22a6955..9ec68139e1 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -28,10 +28,14 @@ type Recorder struct { recordDir string } -func NewRecorder(reportDir string) *Recorder { +func NewRecorder(reportDir string) (*Recorder, error) { + err := os.MkdirAll(reportDir, 0755) + if err != nil { + return nil, errors.Trace(err) + } return &Recorder{ recordDir: reportDir, - } + }, nil } func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindowData, report *Report) error { @@ -52,5 +56,5 @@ func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindo func (r *Recorder) flushReport(report *Report) error { filename := path.Join(r.recordDir, fmt.Sprintf("report-%d.log", report.Round)) data := report.MarshalReport() - return os.WriteFile(filename, []byte(data), 0644) + return os.WriteFile(filename, []byte(data), 0600) } diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index 10f3161ddb..2ab95e085e 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -42,7 +42,10 @@ func runTask(ctx context.Context, cfg *config.Config) error { defer cleanupClients(pdClients, etcdClients) timeWindowAdvancer := advancer.NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) - recorder := recorder.NewRecorder(cfg.GlobalConfig.ReportDir) + recorder, err := recorder.NewRecorder(cfg.GlobalConfig.ReportDir) + if err != nil { + return errors.Trace(err) + } dataChecker := checker.NewDataChecker(cfg.Clusters) log.Info("Starting consistency checker task") @@ -65,7 +68,9 @@ func runTask(ctx context.Context, cfg *config.Config) error { return errors.Trace(err) } - recorder.RecordTimeWindow(newTimeWindowData, report) + if err := recorder.RecordTimeWindow(newTimeWindowData, report); err != nil { + return errors.Trace(err) + } } } From e4af718aae1a6ae5874fa68534483845236803d8 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Fri, 30 Jan 2026 12:40:15 +0800 Subject: [PATCH 10/23] revert enable active active check configuration Signed-off-by: Jianjun Liao --- api/v2/model.go | 23 +++++------ .../dispatcher/basic_dispatcher.go | 3 +- .../basic_dispatcher_active_active_test.go | 1 - .../dispatcher/basic_dispatcher_info.go | 39 +++++++++---------- .../dispatcher/event_dispatcher_test.go | 4 -- .../dispatcher/redo_dispatcher_test.go | 1 - .../dispatchermanager/dispatcher_manager.go | 1 - .../dispatcher_manager_test.go | 2 - pkg/common/event/active_active.go | 9 +---- pkg/common/event/active_active_test.go | 12 +++--- pkg/config/changefeed.go | 2 - pkg/config/replica_config.go | 1 - 12 files changed, 36 insertions(+), 62 deletions(-) diff --git a/api/v2/model.go b/api/v2/model.go index c2a949d3c3..01927bbc44 100644 --- a/api/v2/model.go +++ b/api/v2/model.go @@ -193,7 +193,6 @@ type ReplicaConfig struct { EnableTableMonitor *bool `json:"enable_table_monitor,omitempty"` BDRMode *bool `json:"bdr_mode,omitempty"` EnableActiveActive *bool `json:"enable_active_active,omitempty"` - EnableActiveActiveCheck *bool `json:"enable_active_active_check,omitempty"` ActiveActiveProgressInterval *JSONDuration `json:"active_active_progress_interval,omitempty"` ActiveActiveSyncStatsInterval *JSONDuration `json:"active_active_sync_stats_interval,omitempty"` @@ -251,9 +250,6 @@ func (c *ReplicaConfig) toInternalReplicaConfigWithOriginConfig( if c.EnableActiveActive != nil { res.EnableActiveActive = c.EnableActiveActive } - if c.EnableActiveActiveCheck != nil { - res.EnableActiveActiveCheck = c.EnableActiveActiveCheck - } if c.Filter != nil { efs := make([]*config.EventFilterRule, 0, len(c.Filter.EventFilters)) @@ -635,16 +631,15 @@ func ToAPIReplicaConfig(c *config.ReplicaConfig) *ReplicaConfig { cloned := c.Clone() res := &ReplicaConfig{ - MemoryQuota: cloned.MemoryQuota, - CaseSensitive: cloned.CaseSensitive, - ForceReplicate: cloned.ForceReplicate, - IgnoreIneligibleTable: cloned.IgnoreIneligibleTable, - CheckGCSafePoint: cloned.CheckGCSafePoint, - EnableSyncPoint: cloned.EnableSyncPoint, - EnableTableMonitor: cloned.EnableTableMonitor, - BDRMode: cloned.BDRMode, - EnableActiveActive: cloned.EnableActiveActive, - EnableActiveActiveCheck: cloned.EnableActiveActiveCheck, + MemoryQuota: cloned.MemoryQuota, + CaseSensitive: cloned.CaseSensitive, + ForceReplicate: cloned.ForceReplicate, + IgnoreIneligibleTable: cloned.IgnoreIneligibleTable, + CheckGCSafePoint: cloned.CheckGCSafePoint, + EnableSyncPoint: cloned.EnableSyncPoint, + EnableTableMonitor: cloned.EnableTableMonitor, + BDRMode: cloned.BDRMode, + EnableActiveActive: cloned.EnableActiveActive, } if cloned.SyncPointInterval != nil { diff --git a/downstreamadapter/dispatcher/basic_dispatcher.go b/downstreamadapter/dispatcher/basic_dispatcher.go index be604915b0..eddb932ecc 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher.go +++ b/downstreamadapter/dispatcher/basic_dispatcher.go @@ -249,8 +249,7 @@ func (d *BasicDispatcher) AddDMLEventsToSink(events []*commonEvent.DMLEvent) boo // FilterDMLEvent returns the original event for normal tables and only // allocates a new event when the table needs active-active or soft-delete // processing. Skip is only true when every row in the event should be dropped. - filtered, skip, err := commonEvent.FilterDMLEvent( - event, d.sharedInfo.enableActiveActive, d.sharedInfo.enableActiveActiveCheck) + filtered, skip, err := commonEvent.FilterDMLEvent(event, d.sharedInfo.enableActiveActive) if err != nil { d.HandleError(err) continue diff --git a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go index e99d10d1a3..76db356127 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go +++ b/downstreamadapter/dispatcher/basic_dispatcher_active_active_test.go @@ -131,7 +131,6 @@ func newTestBasicDispatcher(t *testing.T, sinkType common.SinkType, enableActive false, enableActiveActive, false, - false, nil, nil, nil, diff --git a/downstreamadapter/dispatcher/basic_dispatcher_info.go b/downstreamadapter/dispatcher/basic_dispatcher_info.go index 193bdf40fd..db8f9de0b8 100644 --- a/downstreamadapter/dispatcher/basic_dispatcher_info.go +++ b/downstreamadapter/dispatcher/basic_dispatcher_info.go @@ -29,12 +29,11 @@ import ( // This eliminates the need to pass these parameters individually to each dispatcher. type SharedInfo struct { // Basic configuration - changefeedID common.ChangeFeedID - timezone string - bdrMode bool - enableActiveActive bool - enableActiveActiveCheck bool - outputRawChangeEvent bool + changefeedID common.ChangeFeedID + timezone string + bdrMode bool + enableActiveActive bool + outputRawChangeEvent bool // Configuration objects integrityConfig *eventpb.IntegrityConfig @@ -76,7 +75,6 @@ func NewSharedInfo( timezone string, bdrMode bool, enableActiveActive bool, - enableActiveActiveCheck bool, outputRawChangeEvent bool, integrityConfig *eventpb.IntegrityConfig, filterConfig *eventpb.FilterConfig, @@ -88,20 +86,19 @@ func NewSharedInfo( errCh chan error, ) *SharedInfo { sharedInfo := &SharedInfo{ - changefeedID: changefeedID, - timezone: timezone, - bdrMode: bdrMode, - enableActiveActive: enableActiveActive, - enableActiveActiveCheck: enableActiveActiveCheck, - outputRawChangeEvent: outputRawChangeEvent, - integrityConfig: integrityConfig, - filterConfig: filterConfig, - syncPointConfig: syncPointConfig, - enableSplittableCheck: enableSplittableCheck, - statusesChan: statusesChan, - blockStatusesChan: blockStatusesChan, - blockExecutor: newBlockEventExecutor(), - errCh: errCh, + changefeedID: changefeedID, + timezone: timezone, + bdrMode: bdrMode, + enableActiveActive: enableActiveActive, + outputRawChangeEvent: outputRawChangeEvent, + integrityConfig: integrityConfig, + filterConfig: filterConfig, + syncPointConfig: syncPointConfig, + enableSplittableCheck: enableSplittableCheck, + statusesChan: statusesChan, + blockStatusesChan: blockStatusesChan, + blockExecutor: newBlockEventExecutor(), + errCh: errCh, } if txnAtomicity != nil { diff --git a/downstreamadapter/dispatcher/event_dispatcher_test.go b/downstreamadapter/dispatcher/event_dispatcher_test.go index 1de9b9ddee..2265889a49 100644 --- a/downstreamadapter/dispatcher/event_dispatcher_test.go +++ b/downstreamadapter/dispatcher/event_dispatcher_test.go @@ -73,7 +73,6 @@ func newDispatcherForTest(sink sink.Sink, tableSpan *heartbeatpb.TableSpan) *Eve false, false, false, - false, nil, nil, &syncpoint.SyncPointConfig{ @@ -813,7 +812,6 @@ func TestDispatcherSplittableCheck(t *testing.T) { false, false, false, - false, nil, nil, &syncpoint.SyncPointConfig{ @@ -924,7 +922,6 @@ func TestDispatcher_SkipDMLAsStartTs_FilterCorrectly(t *testing.T) { false, false, false, - false, nil, nil, &syncpoint.SyncPointConfig{ @@ -1005,7 +1002,6 @@ func TestDispatcher_SkipDMLAsStartTs_Disabled(t *testing.T) { false, false, false, - false, nil, nil, &syncpoint.SyncPointConfig{ diff --git a/downstreamadapter/dispatcher/redo_dispatcher_test.go b/downstreamadapter/dispatcher/redo_dispatcher_test.go index c8609ac3b7..4c5ff02eb5 100644 --- a/downstreamadapter/dispatcher/redo_dispatcher_test.go +++ b/downstreamadapter/dispatcher/redo_dispatcher_test.go @@ -42,7 +42,6 @@ func newRedoDispatcherForTest(sink sink.Sink, tableSpan *heartbeatpb.TableSpan) false, false, false, - false, nil, nil, nil, // redo dispatcher doesn't need syncPointConfig diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager.go b/downstreamadapter/dispatchermanager/dispatcher_manager.go index 4696ff5dc9..d8a1d729b3 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager.go @@ -234,7 +234,6 @@ func NewDispatcherManager( manager.config.TimeZone, manager.config.BDRMode, manager.config.EnableActiveActive, - manager.config.EnableActiveActiveCheck, outputRawChangeEvent, integrityCfg, filterCfg, diff --git a/downstreamadapter/dispatchermanager/dispatcher_manager_test.go b/downstreamadapter/dispatchermanager/dispatcher_manager_test.go index a168a78b09..575489752f 100644 --- a/downstreamadapter/dispatchermanager/dispatcher_manager_test.go +++ b/downstreamadapter/dispatchermanager/dispatcher_manager_test.go @@ -53,7 +53,6 @@ func createTestDispatcher(t *testing.T, manager *DispatcherManager, id common.Di false, false, false, - false, nil, nil, nil, @@ -112,7 +111,6 @@ func createTestManager(t *testing.T) *DispatcherManager { "system", manager.config.BDRMode, manager.config.EnableActiveActive, - false, false, // outputRawChangeEvent nil, // integrityConfig nil, // filterConfig diff --git a/pkg/common/event/active_active.go b/pkg/common/event/active_active.go index 0095fd0be9..7c1ce77e4c 100644 --- a/pkg/common/event/active_active.go +++ b/pkg/common/event/active_active.go @@ -55,7 +55,6 @@ func EvaluateRowPolicy( tableInfo *common.TableInfo, row *RowChange, enableActiveActive bool, - enableActiveActiveCheck bool, ) (RowPolicyDecision, error) { if tableInfo == nil || row == nil { return RowPolicyKeep, nil @@ -76,10 +75,6 @@ func EvaluateRowPolicy( return RowPolicyKeep, nil } - if enableActiveActiveCheck { - return RowPolicyKeep, nil - } - if row.RowType != common.RowTypeUpdate { return RowPolicyKeep, nil } @@ -157,7 +152,7 @@ func ApplyRowPolicyDecision(row *RowChange, decision RowPolicyDecision) { // // It returns the possibly modified event, whether the event should be skipped entirely, // and an error if evaluation fails. -func FilterDMLEvent(event *DMLEvent, enableActiveActive bool, enableActiveActiveCheck bool) (*DMLEvent, bool, error) { +func FilterDMLEvent(event *DMLEvent, enableActiveActive bool) (*DMLEvent, bool, error) { if event == nil { return nil, true, nil } @@ -190,7 +185,7 @@ func FilterDMLEvent(event *DMLEvent, enableActiveActive bool, enableActiveActive break } - decision, err := EvaluateRowPolicy(tableInfo, &row, enableActiveActive, enableActiveActiveCheck) + decision, err := EvaluateRowPolicy(tableInfo, &row, enableActiveActive) if err != nil { event.Rewind() return nil, false, err diff --git a/pkg/common/event/active_active_test.go b/pkg/common/event/active_active_test.go index 335096e6b9..c145840a4b 100644 --- a/pkg/common/event/active_active_test.go +++ b/pkg/common/event/active_active_test.go @@ -32,7 +32,7 @@ func TestFilterDMLEventNormalTablePassthrough(t *testing.T) { {int64(1)}, }) - filtered, skip, err := FilterDMLEvent(event, false, false) + filtered, skip, err := FilterDMLEvent(event, false) require.NoError(t, err) require.False(t, skip) require.Equal(t, event, filtered) @@ -48,7 +48,7 @@ func TestFilterDMLEventActiveActiveWithEnableDropsDeletes(t *testing.T) { {int64(2), nil}, // insert row }) - filtered, skip, err := FilterDMLEvent(event, true, false) + filtered, skip, err := FilterDMLEvent(event, true) require.NoError(t, err) require.False(t, skip) require.NotEqual(t, event, filtered) @@ -73,7 +73,7 @@ func TestFilterDMLEventSoftDeleteConvertUpdate(t *testing.T) { {int64(1), ts}, // post row with soft delete timestamp }) - filtered, skip, err := FilterDMLEvent(event, false, false) + filtered, skip, err := FilterDMLEvent(event, false) require.NoError(t, err) require.False(t, skip) require.NotEqual(t, event, filtered) @@ -98,7 +98,7 @@ func TestFilterDMLEventActiveActiveConvertWhenDisabled(t *testing.T) { {int64(2), ts}, }) - filtered, skip, err := FilterDMLEvent(event, false, false) + filtered, skip, err := FilterDMLEvent(event, false) require.NoError(t, err) require.False(t, skip) require.NotEqual(t, event, filtered) @@ -123,7 +123,7 @@ func TestFilterDMLEventActiveActiveKeepUpdateWhenEnabled(t *testing.T) { {int64(3), ts}, }) - filtered, skip, err := FilterDMLEvent(event, true, false) + filtered, skip, err := FilterDMLEvent(event, true) require.NoError(t, err) require.False(t, skip) require.Equal(t, event, filtered) @@ -146,7 +146,7 @@ func TestFilterDMLEventAllRowsSkipped(t *testing.T) { {int64(1), nil}, }) - filtered, skip, err := FilterDMLEvent(event, false, false) + filtered, skip, err := FilterDMLEvent(event, false) require.NoError(t, err) require.True(t, skip) require.Nil(t, filtered) diff --git a/pkg/config/changefeed.go b/pkg/config/changefeed.go index 385b8f29e9..4334096bdc 100644 --- a/pkg/config/changefeed.go +++ b/pkg/config/changefeed.go @@ -202,7 +202,6 @@ type ChangefeedConfig struct { Epoch uint64 `json:"epoch"` BDRMode bool `json:"bdr_mode" default:"false"` EnableActiveActive bool `json:"enable_active_active" default:"false"` - EnableActiveActiveCheck bool `json:"enable_active_active_check" default:"false"` ActiveActiveProgressInterval time.Duration `json:"active_active_progress_interval" default:"30m"` ActiveActiveSyncStatsInterval time.Duration `json:"active_active_sync_stats_interval" default:"1m"` // redo releated @@ -282,7 +281,6 @@ func (info *ChangeFeedInfo) ToChangefeedConfig() *ChangefeedConfig { Epoch: info.Epoch, BDRMode: util.GetOrZero(info.Config.BDRMode), EnableActiveActive: util.GetOrZero(info.Config.EnableActiveActive), - EnableActiveActiveCheck: util.GetOrZero(info.Config.EnableActiveActiveCheck), ActiveActiveProgressInterval: util.GetOrZero(info.Config.ActiveActiveProgressInterval), ActiveActiveSyncStatsInterval: util.GetOrZero(info.Config.ActiveActiveSyncStatsInterval), TimeZone: GetGlobalServerConfig().TZ, diff --git a/pkg/config/replica_config.go b/pkg/config/replica_config.go index 13de1bbcba..0a70d749b9 100644 --- a/pkg/config/replica_config.go +++ b/pkg/config/replica_config.go @@ -174,7 +174,6 @@ type replicaConfig struct { SyncedStatus *SyncedStatusConfig `toml:"synced-status" json:"synced-status,omitempty"` EnableActiveActive *bool `toml:"enable-active-active" json:"enable-active-active,omitempty"` - EnableActiveActiveCheck *bool `toml:"enable-active-active-check" json:"enable-active-active-check,omitempty"` ActiveActiveProgressInterval *time.Duration `toml:"active-active-progress-interval" json:"active-active-progress-interval,omitempty"` // ActiveActiveSyncStatsInterval controls how often MySQL/TiDB sink queries // TiDB session variable @@tidb_cdc_active_active_sync_stats for conflict statistics. From 6864e6f34df6bee10deb995b5649f5b1de4ea947 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Fri, 30 Jan 2026 12:52:47 +0800 Subject: [PATCH 11/23] refine the code Signed-off-by: Jianjun Liao --- .../consumer/consumer.go | 408 +-------------- .../consumer/s3_consumer.go | 475 ++++++++++++++++++ .../parser/decoder.go | 28 ++ .../parser/parser.go | 22 +- .../recorder/recorder.go | 6 +- .../recorder/types.go | 44 +- .../watcher/s3_watcher.go | 4 +- 7 files changed, 567 insertions(+), 420 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go index eef3e80976..8b4a8731c6 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -14,23 +14,16 @@ package consumer import ( - "context" - "fmt" - "maps" - "path" - "strings" - "sync" - - "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" - "github.com/pingcap/ticdc/pkg/config" - "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" - "github.com/pingcap/tidb/br/pkg/storage" - "go.uber.org/zap" - "golang.org/x/sync/errgroup" ) +type versionKey struct { + version uint64 + versionPath string + dataPath string +} + type ( fileIndexRange map[cloudstorage.FileIndexKey]indexRange fileIndexKeyMap map[cloudstorage.FileIndexKey]uint64 @@ -70,392 +63,3 @@ type IncrementalData struct { DataContentSlices map[cloudstorage.FileIndexKey][][]byte Parser *parser.TableParser } - -type Consumer struct { - s3Storage storage.ExternalStorage - fileExtension string - dateSeparator string - fileIndexWidth int - tables map[string][]string - - versionMapMu sync.RWMutex - currentTableVersionMap map[schemaKey]uint64 - tableDMLIdxMapMu sync.Mutex - tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap - schemaParserMapMu sync.RWMutex - schemaParserMap map[cloudstorage.SchemaPathKey]schemaParser -} - -func NewConsumer( - s3Storage storage.ExternalStorage, - tables map[string][]string, -) *Consumer { - return &Consumer{ - s3Storage: s3Storage, - fileExtension: ".csv", - dateSeparator: config.DateSeparatorDay.String(), - fileIndexWidth: config.DefaultFileIndexWidth, - tables: tables, - currentTableVersionMap: make(map[schemaKey]uint64, 0), - tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), - schemaParserMap: make(map[cloudstorage.SchemaPathKey]schemaParser), - } -} - -// getCurrentTableVersion returns the current table version for a given schema and table -func (c *Consumer) getCurrentTableVersion(schema, table string) uint64 { - tableKey := schemaKey{ - schema: schema, - table: table, - } - c.versionMapMu.RLock() - currentVersion := c.currentTableVersionMap[tableKey] - c.versionMapMu.RUnlock() - return currentVersion -} - -// updateCurrentTableVersion updates the current table version for a given schema and table -func (c *Consumer) updateCurrentTableVersion(schema, table string, version uint64) { - tableKey := schemaKey{ - schema: schema, - table: table, - } - c.versionMapMu.Lock() - c.currentTableVersionMap[tableKey] = version - c.versionMapMu.Unlock() -} - -// getSchemaParser returns the schema parser for a given schema and table version -func (c *Consumer) getSchemaParser(schema, table string, version uint64) (*parser.TableParser, error) { - schemaPathKey := cloudstorage.SchemaPathKey{ - Schema: schema, - Table: table, - TableVersion: version, - } - c.schemaParserMapMu.RLock() - schemaParser, ok := c.schemaParserMap[schemaPathKey] - c.schemaParserMapMu.RUnlock() - if !ok { - return nil, errors.Errorf("schema parser not found for schema: %s, table: %s, version: %d", schema, table, version) - } - return schemaParser.parser, nil -} - -// setSchemaParser sets the schema parser for a given schema and table version -func (c *Consumer) setSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *parser.TableParser) { - c.schemaParserMapMu.Lock() - c.schemaParserMap[schemaPathKey] = schemaParser{ - path: filePath, - parser: parser, - } - c.schemaParserMapMu.Unlock() -} - -// downloadSchemaFiles downloads schema files concurrently for given schema path keys -func (c *Consumer) downloadSchemaFiles( - ctx context.Context, - newVersionPaths map[cloudstorage.SchemaPathKey]string, -) error { - eg, egCtx := errgroup.WithContext(ctx) - - log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) - for schemaPathKey, filePath := range newVersionPaths { - eg.Go(func() error { - content, err := c.s3Storage.ReadFile(egCtx, filePath) - if err != nil { - return errors.Annotatef(err, "failed to read schema file: %s", filePath) - } - - parser, err := parser.NewTableParser(schemaPathKey.GetKey(), content) - if err != nil { - return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) - } - - c.setSchemaParser(schemaPathKey, filePath, parser) - return nil - }) - } - if err := eg.Wait(); err != nil { - return errors.Trace(err) - } - return nil -} - -func (c *Consumer) discoverAndDownloadNewTableVersions( - ctx context.Context, - schema, table string, -) ([]uint64, error) { - currentVersion := c.getCurrentTableVersion(schema, table) - metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) - opt := &storage.WalkOption{ - SubDir: metaSubDir, - ObjPrefix: "schema_", - } - - var scanVersions []uint64 - newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) - versionSet := make(map[uint64]struct{}) - if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { - if !cloudstorage.IsSchemaFile(filePath) { - return nil - } - var schemaKey cloudstorage.SchemaPathKey - _, err := schemaKey.ParseSchemaFilePath(filePath) - if err != nil { - log.Error("failed to parse schema file path, skipping", - zap.String("path", filePath), - zap.Error(err)) - return nil - } - version := schemaKey.TableVersion - if version > currentVersion { - if _, exists := versionSet[version]; !exists { - versionSet[version] = struct{}{} - scanVersions = append(scanVersions, version) - } - } - - newVersionPaths[schemaKey] = filePath - return nil - }); err != nil { - return nil, errors.Trace(err) - } - - // download new version schema files concurrently - if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { - return nil, errors.Trace(err) - } - - if currentVersion > 0 { - scanVersions = append(scanVersions, currentVersion) - } - return scanVersions, nil -} - -func (c *Consumer) diffNewTableDMLIdxMap( - newTableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap, -) map[cloudstorage.DmlPathKey]fileIndexRange { - resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) - c.tableDMLIdxMapMu.Lock() - defer c.tableDMLIdxMapMu.Unlock() - for newDMLPathKey, newFileIndexKeyMap := range newTableDMLIdxMap { - origFileIndexKeyMap, ok := c.tableDMLIdxMap[newDMLPathKey] - if !ok { - c.tableDMLIdxMap[newDMLPathKey] = newFileIndexKeyMap - resMap[newDMLPathKey] = make(fileIndexRange) - for indexKey, newEndVal := range newFileIndexKeyMap { - resMap[newDMLPathKey][indexKey] = indexRange{ - start: 1, - end: newEndVal, - } - } - continue - } - for indexKey, newEndVal := range newFileIndexKeyMap { - origEndVal := origFileIndexKeyMap[indexKey] - if newEndVal > origEndVal { - origFileIndexKeyMap[indexKey] = newEndVal - if _, ok := resMap[newDMLPathKey]; !ok { - resMap[newDMLPathKey] = make(fileIndexRange) - } - resMap[newDMLPathKey][indexKey] = indexRange{ - start: origEndVal + 1, - end: newEndVal, - } - } - } - } - return resMap -} - -func (c *Consumer) getNewFilesForSchemaPathKey( - ctx context.Context, - schema, table string, - version uint64, -) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { - schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version)) - opt := &storage.WalkOption{SubDir: schemaPrefix} - - // Save a snapshot of current tableDMLIdxMap - origDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap, len(c.tableDMLIdxMap)) - for k, v := range c.tableDMLIdxMap { - m := make(fileIndexKeyMap) - maps.Copy(m, v) - origDMLIdxMap[k] = m - } - - // Walk through all files in S3 storage - newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) - if err := c.s3Storage.WalkDir(ctx, opt, func(path string, size int64) error { - // Try to parse DML file path if it matches the expected extension - if strings.HasSuffix(path, c.fileExtension) { - var dmlkey cloudstorage.DmlPathKey - fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, path) - if err != nil { - log.Error("failed to parse dml file path, skipping", - zap.String("path", path), - zap.Error(err)) - return nil - } - updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) - } - return nil - }); err != nil { - return nil, errors.Trace(err) - } - - // Calculate the difference to find new files - return c.diffNewTableDMLIdxMap(newTableDMLIdxMap), nil -} - -func (c *Consumer) downloadDMLFiles( - ctx context.Context, - schema, table string, - version uint64, -) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { - newFiles, err := c.getNewFilesForSchemaPathKey(ctx, schema, table, version) - if err != nil { - return nil, errors.Trace(err) - } - - if len(newFiles) == 0 { - return nil, nil - } - - result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) - - // Prepare all file download tasks - type downloadTask struct { - dmlPathKey cloudstorage.DmlPathKey - fileIndex cloudstorage.FileIndex - } - - var tasks []downloadTask - for dmlPathKey, fileRange := range newFiles { - for indexKey, indexRange := range fileRange { - log.Debug("prepare to download new dml file in index range", - zap.String("schema", dmlPathKey.Schema), - zap.String("table", dmlPathKey.Table), - zap.Uint64("version", dmlPathKey.TableVersion), - zap.Int64("partitionNum", dmlPathKey.PartitionNum), - zap.String("date", dmlPathKey.Date), - zap.String("dispatcherID", indexKey.DispatcherID), - zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), - zap.Uint64("startIndex", indexRange.start), - zap.Uint64("endIndex", indexRange.end)) - for i := indexRange.start; i <= indexRange.end; i++ { - tasks = append(tasks, downloadTask{ - dmlPathKey: dmlPathKey, - fileIndex: cloudstorage.FileIndex{ - FileIndexKey: indexKey, - Idx: i, - }, - }) - } - } - } - - log.Debug("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) - - // Concurrently download files - type fileContent struct { - dmlPathKey cloudstorage.DmlPathKey - indexKey cloudstorage.FileIndexKey - idx uint64 - content []byte - } - - fileContents := make(chan fileContent, len(tasks)) - eg, egCtx := errgroup.WithContext(ctx) - for _, task := range tasks { - eg.Go(func() error { - filePath := task.dmlPathKey.GenerateDMLFilePath( - &task.fileIndex, - c.fileExtension, - c.fileIndexWidth, - ) - - content, err := c.s3Storage.ReadFile(egCtx, filePath) - if err != nil { - return errors.Annotatef(err, "failed to read file: %s", filePath) - } - - // Channel writes are thread-safe, no mutex needed - fileContents <- fileContent{ - dmlPathKey: task.dmlPathKey, - indexKey: task.fileIndex.FileIndexKey, - idx: task.fileIndex.Idx, - content: content, - } - return nil - }) - } - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - - // Close the channel to signal no more writes - close(fileContents) - - // Process the downloaded file contents - for fc := range fileContents { - if result[fc.dmlPathKey] == nil { - result[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) - } - result[fc.dmlPathKey][fc.indexKey] = append( - result[fc.dmlPathKey][fc.indexKey], - fc.content, - ) - } - - return result, nil -} - -func (c *Consumer) ConsumeNewFiles( - ctx context.Context, -) (map[cloudstorage.DmlPathKey]IncrementalData, error) { - var mu sync.Mutex - // Combine DML data and schema data into result - result := make(map[cloudstorage.DmlPathKey]IncrementalData) - eg, egCtx := errgroup.WithContext(ctx) - for schema, tables := range c.tables { - for _, table := range tables { - eg.Go(func() error { - newVersions, err := c.discoverAndDownloadNewTableVersions(egCtx, schema, table) - if err != nil { - return errors.Trace(err) - } - maxVersion := uint64(0) - for _, version := range newVersions { - maxVersion = max(maxVersion, version) - eg.Go(func() error { - dmlData, err := c.downloadDMLFiles(egCtx, schema, table, version) - if err != nil { - return errors.Trace(err) - } - parser, err := c.getSchemaParser(schema, table, version) - if err != nil { - return errors.Trace(err) - } - for dmlPathKey, dmlSlices := range dmlData { - mu.Lock() - result[dmlPathKey] = IncrementalData{ - DataContentSlices: dmlSlices, - Parser: parser, - } - mu.Unlock() - } - return nil - }) - } - c.updateCurrentTableVersion(schema, table, maxVersion) - return nil - }) - } - } - - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - return result, nil -} diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go new file mode 100644 index 0000000000..1cb5cca3e6 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go @@ -0,0 +1,475 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package consumer + +import ( + "context" + "fmt" + "path" + "strings" + "sync" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/pingcap/tidb/br/pkg/storage" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" +) + +type CurrentTableVersion struct { + mu sync.RWMutex + currentTableVersionMap map[schemaKey]versionKey +} + +func NewCurrentTableVersion() *CurrentTableVersion { + return &CurrentTableVersion{ + currentTableVersionMap: make(map[schemaKey]versionKey), + } +} + +// GetCurrentTableVersion returns the current table version for a given schema and table +func (cvt *CurrentTableVersion) GetCurrentTableVersion(schema, table string) versionKey { + cvt.mu.RLock() + defer cvt.mu.RUnlock() + return cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] +} + +// UpdateCurrentTableVersion updates the current table version for a given schema and table +func (cvt *CurrentTableVersion) UpdateCurrentTableVersion(schema, table string, version versionKey) { + cvt.mu.Lock() + defer cvt.mu.Unlock() + cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] = version +} + +type SchemaParser struct { + mu sync.RWMutex + schemaParserMap map[cloudstorage.SchemaPathKey]schemaParser +} + +func NewSchemaParser() *SchemaParser { + return &SchemaParser{ + schemaParserMap: make(map[cloudstorage.SchemaPathKey]schemaParser), + } +} + +// GetSchemaParser returns the schema parser for a given schema and table version +func (sp *SchemaParser) GetSchemaParser(schema, table string, version uint64) (*parser.TableParser, error) { + schemaPathKey := cloudstorage.SchemaPathKey{ + Schema: schema, + Table: table, + TableVersion: version, + } + sp.mu.RLock() + schemaParser, ok := sp.schemaParserMap[schemaPathKey] + sp.mu.RUnlock() + if !ok { + return nil, errors.Errorf("schema parser not found for schema: %s, table: %s, version: %d", schema, table, version) + } + return schemaParser.parser, nil +} + +// SetSchemaParser sets the schema parser for a given schema and table version +func (sp *SchemaParser) SetSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *parser.TableParser) { + sp.mu.Lock() + sp.schemaParserMap[schemaPathKey] = schemaParser{ + path: filePath, + parser: parser, + } + sp.mu.Unlock() +} + +// RemoveSchemaParserWithCondition removes the schema parser for a given condition +func (sp *SchemaParser) RemoveSchemaParserWithCondition(condition func(schemaPathKey cloudstorage.SchemaPathKey) bool) { + sp.mu.Lock() + for schemaPathkey := range sp.schemaParserMap { + if condition(schemaPathkey) { + delete(sp.schemaParserMap, schemaPathkey) + } + } + sp.mu.Unlock() +} + +type TableDMLIdx struct { + mu sync.Mutex + tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap +} + +func NewTableDMLIdex() *TableDMLIdx { + return &TableDMLIdx{ + tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), + } +} + +func (t *TableDMLIdx) DiffNewTableDMLIdxMap( + newTableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap, +) map[cloudstorage.DmlPathKey]fileIndexRange { + resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) + t.mu.Lock() + defer t.mu.Unlock() + for newDMLPathKey, newFileIndexKeyMap := range newTableDMLIdxMap { + origFileIndexKeyMap, ok := t.tableDMLIdxMap[newDMLPathKey] + if !ok { + t.tableDMLIdxMap[newDMLPathKey] = newFileIndexKeyMap + resMap[newDMLPathKey] = make(fileIndexRange) + for indexKey, newEndVal := range newFileIndexKeyMap { + resMap[newDMLPathKey][indexKey] = indexRange{ + start: 1, + end: newEndVal, + } + } + continue + } + for indexKey, newEndVal := range newFileIndexKeyMap { + origEndVal := origFileIndexKeyMap[indexKey] + if newEndVal > origEndVal { + origFileIndexKeyMap[indexKey] = newEndVal + if _, ok := resMap[newDMLPathKey]; !ok { + resMap[newDMLPathKey] = make(fileIndexRange) + } + } + resMap[newDMLPathKey][indexKey] = indexRange{ + start: origEndVal + 1, + end: newEndVal, + } + } + } + return resMap +} + +type S3Consumer struct { + s3Storage storage.ExternalStorage + fileExtension string + dateSeparator string + fileIndexWidth int + tables map[string][]string + + currentTableVersion *CurrentTableVersion + tableDMLIdx *TableDMLIdx + schemaParser *SchemaParser +} + +func NewS3Consumer( + s3Storage storage.ExternalStorage, + tables map[string][]string, +) *S3Consumer { + return &S3Consumer{ + s3Storage: s3Storage, + fileExtension: ".json", + dateSeparator: config.DateSeparatorDay.String(), + fileIndexWidth: config.DefaultFileIndexWidth, + tables: tables, + + currentTableVersion: NewCurrentTableVersion(), + tableDMLIdx: NewTableDMLIdex(), + schemaParser: NewSchemaParser(), + } +} + +// downloadSchemaFiles downloads schema files concurrently for given schema path keys +func (c *S3Consumer) downloadSchemaFiles( + ctx context.Context, + newVersionPaths map[cloudstorage.SchemaPathKey]string, +) error { + eg, egCtx := errgroup.WithContext(ctx) + + log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) + for schemaPathKey, filePath := range newVersionPaths { + eg.Go(func() error { + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read schema file: %s", filePath) + } + + // Use canal-json decoder for S3 sink with .json file extension + parser, err := parser.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) + if err != nil { + return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) + } + + c.schemaParser.SetSchemaParser(schemaPathKey, filePath, parser) + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } + return nil +} + +func (c *S3Consumer) discoverAndDownloadNewTableVersions( + ctx context.Context, + schema, table string, +) ([]versionKey, error) { + currentVersion := c.currentTableVersion.GetCurrentTableVersion(schema, table) + metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) + opt := &storage.WalkOption{ + SubDir: metaSubDir, + ObjPrefix: "schema_", + // TODO: StartAfter: currentVersion.versionPath, + } + + var scanVersions []versionKey + newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if !cloudstorage.IsSchemaFile(filePath) { + return nil + } + var schemaKey cloudstorage.SchemaPathKey + _, err := schemaKey.ParseSchemaFilePath(filePath) + if err != nil { + log.Error("failed to parse schema file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + version := schemaKey.TableVersion + if version > currentVersion.version { + if _, exists := newVersionPaths[schemaKey]; !exists { + scanVersions = append(scanVersions, versionKey{ + version: version, + versionPath: filePath, + }) + } + newVersionPaths[schemaKey] = filePath + } + return nil + }); err != nil { + return nil, errors.Trace(err) + } + + // download new version schema files concurrently + if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { + return nil, errors.Trace(err) + } + + if currentVersion.version > 0 { + scanVersions = append(scanVersions, currentVersion) + } + return scanVersions, nil +} + +func (c *S3Consumer) getNewFilesForSchemaPathKey( + ctx context.Context, + schema, table string, + version *versionKey, +) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { + schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version.version)) + opt := &storage.WalkOption{ + SubDir: schemaPrefix, + // TODO: StartAfter: version.dataPath, + } + + newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + maxFilePath := "" + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + // Try to parse DML file path if it matches the expected extension + if strings.HasSuffix(filePath, c.fileExtension) { + var dmlkey cloudstorage.DmlPathKey + fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, filePath) + if err != nil { + log.Error("failed to parse dml file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) + } + maxFilePath = filePath + return nil + }); err != nil { + return nil, errors.Trace(err) + } + + version.dataPath = maxFilePath + return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil +} + +func (c *S3Consumer) downloadDMLFiles( + ctx context.Context, + newFiles map[cloudstorage.DmlPathKey]fileIndexRange, +) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { + if len(newFiles) == 0 { + return nil, nil + } + + result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) + type downloadTask struct { + dmlPathKey cloudstorage.DmlPathKey + fileIndex cloudstorage.FileIndex + } + + var tasks []downloadTask + for dmlPathKey, fileRange := range newFiles { + for indexKey, indexRange := range fileRange { + log.Debug("prepare to download new dml file in index range", + zap.String("schema", dmlPathKey.Schema), + zap.String("table", dmlPathKey.Table), + zap.Uint64("version", dmlPathKey.TableVersion), + zap.Int64("partitionNum", dmlPathKey.PartitionNum), + zap.String("date", dmlPathKey.Date), + zap.String("dispatcherID", indexKey.DispatcherID), + zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), + zap.Uint64("startIndex", indexRange.start), + zap.Uint64("endIndex", indexRange.end)) + for i := indexRange.start; i <= indexRange.end; i++ { + tasks = append(tasks, downloadTask{ + dmlPathKey: dmlPathKey, + fileIndex: cloudstorage.FileIndex{ + FileIndexKey: indexKey, + Idx: i, + }, + }) + } + } + } + + log.Debug("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) + + // Concurrently download files + type fileContent struct { + dmlPathKey cloudstorage.DmlPathKey + indexKey cloudstorage.FileIndexKey + idx uint64 + content []byte + } + + fileContents := make(chan fileContent, len(tasks)) + eg, egCtx := errgroup.WithContext(ctx) + for _, task := range tasks { + eg.Go(func() error { + filePath := task.dmlPathKey.GenerateDMLFilePath( + &task.fileIndex, + c.fileExtension, + c.fileIndexWidth, + ) + + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read file: %s", filePath) + } + + // Channel writes are thread-safe, no mutex needed + fileContents <- fileContent{ + dmlPathKey: task.dmlPathKey, + indexKey: task.fileIndex.FileIndexKey, + idx: task.fileIndex.Idx, + content: content, + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + + // Close the channel to signal no more writes + close(fileContents) + + // Process the downloaded file contents + for fc := range fileContents { + if result[fc.dmlPathKey] == nil { + result[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) + } + result[fc.dmlPathKey][fc.indexKey] = append( + result[fc.dmlPathKey][fc.indexKey], + fc.content, + ) + } + + return result, nil +} + +func (c *S3Consumer) downloadNewFilesWithVersions( + ctx context.Context, + schema, table string, + scanVersions []versionKey, + consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *parser.TableParser), +) error { + var maxVersion *versionKey + eg, egCtx := errgroup.WithContext(ctx) + for _, version := range scanVersions { + if maxVersion == nil || maxVersion.version < version.version { + maxVersion = &version + } + eg.Go(func() error { + newFiles, err := c.getNewFilesForSchemaPathKey(egCtx, schema, table, &version) + if err != nil { + return errors.Trace(err) + } + dmlData, err := c.downloadDMLFiles(egCtx, newFiles) + if err != nil { + return errors.Trace(err) + } + parser, err := c.schemaParser.GetSchemaParser(schema, table, version.version) + if err != nil { + return errors.Trace(err) + } + for dmlPathKey, dmlSlices := range dmlData { + consumeFunc(dmlPathKey, dmlSlices, parser) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } + if maxVersion != nil { + c.currentTableVersion.UpdateCurrentTableVersion(schema, table, *maxVersion) + } + return nil +} + +func (c *S3Consumer) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlPathKey]IncrementalData, error) { + var mu sync.Mutex + // Combine DML data and schema data into result + result := make(map[cloudstorage.DmlPathKey]IncrementalData) + eg, egCtx := errgroup.WithContext(ctx) + for schema, tables := range c.tables { + for _, table := range tables { + eg.Go(func() error { + scanVersions, err := c.discoverAndDownloadNewTableVersions(egCtx, schema, table) + if err != nil { + return errors.Trace(err) + } + if err := c.downloadNewFilesWithVersions( + egCtx, schema, table, scanVersions, + func( + dmlPathKey cloudstorage.DmlPathKey, + dmlSlices map[cloudstorage.FileIndexKey][][]byte, + parser *parser.TableParser, + ) { + mu.Lock() + result[dmlPathKey] = IncrementalData{ + DataContentSlices: dmlSlices, + Parser: parser, + } + mu.Unlock() + }, + ); err != nil { + return errors.Trace(err) + } + + return nil + }) + } + } + + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + return result, nil +} diff --git a/cmd/multi-cluster-consistency-checker/parser/decoder.go b/cmd/multi-cluster-consistency-checker/parser/decoder.go index 97e2d0eb3f..2c768927e9 100644 --- a/cmd/multi-cluster-consistency-checker/parser/decoder.go +++ b/cmd/multi-cluster-consistency-checker/parser/decoder.go @@ -19,6 +19,7 @@ import ( commonType "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/codec/canal" codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" "github.com/pingcap/ticdc/pkg/sink/codec/csv" ) @@ -51,3 +52,30 @@ func (d *csvDecoder) NewDecoder(ctx context.Context, tableInfo *commonType.Table } return decoder, nil } + +func defaultCanalJSONCodecConfig(protocol config.Protocol) *codecCommon.Config { + codecConfig := codecCommon.NewConfig(protocol) + // Always enable tidb extension for canal-json protocol + // because we need to get the commit ts from the extension field. + codecConfig.EnableTiDBExtension = true + return codecConfig +} + +type canalJSONDecoder struct { + codecConfig *codecCommon.Config +} + +func NewCanalJSONDecoder() *canalJSONDecoder { + codecConfig := defaultCanalJSONCodecConfig(config.ProtocolCanalJSON) + return &canalJSONDecoder{ + codecConfig: codecConfig, + } +} + +func (d *canalJSONDecoder) NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) { + // For S3 sink with canal-json format, use NewTxnDecoder + // which is designed for batch decoding from storage + decoder := canal.NewTxnDecoder(d.codecConfig) + decoder.AddKeyValue(nil, content) + return decoder, nil +} diff --git a/cmd/multi-cluster-consistency-checker/parser/parser.go b/cmd/multi-cluster-consistency-checker/parser/parser.go index 5af64a07db..c342a0a1d8 100644 --- a/cmd/multi-cluster-consistency-checker/parser/parser.go +++ b/cmd/multi-cluster-consistency-checker/parser/parser.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/ticdc/pkg/common" commonType "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/common/event" + "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" @@ -58,19 +59,34 @@ func getPkColumnOffset(tableInfo *commonType.TableInfo) (map[int64]int, error) { return pkColumnOffsets, nil } +type decoderFactory interface { + NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) +} + type TableParser struct { tableKey string tableInfo *common.TableInfo pkColumnOffsets map[int64]int - csvDecoder *csvDecoder + decoderFactory decoderFactory } func NewTableParser(tableKey string, content []byte) (*TableParser, error) { + return NewTableParserWithFormat(tableKey, content, config.ProtocolCsv) +} + +func NewTableParserWithFormat(tableKey string, content []byte, protocol config.Protocol) (*TableParser, error) { tableParser := &TableParser{} if err := tableParser.parseTableInfo(tableKey, content); err != nil { return nil, errors.Trace(err) } - tableParser.csvDecoder = NewCsvDecoder() + switch protocol { + case config.ProtocolCsv: + tableParser.decoderFactory = NewCsvDecoder() + case config.ProtocolCanalJSON: + tableParser.decoderFactory = NewCanalJSONDecoder() + default: + return nil, errors.Errorf("unsupported protocol: %s", protocol) + } return tableParser, nil } @@ -186,7 +202,7 @@ func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Reco func (pt *TableParser) DecodeFiles(ctx context.Context, content []byte) ([]*utils.Record, error) { records := make([]*utils.Record, 0) - decoder, err := pt.csvDecoder.NewDecoder(ctx, pt.tableInfo, content) + decoder, err := pt.decoderFactory.NewDecoder(ctx, pt.tableInfo, content) if err != nil { return nil, errors.Trace(err) } diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 9ec68139e1..536d73974d 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -47,8 +47,10 @@ func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindo zap.Uint64("window right boundary", timeWindow.RightBoundary), zap.Any("checkpoint ts", timeWindow.CheckpointTs)) } - if err := r.flushReport(report); err != nil { - return errors.Trace(err) + if report.NeedFlush() { + if err := r.flushReport(report); err != nil { + return errors.Trace(err) + } } return nil } diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index d1786c8c9d..c587d23a1a 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -64,6 +64,8 @@ type ClusterReport struct { DataLossItems []DataLossItem DataRedundantItems []DataRedundantItem LWWViolationItems []LWWViolationItem + + needFlush bool } func NewClusterReport(clusterID string) *ClusterReport { @@ -72,6 +74,7 @@ func NewClusterReport(clusterID string) *ClusterReport { DataLossItems: make([]DataLossItem, 0), DataRedundantItems: make([]DataRedundantItem, 0), LWWViolationItems: make([]LWWViolationItem, 0), + needFlush: false, } } @@ -83,6 +86,7 @@ func (r *ClusterReport) AddDataLossItem(downstreamClusterID, pk string, originTS CommitTS: commitTS, Inconsistent: inconsistent, }) + r.needFlush = true } func (r *ClusterReport) AddDataRedundantItem(pk string, originTS, commitTS uint64) { @@ -91,6 +95,7 @@ func (r *ClusterReport) AddDataRedundantItem(pk string, originTS, commitTS uint6 OriginTS: originTS, CommitTS: commitTS, }) + r.needFlush = true } func (r *ClusterReport) AddLWWViolationItem( @@ -105,42 +110,59 @@ func (r *ClusterReport) AddLWWViolationItem( OriginTS: originTS, CommitTS: commitTS, }) + r.needFlush = true } type Report struct { Round uint64 ClusterReports map[string]*ClusterReport + needFlush bool } func NewReport(round uint64) *Report { return &Report{ Round: round, ClusterReports: make(map[string]*ClusterReport), + needFlush: false, } } func (r *Report) AddClusterReport(clusterID string, clusterReport *ClusterReport) { r.ClusterReports[clusterID] = clusterReport + r.needFlush = r.needFlush || clusterReport.needFlush } func (r *Report) MarshalReport() string { var reportMsg strings.Builder - reportMsg.WriteString(fmt.Sprintf("round: %d\n", r.Round)) + fmt.Fprintf(&reportMsg, "round: %d\n", r.Round) for clusterID, clusterReport := range r.ClusterReports { - reportMsg.WriteString(fmt.Sprintf("\n[cluster: %s]\n", clusterID)) - reportMsg.WriteString(fmt.Sprintf(" - [data loss items: %d]\n", len(clusterReport.DataLossItems))) - for _, dataLossItem := range clusterReport.DataLossItems { - reportMsg.WriteString(fmt.Sprintf(" - [%s]\n", dataLossItem.String())) + if !clusterReport.needFlush { + continue + } + fmt.Fprintf(&reportMsg, "\n[cluster: %s]\n", clusterID) + if len(clusterReport.DataLossItems) > 0 { + fmt.Fprintf(&reportMsg, " - [data loss items: %d]\n", len(clusterReport.DataLossItems)) + for _, dataLossItem := range clusterReport.DataLossItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", dataLossItem.String()) + } } - reportMsg.WriteString(fmt.Sprintf(" - [data redundant items: %d]\n", len(clusterReport.DataRedundantItems))) - for _, dataRedundantItem := range clusterReport.DataRedundantItems { - reportMsg.WriteString(fmt.Sprintf(" - [%s]\n", dataRedundantItem.String())) + if len(clusterReport.DataRedundantItems) > 0 { + fmt.Fprintf(&reportMsg, " - [data redundant items: %d]\n", len(clusterReport.DataRedundantItems)) + for _, dataRedundantItem := range clusterReport.DataRedundantItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", dataRedundantItem.String()) + } } - reportMsg.WriteString(fmt.Sprintf(" - [lww violation items: %d]\n", len(clusterReport.LWWViolationItems))) - for _, lwwViolationItem := range clusterReport.LWWViolationItems { - reportMsg.WriteString(fmt.Sprintf(" - [%s]\n", lwwViolationItem.String())) + if len(clusterReport.LWWViolationItems) > 0 { + fmt.Fprintf(&reportMsg, " - [lww violation items: %d]\n", len(clusterReport.LWWViolationItems)) + for _, lwwViolationItem := range clusterReport.LWWViolationItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", lwwViolationItem.String()) + } } } reportMsg.WriteString("\n") return reportMsg.String() } + +func (r *Report) NeedFlush() bool { + return r.needFlush +} diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go index bd4e446ee7..2d6229ce7d 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go @@ -24,7 +24,7 @@ import ( type S3Watcher struct { checkpointWatcher *CheckpointWatcher - consumer *consumer.Consumer + consumer *consumer.S3Consumer } func NewS3Watcher( @@ -32,7 +32,7 @@ func NewS3Watcher( s3Storage storage.ExternalStorage, tables map[string][]string, ) *S3Watcher { - consumer := consumer.NewConsumer(s3Storage, tables) + consumer := consumer.NewS3Consumer(s3Storage, tables) return &S3Watcher{ checkpointWatcher: checkpointWatcher, consumer: consumer, From bd9ec7430db75b39a96e7cfde92eab483d3d2bb2 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 4 Feb 2026 12:44:31 +0800 Subject: [PATCH 12/23] support checkpoint Signed-off-by: Jianjun Liao --- .../advancer/time_window_advancer.go | 112 ++++-- .../advancer/time_window_advancer_test.go | 51 +++ .../checker/checker.go | 110 +++++- .../checker/checker_test.go | 348 ++++++++++++++++++ .../config/config.example.toml | 4 +- .../config/config.go | 6 +- .../config/config_test.go | 244 ++++++++++++ .../consumer/consumer.go | 15 +- .../consumer/s3_consumer.go | 279 ++++++++++++-- .../recorder/recorder.go | 83 ++++- .../recorder/types.go | 130 +++++-- cmd/multi-cluster-consistency-checker/task.go | 9 +- .../{parser => utils}/decoder.go | 2 +- .../{parser => utils}/parser.go | 19 +- .../utils/types.go | 46 +++ .../watcher/s3_watcher.go | 16 +- 16 files changed, 1320 insertions(+), 154 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go create mode 100644 cmd/multi-cluster-consistency-checker/checker/checker_test.go create mode 100644 cmd/multi-cluster-consistency-checker/config/config_test.go rename cmd/multi-cluster-consistency-checker/{parser => utils}/decoder.go (99%) rename cmd/multi-cluster-consistency-checker/{parser => utils}/parser.go (94%) diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go index b69287f0e6..d2ed64a4cf 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go @@ -19,7 +19,8 @@ import ( "sync" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/consumer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" @@ -29,35 +30,12 @@ import ( "golang.org/x/sync/errgroup" ) -// TimeWindow is the time window of the cluster, including the left boundary, right boundary and checkpoint ts -// Assert 1: LeftBoundary < CheckpointTs < RightBoundary -// Assert 2: The other cluster's checkpoint timestamp of next time window should be larger than the PDTimestampAfterTimeWindow saved in this cluster's time window -// Assert 3: CheckpointTs of this cluster should be larger than other clusters' RightBoundary of previous time window -// Assert 4: RightBoundary of this cluster should be larger than other clusters' CheckpointTs of this time window -type TimeWindow struct { - LeftBoundary uint64 - RightBoundary uint64 - // CheckpointTs is the checkpoint timestamp for each changefeed from upstream cluster, - // mapping from downstream cluster ID to the checkpoint timestamp - CheckpointTs map[string]uint64 - // PDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, - // mapping from upstream cluster ID to the max PD timestamp - PDTimestampAfterTimeWindow map[string]uint64 - // NextMinLeftBoundary is the minimum left boundary of the next time window for the cluster - NextMinLeftBoundary uint64 -} - -type TimeWindowData struct { - TimeWindow - Data map[cloudstorage.DmlPathKey]consumer.IncrementalData -} - type TimeWindowAdvancer struct { // round is the current round of the time window round uint64 // timeWindowTriplet is the triplet of adjacent time windows, mapping from cluster ID to the triplet - timeWindowTriplet map[string][3]TimeWindow + timeWindowTriplet map[string][3]utils.TimeWindow // checkpointWatcher is the Active-Active checkpoint watcher for each cluster, // mapping from cluster ID to the downstream cluster ID to the checkpoint watcher @@ -71,21 +49,72 @@ type TimeWindowAdvancer struct { } func NewTimeWindowAdvancer( + ctx context.Context, checkpointWatchers map[string]map[string]*watcher.CheckpointWatcher, s3Watchers map[string]*watcher.S3Watcher, pdClients map[string]pd.Client, -) *TimeWindowAdvancer { - timeWindowTriplet := make(map[string][3]TimeWindow) + checkpoint *recorder.Checkpoint, +) (*TimeWindowAdvancer, map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { + timeWindowTriplet := make(map[string][3]utils.TimeWindow) for clusterID := range pdClients { - timeWindowTriplet[clusterID] = [3]TimeWindow{} + timeWindowTriplet[clusterID] = [3]utils.TimeWindow{} } - return &TimeWindowAdvancer{ + advancer := &TimeWindowAdvancer{ round: 0, timeWindowTriplet: timeWindowTriplet, checkpointWatcher: checkpointWatchers, s3Watcher: s3Watchers, pdClients: pdClients, } + newDataMap, err := advancer.initializeFromCheckpoint(ctx, checkpoint) + if err != nil { + return nil, nil, errors.Trace(err) + } + return advancer, newDataMap, nil +} + +func (t *TimeWindowAdvancer) initializeFromCheckpoint( + ctx context.Context, + checkpoint *recorder.Checkpoint, +) (map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { + if checkpoint == nil { + return nil, nil + } + if checkpoint.CheckpointItems[2] == nil { + return nil, nil + } + t.round = checkpoint.CheckpointItems[2].Round + 1 + for clusterID := range t.timeWindowTriplet { + newTimeWindows := [3]utils.TimeWindow{} + newTimeWindows[2] = checkpoint.CheckpointItems[2].ClusterInfo[clusterID].TimeWindow + if checkpoint.CheckpointItems[1] != nil { + newTimeWindows[1] = checkpoint.CheckpointItems[1].ClusterInfo[clusterID].TimeWindow + } + if checkpoint.CheckpointItems[0] != nil { + newTimeWindows[0] = checkpoint.CheckpointItems[0].ClusterInfo[clusterID].TimeWindow + } + t.timeWindowTriplet[clusterID] = newTimeWindows + } + + var mu sync.Mutex + newDataMap := make(map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData) + eg, egCtx := errgroup.WithContext(ctx) + for clusterID, s3Watcher := range t.s3Watcher { + eg.Go(func() error { + newData, err := s3Watcher.InitializeFromCheckpoint(egCtx, clusterID, checkpoint) + if err != nil { + return errors.Trace(err) + } + mu.Lock() + newDataMap[clusterID] = newData + mu.Unlock() + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + return newDataMap, nil } // AdvanceTimeWindow advances the time window for each cluster. Here is the steps: @@ -107,7 +136,7 @@ func NewTimeWindowAdvancer( // For any cluster, the time window should be updated to the new time window. func (t *TimeWindowAdvancer) AdvanceTimeWindow( pctx context.Context, -) (map[string]TimeWindowData, error) { +) (map[string]utils.TimeWindowData, error) { log.Debug("advance time window", zap.Uint64("round", t.round)) // mapping from upstream cluster ID to the downstream cluster ID to the min checkpoint timestamp minCheckpointTsMap := make(map[string]map[string]uint64) @@ -123,7 +152,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( } var lock sync.Mutex - newTimeWindow := make(map[string]TimeWindow) + newTimeWindow := make(map[string]utils.TimeWindow) maxPDTimestampAfterCheckpointTs := make(map[string]uint64) // for cluster ID, the max checkpoint timestamp is maximum of checkpoint from cluster to other clusters and checkpoint from other clusters to cluster maxCheckpointTs := make(map[string]uint64) @@ -163,7 +192,8 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( } // Update the time window for each cluster - newDataMap := make(map[string]map[cloudstorage.DmlPathKey]consumer.IncrementalData) + newDataMap := make(map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData) + maxVersionMap := make(map[string]map[utils.SchemaTableKey]utils.VersionKey) eg, ctx = errgroup.WithContext(pctx) for clusterID, triplet := range t.timeWindowTriplet { minTimeWindowRightBoundary := max(maxCheckpointTs[clusterID], maxPDTimestampAfterCheckpointTs[clusterID], triplet[2].NextMinLeftBoundary) @@ -173,7 +203,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( if err != nil { return errors.Trace(err) } - newData, err := s3Watcher.ConsumeNewFiles(ctx) + newData, maxClusterVersionMap, err := s3Watcher.ConsumeNewFiles(ctx) if err != nil { return errors.Trace(err) } @@ -187,6 +217,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( } lock.Lock() newDataMap[clusterID] = newData + maxVersionMap[clusterID] = maxClusterVersionMap timeWindow := newTimeWindow[clusterID] timeWindow.LeftBoundary = triplet[2].RightBoundary timeWindow.RightBoundary = s3CheckpointTs @@ -203,10 +234,10 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( } t.updateTimeWindow(newTimeWindow) t.round += 1 - return newTimeWindowData(newTimeWindow, newDataMap), nil + return newTimeWindowData(newTimeWindow, newDataMap, maxVersionMap), nil } -func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]TimeWindow) { +func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]utils.TimeWindow) { for clusterID, timeWindow := range newTimeWindow { triplet := t.timeWindowTriplet[clusterID] triplet[0] = triplet[1] @@ -254,12 +285,17 @@ func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(pctx context.Context, clus return pdtsos, nil } -func newTimeWindowData(newTimeWindow map[string]TimeWindow, newDataMap map[string]map[cloudstorage.DmlPathKey]consumer.IncrementalData) map[string]TimeWindowData { - timeWindowDatas := make(map[string]TimeWindowData) +func newTimeWindowData( + newTimeWindow map[string]utils.TimeWindow, + newDataMap map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, + maxVersionMap map[string]map[utils.SchemaTableKey]utils.VersionKey, +) map[string]utils.TimeWindowData { + timeWindowDatas := make(map[string]utils.TimeWindowData) for clusterID, timeWindow := range newTimeWindow { - timeWindowDatas[clusterID] = TimeWindowData{ + timeWindowDatas[clusterID] = utils.TimeWindowData{ TimeWindow: timeWindow, Data: newDataMap[clusterID], + MaxVersion: maxVersionMap[clusterID], } } return timeWindowDatas diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go new file mode 100644 index 0000000000..ed9fdb54cf --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go @@ -0,0 +1,51 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package advancer + +import ( + "context" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" + "github.com/stretchr/testify/require" + pd "github.com/tikv/pd/client" +) + +func TestNewTimeWindowAdvancer(t *testing.T) { + t.Parallel() + + t.Run("create time window advancer", func(t *testing.T) { + t.Parallel() + checkpointWatchers := map[string]map[string]*watcher.CheckpointWatcher{ + "cluster1": {}, + "cluster2": {}, + } + s3Watchers := map[string]*watcher.S3Watcher{ + "cluster1": nil, + "cluster2": nil, + } + pdClients := map[string]pd.Client{ + "cluster1": nil, + "cluster2": nil, + } + + advancer, _, err := NewTimeWindowAdvancer(context.Background(), checkpointWatchers, s3Watchers, pdClients, nil) + require.NoError(t, err) + require.NotNil(t, advancer) + require.Equal(t, uint64(0), advancer.round) + require.Len(t, advancer.timeWindowTriplet, 2) + require.Contains(t, advancer.timeWindowTriplet, "cluster1") + require.Contains(t, advancer.timeWindowTriplet, "cluster2") + }) +} diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index b27c95035a..02f9885ea8 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -18,11 +18,11 @@ import ( "sort" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "go.uber.org/zap" ) @@ -43,6 +43,25 @@ func newClusterViolationChecker(clusterID string) *clusterViolationChecker { } } +func (c *clusterViolationChecker) NewRecordFromCheckpoint(record *utils.Record, previous int) { + entry, exists := c.twoPreviousTimeWindowKeyVersionCache[record.Pk] + if !exists { + c.twoPreviousTimeWindowKeyVersionCache[record.Pk] = versionCacheEntry{ + previous: previous, + cdcVersion: record.CdcVersion, + } + return + } + entryCompareTs := entry.cdcVersion.GetCompareTs() + recordCompareTs := record.GetCompareTs() + if entryCompareTs < recordCompareTs { + c.twoPreviousTimeWindowKeyVersionCache[record.Pk] = versionCacheEntry{ + previous: previous, + cdcVersion: record.CdcVersion, + } + } +} + func (c *clusterViolationChecker) Check(r *utils.Record, report *recorder.ClusterReport) { entry, exists := c.twoPreviousTimeWindowKeyVersionCache[r.Pk] if !exists { @@ -163,7 +182,58 @@ func newClusterDataChecker(clusterID string) *clusterDataChecker { } } -func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow advancer.TimeWindow) error { +func (cd *clusterDataChecker) InitializeFromCheckpoint( + ctx context.Context, + checkpointDataMap map[cloudstorage.DmlPathKey]utils.IncrementalData, + checkpoint *recorder.Checkpoint, +) error { + if checkpoint == nil { + return nil + } + if checkpoint.CheckpointItems[2] == nil { + return nil + } + clusterInfo := checkpoint.CheckpointItems[2].ClusterInfo[cd.clusterID] + cd.rightBoundary = clusterInfo.TimeWindow.RightBoundary + cd.timeWindowDataCaches[2] = newTimeWindowDataCache( + clusterInfo.TimeWindow.LeftBoundary, clusterInfo.TimeWindow.RightBoundary, clusterInfo.TimeWindow.CheckpointTs) + if checkpoint.CheckpointItems[1] != nil { + clusterInfo = checkpoint.CheckpointItems[1].ClusterInfo[cd.clusterID] + cd.timeWindowDataCaches[1] = newTimeWindowDataCache( + clusterInfo.TimeWindow.LeftBoundary, clusterInfo.TimeWindow.RightBoundary, clusterInfo.TimeWindow.CheckpointTs) + } + for _, incrementalData := range checkpointDataMap { + for _, contents := range incrementalData.DataContentSlices { + for _, content := range contents { + records, err := incrementalData.Parser.DecodeFiles(ctx, content) + if err != nil { + return errors.Trace(err) + } + for _, record := range records { + cd.newRecordFromCheckpoint(record) + } + } + } + } + return nil +} + +func (cd *clusterDataChecker) newRecordFromCheckpoint(record *utils.Record) { + if record.CommitTs > cd.rightBoundary { + cd.overDataCaches = append(cd.overDataCaches, record) + return + } + if cd.timeWindowDataCaches[2].leftBoundary < record.CommitTs { + cd.timeWindowDataCaches[2].NewRecord(record) + cd.clusterViolationChecker.NewRecordFromCheckpoint(record, 1) + + } else if cd.timeWindowDataCaches[1].leftBoundary < record.CommitTs { + cd.timeWindowDataCaches[1].NewRecord(record) + cd.clusterViolationChecker.NewRecordFromCheckpoint(record, 2) + } +} + +func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow utils.TimeWindow) error { if timeWindow.LeftBoundary != cd.rightBoundary { return errors.Errorf("time window left boundary(%d) mismatch right boundary ts(%d)", timeWindow.LeftBoundary, cd.rightBoundary) } @@ -350,18 +420,35 @@ func (cd *clusterDataChecker) GetReport() *recorder.ClusterReport { type DataChecker struct { round uint64 + checkableRound uint64 clusterDataCheckers map[string]*clusterDataChecker } -func NewDataChecker(clusterConfig map[string]config.ClusterConfig) *DataChecker { +func NewDataChecker(ctx context.Context, clusterConfig map[string]config.ClusterConfig, checkpointDataMap map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, checkpoint *recorder.Checkpoint) *DataChecker { clusterDataChecker := make(map[string]*clusterDataChecker) for clusterID := range clusterConfig { clusterDataChecker[clusterID] = newClusterDataChecker(clusterID) } - return &DataChecker{ + checker := &DataChecker{ round: 0, + checkableRound: 0, clusterDataCheckers: clusterDataChecker, } + checker.initializeFromCheckpoint(ctx, checkpointDataMap, checkpoint) + return checker +} + +func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDataMap map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, checkpoint *recorder.Checkpoint) { + if checkpoint == nil { + return + } + if checkpoint.CheckpointItems[2] == nil { + return + } + c.round = checkpoint.CheckpointItems[2].Round + 1 + for _, clusterDataChecker := range c.clusterDataCheckers { + clusterDataChecker.InitializeFromCheckpoint(ctx, checkpointDataMap[clusterDataChecker.clusterID], checkpoint) + } } // FindClusterDownstreamData checks whether the record is present in the downstream data @@ -396,21 +483,25 @@ func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk uti return false } -func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) (*recorder.Report, error) { +func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]utils.TimeWindowData) (*recorder.Report, error) { if err := c.decodeNewTimeWindowData(ctx, newTimeWindowData); err != nil { log.Error("failed to decode new time window data", zap.Error(err)) return nil, errors.Annotate(err, "failed to decode new time window data") } report := recorder.NewReport(c.round) - for clusterID, clusterDataChecker := range c.clusterDataCheckers { - clusterDataChecker.Check(c) - report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) + if c.checkableRound >= 2 { + for clusterID, clusterDataChecker := range c.clusterDataCheckers { + clusterDataChecker.Check(c) + report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) + } + } else { + c.checkableRound += 1 } c.round += 1 return report, nil } -func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]advancer.TimeWindowData) error { +func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]utils.TimeWindowData) error { if len(newTimeWindowData) != len(c.clusterDataCheckers) { return errors.Errorf("number of clusters mismatch, expected %d, got %d", len(c.clusterDataCheckers), len(newTimeWindowData)) } @@ -423,7 +514,6 @@ func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindow return errors.Trace(err) } for _, incrementalData := range timeWindowData.Data { - // Parse CSV data from all file slices for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { records, err := incrementalData.Parser.DecodeFiles(ctx, content) diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go new file mode 100644 index 0000000000..b6e02e5af6 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -0,0 +1,348 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package checker + +import ( + "context" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/stretchr/testify/require" +) + +func TestNewDataChecker(t *testing.T) { + t.Parallel() + + t.Run("create data checker", func(t *testing.T) { + t.Parallel() + clusterConfig := map[string]config.ClusterConfig{ + "cluster1": { + PDAddr: "127.0.0.1:2379", + S3SinkURI: "s3://bucket/cluster1/", + S3ChangefeedID: "s3-cf-1", + }, + "cluster2": { + PDAddr: "127.0.0.1:2479", + S3SinkURI: "s3://bucket/cluster2/", + S3ChangefeedID: "s3-cf-2", + }, + } + + checker := NewDataChecker(context.Background(), clusterConfig, nil, nil) + require.NotNil(t, checker) + require.Equal(t, uint64(0), checker.round) + require.Len(t, checker.clusterDataCheckers, 2) + require.Contains(t, checker.clusterDataCheckers, "cluster1") + require.Contains(t, checker.clusterDataCheckers, "cluster2") + }) +} + +func TestNewClusterDataChecker(t *testing.T) { + t.Parallel() + + t.Run("create cluster data checker", func(t *testing.T) { + t.Parallel() + checker := newClusterDataChecker("cluster1") + require.NotNil(t, checker) + require.Equal(t, "cluster1", checker.clusterID) + require.Equal(t, uint64(0), checker.rightBoundary) + require.NotNil(t, checker.timeWindowDataCaches) + require.NotNil(t, checker.overDataCaches) + require.NotNil(t, checker.clusterViolationChecker) + }) +} + +func TestNewClusterViolationChecker(t *testing.T) { + t.Parallel() + + t.Run("create cluster violation checker", func(t *testing.T) { + t.Parallel() + checker := newClusterViolationChecker("cluster1") + require.NotNil(t, checker) + require.Equal(t, "cluster1", checker.clusterID) + require.NotNil(t, checker.twoPreviousTimeWindowKeyVersionCache) + }) +} + +func TestClusterViolationChecker_Check(t *testing.T) { + t.Parallel() + + t.Run("check new record", func(t *testing.T) { + t.Parallel() + checker := newClusterViolationChecker("cluster1") + report := recorder.NewClusterReport("cluster1") + + record := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 100, + OriginTs: 0, + }, + } + + checker.Check(record, report) + require.Len(t, report.LWWViolationItems, 0) + require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache, record.Pk) + }) + + t.Run("check duplicate old version", func(t *testing.T) { + t.Parallel() + checker := newClusterViolationChecker("cluster1") + report := recorder.NewClusterReport("cluster1") + + record1 := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 100, + OriginTs: 0, + }, + } + record2 := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 50, + OriginTs: 0, + }, + } + + checker.Check(record1, report) + checker.Check(record2, report) + require.Len(t, report.LWWViolationItems, 0) // Should skip duplicate old version + }) + + t.Run("check lww violation", func(t *testing.T) { + t.Parallel() + checker := newClusterViolationChecker("cluster1") + report := recorder.NewClusterReport("cluster1") + + record1 := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 100, + OriginTs: 0, + }, + } + record2 := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 150, + OriginTs: 50, // OriginTs is less than record1's CommitTs, causing violation + }, + } + + checker.Check(record1, report) + checker.Check(record2, report) + require.Len(t, report.LWWViolationItems, 1) + require.Equal(t, "pk1", report.LWWViolationItems[0].PK) + require.Equal(t, uint64(0), report.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(100), report.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(50), report.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(150), report.LWWViolationItems[0].CommitTS) + }) +} + +func TestClusterViolationChecker_UpdateCache(t *testing.T) { + t.Parallel() + + t.Run("update cache", func(t *testing.T) { + t.Parallel() + checker := newClusterViolationChecker("cluster1") + report := recorder.NewClusterReport("cluster1") + + record := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 100, + OriginTs: 0, + }, + } + + checker.Check(record, report) + require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache, record.Pk) + entry := checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + require.Equal(t, 0, entry.previous) + + checker.UpdateCache() + entry = checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + require.Equal(t, 1, entry.previous) + + checker.UpdateCache() + entry = checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + require.Equal(t, 2, entry.previous) + + checker.UpdateCache() + // Entry should be removed after 2 updates + _, exists := checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + require.False(t, exists) + }) +} + +func TestNewTimeWindowDataCache(t *testing.T) { + t.Parallel() + + t.Run("create time window data cache", func(t *testing.T) { + t.Parallel() + leftBoundary := uint64(100) + rightBoundary := uint64(200) + checkpointTs := map[string]uint64{ + "cluster2": 150, + } + + cache := newTimeWindowDataCache(leftBoundary, rightBoundary, checkpointTs) + require.Equal(t, leftBoundary, cache.leftBoundary) + require.Equal(t, rightBoundary, cache.rightBoundary) + require.Equal(t, checkpointTs, cache.checkpointTs) + require.NotNil(t, cache.upstreamDataCache) + require.NotNil(t, cache.downstreamDataCache) + }) +} + +func TestTimeWindowDataCache_NewRecord(t *testing.T) { + t.Parallel() + + t.Run("add upstream record", func(t *testing.T) { + t.Parallel() + cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) + record := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 150, + OriginTs: 0, + }, + } + + cache.NewRecord(record) + require.Contains(t, cache.upstreamDataCache, record.Pk) + require.Contains(t, cache.upstreamDataCache[record.Pk], record.CommitTs) + }) + + t.Run("add downstream record", func(t *testing.T) { + t.Parallel() + cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) + record := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 150, + OriginTs: 100, + }, + } + + cache.NewRecord(record) + require.Contains(t, cache.downstreamDataCache, record.Pk) + require.Contains(t, cache.downstreamDataCache[record.Pk], record.OriginTs) + }) + + t.Run("skip record before left boundary", func(t *testing.T) { + t.Parallel() + cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) + record := &utils.Record{ + Pk: "pk1", + CdcVersion: utils.CdcVersion{ + CommitTs: 50, + OriginTs: 0, + }, + } + + cache.NewRecord(record) + require.NotContains(t, cache.upstreamDataCache, record.Pk) + require.NotContains(t, cache.downstreamDataCache, record.Pk) + }) +} + +func TestClusterDataChecker_PrepareNextTimeWindowData(t *testing.T) { + t.Parallel() + + t.Run("prepare next time window data", func(t *testing.T) { + t.Parallel() + checker := newClusterDataChecker("cluster1") + checker.rightBoundary = 100 + + timeWindow := utils.TimeWindow{ + LeftBoundary: 100, + RightBoundary: 200, + CheckpointTs: map[string]uint64{"cluster2": 150}, + } + + err := checker.PrepareNextTimeWindowData(timeWindow) + require.NoError(t, err) + require.Equal(t, uint64(200), checker.rightBoundary) + }) + + t.Run("mismatch left boundary", func(t *testing.T) { + t.Parallel() + checker := newClusterDataChecker("cluster1") + checker.rightBoundary = 100 + + timeWindow := utils.TimeWindow{ + LeftBoundary: 150, + RightBoundary: 200, + CheckpointTs: map[string]uint64{"cluster2": 150}, + } + + err := checker.PrepareNextTimeWindowData(timeWindow) + require.Error(t, err) + require.Contains(t, err.Error(), "mismatch") + }) +} + +func TestDataChecker_FindClusterDownstreamData(t *testing.T) { + t.Parallel() + + t.Run("find downstream data", func(t *testing.T) { + t.Parallel() + clusterConfig := map[string]config.ClusterConfig{ + "cluster1": { + PDAddr: "127.0.0.1:2379", + S3SinkURI: "s3://bucket/cluster1/", + S3ChangefeedID: "s3-cf-1", + }, + "cluster2": { + PDAddr: "127.0.0.1:2479", + S3SinkURI: "s3://bucket/cluster2/", + S3ChangefeedID: "s3-cf-2", + }, + } + + checker := NewDataChecker(context.Background(), clusterConfig, nil, nil) + record, skipped := checker.FindClusterDownstreamData("cluster2", "pk1", 100) + require.Nil(t, record) + require.False(t, skipped) + }) +} + +func TestDataChecker_FindClusterUpstreamData(t *testing.T) { + t.Parallel() + + t.Run("find upstream data", func(t *testing.T) { + t.Parallel() + clusterConfig := map[string]config.ClusterConfig{ + "cluster1": { + PDAddr: "127.0.0.1:2379", + S3SinkURI: "s3://bucket/cluster1/", + S3ChangefeedID: "s3-cf-1", + }, + "cluster2": { + PDAddr: "127.0.0.1:2479", + S3SinkURI: "s3://bucket/cluster2/", + S3ChangefeedID: "s3-cf-2", + }, + } + + checker := NewDataChecker(context.Background(), clusterConfig, nil, nil) + found := checker.FindClusterUpstreamData("cluster2", "pk1", 100) + require.False(t, found) + }) +} diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index ddae05c415..1684d7e43a 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -9,8 +9,8 @@ log-level = "info" # Report configuration report-dir = "/tmp/multi-cluster-consistency-checker-reports" -# Tables configuration -[global.tables] + # Tables configuration + [global.tables] schema1 = ["table1", "table2"] schema2 = ["table1", "table2"] diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index d280eb61cc..05f6da5ed2 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -32,9 +32,9 @@ type Config struct { // GlobalConfig contains global configuration settings type GlobalConfig struct { - LogLevel string `toml:"log-level" json:"log-level"` - ReportDir string `toml:"report-dir" json:"report-dir"` - Tables map[string][]string `toml:"tables" json:"tables"` + LogLevel string `toml:"log-level" json:"log-level"` + DataDir string `toml:"data-dir" json:"data-dir"` + Tables map[string][]string `toml:"tables" json:"tables"` } type DownstreamClusterChangefeedConfig struct { diff --git a/cmd/multi-cluster-consistency-checker/config/config_test.go b/cmd/multi-cluster-consistency-checker/config/config_test.go new file mode 100644 index 0000000000..3b048d4a65 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/config/config_test.go @@ -0,0 +1,244 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestLoadConfig(t *testing.T) { + t.Parallel() + + t.Run("valid config", func(t *testing.T) { + t.Parallel() + // Create a temporary config file + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1", "table2"] + +[clusters] + [clusters.cluster1] + pd-addr = "127.0.0.1:2379" + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.downstream-cluster-changefeed-config] + cluster2 = { changefeed-id = "cf-1-to-2" } + + [clusters.cluster2] + pd-addr = "127.0.0.1:2479" + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" + [clusters.cluster2.downstream-cluster-changefeed-config] + cluster1 = { changefeed-id = "cf-2-to-1" } +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.NoError(t, err) + require.NotNil(t, cfg) + require.Equal(t, "info", cfg.GlobalConfig.LogLevel) + require.Equal(t, "/tmp/data", cfg.GlobalConfig.DataDir) + require.Len(t, cfg.Clusters, 2) + require.Contains(t, cfg.Clusters, "cluster1") + require.Contains(t, cfg.Clusters, "cluster2") + require.Equal(t, "127.0.0.1:2379", cfg.Clusters["cluster1"].PDAddr) + require.Equal(t, "s3://bucket/cluster1/", cfg.Clusters["cluster1"].S3SinkURI) + require.Equal(t, "s3-cf-1", cfg.Clusters["cluster1"].S3ChangefeedID) + require.Len(t, cfg.Clusters["cluster1"].DownstreamClusterChangefeedConfig, 1) + require.Equal(t, "cf-1-to-2", cfg.Clusters["cluster1"].DownstreamClusterChangefeedConfig["cluster2"].ChangefeedID) + }) + + t.Run("file not exists", func(t *testing.T) { + t.Parallel() + cfg, err := LoadConfig("/nonexistent/path/config.toml") + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "config file does not exist") + }) + + t.Run("invalid toml", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := `invalid toml content [` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "failed to decode config file") + }) + + t.Run("no clusters", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +report-dir = "/tmp/reports" +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "at least one cluster must be configured") + }) + + t.Run("missing pd-addr", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +report-dir = "/tmp/reports" + +[clusters] + [clusters.cluster1] + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "pd-addr is required") + }) + + t.Run("missing s3-sink-uri", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +report-dir = "/tmp/reports" + +[clusters] + [clusters.cluster1] + pd-addr = "127.0.0.1:2379" + s3-changefeed-id = "s3-cf-1" +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "s3-sink-uri is required") + }) + + t.Run("missing s3-changefeed-id", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +report-dir = "/tmp/reports" + +[clusters] + [clusters.cluster1] + pd-addr = "127.0.0.1:2379" + s3-sink-uri = "s3://bucket/cluster1/" +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "s3-changefeed-id is required") + }) + + t.Run("incomplete downstream cluster changefeed config", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +report-dir = "/tmp/reports" + +[clusters] + [clusters.cluster1] + pd-addr = "127.0.0.1:2379" + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.downstream-cluster-changefeed-config] + cluster2 = { changefeed-id = "cf-1-to-2" } + + [clusters.cluster2] + pd-addr = "127.0.0.1:2479" + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "downstream-cluster-changefeed-config is not entirely configured") + }) + + t.Run("missing changefeed-id in downstream config", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +report-dir = "/tmp/reports" + +[clusters] + [clusters.cluster1] + pd-addr = "127.0.0.1:2379" + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.downstream-cluster-changefeed-config] + cluster2 = {} + + [clusters.cluster2] + pd-addr = "127.0.0.1:2479" + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" + [clusters.cluster2.downstream-cluster-changefeed-config] + cluster1 = { changefeed-id = "cf-2-to-1" } +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "changefeed-id is required") + }) +} diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go index 8b4a8731c6..cbc300d9f3 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -14,16 +14,10 @@ package consumer import ( - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" ) -type versionKey struct { - version uint64 - versionPath string - dataPath string -} - type ( fileIndexRange map[cloudstorage.FileIndexKey]indexRange fileIndexKeyMap map[cloudstorage.FileIndexKey]uint64 @@ -51,15 +45,10 @@ func updateTableDMLIdxMap( type schemaParser struct { path string - parser *parser.TableParser + parser *utils.TableParser } type schemaKey struct { schema string table string } - -type IncrementalData struct { - DataContentSlices map[cloudstorage.FileIndexKey][][]byte - Parser *parser.TableParser -} diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go index 1cb5cca3e6..b04b6ba96a 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go @@ -20,8 +20,10 @@ import ( "strings" "sync" + perrors "github.com/pingcap/errors" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/parser" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" @@ -30,26 +32,28 @@ import ( "golang.org/x/sync/errgroup" ) +var ErrWalkDirEnd = perrors.Normalize("walk dir end", perrors.RFCCodeText("CDC:ErrWalkDirEnd")) + type CurrentTableVersion struct { mu sync.RWMutex - currentTableVersionMap map[schemaKey]versionKey + currentTableVersionMap map[schemaKey]utils.VersionKey } func NewCurrentTableVersion() *CurrentTableVersion { return &CurrentTableVersion{ - currentTableVersionMap: make(map[schemaKey]versionKey), + currentTableVersionMap: make(map[schemaKey]utils.VersionKey), } } // GetCurrentTableVersion returns the current table version for a given schema and table -func (cvt *CurrentTableVersion) GetCurrentTableVersion(schema, table string) versionKey { +func (cvt *CurrentTableVersion) GetCurrentTableVersion(schema, table string) utils.VersionKey { cvt.mu.RLock() defer cvt.mu.RUnlock() return cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] } // UpdateCurrentTableVersion updates the current table version for a given schema and table -func (cvt *CurrentTableVersion) UpdateCurrentTableVersion(schema, table string, version versionKey) { +func (cvt *CurrentTableVersion) UpdateCurrentTableVersion(schema, table string, version utils.VersionKey) { cvt.mu.Lock() defer cvt.mu.Unlock() cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] = version @@ -67,7 +71,7 @@ func NewSchemaParser() *SchemaParser { } // GetSchemaParser returns the schema parser for a given schema and table version -func (sp *SchemaParser) GetSchemaParser(schema, table string, version uint64) (*parser.TableParser, error) { +func (sp *SchemaParser) GetSchemaParser(schema, table string, version uint64) (*utils.TableParser, error) { schemaPathKey := cloudstorage.SchemaPathKey{ Schema: schema, Table: table, @@ -83,7 +87,7 @@ func (sp *SchemaParser) GetSchemaParser(schema, table string, version uint64) (* } // SetSchemaParser sets the schema parser for a given schema and table version -func (sp *SchemaParser) SetSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *parser.TableParser) { +func (sp *SchemaParser) SetSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *utils.TableParser) { sp.mu.Lock() sp.schemaParserMap[schemaPathKey] = schemaParser{ path: filePath, @@ -179,6 +183,197 @@ func NewS3Consumer( } } +func (c *S3Consumer) InitializeFromCheckpoint(ctx context.Context, clusterID string, checkpoint *recorder.Checkpoint) (map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { + if checkpoint == nil { + return nil, nil + } + if checkpoint.CheckpointItems[2] == nil { + return nil, nil + } + scanRanges, err := checkpoint.ToScanRange(clusterID) + if err != nil { + return nil, errors.Trace(err) + } + var mu sync.Mutex + // Combine DML data and schema data into result + result := make(map[cloudstorage.DmlPathKey]utils.IncrementalData) + eg, egCtx := errgroup.WithContext(ctx) + for schemaTableKey, scanRange := range scanRanges { + eg.Go(func() error { + scanVersions, err := c.downloadSchemaFilesWithScanRange( + egCtx, schemaTableKey.Schema, schemaTableKey.Table, scanRange.StartVersionKey, scanRange.EndVersionKey, scanRange.EndDataPath) + if err != nil { + return errors.Trace(err) + } + err = c.downloadDataFilesWithScanRange( + egCtx, schemaTableKey.Schema, schemaTableKey.Table, scanVersions, scanRange, + func( + dmlPathKey cloudstorage.DmlPathKey, + dmlSlices map[cloudstorage.FileIndexKey][][]byte, + parser *utils.TableParser, + ) { + mu.Lock() + result[dmlPathKey] = utils.IncrementalData{ + DataContentSlices: dmlSlices, + Parser: parser, + } + mu.Unlock() + }, + ) + if err != nil { + return errors.Trace(err) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + return result, nil +} + +func (c *S3Consumer) downloadSchemaFilesWithScanRange( + ctx context.Context, + schema, table string, + startVersionKey string, + endVersionKey string, + endDataPath string, +) ([]utils.VersionKey, error) { + metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) + opt := &storage.WalkOption{ + SubDir: metaSubDir, + ObjPrefix: "schema_", + // TODO: StartAfter: startVersionKey, + } + + var startSchemaKey, endSchemaKey cloudstorage.SchemaPathKey + _, err := startSchemaKey.ParseSchemaFilePath(startVersionKey) + if err != nil { + return nil, errors.Trace(err) + } + _, err = endSchemaKey.ParseSchemaFilePath(endVersionKey) + if err != nil { + return nil, errors.Trace(err) + } + + var scanVersions []utils.VersionKey + newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) + scanVersions = append(scanVersions, utils.VersionKey{ + Version: startSchemaKey.TableVersion, + VersionPath: startVersionKey, + }) + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if endVersionKey < filePath { + return ErrWalkDirEnd + } + if !cloudstorage.IsSchemaFile(filePath) { + return nil + } + var schemaKey cloudstorage.SchemaPathKey + _, err := schemaKey.ParseSchemaFilePath(filePath) + if err != nil { + log.Error("failed to parse schema file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + if schemaKey.TableVersion > startSchemaKey.TableVersion { + if _, exists := newVersionPaths[schemaKey]; !exists { + scanVersions = append(scanVersions, utils.VersionKey{ + Version: schemaKey.TableVersion, + VersionPath: filePath, + }) + } + newVersionPaths[schemaKey] = filePath + } + return nil + }); err != nil && !ErrWalkDirEnd.Is(err) { + return nil, errors.Trace(err) + } + + if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { + return nil, errors.Trace(err) + } + + c.currentTableVersion.UpdateCurrentTableVersion(schema, table, utils.VersionKey{ + Version: endSchemaKey.TableVersion, + VersionPath: endVersionKey, + DataPath: endDataPath, + }) + + return scanVersions, nil +} + +func (c *S3Consumer) downloadDataFilesWithScanRange( + ctx context.Context, + schema, table string, + scanVersions []utils.VersionKey, + scanRange *recorder.ScanRange, + consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *utils.TableParser), +) error { + eg, egCtx := errgroup.WithContext(ctx) + for _, version := range scanVersions { + eg.Go(func() error { + newFiles, err := c.getNewFilesForSchemaPathKeyWithEndPath(egCtx, schema, table, version.Version, scanRange.StartDataPath, scanRange.EndDataPath) + if err != nil { + return errors.Trace(err) + } + dmlData, err := c.downloadDMLFiles(egCtx, newFiles) + if err != nil { + return errors.Trace(err) + } + parser, err := c.schemaParser.GetSchemaParser(schema, table, version.Version) + if err != nil { + return errors.Trace(err) + } + for dmlPathKey, dmlSlices := range dmlData { + consumeFunc(dmlPathKey, dmlSlices, parser) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } + return nil +} + +func (c *S3Consumer) getNewFilesForSchemaPathKeyWithEndPath( + ctx context.Context, + schema, table string, + version uint64, + startDataPath string, + endDataPath string, +) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { + schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version)) + opt := &storage.WalkOption{ + SubDir: schemaPrefix, + // TODO: StartAfter: startDataPath, + } + newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if endDataPath < filePath { + return ErrWalkDirEnd + } + // Try to parse DML file path if it matches the expected extension + if strings.HasSuffix(filePath, c.fileExtension) { + var dmlkey cloudstorage.DmlPathKey + fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, filePath) + if err != nil { + log.Error("failed to parse dml file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) + } + return nil + }); err != nil && !ErrWalkDirEnd.Is(err) { + return nil, errors.Trace(err) + } + return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil +} + // downloadSchemaFiles downloads schema files concurrently for given schema path keys func (c *S3Consumer) downloadSchemaFiles( ctx context.Context, @@ -195,7 +390,7 @@ func (c *S3Consumer) downloadSchemaFiles( } // Use canal-json decoder for S3 sink with .json file extension - parser, err := parser.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) + parser, err := utils.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) if err != nil { return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) } @@ -213,7 +408,7 @@ func (c *S3Consumer) downloadSchemaFiles( func (c *S3Consumer) discoverAndDownloadNewTableVersions( ctx context.Context, schema, table string, -) ([]versionKey, error) { +) ([]utils.VersionKey, error) { currentVersion := c.currentTableVersion.GetCurrentTableVersion(schema, table) metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) opt := &storage.WalkOption{ @@ -222,7 +417,7 @@ func (c *S3Consumer) discoverAndDownloadNewTableVersions( // TODO: StartAfter: currentVersion.versionPath, } - var scanVersions []versionKey + var scanVersions []utils.VersionKey newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { if !cloudstorage.IsSchemaFile(filePath) { @@ -237,11 +432,11 @@ func (c *S3Consumer) discoverAndDownloadNewTableVersions( return nil } version := schemaKey.TableVersion - if version > currentVersion.version { + if version > currentVersion.Version { if _, exists := newVersionPaths[schemaKey]; !exists { - scanVersions = append(scanVersions, versionKey{ - version: version, - versionPath: filePath, + scanVersions = append(scanVersions, utils.VersionKey{ + Version: version, + VersionPath: filePath, }) } newVersionPaths[schemaKey] = filePath @@ -256,7 +451,7 @@ func (c *S3Consumer) discoverAndDownloadNewTableVersions( return nil, errors.Trace(err) } - if currentVersion.version > 0 { + if currentVersion.Version > 0 { scanVersions = append(scanVersions, currentVersion) } return scanVersions, nil @@ -265,9 +460,9 @@ func (c *S3Consumer) discoverAndDownloadNewTableVersions( func (c *S3Consumer) getNewFilesForSchemaPathKey( ctx context.Context, schema, table string, - version *versionKey, + version *utils.VersionKey, ) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { - schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version.version)) + schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version.Version)) opt := &storage.WalkOption{ SubDir: schemaPrefix, // TODO: StartAfter: version.dataPath, @@ -294,7 +489,7 @@ func (c *S3Consumer) getNewFilesForSchemaPathKey( return nil, errors.Trace(err) } - version.dataPath = maxFilePath + version.DataPath = maxFilePath return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil } @@ -396,17 +591,18 @@ func (c *S3Consumer) downloadDMLFiles( func (c *S3Consumer) downloadNewFilesWithVersions( ctx context.Context, schema, table string, - scanVersions []versionKey, - consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *parser.TableParser), -) error { - var maxVersion *versionKey + scanVersions []utils.VersionKey, + consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *utils.TableParser), +) (*utils.VersionKey, error) { + var maxVersion *utils.VersionKey eg, egCtx := errgroup.WithContext(ctx) for _, version := range scanVersions { - if maxVersion == nil || maxVersion.version < version.version { - maxVersion = &version + versionp := &version + if maxVersion == nil || maxVersion.Version < version.Version { + maxVersion = versionp } eg.Go(func() error { - newFiles, err := c.getNewFilesForSchemaPathKey(egCtx, schema, table, &version) + newFiles, err := c.getNewFilesForSchemaPathKey(egCtx, schema, table, versionp) if err != nil { return errors.Trace(err) } @@ -414,7 +610,7 @@ func (c *S3Consumer) downloadNewFilesWithVersions( if err != nil { return errors.Trace(err) } - parser, err := c.schemaParser.GetSchemaParser(schema, table, version.version) + parser, err := c.schemaParser.GetSchemaParser(schema, table, versionp.Version) if err != nil { return errors.Trace(err) } @@ -425,18 +621,22 @@ func (c *S3Consumer) downloadNewFilesWithVersions( }) } if err := eg.Wait(); err != nil { - return errors.Trace(err) + return nil, errors.Trace(err) } if maxVersion != nil { c.currentTableVersion.UpdateCurrentTableVersion(schema, table, *maxVersion) } - return nil + return maxVersion, nil } -func (c *S3Consumer) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlPathKey]IncrementalData, error) { +func (c *S3Consumer) ConsumeNewFiles( + ctx context.Context, +) (map[cloudstorage.DmlPathKey]utils.IncrementalData, map[utils.SchemaTableKey]utils.VersionKey, error) { var mu sync.Mutex // Combine DML data and schema data into result - result := make(map[cloudstorage.DmlPathKey]IncrementalData) + result := make(map[cloudstorage.DmlPathKey]utils.IncrementalData) + var versionMu sync.Mutex + maxVersionMap := make(map[utils.SchemaTableKey]utils.VersionKey) eg, egCtx := errgroup.WithContext(ctx) for schema, tables := range c.tables { for _, table := range tables { @@ -445,31 +645,36 @@ func (c *S3Consumer) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlP if err != nil { return errors.Trace(err) } - if err := c.downloadNewFilesWithVersions( + maxVersion, err := c.downloadNewFilesWithVersions( egCtx, schema, table, scanVersions, func( dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, - parser *parser.TableParser, + parser *utils.TableParser, ) { mu.Lock() - result[dmlPathKey] = IncrementalData{ + result[dmlPathKey] = utils.IncrementalData{ DataContentSlices: dmlSlices, Parser: parser, } mu.Unlock() }, - ); err != nil { + ) + if err != nil { return errors.Trace(err) } - + if maxVersion != nil { + versionMu.Lock() + maxVersionMap[utils.SchemaTableKey{Schema: schema, Table: table}] = *maxVersion + versionMu.Unlock() + } return nil }) } } if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) + return nil, nil, errors.Trace(err) } - return result, nil + return result, maxVersionMap, nil } diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 536d73974d..5b6bdffb4f 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -14,31 +14,63 @@ package recorder import ( + "encoding/json" "fmt" "os" - "path" + "path/filepath" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/errors" "go.uber.org/zap" ) type Recorder struct { - recordDir string + reportDir string + checkpointDir string + + checkpoint *Checkpoint } -func NewRecorder(reportDir string) (*Recorder, error) { - err := os.MkdirAll(reportDir, 0755) - if err != nil { +func NewRecorder(dataDir string) (*Recorder, error) { + if err := os.MkdirAll(filepath.Join(dataDir, "report"), 0755); err != nil { + return nil, errors.Trace(err) + } + if err := os.MkdirAll(filepath.Join(dataDir, "checkpoint"), 0755); err != nil { return nil, errors.Trace(err) } - return &Recorder{ - recordDir: reportDir, - }, nil + r := &Recorder{ + reportDir: filepath.Join(dataDir, "report"), + checkpointDir: filepath.Join(dataDir, "checkpoint"), + + checkpoint: NewCheckpoint(), + } + return r, r.initializeCheckpoint() +} + +func (r *Recorder) GetCheckpoint() *Checkpoint { + return r.checkpoint } -func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindowData, report *Report) error { +func (r *Recorder) initializeCheckpoint() error { + _, err := os.Stat(filepath.Join(r.checkpointDir, "checkpoint.json")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return errors.Trace(err) + } + data, err := os.ReadFile(filepath.Join(r.checkpointDir, "checkpoint.json")) + if err != nil { + return errors.Trace(err) + } + if err := json.Unmarshal(data, r.checkpoint); err != nil { + return errors.Trace(err) + } + return nil +} + +func (r *Recorder) RecordTimeWindow(timeWindowData map[string]utils.TimeWindowData, report *Report) error { for clusterID, timeWindow := range timeWindowData { log.Info("time window advanced", zap.Uint64("round", report.Round), @@ -52,11 +84,38 @@ func (r *Recorder) RecordTimeWindow(timeWindowData map[string]advancer.TimeWindo return errors.Trace(err) } } + if err := r.flushCheckpoint(report.Round, timeWindowData); err != nil { + return errors.Trace(err) + } return nil } func (r *Recorder) flushReport(report *Report) error { - filename := path.Join(r.recordDir, fmt.Sprintf("report-%d.log", report.Round)) + filename := filepath.Join(r.reportDir, fmt.Sprintf("report-%d.report", report.Round)) data := report.MarshalReport() - return os.WriteFile(filename, []byte(data), 0600) + if err := os.WriteFile(filename, []byte(data), 0600); err != nil { + return errors.Trace(err) + } + filename = filepath.Join(r.reportDir, fmt.Sprintf("report-%d.json", report.Round)) + dataBytes, err := json.Marshal(report) + if err != nil { + return errors.Trace(err) + } + if err := os.WriteFile(filename, dataBytes, 0600); err != nil { + return errors.Trace(err) + } + return nil +} + +func (r *Recorder) flushCheckpoint(round uint64, timeWindowData map[string]utils.TimeWindowData) error { + r.checkpoint.NewTimeWindowData(round, timeWindowData) + filename := filepath.Join(r.checkpointDir, "checkpoint.json") + data, err := json.Marshal(r.checkpoint) + if err != nil { + return errors.Trace(err) + } + if err := os.WriteFile(filename, data, 0600); err != nil { + return errors.Trace(err) + } + return nil } diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index c587d23a1a..a742dc1837 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -16,14 +16,17 @@ package recorder import ( "fmt" "strings" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/pkg/errors" ) type DataLossItem struct { - DownstreamClusterID string - PK string - OriginTS uint64 - CommitTS uint64 - Inconsistent bool + DownstreamClusterID string `json:"downstream_cluster_id"` + PK string `json:"pk"` + OriginTS uint64 `json:"origin_ts"` + CommitTS uint64 `json:"commit_ts"` + Inconsistent bool `json:"inconsistent"` } func (item *DataLossItem) String() string { @@ -35,9 +38,9 @@ func (item *DataLossItem) String() string { } type DataRedundantItem struct { - PK string - OriginTS uint64 - CommitTS uint64 + PK string `json:"pk"` + OriginTS uint64 `json:"origin_ts"` + CommitTS uint64 `json:"commit_ts"` } func (item *DataRedundantItem) String() string { @@ -45,11 +48,11 @@ func (item *DataRedundantItem) String() string { } type LWWViolationItem struct { - PK string - ExistingOriginTS uint64 - ExistingCommitTS uint64 - OriginTS uint64 - CommitTS uint64 + PK string `json:"pk"` + ExistingOriginTS uint64 `json:"existing_origin_ts"` + ExistingCommitTS uint64 `json:"existing_commit_ts"` + OriginTS uint64 `json:"origin_ts"` + CommitTS uint64 `json:"commit_ts"` } func (item *LWWViolationItem) String() string { @@ -59,13 +62,13 @@ func (item *LWWViolationItem) String() string { } type ClusterReport struct { - ClusterID string + ClusterID string `json:"cluster_id"` - DataLossItems []DataLossItem - DataRedundantItems []DataRedundantItem - LWWViolationItems []LWWViolationItem + DataLossItems []DataLossItem `json:"data_loss_items"` // data loss items + DataRedundantItems []DataRedundantItem `json:"data_redundant_items"` // data redundant items + LWWViolationItems []LWWViolationItem `json:"lww_violation_items"` // lww violation items - needFlush bool + needFlush bool `json:"-"` } func NewClusterReport(clusterID string) *ClusterReport { @@ -114,9 +117,9 @@ func (r *ClusterReport) AddLWWViolationItem( } type Report struct { - Round uint64 - ClusterReports map[string]*ClusterReport - needFlush bool + Round uint64 `json:"round"` + ClusterReports map[string]*ClusterReport `json:"cluster_reports"` + needFlush bool `json:"-"` } func NewReport(round uint64) *Report { @@ -166,3 +169,88 @@ func (r *Report) MarshalReport() string { func (r *Report) NeedFlush() bool { return r.needFlush } + +type CheckpointClusterInfo struct { + TimeWindow utils.TimeWindow `json:"time_window"` + MaxVersion map[utils.SchemaTableKey]utils.VersionKey `json:"max_version"` +} + +type CheckpointItem struct { + Round uint64 `json:"round"` + ClusterInfo map[string]CheckpointClusterInfo `json:"cluster_info"` +} + +type Checkpoint struct { + CheckpointItems [3]*CheckpointItem `json:"checkpoint_items"` +} + +func NewCheckpoint() *Checkpoint { + return &Checkpoint{ + CheckpointItems: [3]*CheckpointItem{ + nil, + nil, + nil, + }, + } +} + +func (c *Checkpoint) NewTimeWindowData(round uint64, timeWindowData map[string]utils.TimeWindowData) { + newCheckpointItem := CheckpointItem{ + Round: round, + ClusterInfo: make(map[string]CheckpointClusterInfo), + } + for downstreamClusterID, timeWindow := range timeWindowData { + newCheckpointItem.ClusterInfo[downstreamClusterID] = CheckpointClusterInfo{ + TimeWindow: timeWindow.TimeWindow, + MaxVersion: timeWindow.MaxVersion, + } + } + c.CheckpointItems[0] = c.CheckpointItems[1] + c.CheckpointItems[1] = c.CheckpointItems[2] + c.CheckpointItems[2] = &newCheckpointItem +} + +type ScanRange struct { + StartVersionKey string + EndVersionKey string + StartDataPath string + EndDataPath string +} + +func (c *Checkpoint) ToScanRange(clusterID string) (map[utils.SchemaTableKey]*ScanRange, error) { + result := make(map[utils.SchemaTableKey]*ScanRange) + if c.CheckpointItems[2] == nil { + return result, nil + } + for schemaTableKey, versionKey := range c.CheckpointItems[2].ClusterInfo[clusterID].MaxVersion { + result[schemaTableKey] = &ScanRange{ + StartVersionKey: versionKey.VersionPath, + EndVersionKey: versionKey.VersionPath, + StartDataPath: versionKey.DataPath, + EndDataPath: versionKey.DataPath, + } + } + if c.CheckpointItems[1] == nil { + return result, nil + } + for schemaTableKey, versionKey := range c.CheckpointItems[1].ClusterInfo[clusterID].MaxVersion { + scanRange, ok := result[schemaTableKey] + if !ok { + return nil, errors.Errorf("schema table key %s.%s not found in result", schemaTableKey.Schema, schemaTableKey.Table) + } + scanRange.StartVersionKey = versionKey.VersionPath + scanRange.StartDataPath = versionKey.DataPath + } + if c.CheckpointItems[0] == nil { + return result, nil + } + for schemaTableKey, versionKey := range c.CheckpointItems[0].ClusterInfo[clusterID].MaxVersion { + scanRange, ok := result[schemaTableKey] + if !ok { + return nil, errors.Errorf("schema table key %s.%s not found in result", schemaTableKey.Schema, schemaTableKey.Table) + } + scanRange.StartVersionKey = versionKey.VersionPath + scanRange.StartDataPath = versionKey.DataPath + } + return result, nil +} diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index 2ab95e085e..d39c2cc831 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -41,12 +41,15 @@ func runTask(ctx context.Context, cfg *config.Config) error { // Ensure cleanup happens even if there's an error defer cleanupClients(pdClients, etcdClients) - timeWindowAdvancer := advancer.NewTimeWindowAdvancer(checkpointWatchers, s3Watchers, pdClients) - recorder, err := recorder.NewRecorder(cfg.GlobalConfig.ReportDir) + recorder, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir) if err != nil { return errors.Trace(err) } - dataChecker := checker.NewDataChecker(cfg.Clusters) + timeWindowAdvancer, checkpointDataMap, err := advancer.NewTimeWindowAdvancer(ctx, checkpointWatchers, s3Watchers, pdClients, recorder.GetCheckpoint()) + if err != nil { + return errors.Trace(err) + } + dataChecker := checker.NewDataChecker(ctx, cfg.Clusters, checkpointDataMap, recorder.GetCheckpoint()) log.Info("Starting consistency checker task") for { diff --git a/cmd/multi-cluster-consistency-checker/parser/decoder.go b/cmd/multi-cluster-consistency-checker/utils/decoder.go similarity index 99% rename from cmd/multi-cluster-consistency-checker/parser/decoder.go rename to cmd/multi-cluster-consistency-checker/utils/decoder.go index 2c768927e9..522102c3bb 100644 --- a/cmd/multi-cluster-consistency-checker/parser/decoder.go +++ b/cmd/multi-cluster-consistency-checker/utils/decoder.go @@ -11,7 +11,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package parser +package utils import ( "context" diff --git a/cmd/multi-cluster-consistency-checker/parser/parser.go b/cmd/multi-cluster-consistency-checker/utils/parser.go similarity index 94% rename from cmd/multi-cluster-consistency-checker/parser/parser.go rename to cmd/multi-cluster-consistency-checker/utils/parser.go index c342a0a1d8..d77f2f89bb 100644 --- a/cmd/multi-cluster-consistency-checker/parser/parser.go +++ b/cmd/multi-cluster-consistency-checker/utils/parser.go @@ -11,7 +11,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package parser +package utils import ( "context" @@ -20,7 +20,6 @@ import ( "time" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/common" commonType "github.com/pingcap/ticdc/pkg/common" "github.com/pingcap/ticdc/pkg/common/event" @@ -125,11 +124,11 @@ func (pt *TableParser) parseTableInfo(tableKey string, content []byte) error { return nil } -func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Record, error) { +func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*Record, error) { originTs := uint64(0) pkCount := 0 colInfos := pt.tableInfo.GetColInfosForRowChangedEvent() - columnValues := make([]utils.ColumnValue, 0, len(colInfos)) + columnValues := make([]ColumnValue, 0, len(colInfos)) pkColumnValues := make([]types.Datum, len(pt.pkColumnOffsets)) for _, colInfo := range colInfos { col, ok := pt.tableInfo.GetColumnInfo(colInfo.ID) @@ -171,7 +170,7 @@ func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Reco } } else { colValue := commonType.ExtractColVal(row, col, rowColOffset) - columnValues = append(columnValues, utils.ColumnValue{ + columnValues = append(columnValues, ColumnValue{ ColumnID: colInfo.ID, Value: colValue, }) @@ -189,18 +188,18 @@ func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*utils.Reco return nil, errors.Annotate(err, "failed to encode primary key") } pk := hex.EncodeToString(pkEncoded) - return &utils.Record{ - Pk: utils.PkType(pk), + return &Record{ + Pk: PkType(pk), ColumnValues: columnValues, - CdcVersion: utils.CdcVersion{ + CdcVersion: CdcVersion{ CommitTs: commitTs, OriginTs: originTs, }, }, nil } -func (pt *TableParser) DecodeFiles(ctx context.Context, content []byte) ([]*utils.Record, error) { - records := make([]*utils.Record, 0) +func (pt *TableParser) DecodeFiles(ctx context.Context, content []byte) ([]*Record, error) { + records := make([]*Record, 0) decoder, err := pt.decoderFactory.NewDecoder(ctx, pt.tableInfo, content) if err != nil { diff --git a/cmd/multi-cluster-consistency-checker/utils/types.go b/cmd/multi-cluster-consistency-checker/utils/types.go index e8168dc83a..51ad1e3e2a 100644 --- a/cmd/multi-cluster-consistency-checker/utils/types.go +++ b/cmd/multi-cluster-consistency-checker/utils/types.go @@ -13,6 +13,10 @@ package utils +import ( + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" +) + type PkType string type ColumnValue struct { @@ -61,3 +65,45 @@ func (r *Record) EqualDownstreamRecord(downstreamRecord *Record) bool { } return true } + +type SchemaTableKey struct { + Schema string + Table string +} + +type VersionKey struct { + Version uint64 + // Version Path is a hint for the next version path to scan + VersionPath string + // Data Path is a hint for the next data path to scan + DataPath string +} + +// TimeWindow is the time window of the cluster, including the left boundary, right boundary and checkpoint ts +// Assert 1: LeftBoundary < CheckpointTs < RightBoundary +// Assert 2: The other cluster's checkpoint timestamp of next time window should be larger than the PDTimestampAfterTimeWindow saved in this cluster's time window +// Assert 3: CheckpointTs of this cluster should be larger than other clusters' RightBoundary of previous time window +// Assert 4: RightBoundary of this cluster should be larger than other clusters' CheckpointTs of this time window +type TimeWindow struct { + LeftBoundary uint64 `json:"left_boundary"` + RightBoundary uint64 `json:"right_boundary"` + // CheckpointTs is the checkpoint timestamp for each changefeed from upstream cluster, + // mapping from downstream cluster ID to the checkpoint timestamp + CheckpointTs map[string]uint64 `json:"checkpoint_ts"` + // PDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, + // mapping from upstream cluster ID to the max PD timestamp + PDTimestampAfterTimeWindow map[string]uint64 `json:"pd_timestamp_after_time_window"` + // NextMinLeftBoundary is the minimum left boundary of the next time window for the cluster + NextMinLeftBoundary uint64 `json:"next_min_left_boundary"` +} + +type TimeWindowData struct { + TimeWindow + Data map[cloudstorage.DmlPathKey]IncrementalData + MaxVersion map[SchemaTableKey]VersionKey +} + +type IncrementalData struct { + DataContentSlices map[cloudstorage.FileIndexKey][][]byte + Parser *TableParser +} diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go index 2d6229ce7d..3e8d0dda35 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go @@ -17,6 +17,8 @@ import ( "context" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/consumer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "github.com/pingcap/tidb/br/pkg/storage" @@ -39,6 +41,10 @@ func NewS3Watcher( } } +func (sw *S3Watcher) InitializeFromCheckpoint(ctx context.Context, clusterID string, checkpoint *recorder.Checkpoint) (map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { + return sw.consumer.InitializeFromCheckpoint(ctx, clusterID, checkpoint) +} + func (sw *S3Watcher) AdvanceS3CheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { checkpointTs, err := sw.checkpointWatcher.AdvanceCheckpointTs(ctx, minCheckpointTs) if err != nil { @@ -48,11 +54,13 @@ func (sw *S3Watcher) AdvanceS3CheckpointTs(ctx context.Context, minCheckpointTs return checkpointTs, nil } -func (sw *S3Watcher) ConsumeNewFiles(ctx context.Context) (map[cloudstorage.DmlPathKey]consumer.IncrementalData, error) { +func (sw *S3Watcher) ConsumeNewFiles( + ctx context.Context, +) (map[cloudstorage.DmlPathKey]utils.IncrementalData, map[utils.SchemaTableKey]utils.VersionKey, error) { // TODO: get the index updated from the s3 - newData, err := sw.consumer.ConsumeNewFiles(ctx) + newData, maxVersionMap, err := sw.consumer.ConsumeNewFiles(ctx) if err != nil { - return nil, errors.Annotate(err, "consume new files failed") + return nil, nil, errors.Annotate(err, "consume new files failed") } - return newData, nil + return newData, maxVersionMap, nil } From 3069517e887740fe3c5d767f01fb094fff98c1fc Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 4 Feb 2026 12:46:19 +0800 Subject: [PATCH 13/23] support checkpoint Signed-off-by: Jianjun Liao --- cmd/multi-cluster-consistency-checker/checker/checker.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index 02f9885ea8..e55dc5022e 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -446,6 +446,7 @@ func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDa return } c.round = checkpoint.CheckpointItems[2].Round + 1 + c.checkableRound = checkpoint.CheckpointItems[2].Round for _, clusterDataChecker := range c.clusterDataCheckers { clusterDataChecker.InitializeFromCheckpoint(ctx, checkpointDataMap[clusterDataChecker.clusterID], checkpoint) } From fd938defa7002b08a16a581d513b5690e8fa80bd Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 4 Feb 2026 13:53:42 +0800 Subject: [PATCH 14/23] support checkpoint Signed-off-by: Jianjun Liao --- cmd/multi-cluster-consistency-checker/main.go | 4 +- .../recorder/types.go | 38 +++++++++++++------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index bb46185a1a..90646e2b7c 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -103,7 +103,9 @@ func run(cmd *cobra.Command, args []string) { // Start the task in a goroutine errChan := make(chan error, 1) go func() { - errChan <- runTask(ctx, cfg) + err := runTask(ctx, cfg) + log.Error("task error", zap.Error(err)) + errChan <- err }() // Wait for either a signal or task completion diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index a742dc1837..6c858eb99e 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -170,9 +170,25 @@ func (r *Report) NeedFlush() bool { return r.needFlush } +type SchemaTableVersionKey struct { + utils.SchemaTableKey + utils.VersionKey +} + +func NewSchemaTableVersionKeyFromVersionKeyMap(versionKeyMap map[utils.SchemaTableKey]utils.VersionKey) []SchemaTableVersionKey { + result := make([]SchemaTableVersionKey, 0, len(versionKeyMap)) + for schemaTableKey, versionKey := range versionKeyMap { + result = append(result, SchemaTableVersionKey{ + SchemaTableKey: schemaTableKey, + VersionKey: versionKey, + }) + } + return result +} + type CheckpointClusterInfo struct { - TimeWindow utils.TimeWindow `json:"time_window"` - MaxVersion map[utils.SchemaTableKey]utils.VersionKey `json:"max_version"` + TimeWindow utils.TimeWindow `json:"time_window"` + MaxVersion []SchemaTableVersionKey `json:"max_version"` } type CheckpointItem struct { @@ -202,7 +218,7 @@ func (c *Checkpoint) NewTimeWindowData(round uint64, timeWindowData map[string]u for downstreamClusterID, timeWindow := range timeWindowData { newCheckpointItem.ClusterInfo[downstreamClusterID] = CheckpointClusterInfo{ TimeWindow: timeWindow.TimeWindow, - MaxVersion: timeWindow.MaxVersion, + MaxVersion: NewSchemaTableVersionKeyFromVersionKeyMap(timeWindow.MaxVersion), } } c.CheckpointItems[0] = c.CheckpointItems[1] @@ -222,8 +238,8 @@ func (c *Checkpoint) ToScanRange(clusterID string) (map[utils.SchemaTableKey]*Sc if c.CheckpointItems[2] == nil { return result, nil } - for schemaTableKey, versionKey := range c.CheckpointItems[2].ClusterInfo[clusterID].MaxVersion { - result[schemaTableKey] = &ScanRange{ + for _, versionKey := range c.CheckpointItems[2].ClusterInfo[clusterID].MaxVersion { + result[versionKey.SchemaTableKey] = &ScanRange{ StartVersionKey: versionKey.VersionPath, EndVersionKey: versionKey.VersionPath, StartDataPath: versionKey.DataPath, @@ -233,10 +249,10 @@ func (c *Checkpoint) ToScanRange(clusterID string) (map[utils.SchemaTableKey]*Sc if c.CheckpointItems[1] == nil { return result, nil } - for schemaTableKey, versionKey := range c.CheckpointItems[1].ClusterInfo[clusterID].MaxVersion { - scanRange, ok := result[schemaTableKey] + for _, versionKey := range c.CheckpointItems[1].ClusterInfo[clusterID].MaxVersion { + scanRange, ok := result[versionKey.SchemaTableKey] if !ok { - return nil, errors.Errorf("schema table key %s.%s not found in result", schemaTableKey.Schema, schemaTableKey.Table) + return nil, errors.Errorf("schema table key %s.%s not found in result", versionKey.Schema, versionKey.Table) } scanRange.StartVersionKey = versionKey.VersionPath scanRange.StartDataPath = versionKey.DataPath @@ -244,10 +260,10 @@ func (c *Checkpoint) ToScanRange(clusterID string) (map[utils.SchemaTableKey]*Sc if c.CheckpointItems[0] == nil { return result, nil } - for schemaTableKey, versionKey := range c.CheckpointItems[0].ClusterInfo[clusterID].MaxVersion { - scanRange, ok := result[schemaTableKey] + for _, versionKey := range c.CheckpointItems[0].ClusterInfo[clusterID].MaxVersion { + scanRange, ok := result[versionKey.SchemaTableKey] if !ok { - return nil, errors.Errorf("schema table key %s.%s not found in result", schemaTableKey.Schema, schemaTableKey.Table) + return nil, errors.Errorf("schema table key %s.%s not found in result", versionKey.Schema, versionKey.Table) } scanRange.StartVersionKey = versionKey.VersionPath scanRange.StartDataPath = versionKey.DataPath From 212b372bbb529bb015f9c53e4919b953ce17da06 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 4 Feb 2026 15:08:59 +0800 Subject: [PATCH 15/23] fix bugs Signed-off-by: Jianjun Liao --- .../consumer/s3_consumer.go | 11 ++++++----- .../utils/decoder.go | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go index b04b6ba96a..dcb53ea1c3 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go @@ -144,10 +144,10 @@ func (t *TableDMLIdx) DiffNewTableDMLIdxMap( if _, ok := resMap[newDMLPathKey]; !ok { resMap[newDMLPathKey] = make(fileIndexRange) } - } - resMap[newDMLPathKey][indexKey] = indexRange{ - start: origEndVal + 1, - end: newEndVal, + resMap[newDMLPathKey][indexKey] = indexRange{ + start: origEndVal + 1, + end: newEndVal, + } } } } @@ -262,6 +262,7 @@ func (c *S3Consumer) downloadSchemaFilesWithScanRange( Version: startSchemaKey.TableVersion, VersionPath: startVersionKey, }) + newVersionPaths[startSchemaKey] = startVersionKey if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { if endVersionKey < filePath { return ErrWalkDirEnd @@ -482,8 +483,8 @@ func (c *S3Consumer) getNewFilesForSchemaPathKey( return nil } updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) + maxFilePath = filePath } - maxFilePath = filePath return nil }); err != nil { return nil, errors.Trace(err) diff --git a/cmd/multi-cluster-consistency-checker/utils/decoder.go b/cmd/multi-cluster-consistency-checker/utils/decoder.go index 522102c3bb..bb1aa9b64e 100644 --- a/cmd/multi-cluster-consistency-checker/utils/decoder.go +++ b/cmd/multi-cluster-consistency-checker/utils/decoder.go @@ -58,6 +58,7 @@ func defaultCanalJSONCodecConfig(protocol config.Protocol) *codecCommon.Config { // Always enable tidb extension for canal-json protocol // because we need to get the commit ts from the extension field. codecConfig.EnableTiDBExtension = true + codecConfig.Terminator = config.CRLF return codecConfig } From c9774fd4789ec0e14196e31276aac1d0325fbb93 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 4 Feb 2026 16:13:10 +0800 Subject: [PATCH 16/23] fix bugs Signed-off-by: Jianjun Liao --- cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go index dcb53ea1c3..eb8b2df069 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go @@ -288,7 +288,7 @@ func (c *S3Consumer) downloadSchemaFilesWithScanRange( newVersionPaths[schemaKey] = filePath } return nil - }); err != nil && !ErrWalkDirEnd.Is(err) { + }); err != nil && !errors.Is(err, ErrWalkDirEnd) { return nil, errors.Trace(err) } @@ -369,7 +369,7 @@ func (c *S3Consumer) getNewFilesForSchemaPathKeyWithEndPath( updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) } return nil - }); err != nil && !ErrWalkDirEnd.Is(err) { + }); err != nil && !errors.Is(err, ErrWalkDirEnd) { return nil, errors.Trace(err) } return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil From ec5f5b9a7068e651fa85a4a5cb1594da92e6d05f Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 4 Feb 2026 23:56:29 +0800 Subject: [PATCH 17/23] fix bugs Signed-off-by: Jianjun Liao --- .../utils/decoder.go | 4 +--- .../utils/parser.go | 4 ---- pkg/sink/codec/canal/canal_json_txn_decoder.go | 18 +++++++++++++++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/utils/decoder.go b/cmd/multi-cluster-consistency-checker/utils/decoder.go index bb1aa9b64e..1a33718810 100644 --- a/cmd/multi-cluster-consistency-checker/utils/decoder.go +++ b/cmd/multi-cluster-consistency-checker/utils/decoder.go @@ -74,9 +74,7 @@ func NewCanalJSONDecoder() *canalJSONDecoder { } func (d *canalJSONDecoder) NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) { - // For S3 sink with canal-json format, use NewTxnDecoder - // which is designed for batch decoding from storage - decoder := canal.NewTxnDecoder(d.codecConfig) + decoder := canal.NewTxnDecoderWithTableInfo(d.codecConfig, tableInfo) decoder.AddKeyValue(nil, content) return decoder, nil } diff --git a/cmd/multi-cluster-consistency-checker/utils/parser.go b/cmd/multi-cluster-consistency-checker/utils/parser.go index d77f2f89bb..ce61f4cfd3 100644 --- a/cmd/multi-cluster-consistency-checker/utils/parser.go +++ b/cmd/multi-cluster-consistency-checker/utils/parser.go @@ -69,10 +69,6 @@ type TableParser struct { decoderFactory decoderFactory } -func NewTableParser(tableKey string, content []byte) (*TableParser, error) { - return NewTableParserWithFormat(tableKey, content, config.ProtocolCsv) -} - func NewTableParserWithFormat(tableKey string, content []byte, protocol config.Protocol) (*TableParser, error) { tableParser := &TableParser{} if err := tableParser.parseTableInfo(tableKey, content); err != nil { diff --git a/pkg/sink/codec/canal/canal_json_txn_decoder.go b/pkg/sink/codec/canal/canal_json_txn_decoder.go index 9c41be58e0..7bbdbe111e 100644 --- a/pkg/sink/codec/canal/canal_json_txn_decoder.go +++ b/pkg/sink/codec/canal/canal_json_txn_decoder.go @@ -33,6 +33,19 @@ type txnDecoder struct { config *common.Config msg canalJSONMessageInterface + + cachedTableInfo *commonType.TableInfo +} + +// NewTxnDecoderWithTableInfo return a new txn decoder with a cached table info. +func NewTxnDecoderWithTableInfo( + codecConfig *common.Config, + tableInfo *commonType.TableInfo, +) *txnDecoder { + return &txnDecoder{ + config: codecConfig, + cachedTableInfo: tableInfo, + } } // NewTxnDecoder return a new CanalJSONTxnEventDecoder. @@ -108,7 +121,10 @@ func (d *txnDecoder) NextDMLEvent() *commonEvent.DMLEvent { func (d *txnDecoder) canalJSONMessage2RowChange() *commonEvent.DMLEvent { msg := d.msg - tableInfo := newTableInfo(msg) + tableInfo := d.cachedTableInfo + if tableInfo == nil { + tableInfo = newTableInfo(msg) + } result := new(commonEvent.DMLEvent) result.Length++ // todo: set this field correctly result.StartTs = msg.getCommitTs() // todo: how to set this correctly? From 27164e2116803db58b98d5564c8763a15321a629 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Tue, 10 Feb 2026 15:24:30 +0800 Subject: [PATCH 18/23] add unit tests Signed-off-by: Jianjun Liao --- .../advancer/time_window_advancer.go | 45 +- .../advancer/time_window_advancer_test.go | 214 +++++- .../checker/checker.go | 79 +- .../checker/checker_test.go | 347 +++++++-- .../config/config.example.toml | 4 +- .../consumer/consumer.go | 695 +++++++++++++++++- .../consumer/consumer_test.go | 636 ++++++++++++++++ .../consumer/s3_consumer.go | 681 ----------------- .../decoder/decoder.go | 417 +++++++++++ .../decoder/decoder_test.go | 232 ++++++ cmd/multi-cluster-consistency-checker/main.go | 4 +- .../recorder/recorder.go | 29 +- .../recorder/recorder_test.go | 371 ++++++++++ .../recorder/types.go | 16 +- .../recorder/types_test.go | 548 ++++++++++++++ cmd/multi-cluster-consistency-checker/task.go | 44 +- .../{utils => types}/types.go | 38 +- .../types/types_test.go | 69 ++ .../utils/decoder.go | 80 -- .../utils/parser.go | 225 ------ .../watcher/checkpoint_watcher.go | 249 ++++++- .../watcher/checkpoint_watcher_test.go | 543 ++++++++++++++ .../watcher/s3_watcher.go | 14 +- .../watcher/s3_watcher_test.go | 202 +++++ pkg/common/table_info.go | 9 - .../codec/canal/canal_json_txn_decoder.go | 18 +- 26 files changed, 4562 insertions(+), 1247 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/consumer/consumer_test.go delete mode 100644 cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go create mode 100644 cmd/multi-cluster-consistency-checker/decoder/decoder.go create mode 100644 cmd/multi-cluster-consistency-checker/decoder/decoder_test.go create mode 100644 cmd/multi-cluster-consistency-checker/recorder/recorder_test.go create mode 100644 cmd/multi-cluster-consistency-checker/recorder/types_test.go rename cmd/multi-cluster-consistency-checker/{utils => types}/types.go (77%) create mode 100644 cmd/multi-cluster-consistency-checker/types/types_test.go delete mode 100644 cmd/multi-cluster-consistency-checker/utils/decoder.go delete mode 100644 cmd/multi-cluster-consistency-checker/utils/parser.go create mode 100644 cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go create mode 100644 cmd/multi-cluster-consistency-checker/watcher/s3_watcher_test.go diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go index d2ed64a4cf..353ae40ff7 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go @@ -20,7 +20,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" @@ -35,11 +35,11 @@ type TimeWindowAdvancer struct { round uint64 // timeWindowTriplet is the triplet of adjacent time windows, mapping from cluster ID to the triplet - timeWindowTriplet map[string][3]utils.TimeWindow + timeWindowTriplet map[string][3]types.TimeWindow // checkpointWatcher is the Active-Active checkpoint watcher for each cluster, // mapping from cluster ID to the downstream cluster ID to the checkpoint watcher - checkpointWatcher map[string]map[string]*watcher.CheckpointWatcher + checkpointWatcher map[string]map[string]watcher.Watcher // s3checkpointWatcher is the S3 checkpoint watcher for each cluster, mapping from cluster ID to the s3 checkpoint watcher s3Watcher map[string]*watcher.S3Watcher @@ -50,14 +50,14 @@ type TimeWindowAdvancer struct { func NewTimeWindowAdvancer( ctx context.Context, - checkpointWatchers map[string]map[string]*watcher.CheckpointWatcher, + checkpointWatchers map[string]map[string]watcher.Watcher, s3Watchers map[string]*watcher.S3Watcher, pdClients map[string]pd.Client, checkpoint *recorder.Checkpoint, -) (*TimeWindowAdvancer, map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { - timeWindowTriplet := make(map[string][3]utils.TimeWindow) +) (*TimeWindowAdvancer, map[string]map[cloudstorage.DmlPathKey]types.IncrementalData, error) { + timeWindowTriplet := make(map[string][3]types.TimeWindow) for clusterID := range pdClients { - timeWindowTriplet[clusterID] = [3]utils.TimeWindow{} + timeWindowTriplet[clusterID] = [3]types.TimeWindow{} } advancer := &TimeWindowAdvancer{ round: 0, @@ -76,7 +76,7 @@ func NewTimeWindowAdvancer( func (t *TimeWindowAdvancer) initializeFromCheckpoint( ctx context.Context, checkpoint *recorder.Checkpoint, -) (map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { +) (map[string]map[cloudstorage.DmlPathKey]types.IncrementalData, error) { if checkpoint == nil { return nil, nil } @@ -85,7 +85,7 @@ func (t *TimeWindowAdvancer) initializeFromCheckpoint( } t.round = checkpoint.CheckpointItems[2].Round + 1 for clusterID := range t.timeWindowTriplet { - newTimeWindows := [3]utils.TimeWindow{} + newTimeWindows := [3]types.TimeWindow{} newTimeWindows[2] = checkpoint.CheckpointItems[2].ClusterInfo[clusterID].TimeWindow if checkpoint.CheckpointItems[1] != nil { newTimeWindows[1] = checkpoint.CheckpointItems[1].ClusterInfo[clusterID].TimeWindow @@ -97,7 +97,7 @@ func (t *TimeWindowAdvancer) initializeFromCheckpoint( } var mu sync.Mutex - newDataMap := make(map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData) + newDataMap := make(map[string]map[cloudstorage.DmlPathKey]types.IncrementalData) eg, egCtx := errgroup.WithContext(ctx) for clusterID, s3Watcher := range t.s3Watcher { eg.Go(func() error { @@ -136,7 +136,7 @@ func (t *TimeWindowAdvancer) initializeFromCheckpoint( // For any cluster, the time window should be updated to the new time window. func (t *TimeWindowAdvancer) AdvanceTimeWindow( pctx context.Context, -) (map[string]utils.TimeWindowData, error) { +) (map[string]types.TimeWindowData, error) { log.Debug("advance time window", zap.Uint64("round", t.round)) // mapping from upstream cluster ID to the downstream cluster ID to the min checkpoint timestamp minCheckpointTsMap := make(map[string]map[string]uint64) @@ -152,7 +152,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( } var lock sync.Mutex - newTimeWindow := make(map[string]utils.TimeWindow) + newTimeWindow := make(map[string]types.TimeWindow) maxPDTimestampAfterCheckpointTs := make(map[string]uint64) // for cluster ID, the max checkpoint timestamp is maximum of checkpoint from cluster to other clusters and checkpoint from other clusters to cluster maxCheckpointTs := make(map[string]uint64) @@ -166,6 +166,7 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( if err != nil { return errors.Trace(err) } + // TODO: optimize this by getting pd ts in the end of all checkpoint ts advance pdtsos, err := t.getPDTsFromOtherClusters(ctx, upstreamClusterID) if err != nil { return errors.Trace(err) @@ -192,8 +193,8 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( } // Update the time window for each cluster - newDataMap := make(map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData) - maxVersionMap := make(map[string]map[utils.SchemaTableKey]utils.VersionKey) + newDataMap := make(map[string]map[cloudstorage.DmlPathKey]types.IncrementalData) + maxVersionMap := make(map[string]map[types.SchemaTableKey]types.VersionKey) eg, ctx = errgroup.WithContext(pctx) for clusterID, triplet := range t.timeWindowTriplet { minTimeWindowRightBoundary := max(maxCheckpointTs[clusterID], maxPDTimestampAfterCheckpointTs[clusterID], triplet[2].NextMinLeftBoundary) @@ -233,11 +234,11 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( return nil, errors.Annotate(err, "advance time window failed") } t.updateTimeWindow(newTimeWindow) - t.round += 1 + t.round++ return newTimeWindowData(newTimeWindow, newDataMap, maxVersionMap), nil } -func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]utils.TimeWindow) { +func (t *TimeWindowAdvancer) updateTimeWindow(newTimeWindow map[string]types.TimeWindow) { for clusterID, timeWindow := range newTimeWindow { triplet := t.timeWindowTriplet[clusterID] triplet[0] = triplet[1] @@ -286,13 +287,13 @@ func (t *TimeWindowAdvancer) getPDTsFromOtherClusters(pctx context.Context, clus } func newTimeWindowData( - newTimeWindow map[string]utils.TimeWindow, - newDataMap map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, - maxVersionMap map[string]map[utils.SchemaTableKey]utils.VersionKey, -) map[string]utils.TimeWindowData { - timeWindowDatas := make(map[string]utils.TimeWindowData) + newTimeWindow map[string]types.TimeWindow, + newDataMap map[string]map[cloudstorage.DmlPathKey]types.IncrementalData, + maxVersionMap map[string]map[types.SchemaTableKey]types.VersionKey, +) map[string]types.TimeWindowData { + timeWindowDatas := make(map[string]types.TimeWindowData) for clusterID, timeWindow := range newTimeWindow { - timeWindowDatas[clusterID] = utils.TimeWindowData{ + timeWindowDatas[clusterID] = types.TimeWindowData{ TimeWindow: timeWindow, Data: newDataMap[clusterID], MaxVersion: maxVersionMap[clusterID], diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go index ed9fdb54cf..187cc6f7ca 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go @@ -15,37 +15,213 @@ package advancer import ( "context" + "sync" + "sync/atomic" "testing" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" + "github.com/pingcap/tidb/br/pkg/storage" "github.com/stretchr/testify/require" pd "github.com/tikv/pd/client" ) +// mockPDClient mocks pd.Client for testing. +// Each call to GetTS returns a monotonically increasing TSO (physical part increases by 1000ms per call). +type mockPDClient struct { + pd.Client + seq int64 // accessed atomically +} + +func (m *mockPDClient) GetTS(ctx context.Context) (int64, int64, error) { + n := atomic.AddInt64(&m.seq, 1) + // Physical timestamp starts at 11000ms and increases by 1000ms per call. + // oracle.ComposeTS(physical, 0) = physical << 18, so each step is ~262 million. + return 10000 + n*1000, 0, nil +} + +func (m *mockPDClient) Close() {} + +// mockAdvancerWatcher mocks watcher.Watcher for testing. +// Returns minCheckpointTs + delta, ensuring the result is always > minCheckpointTs and monotonically increasing. +type mockAdvancerWatcher struct { + mu sync.Mutex + delta uint64 + history []uint64 +} + +func (m *mockAdvancerWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + m.mu.Lock() + defer m.mu.Unlock() + result := minCheckpointTs + m.delta + m.history = append(m.history, result) + return result, nil +} + +func (m *mockAdvancerWatcher) Close() {} + +func (m *mockAdvancerWatcher) getHistory() []uint64 { + m.mu.Lock() + defer m.mu.Unlock() + out := make([]uint64, len(m.history)) + copy(out, m.history) + return out +} + func TestNewTimeWindowAdvancer(t *testing.T) { + checkpointWatchers := map[string]map[string]watcher.Watcher{ + "cluster1": {}, + "cluster2": {}, + } + s3Watchers := map[string]*watcher.S3Watcher{ + "cluster1": nil, + "cluster2": nil, + } + pdClients := map[string]pd.Client{ + "cluster1": nil, + "cluster2": nil, + } + + advancer, _, err := NewTimeWindowAdvancer(context.Background(), checkpointWatchers, s3Watchers, pdClients, nil) + require.NoError(t, err) + require.NotNil(t, advancer) + require.Equal(t, uint64(0), advancer.round) + require.Len(t, advancer.timeWindowTriplet, 2) + require.Contains(t, advancer.timeWindowTriplet, "cluster1") + require.Contains(t, advancer.timeWindowTriplet, "cluster2") +} + +// TestTimeWindowAdvancer_AdvanceMultipleRounds simulates 4 rounds of AdvanceTimeWindow +// with 2 clusters (c1, c2) performing bidirectional replication. +// +// The test verifies: +// - Time windows advance correctly (LeftBoundary == previous RightBoundary) +// - RightBoundary > LeftBoundary for each time window +// - Checkpoint timestamps are monotonically increasing across rounds +// - PD TSOs are always greater than checkpoint timestamps +// - NextMinLeftBoundary (PD TSO) > RightBoundary (S3 checkpoint) +// - Mock watcher checkpoint histories are strictly increasing +func TestTimeWindowAdvancer_AdvanceMultipleRounds(t *testing.T) { t.Parallel() + ctx := context.Background() - t.Run("create time window advancer", func(t *testing.T) { - t.Parallel() - checkpointWatchers := map[string]map[string]*watcher.CheckpointWatcher{ - "cluster1": {}, - "cluster2": {}, + // Create mock PD clients for each cluster (monotonically increasing TSO) + pdC1 := &mockPDClient{} + pdC2 := &mockPDClient{} + pdClients := map[string]pd.Client{ + "c1": pdC1, + "c2": pdC2, + } + + // Create mock checkpoint watchers for bidirectional replication (c1->c2, c2->c1) + // Each returns minCheckpointTs + 100 + cpWatcherC1C2 := &mockAdvancerWatcher{delta: 100} + cpWatcherC2C1 := &mockAdvancerWatcher{delta: 100} + checkpointWatchers := map[string]map[string]watcher.Watcher{ + "c1": {"c2": cpWatcherC1C2}, + "c2": {"c1": cpWatcherC2C1}, + } + + // Create S3 watchers with mock checkpoint watchers (returns minCheckpointTs + 50) + // and empty in-memory storage (no actual S3 data) + s3WatcherMockC1 := &mockAdvancerWatcher{delta: 50} + s3WatcherMockC2 := &mockAdvancerWatcher{delta: 50} + s3Watchers := map[string]*watcher.S3Watcher{ + "c1": watcher.NewS3Watcher(s3WatcherMockC1, storage.NewMemStorage(), nil), + "c2": watcher.NewS3Watcher(s3WatcherMockC2, storage.NewMemStorage(), nil), + } + + advancer, _, err := NewTimeWindowAdvancer(ctx, checkpointWatchers, s3Watchers, pdClients, nil) + require.NoError(t, err) + require.Equal(t, uint64(0), advancer.round) + + // Track previous round values for cross-round assertions + prevRightBoundaries := map[string]uint64{"c1": 0, "c2": 0} + prevCheckpointTs := map[string]map[string]uint64{ + "c1": {"c2": 0}, + "c2": {"c1": 0}, + } + prevRightBoundary := uint64(0) // max across all clusters + + for round := range 4 { + result, err := advancer.AdvanceTimeWindow(ctx) + require.NoError(t, err, "round %d", round) + require.Len(t, result, 2, "round %d: should have data for both clusters", round) + + for clusterID, twData := range result { + tw := twData.TimeWindow + + // 1. LeftBoundary == previous RightBoundary + require.Equal(t, prevRightBoundaries[clusterID], tw.LeftBoundary, + "round %d, cluster %s: LeftBoundary should equal previous RightBoundary", round, clusterID) + + // 2. RightBoundary > LeftBoundary (time window is non-empty) + require.Greater(t, tw.RightBoundary, tw.LeftBoundary, + "round %d, cluster %s: RightBoundary should be > LeftBoundary", round, clusterID) + + // 3. CheckpointTs should be populated and strictly increasing across rounds + require.NotEmpty(t, tw.CheckpointTs, + "round %d, cluster %s: CheckpointTs should be populated", round, clusterID) + for downstream, cpTs := range tw.CheckpointTs { + require.Greater(t, cpTs, prevCheckpointTs[clusterID][downstream], + "round %d, %s->%s: checkpoint should be strictly increasing", round, clusterID, downstream) + } + + // 4. PDTimestampAfterTimeWindow should be populated + require.NotEmpty(t, tw.PDTimestampAfterTimeWindow, + "round %d, cluster %s: PDTimestampAfterTimeWindow should be populated", round, clusterID) + + // 5. NextMinLeftBoundary > RightBoundary + // (PD TSO is obtained after S3 checkpoint, and PD TSO >> S3 checkpoint) + require.Greater(t, tw.NextMinLeftBoundary, tw.RightBoundary, + "round %d, cluster %s: NextMinLeftBoundary (PD TSO) should be > RightBoundary (S3 checkpoint)", round, clusterID) + + // 6. PD TSO values in PDTimestampAfterTimeWindow > all CheckpointTs values + // (PD TSOs are obtained after checkpoint advance) + for otherCluster, pdTs := range tw.PDTimestampAfterTimeWindow { + for downstream, cpTs := range tw.CheckpointTs { + require.Greater(t, pdTs, cpTs, + "round %d, cluster %s: PD TSO (from %s) should be > checkpoint (%s->%s)", + round, clusterID, otherCluster, clusterID, downstream) + } + } + + // 7. RightBoundary > previous round's max RightBoundary (time window advances) + require.Greater(t, tw.RightBoundary, prevRightBoundary, + "round %d, cluster %s: RightBoundary should be > previous max RightBoundary", round, clusterID) } - s3Watchers := map[string]*watcher.S3Watcher{ - "cluster1": nil, - "cluster2": nil, + + // Save current values for next round + maxRB := uint64(0) + for clusterID, twData := range result { + prevRightBoundaries[clusterID] = twData.TimeWindow.RightBoundary + if twData.TimeWindow.RightBoundary > maxRB { + maxRB = twData.TimeWindow.RightBoundary + } + for downstream, cpTs := range twData.TimeWindow.CheckpointTs { + prevCheckpointTs[clusterID][downstream] = cpTs + } } - pdClients := map[string]pd.Client{ - "cluster1": nil, - "cluster2": nil, + prevRightBoundary = maxRB + } + + // After 4 rounds, round counter should be 4 + require.Equal(t, uint64(4), advancer.round) + + // Verify all mock watcher checkpoint histories are strictly monotonically increasing + allWatchers := []*mockAdvancerWatcher{cpWatcherC1C2, cpWatcherC2C1, s3WatcherMockC1, s3WatcherMockC2} + watcherNames := []string{"cp c1->c2", "cp c2->c1", "s3 c1", "s3 c2"} + for idx, w := range allWatchers { + history := w.getHistory() + require.GreaterOrEqual(t, len(history), 4, + "%s: should have at least 4 checkpoint values (one per round)", watcherNames[idx]) + for i := 1; i < len(history); i++ { + require.Greater(t, history[i], history[i-1], + "%s: checkpoint values should be strictly increasing (index %d: %d -> %d)", + watcherNames[idx], i, history[i-1], history[i]) } + } - advancer, _, err := NewTimeWindowAdvancer(context.Background(), checkpointWatchers, s3Watchers, pdClients, nil) - require.NoError(t, err) - require.NotNil(t, advancer) - require.Equal(t, uint64(0), advancer.round) - require.Len(t, advancer.timeWindowTriplet, 2) - require.Contains(t, advancer.timeWindowTriplet, "cluster1") - require.Contains(t, advancer.timeWindowTriplet, "cluster2") - }) + // Verify PD clients were called (monotonically increasing due to atomic counter) + require.Greater(t, atomic.LoadInt64(&pdC1.seq), int64(0), "pd-c1 should have been called") + require.Greater(t, atomic.LoadInt64(&pdC2.seq), int64(0), "pd-c2 should have been called") } diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index e55dc5022e..8bdcb9d055 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -19,8 +19,9 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/decoder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "go.uber.org/zap" @@ -28,22 +29,22 @@ import ( type versionCacheEntry struct { previous int - cdcVersion utils.CdcVersion + cdcVersion types.CdcVersion } type clusterViolationChecker struct { clusterID string - twoPreviousTimeWindowKeyVersionCache map[utils.PkType]versionCacheEntry + twoPreviousTimeWindowKeyVersionCache map[types.PkType]versionCacheEntry } func newClusterViolationChecker(clusterID string) *clusterViolationChecker { return &clusterViolationChecker{ clusterID: clusterID, - twoPreviousTimeWindowKeyVersionCache: make(map[utils.PkType]versionCacheEntry), + twoPreviousTimeWindowKeyVersionCache: make(map[types.PkType]versionCacheEntry), } } -func (c *clusterViolationChecker) NewRecordFromCheckpoint(record *utils.Record, previous int) { +func (c *clusterViolationChecker) NewRecordFromCheckpoint(record *decoder.Record, previous int) { entry, exists := c.twoPreviousTimeWindowKeyVersionCache[record.Pk] if !exists { c.twoPreviousTimeWindowKeyVersionCache[record.Pk] = versionCacheEntry{ @@ -62,7 +63,7 @@ func (c *clusterViolationChecker) NewRecordFromCheckpoint(record *utils.Record, } } -func (c *clusterViolationChecker) Check(r *utils.Record, report *recorder.ClusterReport) { +func (c *clusterViolationChecker) Check(r *decoder.Record, report *recorder.ClusterReport) { entry, exists := c.twoPreviousTimeWindowKeyVersionCache[r.Pk] if !exists { c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ @@ -93,7 +94,7 @@ func (c *clusterViolationChecker) Check(r *utils.Record, report *recorder.Cluste } func (c *clusterViolationChecker) UpdateCache() { - newTwoPreviousTimeWindowKeyVersionCache := make(map[utils.PkType]versionCacheEntry) + newTwoPreviousTimeWindowKeyVersionCache := make(map[types.PkType]versionCacheEntry) for primaryKey, entry := range c.twoPreviousTimeWindowKeyVersionCache { if entry.previous >= 2 { continue @@ -108,10 +109,10 @@ func (c *clusterViolationChecker) UpdateCache() { type timeWindowDataCache struct { // upstreamDataCache is a map of primary key to a map of commit ts to a record - upstreamDataCache map[utils.PkType]map[uint64]*utils.Record + upstreamDataCache map[types.PkType]map[uint64]*decoder.Record // downstreamDataCache is a map of primary key to a map of origin ts to a record - downstreamDataCache map[utils.PkType]map[uint64]*utils.Record + downstreamDataCache map[types.PkType]map[uint64]*decoder.Record leftBoundary uint64 rightBoundary uint64 @@ -120,33 +121,33 @@ type timeWindowDataCache struct { func newTimeWindowDataCache(leftBoundary, rightBoundary uint64, checkpointTs map[string]uint64) timeWindowDataCache { return timeWindowDataCache{ - upstreamDataCache: make(map[utils.PkType]map[uint64]*utils.Record), - downstreamDataCache: make(map[utils.PkType]map[uint64]*utils.Record), + upstreamDataCache: make(map[types.PkType]map[uint64]*decoder.Record), + downstreamDataCache: make(map[types.PkType]map[uint64]*decoder.Record), leftBoundary: leftBoundary, rightBoundary: rightBoundary, checkpointTs: checkpointTs, } } -func (twdc *timeWindowDataCache) newUpstreamRecord(record *utils.Record) { +func (twdc *timeWindowDataCache) newUpstreamRecord(record *decoder.Record) { recordsMap, exists := twdc.upstreamDataCache[record.Pk] if !exists { - recordsMap = make(map[uint64]*utils.Record) + recordsMap = make(map[uint64]*decoder.Record) twdc.upstreamDataCache[record.Pk] = recordsMap } recordsMap[record.CommitTs] = record } -func (twdc *timeWindowDataCache) newDownstreamRecord(record *utils.Record) { +func (twdc *timeWindowDataCache) newDownstreamRecord(record *decoder.Record) { recordsMap, exists := twdc.downstreamDataCache[record.Pk] if !exists { - recordsMap = make(map[uint64]*utils.Record) + recordsMap = make(map[uint64]*decoder.Record) twdc.downstreamDataCache[record.Pk] = recordsMap } recordsMap[record.OriginTs] = record } -func (twdc *timeWindowDataCache) NewRecord(record *utils.Record) { +func (twdc *timeWindowDataCache) NewRecord(record *decoder.Record) { if record.CommitTs <= twdc.leftBoundary { // record is before the left boundary, just skip it return @@ -165,7 +166,7 @@ type clusterDataChecker struct { rightBoundary uint64 - overDataCaches []*utils.Record + overDataCaches []*decoder.Record clusterViolationChecker *clusterViolationChecker @@ -177,14 +178,14 @@ func newClusterDataChecker(clusterID string) *clusterDataChecker { clusterID: clusterID, timeWindowDataCaches: [3]timeWindowDataCache{}, rightBoundary: 0, - overDataCaches: make([]*utils.Record, 0), + overDataCaches: make([]*decoder.Record, 0), clusterViolationChecker: newClusterViolationChecker(clusterID), } } func (cd *clusterDataChecker) InitializeFromCheckpoint( ctx context.Context, - checkpointDataMap map[cloudstorage.DmlPathKey]utils.IncrementalData, + checkpointDataMap map[cloudstorage.DmlPathKey]types.IncrementalData, checkpoint *recorder.Checkpoint, ) error { if checkpoint == nil { @@ -205,7 +206,7 @@ func (cd *clusterDataChecker) InitializeFromCheckpoint( for _, incrementalData := range checkpointDataMap { for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { - records, err := incrementalData.Parser.DecodeFiles(ctx, content) + records, err := decoder.Decode(content) if err != nil { return errors.Trace(err) } @@ -218,7 +219,7 @@ func (cd *clusterDataChecker) InitializeFromCheckpoint( return nil } -func (cd *clusterDataChecker) newRecordFromCheckpoint(record *utils.Record) { +func (cd *clusterDataChecker) newRecordFromCheckpoint(record *decoder.Record) { if record.CommitTs > cd.rightBoundary { cd.overDataCaches = append(cd.overDataCaches, record) return @@ -233,7 +234,7 @@ func (cd *clusterDataChecker) newRecordFromCheckpoint(record *utils.Record) { } } -func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow utils.TimeWindow) error { +func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow types.TimeWindow) error { if timeWindow.LeftBoundary != cd.rightBoundary { return errors.Errorf("time window left boundary(%d) mismatch right boundary ts(%d)", timeWindow.LeftBoundary, cd.rightBoundary) } @@ -241,7 +242,7 @@ func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow utils.TimeWin cd.timeWindowDataCaches[1] = cd.timeWindowDataCaches[2] newTimeWindowDataCache := newTimeWindowDataCache(timeWindow.LeftBoundary, timeWindow.RightBoundary, timeWindow.CheckpointTs) cd.rightBoundary = timeWindow.RightBoundary - newOverDataCache := make([]*utils.Record, 0, len(cd.overDataCaches)) + newOverDataCache := make([]*decoder.Record, 0, len(cd.overDataCaches)) for _, overRecord := range cd.overDataCaches { if overRecord.CommitTs > timeWindow.RightBoundary { newOverDataCache = append(newOverDataCache, overRecord) @@ -254,7 +255,7 @@ func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow utils.TimeWin return nil } -func (cd *clusterDataChecker) NewRecord(record *utils.Record) { +func (cd *clusterDataChecker) NewRecord(record *decoder.Record) { if record.CommitTs > cd.rightBoundary { cd.overDataCaches = append(cd.overDataCaches, record) return @@ -262,7 +263,7 @@ func (cd *clusterDataChecker) NewRecord(record *utils.Record) { cd.timeWindowDataCaches[2].NewRecord(record) } -func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowIdx int, pk utils.PkType, originTs uint64) (*utils.Record, bool) { +func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowIdx int, pk types.PkType, originTs uint64) (*decoder.Record, bool) { records, exists := cd.timeWindowDataCaches[timeWindowIdx].downstreamDataCache[pk] if !exists { return nil, false @@ -278,7 +279,7 @@ func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowId return nil, false } -func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx int, pk utils.PkType, commitTs uint64) bool { +func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx int, pk types.PkType, commitTs uint64) bool { records, exists := cd.timeWindowDataCaches[timeWindowIdx].upstreamDataCache[pk] if !exists { return false @@ -371,7 +372,7 @@ func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { func (cd *clusterDataChecker) lwwViolationDetection() { for pk, upstreamRecords := range cd.timeWindowDataCaches[2].upstreamDataCache { downstreamRecords := cd.timeWindowDataCaches[2].downstreamDataCache[pk] - pkRecords := make([]*utils.Record, 0, len(upstreamRecords)+len(downstreamRecords)) + pkRecords := make([]*decoder.Record, 0, len(upstreamRecords)+len(downstreamRecords)) for _, upstreamRecord := range upstreamRecords { pkRecords = append(pkRecords, upstreamRecord) } @@ -389,7 +390,7 @@ func (cd *clusterDataChecker) lwwViolationDetection() { if _, exists := cd.timeWindowDataCaches[2].upstreamDataCache[pk]; exists { continue } - pkRecords := make([]*utils.Record, 0, len(downstreamRecords)) + pkRecords := make([]*decoder.Record, 0, len(downstreamRecords)) for _, downstreamRecord := range downstreamRecords { pkRecords = append(pkRecords, downstreamRecord) } @@ -424,7 +425,7 @@ type DataChecker struct { clusterDataCheckers map[string]*clusterDataChecker } -func NewDataChecker(ctx context.Context, clusterConfig map[string]config.ClusterConfig, checkpointDataMap map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, checkpoint *recorder.Checkpoint) *DataChecker { +func NewDataChecker(ctx context.Context, clusterConfig map[string]config.ClusterConfig, checkpointDataMap map[string]map[cloudstorage.DmlPathKey]types.IncrementalData, checkpoint *recorder.Checkpoint) *DataChecker { clusterDataChecker := make(map[string]*clusterDataChecker) for clusterID := range clusterConfig { clusterDataChecker[clusterID] = newClusterDataChecker(clusterID) @@ -438,7 +439,7 @@ func NewDataChecker(ctx context.Context, clusterConfig map[string]config.Cluster return checker } -func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDataMap map[string]map[cloudstorage.DmlPathKey]utils.IncrementalData, checkpoint *recorder.Checkpoint) { +func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDataMap map[string]map[cloudstorage.DmlPathKey]types.IncrementalData, checkpoint *recorder.Checkpoint) { if checkpoint == nil { return } @@ -446,7 +447,7 @@ func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDa return } c.round = checkpoint.CheckpointItems[2].Round + 1 - c.checkableRound = checkpoint.CheckpointItems[2].Round + c.checkableRound = checkpoint.CheckpointItems[2].Round + 1 for _, clusterDataChecker := range c.clusterDataCheckers { clusterDataChecker.InitializeFromCheckpoint(ctx, checkpointDataMap[clusterDataChecker.clusterID], checkpoint) } @@ -454,7 +455,7 @@ func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDa // FindClusterDownstreamData checks whether the record is present in the downstream data // cache [1] or [2] or another new record is present in the downstream data cache [1] or [2]. -func (c *DataChecker) FindClusterDownstreamData(clusterID string, pk utils.PkType, originTs uint64) (*utils.Record, bool) { +func (c *DataChecker) FindClusterDownstreamData(clusterID string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { clusterDataChecker, exists := c.clusterDataCheckers[clusterID] if !exists { return nil, false @@ -466,7 +467,7 @@ func (c *DataChecker) FindClusterDownstreamData(clusterID string, pk utils.PkTyp return clusterDataChecker.findClusterDownstreamDataInTimeWindow(2, pk, originTs) } -func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk utils.PkType, commitTs uint64) bool { +func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk types.PkType, commitTs uint64) bool { for _, clusterDataChecker := range c.clusterDataCheckers { if clusterDataChecker.clusterID == downstreamClusterID { continue @@ -484,25 +485,25 @@ func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk uti return false } -func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]utils.TimeWindowData) (*recorder.Report, error) { +func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]types.TimeWindowData) (*recorder.Report, error) { if err := c.decodeNewTimeWindowData(ctx, newTimeWindowData); err != nil { log.Error("failed to decode new time window data", zap.Error(err)) return nil, errors.Annotate(err, "failed to decode new time window data") } report := recorder.NewReport(c.round) - if c.checkableRound >= 2 { + if c.checkableRound >= 3 { for clusterID, clusterDataChecker := range c.clusterDataCheckers { clusterDataChecker.Check(c) report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) } } else { - c.checkableRound += 1 + c.checkableRound++ } - c.round += 1 + c.round++ return report, nil } -func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]utils.TimeWindowData) error { +func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]types.TimeWindowData) error { if len(newTimeWindowData) != len(c.clusterDataCheckers) { return errors.Errorf("number of clusters mismatch, expected %d, got %d", len(c.clusterDataCheckers), len(newTimeWindowData)) } @@ -517,7 +518,7 @@ func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindow for _, incrementalData := range timeWindowData.Data { for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { - records, err := incrementalData.Parser.DecodeFiles(ctx, content) + records, err := decoder.Decode(content) if err != nil { return errors.Trace(err) } diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go index b6e02e5af6..312bee6c12 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker_test.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -15,11 +15,15 @@ package checker import ( "context" + "fmt" + "strings" "testing" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/decoder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "github.com/stretchr/testify/require" ) @@ -85,9 +89,9 @@ func TestClusterViolationChecker_Check(t *testing.T) { checker := newClusterViolationChecker("cluster1") report := recorder.NewClusterReport("cluster1") - record := &utils.Record{ + record := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, }, @@ -103,16 +107,16 @@ func TestClusterViolationChecker_Check(t *testing.T) { checker := newClusterViolationChecker("cluster1") report := recorder.NewClusterReport("cluster1") - record1 := &utils.Record{ + record1 := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, }, } - record2 := &utils.Record{ + record2 := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 50, OriginTs: 0, }, @@ -128,16 +132,16 @@ func TestClusterViolationChecker_Check(t *testing.T) { checker := newClusterViolationChecker("cluster1") report := recorder.NewClusterReport("cluster1") - record1 := &utils.Record{ + record1 := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, }, } - record2 := &utils.Record{ + record2 := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 150, OriginTs: 50, // OriginTs is less than record1's CommitTs, causing violation }, @@ -162,9 +166,9 @@ func TestClusterViolationChecker_UpdateCache(t *testing.T) { checker := newClusterViolationChecker("cluster1") report := recorder.NewClusterReport("cluster1") - record := &utils.Record{ + record := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, }, @@ -216,9 +220,9 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Run("add upstream record", func(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) - record := &utils.Record{ + record := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 150, OriginTs: 0, }, @@ -232,9 +236,9 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Run("add downstream record", func(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) - record := &utils.Record{ + record := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 150, OriginTs: 100, }, @@ -248,9 +252,9 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Run("skip record before left boundary", func(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) - record := &utils.Record{ + record := &decoder.Record{ Pk: "pk1", - CdcVersion: utils.CdcVersion{ + CdcVersion: types.CdcVersion{ CommitTs: 50, OriginTs: 0, }, @@ -270,7 +274,7 @@ func TestClusterDataChecker_PrepareNextTimeWindowData(t *testing.T) { checker := newClusterDataChecker("cluster1") checker.rightBoundary = 100 - timeWindow := utils.TimeWindow{ + timeWindow := types.TimeWindow{ LeftBoundary: 100, RightBoundary: 200, CheckpointTs: map[string]uint64{"cluster2": 150}, @@ -286,7 +290,7 @@ func TestClusterDataChecker_PrepareNextTimeWindowData(t *testing.T) { checker := newClusterDataChecker("cluster1") checker.rightBoundary = 100 - timeWindow := utils.TimeWindow{ + timeWindow := types.TimeWindow{ LeftBoundary: 150, RightBoundary: 200, CheckpointTs: map[string]uint64{"cluster2": 150}, @@ -298,51 +302,282 @@ func TestClusterDataChecker_PrepareNextTimeWindowData(t *testing.T) { }) } -func TestDataChecker_FindClusterDownstreamData(t *testing.T) { - t.Parallel() +// makeCanalJSON builds a canal-JSON formatted record for testing. +// pkID is the primary key value, commitTs is the TiDB commit timestamp, +// originTs is the origin timestamp (0 for upstream records, non-zero for downstream), +// val is a varchar column value. +func makeCanalJSON(pkID int, commitTs uint64, originTs uint64, val string) string { + originTsVal := "null" + if originTs > 0 { + originTsVal = fmt.Sprintf(`"%d"`, originTs) + } + return fmt.Sprintf( + `{"id":0,"database":"test","table":"t1","pkNames":["id"],"isDdl":false,"type":"INSERT",`+ + `"es":0,"ts":0,"sql":"","sqlType":{"id":4,"val":12,"_tidb_origin_ts":-5},`+ + `"mysqlType":{"id":"int","val":"varchar","_tidb_origin_ts":"bigint"},`+ + `"old":null,"data":[{"id":"%d","val":"%s","_tidb_origin_ts":%s}],`+ + `"_tidb":{"commitTs":%d}}`, + pkID, val, originTsVal, commitTs) +} - t.Run("find downstream data", func(t *testing.T) { - t.Parallel() - clusterConfig := map[string]config.ClusterConfig{ - "cluster1": { - PDAddr: "127.0.0.1:2379", - S3SinkURI: "s3://bucket/cluster1/", - S3ChangefeedID: "s3-cf-1", +// makeContent combines canal-JSON records with CRLF terminator. +func makeContent(records ...string) []byte { + return []byte(strings.Join(records, "\r\n")) +} + +// makeTWData builds a TimeWindowData for testing. +func makeTWData(left, right uint64, checkpointTs map[string]uint64, content []byte) types.TimeWindowData { + data := map[cloudstorage.DmlPathKey]types.IncrementalData{} + if content != nil { + data[cloudstorage.DmlPathKey{}] = types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {}: {content}, }, - "cluster2": { - PDAddr: "127.0.0.1:2479", - S3SinkURI: "s3://bucket/cluster2/", - S3ChangefeedID: "s3-cf-2", + } + } + return types.TimeWindowData{ + TimeWindow: types.TimeWindow{ + LeftBoundary: left, + RightBoundary: right, + CheckpointTs: checkpointTs, + }, + Data: data, + } +} + +// TestDataChecker_FourRoundsCheck simulates 4 rounds with increasing data and verifies check results. +// Setup: 2 clusters (c1 upstream, c2 downstream from c1). +// Rounds 0-2: accumulate data, check not yet active (checkableRound < 3). +// Round 3: first real check runs, detecting violations. +func TestDataChecker_FourRoundsCheck(t *testing.T) { + t.Parallel() + ctx := context.Background() + + clusterCfg := map[string]config.ClusterConfig{"c1": {}, "c2": {}} + + // makeBaseRounds creates shared rounds 0 and 1 data for all subtests. + // c1 produces upstream data, c2 receives matching downstream from c1. + makeBaseRounds := func() [2]map[string]types.TimeWindowData { + return [2]map[string]types.TimeWindowData{ + // Round 0: [0, 100] + { + "c1": makeTWData(0, 100, map[string]uint64{"c2": 80}, + makeContent(makeCanalJSON(1, 50, 0, "a"))), + "c2": makeTWData(0, 100, nil, + makeContent(makeCanalJSON(1, 60, 50, "a"))), + }, + // Round 1: [100, 200] + { + "c1": makeTWData(100, 200, map[string]uint64{"c2": 180}, + makeContent(makeCanalJSON(2, 150, 0, "b"))), + "c2": makeTWData(100, 200, nil, + makeContent(makeCanalJSON(2, 160, 150, "b"))), }, } + } - checker := NewDataChecker(context.Background(), clusterConfig, nil, nil) - record, skipped := checker.FindClusterDownstreamData("cluster2", "pk1", 100) - require.Nil(t, record) - require.False(t, skipped) + t.Run("all consistent", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + base := makeBaseRounds() + + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, + makeContent(makeCanalJSON(3, 250, 0, "c"))), + "c2": makeTWData(200, 300, nil, + makeContent(makeCanalJSON(3, 260, 250, "c"))), + } + round3 := map[string]types.TimeWindowData{ + "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, + makeContent(makeCanalJSON(4, 350, 0, "d"))), + "c2": makeTWData(300, 400, nil, + makeContent(makeCanalJSON(4, 360, 350, "d"))), + } + + rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} + for i, roundData := range rounds { + report, err := checker.CheckInNextTimeWindow(ctx, roundData) + require.NoError(t, err, "round %d", i) + require.Equal(t, uint64(i), report.Round) + if i < 3 { + require.Empty(t, report.ClusterReports, "round %d should have no cluster reports", i) + require.False(t, report.NeedFlush(), "round %d should not need flush", i) + } else { + require.Len(t, report.ClusterReports, 2) + require.False(t, report.NeedFlush(), "round 3 should not need flush (all consistent)") + for clusterID, cr := range report.ClusterReports { + require.Empty(t, cr.DataLossItems, "cluster %s should have no data loss", clusterID) + require.Empty(t, cr.DataRedundantItems, "cluster %s should have no data redundant", clusterID) + require.Empty(t, cr.LWWViolationItems, "cluster %s should have no LWW violation", clusterID) + } + } + } }) -} -func TestDataChecker_FindClusterUpstreamData(t *testing.T) { - t.Parallel() + t.Run("data loss detected", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + base := makeBaseRounds() + + // Round 2: c1 has upstream pk=3 but c2 has NO matching downstream + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, + makeContent(makeCanalJSON(3, 250, 0, "c"))), + "c2": makeTWData(200, 300, nil, nil), + } + round3 := map[string]types.TimeWindowData{ + "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, + makeContent(makeCanalJSON(4, 350, 0, "d"))), + "c2": makeTWData(300, 400, nil, + makeContent(makeCanalJSON(4, 360, 350, "d"))), + } + + rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} + var lastReport *recorder.Report + for i, roundData := range rounds { + report, err := checker.CheckInNextTimeWindow(ctx, roundData) + require.NoError(t, err, "round %d", i) + lastReport = report + } - t.Run("find upstream data", func(t *testing.T) { + require.True(t, lastReport.NeedFlush()) + // c1 should detect data loss: pk=3 (commitTs=250) missing in c2's downstream + c1Report := lastReport.ClusterReports["c1"] + require.NotNil(t, c1Report) + require.Len(t, c1Report.DataLossItems, 1) + require.Equal(t, "c2", c1Report.DataLossItems[0].DownstreamClusterID) + require.Equal(t, uint64(0), c1Report.DataLossItems[0].OriginTS) + require.Equal(t, uint64(250), c1Report.DataLossItems[0].CommitTS) + require.False(t, c1Report.DataLossItems[0].Inconsistent) + // c2 should have no issues + c2Report := lastReport.ClusterReports["c2"] + require.Empty(t, c2Report.DataLossItems) + require.Empty(t, c2Report.DataRedundantItems) + }) + + t.Run("data inconsistent detected", func(t *testing.T) { t.Parallel() - clusterConfig := map[string]config.ClusterConfig{ - "cluster1": { - PDAddr: "127.0.0.1:2379", - S3SinkURI: "s3://bucket/cluster1/", - S3ChangefeedID: "s3-cf-1", - }, - "cluster2": { - PDAddr: "127.0.0.1:2479", - S3SinkURI: "s3://bucket/cluster2/", - S3ChangefeedID: "s3-cf-2", - }, + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + base := makeBaseRounds() + + // Round 2: c2 has downstream for pk=3 but with wrong column value + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, + makeContent(makeCanalJSON(3, 250, 0, "c"))), + "c2": makeTWData(200, 300, nil, + makeContent(makeCanalJSON(3, 260, 250, "WRONG"))), + } + round3 := map[string]types.TimeWindowData{ + "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, + makeContent(makeCanalJSON(4, 350, 0, "d"))), + "c2": makeTWData(300, 400, nil, + makeContent(makeCanalJSON(4, 360, 350, "d"))), } - checker := NewDataChecker(context.Background(), clusterConfig, nil, nil) - found := checker.FindClusterUpstreamData("cluster2", "pk1", 100) - require.False(t, found) + rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} + var lastReport *recorder.Report + for i, roundData := range rounds { + report, err := checker.CheckInNextTimeWindow(ctx, roundData) + require.NoError(t, err, "round %d", i) + lastReport = report + } + + require.True(t, lastReport.NeedFlush()) + c1Report := lastReport.ClusterReports["c1"] + require.Len(t, c1Report.DataLossItems, 1) + require.Equal(t, "c2", c1Report.DataLossItems[0].DownstreamClusterID) + require.Equal(t, uint64(250), c1Report.DataLossItems[0].CommitTS) + require.True(t, c1Report.DataLossItems[0].Inconsistent) // data inconsistent, not pure data loss + }) + + t.Run("data redundant detected", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + base := makeBaseRounds() + + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, + makeContent(makeCanalJSON(3, 250, 0, "c"))), + "c2": makeTWData(200, 300, nil, + makeContent(makeCanalJSON(3, 260, 250, "c"))), + } + // Round 3: c2 has an extra downstream pk=99 (originTs=330) that doesn't match + // any upstream record in c1 + round3 := map[string]types.TimeWindowData{ + "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, + makeContent(makeCanalJSON(4, 350, 0, "d"))), + "c2": makeTWData(300, 400, nil, + makeContent( + makeCanalJSON(4, 360, 350, "d"), + makeCanalJSON(99, 340, 330, "x"), + )), + } + + rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} + var lastReport *recorder.Report + for i, roundData := range rounds { + report, err := checker.CheckInNextTimeWindow(ctx, roundData) + require.NoError(t, err, "round %d", i) + lastReport = report + } + + require.True(t, lastReport.NeedFlush()) + // c1 should have no data loss + c1Report := lastReport.ClusterReports["c1"] + require.Empty(t, c1Report.DataLossItems) + // c2 should detect data redundant: pk=99 has no matching upstream in c1 + c2Report := lastReport.ClusterReports["c2"] + require.Len(t, c2Report.DataRedundantItems, 1) + require.Equal(t, uint64(330), c2Report.DataRedundantItems[0].OriginTS) + require.Equal(t, uint64(340), c2Report.DataRedundantItems[0].CommitTS) + }) + + t.Run("lww violation detected", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + base := makeBaseRounds() + + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, + makeContent(makeCanalJSON(3, 250, 0, "c"))), + "c2": makeTWData(200, 300, nil, + makeContent(makeCanalJSON(3, 260, 250, "c"))), + } + // Round 3: c1 has upstream pk=5 (commitTs=350, compareTs=350) and + // downstream pk=5 from c2 (commitTs=370, originTs=310, compareTs=310). + // Since 350 >= 310 with commitTs 350 < 370, this is an LWW violation. + // c2 also has matching records to avoid data loss/redundant noise. + round3 := map[string]types.TimeWindowData{ + "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, + makeContent( + makeCanalJSON(5, 350, 0, "e"), + makeCanalJSON(5, 370, 310, "e"), + )), + "c2": makeTWData(300, 400, nil, + makeContent( + makeCanalJSON(5, 310, 0, "e"), + makeCanalJSON(5, 360, 350, "e"), + )), + } + + rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} + var lastReport *recorder.Report + for i, roundData := range rounds { + report, err := checker.CheckInNextTimeWindow(ctx, roundData) + require.NoError(t, err, "round %d", i) + lastReport = report + } + + require.True(t, lastReport.NeedFlush()) + c1Report := lastReport.ClusterReports["c1"] + require.Len(t, c1Report.LWWViolationItems, 1) + require.Equal(t, uint64(0), c1Report.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(350), c1Report.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(310), c1Report.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(370), c1Report.LWWViolationItems[0].CommitTS) + // c2 should have no LWW violation (its records are ordered correctly: + // upstream commitTs=310 compareTs=310, downstream commitTs=360 compareTs=350, 310 < 350) + c2Report := lastReport.ClusterReports["c2"] + require.Empty(t, c2Report.LWWViolationItems) }) } diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index 1684d7e43a..8073475ae2 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -6,8 +6,8 @@ # Log level: debug, info, warn, error, fatal, panic log-level = "info" -# Report configuration -report-dir = "/tmp/multi-cluster-consistency-checker-reports" +# Data directory configuration, contains report and checkpoint data +data-dir = "/tmp/multi-cluster-consistency-checker-data" # Tables configuration [global.tables] diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go index cbc300d9f3..e251a06813 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -14,8 +14,22 @@ package consumer import ( - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "context" + "fmt" + "path" + "strings" + "sync" + + perrors "github.com/pingcap/errors" + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/pingcap/tidb/br/pkg/storage" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" ) type ( @@ -45,10 +59,687 @@ func updateTableDMLIdxMap( type schemaParser struct { path string - parser *utils.TableParser + parser *TableParser } type schemaKey struct { schema string table string } + +type TableParser struct{} + +var ErrWalkDirEnd = perrors.Normalize("walk dir end", perrors.RFCCodeText("CDC:ErrWalkDirEnd")) + +type CurrentTableVersion struct { + mu sync.RWMutex + currentTableVersionMap map[schemaKey]types.VersionKey +} + +func NewCurrentTableVersion() *CurrentTableVersion { + return &CurrentTableVersion{ + currentTableVersionMap: make(map[schemaKey]types.VersionKey), + } +} + +// GetCurrentTableVersion returns the current table version for a given schema and table +func (cvt *CurrentTableVersion) GetCurrentTableVersion(schema, table string) types.VersionKey { + cvt.mu.RLock() + defer cvt.mu.RUnlock() + return cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] +} + +// UpdateCurrentTableVersion updates the current table version for a given schema and table +func (cvt *CurrentTableVersion) UpdateCurrentTableVersion(schema, table string, version types.VersionKey) { + cvt.mu.Lock() + defer cvt.mu.Unlock() + cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] = version +} + +type SchemaParsers struct { + mu sync.RWMutex + schemaParserMap map[cloudstorage.SchemaPathKey]schemaParser +} + +func NewSchemaParser() *SchemaParsers { + return &SchemaParsers{ + schemaParserMap: make(map[cloudstorage.SchemaPathKey]schemaParser), + } +} + +// GetSchemaParser returns the schema parser for a given schema and table version +func (sp *SchemaParsers) GetSchemaParser(schema, table string, version uint64) (*TableParser, error) { + schemaPathKey := cloudstorage.SchemaPathKey{ + Schema: schema, + Table: table, + TableVersion: version, + } + sp.mu.RLock() + schemaParser, ok := sp.schemaParserMap[schemaPathKey] + sp.mu.RUnlock() + if !ok { + return nil, errors.Errorf("schema parser not found for schema: %s, table: %s, version: %d", schema, table, version) + } + return schemaParser.parser, nil +} + +// SetSchemaParser sets the schema parser for a given schema and table version +func (sp *SchemaParsers) SetSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *TableParser) { + sp.mu.Lock() + sp.schemaParserMap[schemaPathKey] = schemaParser{ + path: filePath, + parser: parser, + } + sp.mu.Unlock() +} + +// RemoveSchemaParserWithCondition removes the schema parser for a given condition +func (sp *SchemaParsers) RemoveSchemaParserWithCondition(condition func(schemaPathKey cloudstorage.SchemaPathKey) bool) { + sp.mu.Lock() + for schemaPathkey := range sp.schemaParserMap { + if condition(schemaPathkey) { + delete(sp.schemaParserMap, schemaPathkey) + } + } + sp.mu.Unlock() +} + +type TableDMLIdx struct { + mu sync.Mutex + tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap +} + +func NewTableDMLIdx() *TableDMLIdx { + return &TableDMLIdx{ + tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), + } +} + +func (t *TableDMLIdx) UpdateDMLIdxMapByStartPath(dmlkey cloudstorage.DmlPathKey, fileIdx *cloudstorage.FileIndex) { + t.mu.Lock() + defer t.mu.Unlock() + if originalFileIndexKeyMap, ok := t.tableDMLIdxMap[dmlkey]; !ok { + t.tableDMLIdxMap[dmlkey] = fileIndexKeyMap{ + fileIdx.FileIndexKey: fileIdx.Idx, + } + } else { + if fileIdx.Idx > originalFileIndexKeyMap[fileIdx.FileIndexKey] { + originalFileIndexKeyMap[fileIdx.FileIndexKey] = fileIdx.Idx + } + } +} + +func (t *TableDMLIdx) DiffNewTableDMLIdxMap( + newTableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap, +) map[cloudstorage.DmlPathKey]fileIndexRange { + resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) + t.mu.Lock() + defer t.mu.Unlock() + for newDMLPathKey, newFileIndexKeyMap := range newTableDMLIdxMap { + origFileIndexKeyMap, ok := t.tableDMLIdxMap[newDMLPathKey] + if !ok { + t.tableDMLIdxMap[newDMLPathKey] = newFileIndexKeyMap + resMap[newDMLPathKey] = make(fileIndexRange) + for indexKey, newEndVal := range newFileIndexKeyMap { + resMap[newDMLPathKey][indexKey] = indexRange{ + start: 1, + end: newEndVal, + } + } + continue + } + for indexKey, newEndVal := range newFileIndexKeyMap { + origEndVal := origFileIndexKeyMap[indexKey] + if newEndVal > origEndVal { + origFileIndexKeyMap[indexKey] = newEndVal + if _, ok := resMap[newDMLPathKey]; !ok { + resMap[newDMLPathKey] = make(fileIndexRange) + } + resMap[newDMLPathKey][indexKey] = indexRange{ + start: origEndVal + 1, + end: newEndVal, + } + } + } + } + return resMap +} + +type S3Consumer struct { + s3Storage storage.ExternalStorage + fileExtension string + dateSeparator string + fileIndexWidth int + tables map[string][]string + + // skip the first round data download + skipDownloadData bool + + currentTableVersion *CurrentTableVersion + tableDMLIdx *TableDMLIdx + schemaParser *SchemaParsers +} + +func NewS3Consumer( + s3Storage storage.ExternalStorage, + tables map[string][]string, +) *S3Consumer { + return &S3Consumer{ + s3Storage: s3Storage, + fileExtension: ".json", + dateSeparator: config.DateSeparatorDay.String(), + fileIndexWidth: config.DefaultFileIndexWidth, + tables: tables, + + skipDownloadData: true, + + currentTableVersion: NewCurrentTableVersion(), + tableDMLIdx: NewTableDMLIdx(), + schemaParser: NewSchemaParser(), + } +} + +func (c *S3Consumer) InitializeFromCheckpoint( + ctx context.Context, clusterID string, checkpoint *recorder.Checkpoint, +) (map[cloudstorage.DmlPathKey]types.IncrementalData, error) { + if checkpoint == nil { + return nil, nil + } + if checkpoint.CheckpointItems[2] == nil { + return nil, nil + } + c.skipDownloadData = false + scanRanges, err := checkpoint.ToScanRange(clusterID) + if err != nil { + return nil, errors.Trace(err) + } + var mu sync.Mutex + // Combine DML data and schema data into result + result := make(map[cloudstorage.DmlPathKey]types.IncrementalData) + eg, egCtx := errgroup.WithContext(ctx) + for schemaTableKey, scanRange := range scanRanges { + eg.Go(func() error { + scanVersions, err := c.downloadSchemaFilesWithScanRange( + egCtx, schemaTableKey.Schema, schemaTableKey.Table, scanRange.StartVersionKey, scanRange.EndVersionKey, scanRange.EndDataPath) + if err != nil { + return errors.Trace(err) + } + err = c.downloadDataFilesWithScanRange( + egCtx, schemaTableKey.Schema, schemaTableKey.Table, scanVersions, scanRange, + func( + dmlPathKey cloudstorage.DmlPathKey, + dmlSlices map[cloudstorage.FileIndexKey][][]byte, + parser *TableParser, + ) { + mu.Lock() + result[dmlPathKey] = types.IncrementalData{ + DataContentSlices: dmlSlices, + // Parser: parser, + } + mu.Unlock() + }, + ) + if err != nil { + return errors.Trace(err) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + return result, nil +} + +func (c *S3Consumer) downloadSchemaFilesWithScanRange( + ctx context.Context, + schema, table string, + startVersionKey string, + endVersionKey string, + endDataPath string, +) ([]types.VersionKey, error) { + metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) + opt := &storage.WalkOption{ + SubDir: metaSubDir, + ObjPrefix: "schema_", + // TODO: StartAfter: startVersionKey, + } + + var startSchemaKey, endSchemaKey cloudstorage.SchemaPathKey + _, err := startSchemaKey.ParseSchemaFilePath(startVersionKey) + if err != nil { + return nil, errors.Trace(err) + } + _, err = endSchemaKey.ParseSchemaFilePath(endVersionKey) + if err != nil { + return nil, errors.Trace(err) + } + + var scanVersions []types.VersionKey + newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) + scanVersions = append(scanVersions, types.VersionKey{ + Version: startSchemaKey.TableVersion, + VersionPath: startVersionKey, + }) + newVersionPaths[startSchemaKey] = startVersionKey + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if endVersionKey < filePath { + return ErrWalkDirEnd + } + if !cloudstorage.IsSchemaFile(filePath) { + return nil + } + var schemaKey cloudstorage.SchemaPathKey + _, err := schemaKey.ParseSchemaFilePath(filePath) + if err != nil { + log.Error("failed to parse schema file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + if schemaKey.TableVersion > startSchemaKey.TableVersion { + if _, exists := newVersionPaths[schemaKey]; !exists { + scanVersions = append(scanVersions, types.VersionKey{ + Version: schemaKey.TableVersion, + VersionPath: filePath, + }) + } + newVersionPaths[schemaKey] = filePath + } + return nil + }); err != nil && !errors.Is(err, ErrWalkDirEnd) { + return nil, errors.Trace(err) + } + + if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { + return nil, errors.Trace(err) + } + + c.currentTableVersion.UpdateCurrentTableVersion(schema, table, types.VersionKey{ + Version: endSchemaKey.TableVersion, + VersionPath: endVersionKey, + DataPath: endDataPath, + }) + + return scanVersions, nil +} + +func (c *S3Consumer) downloadDataFilesWithScanRange( + ctx context.Context, + schema, table string, + scanVersions []types.VersionKey, + scanRange *recorder.ScanRange, + consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *TableParser), +) error { + eg, egCtx := errgroup.WithContext(ctx) + for _, version := range scanVersions { + eg.Go(func() error { + newFiles, err := c.getNewFilesForSchemaPathKeyWithEndPath(egCtx, schema, table, version.Version, scanRange.StartDataPath, scanRange.EndDataPath) + if err != nil { + return errors.Trace(err) + } + dmlData, err := c.downloadDMLFiles(egCtx, newFiles) + if err != nil { + return errors.Trace(err) + } + parser, err := c.schemaParser.GetSchemaParser(schema, table, version.Version) + if err != nil { + return errors.Trace(err) + } + for dmlPathKey, dmlSlices := range dmlData { + consumeFunc(dmlPathKey, dmlSlices, parser) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } + return nil +} + +func (c *S3Consumer) getNewFilesForSchemaPathKeyWithEndPath( + ctx context.Context, + schema, table string, + version uint64, + startDataPath string, + endDataPath string, +) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { + schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version)) + opt := &storage.WalkOption{ + SubDir: schemaPrefix, + // TODO: StartAfter: startDataPath, + } + newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if endDataPath < filePath { + return ErrWalkDirEnd + } + // Try to parse DML file path if it matches the expected extension + if strings.HasSuffix(filePath, c.fileExtension) { + var dmlkey cloudstorage.DmlPathKey + fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, filePath) + if err != nil { + log.Error("failed to parse dml file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + if filePath == startDataPath { + c.tableDMLIdx.UpdateDMLIdxMapByStartPath(dmlkey, fileIdx) + } else { + updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) + } + } + return nil + }); err != nil && !errors.Is(err, ErrWalkDirEnd) { + return nil, errors.Trace(err) + } + return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil +} + +// downloadSchemaFiles downloads schema files concurrently for given schema path keys +func (c *S3Consumer) downloadSchemaFiles( + ctx context.Context, + newVersionPaths map[cloudstorage.SchemaPathKey]string, +) error { + eg, _ := errgroup.WithContext(ctx) + + log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) + for schemaPathKey, filePath := range newVersionPaths { + eg.Go(func() error { + // content, err := c.s3Storage.ReadFile(egCtx, filePath) + // if err != nil { + // return errors.Annotatef(err, "failed to read schema file: %s", filePath) + // } + // + // Use canal-json decoder for S3 sink with .json file extension + // parser, err := types.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) + // if err != nil { + // return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) + // } + + c.schemaParser.SetSchemaParser(schemaPathKey, filePath, nil) + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } + return nil +} + +func (c *S3Consumer) discoverAndDownloadNewTableVersions( + ctx context.Context, + schema, table string, +) ([]types.VersionKey, error) { + currentVersion := c.currentTableVersion.GetCurrentTableVersion(schema, table) + metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) + opt := &storage.WalkOption{ + SubDir: metaSubDir, + ObjPrefix: "schema_", + // TODO: StartAfter: currentVersion.versionPath, + } + + var scanVersions []types.VersionKey + newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + if !cloudstorage.IsSchemaFile(filePath) { + return nil + } + var schemaKey cloudstorage.SchemaPathKey + _, err := schemaKey.ParseSchemaFilePath(filePath) + if err != nil { + log.Error("failed to parse schema file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + version := schemaKey.TableVersion + if version > currentVersion.Version { + if _, exists := newVersionPaths[schemaKey]; !exists { + scanVersions = append(scanVersions, types.VersionKey{ + Version: version, + VersionPath: filePath, + }) + } + newVersionPaths[schemaKey] = filePath + } + return nil + }); err != nil { + return nil, errors.Trace(err) + } + + // download new version schema files concurrently + if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { + return nil, errors.Trace(err) + } + + if currentVersion.Version > 0 { + scanVersions = append(scanVersions, currentVersion) + } + return scanVersions, nil +} + +func (c *S3Consumer) getNewFilesForSchemaPathKey( + ctx context.Context, + schema, table string, + version *types.VersionKey, +) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { + schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version.Version)) + opt := &storage.WalkOption{ + SubDir: schemaPrefix, + // TODO: StartAfter: version.dataPath, + } + + newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + maxFilePath := "" + if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { + // Try to parse DML file path if it matches the expected extension + if strings.HasSuffix(filePath, c.fileExtension) { + var dmlkey cloudstorage.DmlPathKey + fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, filePath) + if err != nil { + log.Error("failed to parse dml file path, skipping", + zap.String("path", filePath), + zap.Error(err)) + return nil + } + updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) + maxFilePath = filePath + } + return nil + }); err != nil { + return nil, errors.Trace(err) + } + + version.DataPath = maxFilePath + return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil +} + +func (c *S3Consumer) downloadDMLFiles( + ctx context.Context, + newFiles map[cloudstorage.DmlPathKey]fileIndexRange, +) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { + if len(newFiles) == 0 || c.skipDownloadData { + return nil, nil + } + + result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) + type downloadTask struct { + dmlPathKey cloudstorage.DmlPathKey + fileIndex cloudstorage.FileIndex + } + + var tasks []downloadTask + for dmlPathKey, fileRange := range newFiles { + for indexKey, indexRange := range fileRange { + log.Debug("prepare to download new dml file in index range", + zap.String("schema", dmlPathKey.Schema), + zap.String("table", dmlPathKey.Table), + zap.Uint64("version", dmlPathKey.TableVersion), + zap.Int64("partitionNum", dmlPathKey.PartitionNum), + zap.String("date", dmlPathKey.Date), + zap.String("dispatcherID", indexKey.DispatcherID), + zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), + zap.Uint64("startIndex", indexRange.start), + zap.Uint64("endIndex", indexRange.end)) + for i := indexRange.start; i <= indexRange.end; i++ { + tasks = append(tasks, downloadTask{ + dmlPathKey: dmlPathKey, + fileIndex: cloudstorage.FileIndex{ + FileIndexKey: indexKey, + Idx: i, + }, + }) + } + } + } + + log.Debug("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) + + // Concurrently download files + type fileContent struct { + dmlPathKey cloudstorage.DmlPathKey + indexKey cloudstorage.FileIndexKey + idx uint64 + content []byte + } + + fileContents := make(chan fileContent, len(tasks)) + eg, egCtx := errgroup.WithContext(ctx) + for _, task := range tasks { + eg.Go(func() error { + filePath := task.dmlPathKey.GenerateDMLFilePath( + &task.fileIndex, + c.fileExtension, + c.fileIndexWidth, + ) + + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read file: %s", filePath) + } + + // Channel writes are thread-safe, no mutex needed + fileContents <- fileContent{ + dmlPathKey: task.dmlPathKey, + indexKey: task.fileIndex.FileIndexKey, + idx: task.fileIndex.Idx, + content: content, + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + + // Close the channel to signal no more writes + close(fileContents) + + // Process the downloaded file contents + for fc := range fileContents { + if result[fc.dmlPathKey] == nil { + result[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) + } + result[fc.dmlPathKey][fc.indexKey] = append( + result[fc.dmlPathKey][fc.indexKey], + fc.content, + ) + } + + return result, nil +} + +func (c *S3Consumer) downloadNewFilesWithVersions( + ctx context.Context, + schema, table string, + scanVersions []types.VersionKey, + consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *TableParser), +) (*types.VersionKey, error) { + var maxVersion *types.VersionKey + eg, egCtx := errgroup.WithContext(ctx) + for _, version := range scanVersions { + versionp := &version + if maxVersion == nil || maxVersion.Version < version.Version { + maxVersion = versionp + } + eg.Go(func() error { + newFiles, err := c.getNewFilesForSchemaPathKey(egCtx, schema, table, versionp) + if err != nil { + return errors.Trace(err) + } + dmlData, err := c.downloadDMLFiles(egCtx, newFiles) + if err != nil { + return errors.Trace(err) + } + parser, err := c.schemaParser.GetSchemaParser(schema, table, versionp.Version) + if err != nil { + return errors.Trace(err) + } + for dmlPathKey, dmlSlices := range dmlData { + consumeFunc(dmlPathKey, dmlSlices, parser) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return nil, errors.Trace(err) + } + if maxVersion != nil { + c.currentTableVersion.UpdateCurrentTableVersion(schema, table, *maxVersion) + } + return maxVersion, nil +} + +func (c *S3Consumer) ConsumeNewFiles( + ctx context.Context, +) (map[cloudstorage.DmlPathKey]types.IncrementalData, map[types.SchemaTableKey]types.VersionKey, error) { + var mu sync.Mutex + // Combine DML data and schema data into result + result := make(map[cloudstorage.DmlPathKey]types.IncrementalData) + var versionMu sync.Mutex + maxVersionMap := make(map[types.SchemaTableKey]types.VersionKey) + eg, egCtx := errgroup.WithContext(ctx) + for schema, tables := range c.tables { + for _, table := range tables { + eg.Go(func() error { + scanVersions, err := c.discoverAndDownloadNewTableVersions(egCtx, schema, table) + if err != nil { + return errors.Trace(err) + } + maxVersion, err := c.downloadNewFilesWithVersions( + egCtx, schema, table, scanVersions, + func( + dmlPathKey cloudstorage.DmlPathKey, + dmlSlices map[cloudstorage.FileIndexKey][][]byte, + parser *TableParser, + ) { + mu.Lock() + result[dmlPathKey] = types.IncrementalData{ + DataContentSlices: dmlSlices, + // Parser: parser, + } + mu.Unlock() + }, + ) + if err != nil { + return errors.Trace(err) + } + if maxVersion != nil { + versionMu.Lock() + maxVersionMap[types.SchemaTableKey{Schema: schema, Table: table}] = *maxVersion + versionMu.Unlock() + } + return nil + }) + } + } + + if err := eg.Wait(); err != nil { + return nil, nil, errors.Trace(err) + } + c.skipDownloadData = false + return result, maxVersionMap, nil +} diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go b/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go new file mode 100644 index 0000000000..5761d8845d --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go @@ -0,0 +1,636 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package consumer + +import ( + "bytes" + "context" + "fmt" + "path" + "slices" + "strings" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/pingcap/tidb/br/pkg/storage" + "github.com/stretchr/testify/require" +) + +// helper to build a DML file path for tests (day separator, no partition, no dispatcherID). +// Format: {schema}/{table}/{version}/{date}/CDC{idx:020d}.json +func buildDMLFilePath(schema, table string, version uint64, date string, idx uint64) string { + return fmt.Sprintf("%s/%s/%d/%s/CDC%020d.json", schema, table, version, date, idx) +} + +// helper to build a schema file path for tests. +// Format: {schema}/{table}/meta/schema_{version}_{checksum}.json +func buildSchemaFilePath(schema, table string, version uint64, checksum uint32) string { + return fmt.Sprintf("%s/%s/meta/schema_%d_%010d.json", schema, table, version, checksum) +} + +func TestUpdateTableDMLIdxMap(t *testing.T) { + t.Parallel() + + t.Run("insert new entry", func(t *testing.T) { + t.Parallel() + m := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + dmlKey := cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1}, + Date: "2026-01-01", + } + fileIdx := &cloudstorage.FileIndex{ + FileIndexKey: cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false}, + Idx: 5, + } + + updateTableDMLIdxMap(m, dmlKey, fileIdx) + require.Len(t, m, 1) + require.Equal(t, uint64(5), m[dmlKey][fileIdx.FileIndexKey]) + }) + + t.Run("update with higher index", func(t *testing.T) { + t.Parallel() + m := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + dmlKey := cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1}, + Date: "2026-01-01", + } + indexKey := cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false} + fileIdx1 := &cloudstorage.FileIndex{FileIndexKey: indexKey, Idx: 3} + fileIdx2 := &cloudstorage.FileIndex{FileIndexKey: indexKey, Idx: 7} + + updateTableDMLIdxMap(m, dmlKey, fileIdx1) + updateTableDMLIdxMap(m, dmlKey, fileIdx2) + require.Equal(t, uint64(7), m[dmlKey][indexKey]) + }) + + t.Run("skip lower index", func(t *testing.T) { + t.Parallel() + m := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) + dmlKey := cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1}, + Date: "2026-01-01", + } + indexKey := cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false} + fileIdx1 := &cloudstorage.FileIndex{FileIndexKey: indexKey, Idx: 10} + fileIdx2 := &cloudstorage.FileIndex{FileIndexKey: indexKey, Idx: 5} + + updateTableDMLIdxMap(m, dmlKey, fileIdx1) + updateTableDMLIdxMap(m, dmlKey, fileIdx2) + require.Equal(t, uint64(10), m[dmlKey][indexKey]) + }) +} + +func TestCurrentTableVersion(t *testing.T) { + t.Parallel() + + t.Run("get returns zero value for missing key", func(t *testing.T) { + t.Parallel() + cvt := NewCurrentTableVersion() + v := cvt.GetCurrentTableVersion("db", "tbl") + require.Equal(t, types.VersionKey{}, v) + }) + + t.Run("update and get", func(t *testing.T) { + t.Parallel() + cvt := NewCurrentTableVersion() + vk := types.VersionKey{Version: 100, VersionPath: "db/tbl/meta/schema_100_0000000000.json"} + cvt.UpdateCurrentTableVersion("db", "tbl", vk) + got := cvt.GetCurrentTableVersion("db", "tbl") + require.Equal(t, vk, got) + }) + + t.Run("update overwrites previous value", func(t *testing.T) { + t.Parallel() + cvt := NewCurrentTableVersion() + vk1 := types.VersionKey{Version: 1} + vk2 := types.VersionKey{Version: 2} + cvt.UpdateCurrentTableVersion("db", "tbl", vk1) + cvt.UpdateCurrentTableVersion("db", "tbl", vk2) + got := cvt.GetCurrentTableVersion("db", "tbl") + require.Equal(t, vk2, got) + }) + + t.Run("different tables are independent", func(t *testing.T) { + t.Parallel() + cvt := NewCurrentTableVersion() + vk1 := types.VersionKey{Version: 10} + vk2 := types.VersionKey{Version: 20} + cvt.UpdateCurrentTableVersion("db", "tbl1", vk1) + cvt.UpdateCurrentTableVersion("db", "tbl2", vk2) + require.Equal(t, vk1, cvt.GetCurrentTableVersion("db", "tbl1")) + require.Equal(t, vk2, cvt.GetCurrentTableVersion("db", "tbl2")) + }) +} + +func TestSchemaParser(t *testing.T) { + t.Parallel() + + t.Run("get returns error for missing key", func(t *testing.T) { + t.Parallel() + sp := NewSchemaParser() + _, err := sp.GetSchemaParser("db", "tbl", 1) + require.Error(t, err) + require.Contains(t, err.Error(), "schema parser not found") + }) + + t.Run("set and get", func(t *testing.T) { + t.Parallel() + sp := NewSchemaParser() + key := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1} + parser := &TableParser{} + sp.SetSchemaParser(key, "/path/to/schema.json", parser) + + got, err := sp.GetSchemaParser("db", "tbl", 1) + require.NoError(t, err) + require.Equal(t, parser, got) + }) + + t.Run("remove with condition", func(t *testing.T) { + t.Parallel() + sp := NewSchemaParser() + key1 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl1", TableVersion: 1} + key2 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl2", TableVersion: 2} + sp.SetSchemaParser(key1, "/path1", nil) + sp.SetSchemaParser(key2, "/path2", nil) + + // Remove only entries for tbl1 + sp.RemoveSchemaParserWithCondition(func(k cloudstorage.SchemaPathKey) bool { + return k.Table == "tbl1" + }) + + _, err := sp.GetSchemaParser("db", "tbl1", 1) + require.Error(t, err) + + _, err = sp.GetSchemaParser("db", "tbl2", 2) + require.NoError(t, err) + }) + + t.Run("remove with condition matching all", func(t *testing.T) { + t.Parallel() + sp := NewSchemaParser() + key1 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl1", TableVersion: 1} + key2 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl2", TableVersion: 2} + sp.SetSchemaParser(key1, "/path1", nil) + sp.SetSchemaParser(key2, "/path2", nil) + + sp.RemoveSchemaParserWithCondition(func(k cloudstorage.SchemaPathKey) bool { + return true + }) + + _, err := sp.GetSchemaParser("db", "tbl1", 1) + require.Error(t, err) + _, err = sp.GetSchemaParser("db", "tbl2", 2) + require.Error(t, err) + }) +} + +func TestTableDMLIdx_DiffNewTableDMLIdxMap(t *testing.T) { + t.Parallel() + + indexKey := cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false} + dmlKey := cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1}, + Date: "2026-01-01", + } + + t.Run("new entry starts from 1", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + newMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 5}, + } + + result := idx.DiffNewTableDMLIdxMap(newMap) + require.Len(t, result, 1) + require.Equal(t, indexRange{start: 1, end: 5}, result[dmlKey][indexKey]) + }) + + t.Run("existing entry increments from previous end + 1", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + + // First call: set initial state + firstMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 3}, + } + idx.DiffNewTableDMLIdxMap(firstMap) + + // Second call: new end is 7, should get range [4, 7] + secondMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 7}, + } + result := idx.DiffNewTableDMLIdxMap(secondMap) + require.Len(t, result, 1) + require.Equal(t, indexRange{start: 4, end: 7}, result[dmlKey][indexKey]) + }) + + t.Run("same end value returns no diff", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + + firstMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 5}, + } + idx.DiffNewTableDMLIdxMap(firstMap) + + secondMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 5}, + } + result := idx.DiffNewTableDMLIdxMap(secondMap) + require.Empty(t, result) + }) + + t.Run("lower end value returns no diff", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + + firstMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 10}, + } + idx.DiffNewTableDMLIdxMap(firstMap) + + secondMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 5}, + } + result := idx.DiffNewTableDMLIdxMap(secondMap) + require.Empty(t, result) + }) + + t.Run("empty new map returns empty result", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + result := idx.DiffNewTableDMLIdxMap(map[cloudstorage.DmlPathKey]fileIndexKeyMap{}) + require.Empty(t, result) + }) + + t.Run("multiple keys", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + dmlKey2 := cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl2", TableVersion: 1}, + Date: "2026-01-02", + } + + newMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 3}, + dmlKey2: {indexKey: 5}, + } + result := idx.DiffNewTableDMLIdxMap(newMap) + require.Len(t, result, 2) + require.Equal(t, indexRange{start: 1, end: 3}, result[dmlKey][indexKey]) + require.Equal(t, indexRange{start: 1, end: 5}, result[dmlKey2][indexKey]) + }) + + t.Run("multiple index keys for same dml path", func(t *testing.T) { + t.Parallel() + idx := NewTableDMLIdx() + indexKey2 := cloudstorage.FileIndexKey{DispatcherID: "dispatcher1", EnableTableAcrossNodes: true} + + newMap := map[cloudstorage.DmlPathKey]fileIndexKeyMap{ + dmlKey: {indexKey: 3, indexKey2: 5}, + } + result := idx.DiffNewTableDMLIdxMap(newMap) + require.Len(t, result, 1) + require.Equal(t, indexRange{start: 1, end: 3}, result[dmlKey][indexKey]) + require.Equal(t, indexRange{start: 1, end: 5}, result[dmlKey][indexKey2]) + }) +} + +type mockFile struct { + name string + content []byte +} + +type mockS3Storage struct { + storage.ExternalStorage + + fileOffset map[string]int + sortedFiles []mockFile +} + +func NewMockS3Storage(sortedFiles []mockFile) *mockS3Storage { + s3Storage := &mockS3Storage{} + s3Storage.UpdateFiles(sortedFiles) + return s3Storage +} + +func (m *mockS3Storage) ReadFile(ctx context.Context, name string) ([]byte, error) { + return m.sortedFiles[m.fileOffset[name]].content, nil +} + +func (m *mockS3Storage) WalkDir(ctx context.Context, opt *storage.WalkOption, fn func(path string, size int64) error) error { + filenamePrefix := path.Join(opt.SubDir, opt.ObjPrefix) + for _, file := range m.sortedFiles { + if strings.HasPrefix(file.name, filenamePrefix) { + if err := fn(file.name, 0); err != nil { + return err + } + } + } + return nil +} + +func (m *mockS3Storage) UpdateFiles(sortedFiles []mockFile) { + fileOffset := make(map[string]int) + for i, file := range sortedFiles { + fileOffset[file.name] = i + } + m.fileOffset = fileOffset + m.sortedFiles = sortedFiles +} + +func TestS3Consumer(t *testing.T) { + t.Parallel() + ctx := context.Background() + round1Files := []mockFile{ + {name: "test/t1/meta/schema_1_0000000001.json", content: []byte{}}, + {name: "test/t1/1/2026-01-01/CDC00000000000000000001.json", content: []byte("1_2026-01-01_1.json")}, + } + round1TimeWindowData := types.TimeWindowData{ + TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}, + Data: map[cloudstorage.DmlPathKey]types.IncrementalData{}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "test", Table: "t1"}: { + Version: 1, + VersionPath: "test/t1/meta/schema_1_0000000001.json", + DataPath: "test/t1/1/2026-01-01/CDC00000000000000000001.json", + }, + }, + } + expectedMaxVersionMap1 := func(maxVersionMap map[types.SchemaTableKey]types.VersionKey) { + require.Len(t, maxVersionMap, 1) + require.Equal(t, types.VersionKey{ + Version: 1, VersionPath: "test/t1/meta/schema_1_0000000001.json", DataPath: "test/t1/1/2026-01-01/CDC00000000000000000001.json", + }, maxVersionMap[types.SchemaTableKey{Schema: "test", Table: "t1"}]) + } + round2Files := []mockFile{ + {name: "test/t1/meta/schema_1_0000000001.json", content: []byte{}}, + {name: "test/t1/1/2026-01-01/CDC00000000000000000001.json", content: []byte("1_2026-01-01_1.json")}, + {name: "test/t1/1/2026-01-01/CDC00000000000000000002.json", content: []byte("1_2026-01-01_2.json")}, + {name: "test/t1/1/2026-01-02/CDC00000000000000000001.json", content: []byte("1_2026-01-02_1.json")}, + } + round2TimeWindowData := types.TimeWindowData{ + TimeWindow: types.TimeWindow{LeftBoundary: 10, RightBoundary: 20}, + Data: map[cloudstorage.DmlPathKey]types.IncrementalData{}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "test", Table: "t1"}: { + Version: 1, + VersionPath: "test/t1/meta/schema_1_0000000001.json", + DataPath: "test/t1/1/2026-01-02/CDC00000000000000000001.json", + }, + }, + } + expectedNewData2 := func(newData map[cloudstorage.DmlPathKey]types.IncrementalData) { + require.Len(t, newData, 2) + require.Equal(t, types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-01_2.json")}, + }, + }, newData[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, + PartitionNum: 0, + Date: "2026-01-01", + }]) + require.Equal(t, types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-02_1.json")}, + }, + }, newData[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, + PartitionNum: 0, + Date: "2026-01-02", + }]) + } + expectedMaxVersionMap2 := func(maxVersionMap map[types.SchemaTableKey]types.VersionKey) { + require.Len(t, maxVersionMap, 1) + require.Equal(t, types.VersionKey{ + Version: 1, VersionPath: "test/t1/meta/schema_1_0000000001.json", DataPath: "test/t1/1/2026-01-02/CDC00000000000000000001.json", + }, maxVersionMap[types.SchemaTableKey{Schema: "test", Table: "t1"}]) + } + round3Files := []mockFile{ + {name: "test/t1/meta/schema_1_0000000001.json", content: []byte{}}, + {name: "test/t1/meta/schema_2_0000000001.json", content: []byte{}}, + {name: "test/t1/1/2026-01-01/CDC00000000000000000001.json", content: []byte("1_2026-01-01_1.json")}, + {name: "test/t1/1/2026-01-01/CDC00000000000000000002.json", content: []byte("1_2026-01-01_2.json")}, + {name: "test/t1/1/2026-01-02/CDC00000000000000000001.json", content: []byte("1_2026-01-02_1.json")}, + {name: "test/t1/1/2026-01-02/CDC00000000000000000002.json", content: []byte("1_2026-01-02_2.json")}, + {name: "test/t1/2/2026-01-02/CDC00000000000000000001.json", content: []byte("2_2026-01-02_1.json")}, + {name: "test/t1/2/2026-01-03/CDC00000000000000000001.json", content: []byte("2_2026-01-03_1.json")}, + {name: "test/t1/2/2026-01-03/CDC00000000000000000002.json", content: []byte("2_2026-01-03_2.json")}, + } + round3TimeWindowData := types.TimeWindowData{ + TimeWindow: types.TimeWindow{LeftBoundary: 20, RightBoundary: 30}, + Data: map[cloudstorage.DmlPathKey]types.IncrementalData{}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "test", Table: "t1"}: { + Version: 2, + VersionPath: "test/t1/meta/schema_2_0000000001.json", + DataPath: "test/t1/2/2026-01-03/CDC00000000000000000002.json", + }, + }, + } + expectedNewData3 := func(newData map[cloudstorage.DmlPathKey]types.IncrementalData) { + require.Len(t, newData, 3) + require.Equal(t, types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-02_2.json")}, + }, + }, newData[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, + PartitionNum: 0, + Date: "2026-01-02", + }]) + require.Equal(t, types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("2_2026-01-02_1.json")}, + }, + }, newData[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 2}, + PartitionNum: 0, + Date: "2026-01-02", + }]) + newDataContent := newData[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 2}, + PartitionNum: 0, + Date: "2026-01-03", + }] + require.Len(t, newDataContent.DataContentSlices, 1) + contents := newDataContent.DataContentSlices[cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false}] + require.Len(t, contents, 2) + slices.SortFunc(contents, func(a, b []byte) int { + return bytes.Compare(a, b) + }) + require.Equal(t, [][]byte{[]byte("2_2026-01-03_1.json"), []byte("2_2026-01-03_2.json")}, contents) + } + expectedMaxVersionMap3 := func(maxVersionMap map[types.SchemaTableKey]types.VersionKey) { + require.Len(t, maxVersionMap, 1) + require.Equal(t, types.VersionKey{ + Version: 2, VersionPath: "test/t1/meta/schema_2_0000000001.json", DataPath: "test/t1/2/2026-01-03/CDC00000000000000000002.json", + }, maxVersionMap[types.SchemaTableKey{Schema: "test", Table: "t1"}]) + } + expectedCheckpoint23 := func(data map[cloudstorage.DmlPathKey]types.IncrementalData) { + require.Len(t, data, 4) + require.Equal(t, types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-01_2.json")}, + }, + }, data[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, + PartitionNum: 0, + Date: "2026-01-01", + }]) + dataContent := data[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, + PartitionNum: 0, + Date: "2026-01-02", + }] + require.Len(t, dataContent.DataContentSlices, 1) + contents := dataContent.DataContentSlices[cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false}] + require.Len(t, contents, 2) + slices.SortFunc(contents, func(a, b []byte) int { + return bytes.Compare(a, b) + }) + require.Equal(t, [][]byte{[]byte("1_2026-01-02_1.json"), []byte("1_2026-01-02_2.json")}, contents) + require.Equal(t, types.IncrementalData{ + DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ + {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("2_2026-01-02_1.json")}, + }, + }, data[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 2}, + PartitionNum: 0, + Date: "2026-01-02", + }]) + dataContent = data[cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 2}, + PartitionNum: 0, + Date: "2026-01-03", + }] + require.Len(t, dataContent.DataContentSlices, 1) + contents = dataContent.DataContentSlices[cloudstorage.FileIndexKey{DispatcherID: "", EnableTableAcrossNodes: false}] + require.Len(t, contents, 2) + slices.SortFunc(contents, func(a, b []byte) int { + return bytes.Compare(a, b) + }) + require.Equal(t, [][]byte{[]byte("2_2026-01-03_1.json"), []byte("2_2026-01-03_2.json")}, contents) + } + + t.Run("checkpoint with nil items returns nil", func(t *testing.T) { + t.Parallel() + s3Storage := NewMockS3Storage(round1Files) + s3Consumer := NewS3Consumer(s3Storage, map[string][]string{"test": {"t1"}}) + data, err := s3Consumer.InitializeFromCheckpoint(ctx, "test", nil) + require.NoError(t, err) + require.Empty(t, data) + newData, maxVersionMap, err := s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + require.Empty(t, newData) + expectedMaxVersionMap1(maxVersionMap) + s3Storage.UpdateFiles(round2Files) + newData, maxVersionMap, err = s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData2(newData) + expectedMaxVersionMap2(maxVersionMap) + s3Storage.UpdateFiles(round3Files) + newData, maxVersionMap, err = s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData3(newData) + expectedMaxVersionMap3(maxVersionMap) + }) + t.Run("checkpoint with empty items returns nil", func(t *testing.T) { + t.Parallel() + checkpoint := recorder.NewCheckpoint() + s3Storage := NewMockS3Storage(round1Files) + s3Consumer := NewS3Consumer(s3Storage, map[string][]string{"test": {"t1"}}) + data, err := s3Consumer.InitializeFromCheckpoint(ctx, "test", checkpoint) + require.NoError(t, err) + require.Empty(t, data) + newData, maxVersionMap, err := s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + require.Empty(t, newData) + expectedMaxVersionMap1(maxVersionMap) + s3Storage.UpdateFiles(round2Files) + newData, maxVersionMap, err = s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData2(newData) + expectedMaxVersionMap2(maxVersionMap) + s3Storage.UpdateFiles(round3Files) + newData, maxVersionMap, err = s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData3(newData) + expectedMaxVersionMap3(maxVersionMap) + }) + t.Run("checkpoint with 1 item", func(t *testing.T) { + t.Parallel() + checkpoint := recorder.NewCheckpoint() + checkpoint.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "clusterX": round1TimeWindowData, + }) + s3Storage := NewMockS3Storage(round1Files) + s3Consumer := NewS3Consumer(s3Storage, map[string][]string{"test": {"t1"}}) + data, err := s3Consumer.InitializeFromCheckpoint(ctx, "clusterX", checkpoint) + require.NoError(t, err) + require.Empty(t, data) + s3Storage.UpdateFiles(round2Files) + newData, maxVersionMap, err := s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData2(newData) + expectedMaxVersionMap2(maxVersionMap) + s3Storage.UpdateFiles(round3Files) + newData, maxVersionMap, err = s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData3(newData) + expectedMaxVersionMap3(maxVersionMap) + }) + t.Run("checkpoint with 2 items", func(t *testing.T) { + t.Parallel() + checkpoint := recorder.NewCheckpoint() + checkpoint.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "clusterX": round1TimeWindowData, + }) + checkpoint.NewTimeWindowData(1, map[string]types.TimeWindowData{ + "clusterX": round2TimeWindowData, + }) + s3Storage := NewMockS3Storage(round2Files) + s3Consumer := NewS3Consumer(s3Storage, map[string][]string{"test": {"t1"}}) + data, err := s3Consumer.InitializeFromCheckpoint(ctx, "clusterX", checkpoint) + require.NoError(t, err) + expectedNewData2(data) + s3Storage.UpdateFiles(round3Files) + newData, maxVersionMap, err := s3Consumer.ConsumeNewFiles(ctx) + require.NoError(t, err) + expectedNewData3(newData) + expectedMaxVersionMap3(maxVersionMap) + }) + t.Run("checkpoint with 3 items", func(t *testing.T) { + t.Parallel() + checkpoint := recorder.NewCheckpoint() + checkpoint.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "clusterX": round1TimeWindowData, + }) + checkpoint.NewTimeWindowData(1, map[string]types.TimeWindowData{ + "clusterX": round2TimeWindowData, + }) + checkpoint.NewTimeWindowData(2, map[string]types.TimeWindowData{ + "clusterX": round3TimeWindowData, + }) + s3Storage := NewMockS3Storage(round3Files) + s3Consumer := NewS3Consumer(s3Storage, map[string][]string{"test": {"t1"}}) + data, err := s3Consumer.InitializeFromCheckpoint(ctx, "clusterX", checkpoint) + require.NoError(t, err) + expectedCheckpoint23(data) + }) +} diff --git a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go b/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go deleted file mode 100644 index eb8b2df069..0000000000 --- a/cmd/multi-cluster-consistency-checker/consumer/s3_consumer.go +++ /dev/null @@ -1,681 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package consumer - -import ( - "context" - "fmt" - "path" - "strings" - "sync" - - perrors "github.com/pingcap/errors" - "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" - "github.com/pingcap/ticdc/pkg/config" - "github.com/pingcap/ticdc/pkg/errors" - "github.com/pingcap/ticdc/pkg/sink/cloudstorage" - "github.com/pingcap/tidb/br/pkg/storage" - "go.uber.org/zap" - "golang.org/x/sync/errgroup" -) - -var ErrWalkDirEnd = perrors.Normalize("walk dir end", perrors.RFCCodeText("CDC:ErrWalkDirEnd")) - -type CurrentTableVersion struct { - mu sync.RWMutex - currentTableVersionMap map[schemaKey]utils.VersionKey -} - -func NewCurrentTableVersion() *CurrentTableVersion { - return &CurrentTableVersion{ - currentTableVersionMap: make(map[schemaKey]utils.VersionKey), - } -} - -// GetCurrentTableVersion returns the current table version for a given schema and table -func (cvt *CurrentTableVersion) GetCurrentTableVersion(schema, table string) utils.VersionKey { - cvt.mu.RLock() - defer cvt.mu.RUnlock() - return cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] -} - -// UpdateCurrentTableVersion updates the current table version for a given schema and table -func (cvt *CurrentTableVersion) UpdateCurrentTableVersion(schema, table string, version utils.VersionKey) { - cvt.mu.Lock() - defer cvt.mu.Unlock() - cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] = version -} - -type SchemaParser struct { - mu sync.RWMutex - schemaParserMap map[cloudstorage.SchemaPathKey]schemaParser -} - -func NewSchemaParser() *SchemaParser { - return &SchemaParser{ - schemaParserMap: make(map[cloudstorage.SchemaPathKey]schemaParser), - } -} - -// GetSchemaParser returns the schema parser for a given schema and table version -func (sp *SchemaParser) GetSchemaParser(schema, table string, version uint64) (*utils.TableParser, error) { - schemaPathKey := cloudstorage.SchemaPathKey{ - Schema: schema, - Table: table, - TableVersion: version, - } - sp.mu.RLock() - schemaParser, ok := sp.schemaParserMap[schemaPathKey] - sp.mu.RUnlock() - if !ok { - return nil, errors.Errorf("schema parser not found for schema: %s, table: %s, version: %d", schema, table, version) - } - return schemaParser.parser, nil -} - -// SetSchemaParser sets the schema parser for a given schema and table version -func (sp *SchemaParser) SetSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *utils.TableParser) { - sp.mu.Lock() - sp.schemaParserMap[schemaPathKey] = schemaParser{ - path: filePath, - parser: parser, - } - sp.mu.Unlock() -} - -// RemoveSchemaParserWithCondition removes the schema parser for a given condition -func (sp *SchemaParser) RemoveSchemaParserWithCondition(condition func(schemaPathKey cloudstorage.SchemaPathKey) bool) { - sp.mu.Lock() - for schemaPathkey := range sp.schemaParserMap { - if condition(schemaPathkey) { - delete(sp.schemaParserMap, schemaPathkey) - } - } - sp.mu.Unlock() -} - -type TableDMLIdx struct { - mu sync.Mutex - tableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap -} - -func NewTableDMLIdex() *TableDMLIdx { - return &TableDMLIdx{ - tableDMLIdxMap: make(map[cloudstorage.DmlPathKey]fileIndexKeyMap), - } -} - -func (t *TableDMLIdx) DiffNewTableDMLIdxMap( - newTableDMLIdxMap map[cloudstorage.DmlPathKey]fileIndexKeyMap, -) map[cloudstorage.DmlPathKey]fileIndexRange { - resMap := make(map[cloudstorage.DmlPathKey]fileIndexRange) - t.mu.Lock() - defer t.mu.Unlock() - for newDMLPathKey, newFileIndexKeyMap := range newTableDMLIdxMap { - origFileIndexKeyMap, ok := t.tableDMLIdxMap[newDMLPathKey] - if !ok { - t.tableDMLIdxMap[newDMLPathKey] = newFileIndexKeyMap - resMap[newDMLPathKey] = make(fileIndexRange) - for indexKey, newEndVal := range newFileIndexKeyMap { - resMap[newDMLPathKey][indexKey] = indexRange{ - start: 1, - end: newEndVal, - } - } - continue - } - for indexKey, newEndVal := range newFileIndexKeyMap { - origEndVal := origFileIndexKeyMap[indexKey] - if newEndVal > origEndVal { - origFileIndexKeyMap[indexKey] = newEndVal - if _, ok := resMap[newDMLPathKey]; !ok { - resMap[newDMLPathKey] = make(fileIndexRange) - } - resMap[newDMLPathKey][indexKey] = indexRange{ - start: origEndVal + 1, - end: newEndVal, - } - } - } - } - return resMap -} - -type S3Consumer struct { - s3Storage storage.ExternalStorage - fileExtension string - dateSeparator string - fileIndexWidth int - tables map[string][]string - - currentTableVersion *CurrentTableVersion - tableDMLIdx *TableDMLIdx - schemaParser *SchemaParser -} - -func NewS3Consumer( - s3Storage storage.ExternalStorage, - tables map[string][]string, -) *S3Consumer { - return &S3Consumer{ - s3Storage: s3Storage, - fileExtension: ".json", - dateSeparator: config.DateSeparatorDay.String(), - fileIndexWidth: config.DefaultFileIndexWidth, - tables: tables, - - currentTableVersion: NewCurrentTableVersion(), - tableDMLIdx: NewTableDMLIdex(), - schemaParser: NewSchemaParser(), - } -} - -func (c *S3Consumer) InitializeFromCheckpoint(ctx context.Context, clusterID string, checkpoint *recorder.Checkpoint) (map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { - if checkpoint == nil { - return nil, nil - } - if checkpoint.CheckpointItems[2] == nil { - return nil, nil - } - scanRanges, err := checkpoint.ToScanRange(clusterID) - if err != nil { - return nil, errors.Trace(err) - } - var mu sync.Mutex - // Combine DML data and schema data into result - result := make(map[cloudstorage.DmlPathKey]utils.IncrementalData) - eg, egCtx := errgroup.WithContext(ctx) - for schemaTableKey, scanRange := range scanRanges { - eg.Go(func() error { - scanVersions, err := c.downloadSchemaFilesWithScanRange( - egCtx, schemaTableKey.Schema, schemaTableKey.Table, scanRange.StartVersionKey, scanRange.EndVersionKey, scanRange.EndDataPath) - if err != nil { - return errors.Trace(err) - } - err = c.downloadDataFilesWithScanRange( - egCtx, schemaTableKey.Schema, schemaTableKey.Table, scanVersions, scanRange, - func( - dmlPathKey cloudstorage.DmlPathKey, - dmlSlices map[cloudstorage.FileIndexKey][][]byte, - parser *utils.TableParser, - ) { - mu.Lock() - result[dmlPathKey] = utils.IncrementalData{ - DataContentSlices: dmlSlices, - Parser: parser, - } - mu.Unlock() - }, - ) - if err != nil { - return errors.Trace(err) - } - return nil - }) - } - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - return result, nil -} - -func (c *S3Consumer) downloadSchemaFilesWithScanRange( - ctx context.Context, - schema, table string, - startVersionKey string, - endVersionKey string, - endDataPath string, -) ([]utils.VersionKey, error) { - metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) - opt := &storage.WalkOption{ - SubDir: metaSubDir, - ObjPrefix: "schema_", - // TODO: StartAfter: startVersionKey, - } - - var startSchemaKey, endSchemaKey cloudstorage.SchemaPathKey - _, err := startSchemaKey.ParseSchemaFilePath(startVersionKey) - if err != nil { - return nil, errors.Trace(err) - } - _, err = endSchemaKey.ParseSchemaFilePath(endVersionKey) - if err != nil { - return nil, errors.Trace(err) - } - - var scanVersions []utils.VersionKey - newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) - scanVersions = append(scanVersions, utils.VersionKey{ - Version: startSchemaKey.TableVersion, - VersionPath: startVersionKey, - }) - newVersionPaths[startSchemaKey] = startVersionKey - if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { - if endVersionKey < filePath { - return ErrWalkDirEnd - } - if !cloudstorage.IsSchemaFile(filePath) { - return nil - } - var schemaKey cloudstorage.SchemaPathKey - _, err := schemaKey.ParseSchemaFilePath(filePath) - if err != nil { - log.Error("failed to parse schema file path, skipping", - zap.String("path", filePath), - zap.Error(err)) - return nil - } - if schemaKey.TableVersion > startSchemaKey.TableVersion { - if _, exists := newVersionPaths[schemaKey]; !exists { - scanVersions = append(scanVersions, utils.VersionKey{ - Version: schemaKey.TableVersion, - VersionPath: filePath, - }) - } - newVersionPaths[schemaKey] = filePath - } - return nil - }); err != nil && !errors.Is(err, ErrWalkDirEnd) { - return nil, errors.Trace(err) - } - - if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { - return nil, errors.Trace(err) - } - - c.currentTableVersion.UpdateCurrentTableVersion(schema, table, utils.VersionKey{ - Version: endSchemaKey.TableVersion, - VersionPath: endVersionKey, - DataPath: endDataPath, - }) - - return scanVersions, nil -} - -func (c *S3Consumer) downloadDataFilesWithScanRange( - ctx context.Context, - schema, table string, - scanVersions []utils.VersionKey, - scanRange *recorder.ScanRange, - consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *utils.TableParser), -) error { - eg, egCtx := errgroup.WithContext(ctx) - for _, version := range scanVersions { - eg.Go(func() error { - newFiles, err := c.getNewFilesForSchemaPathKeyWithEndPath(egCtx, schema, table, version.Version, scanRange.StartDataPath, scanRange.EndDataPath) - if err != nil { - return errors.Trace(err) - } - dmlData, err := c.downloadDMLFiles(egCtx, newFiles) - if err != nil { - return errors.Trace(err) - } - parser, err := c.schemaParser.GetSchemaParser(schema, table, version.Version) - if err != nil { - return errors.Trace(err) - } - for dmlPathKey, dmlSlices := range dmlData { - consumeFunc(dmlPathKey, dmlSlices, parser) - } - return nil - }) - } - if err := eg.Wait(); err != nil { - return errors.Trace(err) - } - return nil -} - -func (c *S3Consumer) getNewFilesForSchemaPathKeyWithEndPath( - ctx context.Context, - schema, table string, - version uint64, - startDataPath string, - endDataPath string, -) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { - schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version)) - opt := &storage.WalkOption{ - SubDir: schemaPrefix, - // TODO: StartAfter: startDataPath, - } - newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) - if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { - if endDataPath < filePath { - return ErrWalkDirEnd - } - // Try to parse DML file path if it matches the expected extension - if strings.HasSuffix(filePath, c.fileExtension) { - var dmlkey cloudstorage.DmlPathKey - fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, filePath) - if err != nil { - log.Error("failed to parse dml file path, skipping", - zap.String("path", filePath), - zap.Error(err)) - return nil - } - updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) - } - return nil - }); err != nil && !errors.Is(err, ErrWalkDirEnd) { - return nil, errors.Trace(err) - } - return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil -} - -// downloadSchemaFiles downloads schema files concurrently for given schema path keys -func (c *S3Consumer) downloadSchemaFiles( - ctx context.Context, - newVersionPaths map[cloudstorage.SchemaPathKey]string, -) error { - eg, egCtx := errgroup.WithContext(ctx) - - log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) - for schemaPathKey, filePath := range newVersionPaths { - eg.Go(func() error { - content, err := c.s3Storage.ReadFile(egCtx, filePath) - if err != nil { - return errors.Annotatef(err, "failed to read schema file: %s", filePath) - } - - // Use canal-json decoder for S3 sink with .json file extension - parser, err := utils.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) - if err != nil { - return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) - } - - c.schemaParser.SetSchemaParser(schemaPathKey, filePath, parser) - return nil - }) - } - if err := eg.Wait(); err != nil { - return errors.Trace(err) - } - return nil -} - -func (c *S3Consumer) discoverAndDownloadNewTableVersions( - ctx context.Context, - schema, table string, -) ([]utils.VersionKey, error) { - currentVersion := c.currentTableVersion.GetCurrentTableVersion(schema, table) - metaSubDir := fmt.Sprintf("%s/%s/meta/", schema, table) - opt := &storage.WalkOption{ - SubDir: metaSubDir, - ObjPrefix: "schema_", - // TODO: StartAfter: currentVersion.versionPath, - } - - var scanVersions []utils.VersionKey - newVersionPaths := make(map[cloudstorage.SchemaPathKey]string) - if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { - if !cloudstorage.IsSchemaFile(filePath) { - return nil - } - var schemaKey cloudstorage.SchemaPathKey - _, err := schemaKey.ParseSchemaFilePath(filePath) - if err != nil { - log.Error("failed to parse schema file path, skipping", - zap.String("path", filePath), - zap.Error(err)) - return nil - } - version := schemaKey.TableVersion - if version > currentVersion.Version { - if _, exists := newVersionPaths[schemaKey]; !exists { - scanVersions = append(scanVersions, utils.VersionKey{ - Version: version, - VersionPath: filePath, - }) - } - newVersionPaths[schemaKey] = filePath - } - return nil - }); err != nil { - return nil, errors.Trace(err) - } - - // download new version schema files concurrently - if err := c.downloadSchemaFiles(ctx, newVersionPaths); err != nil { - return nil, errors.Trace(err) - } - - if currentVersion.Version > 0 { - scanVersions = append(scanVersions, currentVersion) - } - return scanVersions, nil -} - -func (c *S3Consumer) getNewFilesForSchemaPathKey( - ctx context.Context, - schema, table string, - version *utils.VersionKey, -) (map[cloudstorage.DmlPathKey]fileIndexRange, error) { - schemaPrefix := path.Join(schema, table, fmt.Sprintf("%d", version.Version)) - opt := &storage.WalkOption{ - SubDir: schemaPrefix, - // TODO: StartAfter: version.dataPath, - } - - newTableDMLIdxMap := make(map[cloudstorage.DmlPathKey]fileIndexKeyMap) - maxFilePath := "" - if err := c.s3Storage.WalkDir(ctx, opt, func(filePath string, size int64) error { - // Try to parse DML file path if it matches the expected extension - if strings.HasSuffix(filePath, c.fileExtension) { - var dmlkey cloudstorage.DmlPathKey - fileIdx, err := dmlkey.ParseDMLFilePath(c.dateSeparator, filePath) - if err != nil { - log.Error("failed to parse dml file path, skipping", - zap.String("path", filePath), - zap.Error(err)) - return nil - } - updateTableDMLIdxMap(newTableDMLIdxMap, dmlkey, fileIdx) - maxFilePath = filePath - } - return nil - }); err != nil { - return nil, errors.Trace(err) - } - - version.DataPath = maxFilePath - return c.tableDMLIdx.DiffNewTableDMLIdxMap(newTableDMLIdxMap), nil -} - -func (c *S3Consumer) downloadDMLFiles( - ctx context.Context, - newFiles map[cloudstorage.DmlPathKey]fileIndexRange, -) (map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte, error) { - if len(newFiles) == 0 { - return nil, nil - } - - result := make(map[cloudstorage.DmlPathKey]map[cloudstorage.FileIndexKey][][]byte) - type downloadTask struct { - dmlPathKey cloudstorage.DmlPathKey - fileIndex cloudstorage.FileIndex - } - - var tasks []downloadTask - for dmlPathKey, fileRange := range newFiles { - for indexKey, indexRange := range fileRange { - log.Debug("prepare to download new dml file in index range", - zap.String("schema", dmlPathKey.Schema), - zap.String("table", dmlPathKey.Table), - zap.Uint64("version", dmlPathKey.TableVersion), - zap.Int64("partitionNum", dmlPathKey.PartitionNum), - zap.String("date", dmlPathKey.Date), - zap.String("dispatcherID", indexKey.DispatcherID), - zap.Bool("enableTableAcrossNodes", indexKey.EnableTableAcrossNodes), - zap.Uint64("startIndex", indexRange.start), - zap.Uint64("endIndex", indexRange.end)) - for i := indexRange.start; i <= indexRange.end; i++ { - tasks = append(tasks, downloadTask{ - dmlPathKey: dmlPathKey, - fileIndex: cloudstorage.FileIndex{ - FileIndexKey: indexKey, - Idx: i, - }, - }) - } - } - } - - log.Debug("starting concurrent DML file download", zap.Int("totalFiles", len(tasks))) - - // Concurrently download files - type fileContent struct { - dmlPathKey cloudstorage.DmlPathKey - indexKey cloudstorage.FileIndexKey - idx uint64 - content []byte - } - - fileContents := make(chan fileContent, len(tasks)) - eg, egCtx := errgroup.WithContext(ctx) - for _, task := range tasks { - eg.Go(func() error { - filePath := task.dmlPathKey.GenerateDMLFilePath( - &task.fileIndex, - c.fileExtension, - c.fileIndexWidth, - ) - - content, err := c.s3Storage.ReadFile(egCtx, filePath) - if err != nil { - return errors.Annotatef(err, "failed to read file: %s", filePath) - } - - // Channel writes are thread-safe, no mutex needed - fileContents <- fileContent{ - dmlPathKey: task.dmlPathKey, - indexKey: task.fileIndex.FileIndexKey, - idx: task.fileIndex.Idx, - content: content, - } - return nil - }) - } - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - - // Close the channel to signal no more writes - close(fileContents) - - // Process the downloaded file contents - for fc := range fileContents { - if result[fc.dmlPathKey] == nil { - result[fc.dmlPathKey] = make(map[cloudstorage.FileIndexKey][][]byte) - } - result[fc.dmlPathKey][fc.indexKey] = append( - result[fc.dmlPathKey][fc.indexKey], - fc.content, - ) - } - - return result, nil -} - -func (c *S3Consumer) downloadNewFilesWithVersions( - ctx context.Context, - schema, table string, - scanVersions []utils.VersionKey, - consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *utils.TableParser), -) (*utils.VersionKey, error) { - var maxVersion *utils.VersionKey - eg, egCtx := errgroup.WithContext(ctx) - for _, version := range scanVersions { - versionp := &version - if maxVersion == nil || maxVersion.Version < version.Version { - maxVersion = versionp - } - eg.Go(func() error { - newFiles, err := c.getNewFilesForSchemaPathKey(egCtx, schema, table, versionp) - if err != nil { - return errors.Trace(err) - } - dmlData, err := c.downloadDMLFiles(egCtx, newFiles) - if err != nil { - return errors.Trace(err) - } - parser, err := c.schemaParser.GetSchemaParser(schema, table, versionp.Version) - if err != nil { - return errors.Trace(err) - } - for dmlPathKey, dmlSlices := range dmlData { - consumeFunc(dmlPathKey, dmlSlices, parser) - } - return nil - }) - } - if err := eg.Wait(); err != nil { - return nil, errors.Trace(err) - } - if maxVersion != nil { - c.currentTableVersion.UpdateCurrentTableVersion(schema, table, *maxVersion) - } - return maxVersion, nil -} - -func (c *S3Consumer) ConsumeNewFiles( - ctx context.Context, -) (map[cloudstorage.DmlPathKey]utils.IncrementalData, map[utils.SchemaTableKey]utils.VersionKey, error) { - var mu sync.Mutex - // Combine DML data and schema data into result - result := make(map[cloudstorage.DmlPathKey]utils.IncrementalData) - var versionMu sync.Mutex - maxVersionMap := make(map[utils.SchemaTableKey]utils.VersionKey) - eg, egCtx := errgroup.WithContext(ctx) - for schema, tables := range c.tables { - for _, table := range tables { - eg.Go(func() error { - scanVersions, err := c.discoverAndDownloadNewTableVersions(egCtx, schema, table) - if err != nil { - return errors.Trace(err) - } - maxVersion, err := c.downloadNewFilesWithVersions( - egCtx, schema, table, scanVersions, - func( - dmlPathKey cloudstorage.DmlPathKey, - dmlSlices map[cloudstorage.FileIndexKey][][]byte, - parser *utils.TableParser, - ) { - mu.Lock() - result[dmlPathKey] = utils.IncrementalData{ - DataContentSlices: dmlSlices, - Parser: parser, - } - mu.Unlock() - }, - ) - if err != nil { - return errors.Trace(err) - } - if maxVersion != nil { - versionMu.Lock() - maxVersionMap[utils.SchemaTableKey{Schema: schema, Table: table}] = *maxVersion - versionMu.Unlock() - } - return nil - }) - } - } - - if err := eg.Wait(); err != nil { - return nil, nil, errors.Trace(err) - } - return result, maxVersionMap, nil -} diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder.go b/cmd/multi-cluster-consistency-checker/decoder/decoder.go new file mode 100644 index 0000000000..14f7fc4355 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder.go @@ -0,0 +1,417 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package decoder + +import ( + "bytes" + "encoding/hex" + "encoding/json" + "slices" + "strconv" + "strings" + "time" + + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/common/event" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/sink/codec/common" + codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" + "github.com/pingcap/tidb/pkg/parser/mysql" + ptypes "github.com/pingcap/tidb/pkg/parser/types" + tiTypes "github.com/pingcap/tidb/pkg/types" + "github.com/pingcap/tidb/pkg/util/codec" + "go.uber.org/zap" + "golang.org/x/text/encoding/charmap" +) + +const tidbWaterMarkType = "TIDB_WATERMARK" + +type canalValueDecoderJSONMessage struct { + PkNames []string `json:"pkNames"` + IsDDL bool `json:"isDdl"` + EventType string `json:"type"` + MySQLType map[string]string `json:"mysqlType"` + Data []map[string]any `json:"data"` +} + +func (c *canalValueDecoderJSONMessage) messageType() common.MessageType { + if c.IsDDL { + return common.MessageTypeDDL + } + + if c.EventType == tidbWaterMarkType { + return common.MessageTypeResolved + } + + return common.MessageTypeRow +} + +type TiDBCommitTsExtension struct { + CommitTs uint64 `json:"commitTs"` +} + +type canalValueDecoderJSONMessageWithTiDBExtension struct { + canalValueDecoderJSONMessage + + TiDBCommitTsExtension *TiDBCommitTsExtension `json:"_tidb"` +} + +func defaultCanalJSONCodecConfig() *codecCommon.Config { + codecConfig := codecCommon.NewConfig(config.ProtocolCanalJSON) + // Always enable tidb extension for canal-json protocol + // because we need to get the commit ts from the extension field. + codecConfig.EnableTiDBExtension = true + codecConfig.Terminator = config.CRLF + return codecConfig +} + +type Record struct { + types.CdcVersion + Pk types.PkType + ColumnValues map[string]any +} + +func (r *Record) EqualDownstreamRecord(downstreamRecord *Record) bool { + if downstreamRecord == nil { + return false + } + if r.CommitTs != downstreamRecord.OriginTs { + return false + } + if r.Pk != downstreamRecord.Pk { + return false + } + if len(r.ColumnValues) != len(downstreamRecord.ColumnValues) { + return false + } + for columnName, columnValue := range r.ColumnValues { + downstreamColumnValue, ok := downstreamRecord.ColumnValues[columnName] + if !ok { + return false + } + if columnValue != downstreamColumnValue { + return false + } + } + return true +} + +type columnValueDecoder struct { + data []byte + config *common.Config + + msg *canalValueDecoderJSONMessageWithTiDBExtension +} + +func newColumnValueDecoder(data []byte) (*columnValueDecoder, error) { + config := defaultCanalJSONCodecConfig() + data, err := common.Decompress(config.LargeMessageHandle.LargeMessageHandleCompression, data) + if err != nil { + log.Error("decompress data failed", + zap.String("compression", config.LargeMessageHandle.LargeMessageHandleCompression), + zap.Any("data", data), + zap.Error(err)) + return nil, errors.Annotatef(err, "decompress data failed") + } + return &columnValueDecoder{ + config: config, + data: data, + }, nil +} + +func Decode(data []byte) ([]*Record, error) { + decoder, err := newColumnValueDecoder(data) + if err != nil { + return nil, errors.Trace(err) + } + + records := make([]*Record, 0) + for { + msgType, hasNext := decoder.tryNext() + if !hasNext { + break + } + if msgType == common.MessageTypeRow { + record, err := decoder.decodeNext() + if err != nil { + return nil, errors.Trace(err) + } + records = append(records, record) + } + } + + return records, nil +} + +func (d *columnValueDecoder) tryNext() (common.MessageType, bool) { + if d.data == nil { + return common.MessageTypeUnknown, false + } + var ( + msg = &canalValueDecoderJSONMessageWithTiDBExtension{} + encodedData []byte + ) + + idx := bytes.IndexAny(d.data, d.config.Terminator) + if idx >= 0 { + encodedData = d.data[:idx] + d.data = d.data[idx+len(d.config.Terminator):] + } else { + encodedData = d.data + d.data = nil + } + + if len(encodedData) == 0 { + return common.MessageTypeUnknown, false + } + + if err := json.Unmarshal(encodedData, msg); err != nil { + log.Panic("canal-json decoder unmarshal data failed", + zap.Error(err), zap.ByteString("data", encodedData)) + return common.MessageTypeUnknown, false + } + d.msg = msg + return d.msg.messageType(), true +} + +func (d *columnValueDecoder) decodeNext() (*Record, error) { + if d.msg == nil || len(d.msg.Data) == 0 || d.msg.messageType() != common.MessageTypeRow { + log.Error("invalid message", zap.Any("msg", d.msg)) + return nil, errors.New("invalid message") + } + + pkValues := make([]tiTypes.Datum, 0, len(d.msg.PkNames)) + slices.Sort(d.msg.PkNames) + for _, pkName := range d.msg.PkNames { + mysqlType, ok := d.msg.MySQLType[pkName] + if !ok { + log.Error("mysql type not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) + return nil, errors.Errorf("mysql type of column %s not found", pkName) + } + columnValue, ok := d.msg.Data[0][pkName] + if !ok { + log.Error("column value not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) + return nil, errors.Errorf("column value of column %s not found", pkName) + } + ft := newPKColumnFieldTypeFromMysqlType(mysqlType) + datum := valueToDatum(columnValue, ft) + if datum.IsNull() { + log.Error("column value is null", zap.String("pkName", pkName), zap.Any("msg", d.msg)) + return nil, errors.Errorf("column value of column %s is null", pkName) + } + pkValues = append(pkValues, *datum) + delete(d.msg.Data[0], pkName) + } + pkEncoded, err := codec.EncodeKey(time.UTC, nil, pkValues...) + if err != nil { + return nil, errors.Annotate(err, "failed to encode primary key") + } + pk := hex.EncodeToString(pkEncoded) + originTs := uint64(0) + columnValues := make(map[string]any) + for columnName, columnValue := range d.msg.Data[0] { + if columnName == event.OriginTsColumn { + if columnValue != nil { + originTs, err = strconv.ParseUint(columnValue.(string), 10, 64) + if err != nil { + return nil, errors.Trace(err) + } + } + } else { + columnValues[columnName] = columnValue + } + } + commitTs := d.msg.TiDBCommitTsExtension.CommitTs + d.msg = nil + return &Record{ + Pk: types.PkType(pk), + ColumnValues: columnValues, + CdcVersion: types.CdcVersion{ + CommitTs: commitTs, + OriginTs: originTs, + }, + }, nil +} + +func newPKColumnFieldTypeFromMysqlType(mysqlType string) *ptypes.FieldType { + tp := ptypes.NewFieldType(common.ExtractBasicMySQLType(mysqlType)) + if common.IsBinaryMySQLType(mysqlType) { + tp.AddFlag(mysql.BinaryFlag) + tp.SetCharset("binary") + tp.SetCollate("binary") + } + if strings.HasPrefix(mysqlType, "char") || + strings.HasPrefix(mysqlType, "varchar") || + strings.Contains(mysqlType, "text") || + strings.Contains(mysqlType, "enum") || + strings.Contains(mysqlType, "set") { + tp.SetCharset("utf8mb4") + tp.SetCollate("utf8mb4_bin") + } + + if common.IsUnsignedMySQLType(mysqlType) { + tp.AddFlag(mysql.UnsignedFlag) + } + + flen, decimal := common.ExtractFlenDecimal(mysqlType, tp.GetType()) + tp.SetFlen(flen) + tp.SetDecimal(decimal) + switch tp.GetType() { + case mysql.TypeEnum, mysql.TypeSet: + tp.SetElems(common.ExtractElements(mysqlType)) + case mysql.TypeDuration: + decimal = common.ExtractDecimal(mysqlType) + tp.SetDecimal(decimal) + default: + } + return tp +} + +func valueToDatum(value any, ft *ptypes.FieldType) *tiTypes.Datum { + d := &tiTypes.Datum{} + if value == nil { + d.SetNull() + return d + } + rawValue, ok := value.(string) + if !ok { + log.Panic("canal-json encoded message should have type in `string`") + } + if mysql.HasBinaryFlag(ft.GetFlag()) { + // when encoding the `JavaSQLTypeBLOB`, use `IS08859_1` decoder, now reverse it back. + result, err := charmap.ISO8859_1.NewEncoder().String(rawValue) + if err != nil { + log.Panic("invalid column value, please report a bug", zap.Any("rawValue", rawValue), zap.Error(err)) + } + rawValue = result + } + + switch ft.GetType() { + case mysql.TypeLonglong, mysql.TypeLong, mysql.TypeInt24, mysql.TypeShort, mysql.TypeTiny: + if mysql.HasUnsignedFlag(ft.GetFlag()) { + data, err := strconv.ParseUint(rawValue, 10, 64) + if err != nil { + log.Panic("invalid column value for unsigned integer", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetUint64(data) + return d + } + data, err := strconv.ParseInt(rawValue, 10, 64) + if err != nil { + log.Panic("invalid column value for integer", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetInt64(data) + return d + case mysql.TypeYear: + data, err := strconv.ParseInt(rawValue, 10, 64) + if err != nil { + log.Panic("invalid column value for year", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetInt64(data) + return d + case mysql.TypeFloat: + data, err := strconv.ParseFloat(rawValue, 32) + if err != nil { + log.Panic("invalid column value for float", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetFloat32(float32(data)) + return d + case mysql.TypeDouble: + data, err := strconv.ParseFloat(rawValue, 64) + if err != nil { + log.Panic("invalid column value for double", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetFloat64(data) + return d + case mysql.TypeVarString, mysql.TypeVarchar, mysql.TypeString, + mysql.TypeBlob, mysql.TypeTinyBlob, mysql.TypeMediumBlob, mysql.TypeLongBlob: + d.SetString(rawValue, ft.GetCollate()) + return d + case mysql.TypeNewDecimal: + data := new(tiTypes.MyDecimal) + err := data.FromString([]byte(rawValue)) + if err != nil { + log.Panic("invalid column value for decimal", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetMysqlDecimal(data) + d.SetLength(ft.GetFlen()) + if ft.GetDecimal() == tiTypes.UnspecifiedLength { + d.SetFrac(int(data.GetDigitsFrac())) + } else { + d.SetFrac(ft.GetDecimal()) + } + return d + case mysql.TypeDate, mysql.TypeDatetime, mysql.TypeTimestamp: + data, err := tiTypes.ParseTime(tiTypes.DefaultStmtNoWarningContext, rawValue, ft.GetType(), ft.GetDecimal()) + if err != nil { + log.Panic("invalid column value for time", zap.Any("rawValue", rawValue), + zap.Int("flen", ft.GetFlen()), zap.Int("decimal", ft.GetDecimal()), + zap.Error(err)) + } + d.SetMysqlTime(data) + return d + case mysql.TypeDuration: + data, _, err := tiTypes.ParseDuration(tiTypes.DefaultStmtNoWarningContext, rawValue, ft.GetDecimal()) + if err != nil { + log.Panic("invalid column value for duration", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetMysqlDuration(data) + return d + case mysql.TypeEnum: + enumValue, err := strconv.ParseUint(rawValue, 10, 64) + if err != nil { + log.Panic("invalid column value for enum", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetMysqlEnum(tiTypes.Enum{ + Name: "", + Value: enumValue, + }, ft.GetCollate()) + return d + case mysql.TypeSet: + setValue, err := strconv.ParseUint(rawValue, 10, 64) + if err != nil { + log.Panic("invalid column value for set", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetMysqlSet(tiTypes.Set{ + Name: "", + Value: setValue, + }, ft.GetCollate()) + return d + case mysql.TypeBit: + data, err := strconv.ParseUint(rawValue, 10, 64) + if err != nil { + log.Panic("invalid column value for bit", zap.Any("rawValue", rawValue), zap.Error(err)) + } + byteSize := (ft.GetFlen() + 7) >> 3 + d.SetMysqlBit(tiTypes.NewBinaryLiteralFromUint(data, byteSize)) + return d + case mysql.TypeJSON: + data, err := tiTypes.ParseBinaryJSONFromString(rawValue) + if err != nil { + log.Panic("invalid column value for json", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetMysqlJSON(data) + return d + case mysql.TypeTiDBVectorFloat32: + data, err := tiTypes.ParseVectorFloat32(rawValue) + if err != nil { + log.Panic("cannot parse vector32 value from string", zap.Any("rawValue", rawValue), zap.Error(err)) + } + d.SetVectorFloat32(data) + return d + } + return d +} diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go new file mode 100644 index 0000000000..152dfa35b3 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go @@ -0,0 +1,232 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package decoder_test + +import ( + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/decoder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/stretchr/testify/require" +) + +// DataContent uses CRLF (\r\n) as line terminator to match the codec config +const DataContent1 string = "" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770184540709,"ts":1770184542274,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp","id":"int","first_name":"varchar"},"old":null,"data":[{"id":"20","first_name":"t","last_name":"TT","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"_tidb":{"commitTs":464043256649875456}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770184540709,"ts":1770184542274,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"21","first_name":"u","last_name":"UU","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"_tidb":{"commitTs":464043256649875456}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770301693150,"ts":1770301693833,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp","id":"int"},"old":null,"data":[{"id":"5","first_name":"e","last_name":"E","_tidb_origin_ts":"464073966942421014","_tidb_softdelete_time":null}],"_tidb":{"commitTs":464073967049113629}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770301693150,"ts":1770301693833,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"_tidb_softdelete_time":"timestamp","id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint"},"old":null,"data":[{"id":"6","first_name":"f","last_name":"F","_tidb_origin_ts":"464073966942421014","_tidb_softdelete_time":null}],"_tidb":{"commitTs":464073967049113629}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770303499850,"ts":1770303500498,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"7","first_name":"g","last_name":"G","_tidb_origin_ts":"464074440387592202","_tidb_softdelete_time":null}],"_tidb":{"commitTs":464074440664678441}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"UPDATE","es":1770303520951,"ts":1770303522531,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp","id":"int","first_name":"varchar"},"old":[{"id":"7","first_name":"g","last_name":"G","_tidb_origin_ts":"464074440387592202","_tidb_softdelete_time":null}],"data":[{"id":"7","first_name":"g","last_name":"G","_tidb_origin_ts":null,"_tidb_softdelete_time":"2026-02-05 22:58:40.992217"}],"_tidb":{"commitTs":464074446196178963}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770303498793,"ts":1770303499864,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"8","first_name":"h","last_name":"H","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"_tidb":{"commitTs":464074440387592202}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"UPDATE","es":1770303522494,"ts":1770303523900,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":[{"id":"8","first_name":"h","last_name":"H","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"data":[{"id":"8","first_name":"h","last_name":"H","_tidb_origin_ts":"464074446196178963","_tidb_softdelete_time":"2026-02-05 22:58:40.992217"}],"_tidb":{"commitTs":464074446600667164}}` + +var ExpectedRecords1 = []decoder.Record{ + {CdcVersion: types.CdcVersion{CommitTs: 464043256649875456, OriginTs: 0}, Pk: "038000000000000014", ColumnValues: map[string]any{"first_name": "t", "last_name": "TT", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464043256649875456, OriginTs: 0}, Pk: "038000000000000015", ColumnValues: map[string]any{"first_name": "u", "last_name": "UU", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464073967049113629, OriginTs: 464073966942421014}, Pk: "038000000000000005", ColumnValues: map[string]any{"first_name": "e", "last_name": "E", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464073967049113629, OriginTs: 464073966942421014}, Pk: "038000000000000006", ColumnValues: map[string]any{"first_name": "f", "last_name": "F", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074440664678441, OriginTs: 464074440387592202}, Pk: "038000000000000007", ColumnValues: map[string]any{"first_name": "g", "last_name": "G", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074446196178963, OriginTs: 0}, Pk: "038000000000000007", ColumnValues: map[string]any{"first_name": "g", "last_name": "G", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074440387592202, OriginTs: 0}, Pk: "038000000000000008", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074446600667164, OriginTs: 464074446196178963}, Pk: "038000000000000008", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, +} + +func TestCanalJSONDecoder1(t *testing.T) { + records, err := decoder.Decode([]byte(DataContent1)) + require.NoError(t, err) + require.Len(t, records, 8) + for i, actualRecord := range records { + expectedRecord := ExpectedRecords1[i] + require.Equal(t, actualRecord.Pk, expectedRecord.Pk) + require.Equal(t, actualRecord.ColumnValues, expectedRecord.ColumnValues) + require.Equal(t, actualRecord.CdcVersion.CommitTs, expectedRecord.CdcVersion.CommitTs) + require.Equal(t, actualRecord.CdcVersion.OriginTs, expectedRecord.CdcVersion.OriginTs) + } +} + +const DataContent2 string = "" + + `{"id":0,"database":"test_active","table":"message2","pkNames":["id","first_name"],"isDdl":false,"type":"INSERT","es":1770344412751,"ts":1770344413749,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"100","first_name":"a","last_name":"A","_tidb_origin_ts":"464085165262503958","_tidb_softdelete_time":null}],"_tidb":{"commitTs":464085165736198159}}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message2","pkNames":["id","first_name"],"isDdl":false,"type":"INSERT","es":1770344427851,"ts":1770344429772,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"101","first_name":"b","last_name":"B","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"_tidb":{"commitTs":464085169694572575}}` + "\r\n" + +var ExpectedRecords2 = []decoder.Record{ + {CdcVersion: types.CdcVersion{CommitTs: 464085165736198159, OriginTs: 464085165262503958}, Pk: "016100000000000000f8038000000000000064", ColumnValues: map[string]any{"last_name": "A", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464085169694572575, OriginTs: 0}, Pk: "016200000000000000f8038000000000000065", ColumnValues: map[string]any{"last_name": "B", "_tidb_softdelete_time": nil}}, +} + +func TestCanalJSONDecoder2(t *testing.T) { + records, err := decoder.Decode([]byte(DataContent2)) + require.NoError(t, err) + require.Len(t, records, 2) + for i, actualRecord := range records { + expectedRecord := ExpectedRecords2[i] + require.Equal(t, actualRecord.Pk, expectedRecord.Pk) + require.Equal(t, actualRecord.ColumnValues, expectedRecord.ColumnValues) + require.Equal(t, actualRecord.CdcVersion.CommitTs, expectedRecord.CdcVersion.CommitTs) + require.Equal(t, actualRecord.CdcVersion.OriginTs, expectedRecord.CdcVersion.OriginTs) + } +} + +func TestRecord_EqualDownstreamRecord(t *testing.T) { + tests := []struct { + name string + upstream *decoder.Record + downstream *decoder.Record + expectedEqual bool + }{ + { + name: "equal records", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value1", + "col2": 42, + }, + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value1", + "col2": 42, + }, + }, + expectedEqual: true, + }, + { + name: "downstream is nil", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + }, + downstream: nil, + expectedEqual: false, + }, + { + name: "different CommitTs and OriginTs", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 200}, + Pk: "pk1", + }, + expectedEqual: false, + }, + { + name: "different primary keys", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk2", + }, + expectedEqual: false, + }, + { + name: "different column count", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value1", + }, + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value1", + "col2": "value2", + }, + }, + expectedEqual: false, + }, + { + name: "different column names", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value1", + }, + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col2": "value1", + }, + }, + expectedEqual: false, + }, + { + name: "different column values", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value1", + }, + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk1", + ColumnValues: map[string]any{ + "col1": "value2", + }, + }, + expectedEqual: false, + }, + { + name: "empty column values", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + ColumnValues: map[string]any{}, + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk1", + ColumnValues: map[string]any{}, + }, + expectedEqual: true, + }, + { + name: "nil column values", + upstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, + Pk: "pk1", + ColumnValues: nil, + }, + downstream: &decoder.Record{ + CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, + Pk: "pk1", + ColumnValues: nil, + }, + expectedEqual: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.upstream.EqualDownstreamRecord(tt.downstream) + require.Equal(t, tt.expectedEqual, result) + }) + } +} diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 90646e2b7c..087d5dfa4d 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -104,7 +104,9 @@ func run(cmd *cobra.Command, args []string) { errChan := make(chan error, 1) go func() { err := runTask(ctx, cfg) - log.Error("task error", zap.Error(err)) + if err != nil { + log.Error("task error", zap.Error(err)) + } errChan <- err }() diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 5b6bdffb4f..37f9a3f2e7 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -20,7 +20,8 @@ import ( "path/filepath" "github.com/pingcap/log" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" "github.com/pingcap/ticdc/pkg/errors" "go.uber.org/zap" ) @@ -32,7 +33,7 @@ type Recorder struct { checkpoint *Checkpoint } -func NewRecorder(dataDir string) (*Recorder, error) { +func NewRecorder(dataDir string, clusters map[string]config.ClusterConfig) (*Recorder, error) { if err := os.MkdirAll(filepath.Join(dataDir, "report"), 0755); err != nil { return nil, errors.Trace(err) } @@ -45,7 +46,24 @@ func NewRecorder(dataDir string) (*Recorder, error) { checkpoint: NewCheckpoint(), } - return r, r.initializeCheckpoint() + if err := r.initializeCheckpoint(); err != nil { + return nil, errors.Trace(err) + } + for _, item := range r.checkpoint.CheckpointItems { + if item == nil { + continue + } + if len(item.ClusterInfo) != len(clusters) { + return nil, errors.Errorf("checkpoint item (round %d) cluster info length mismatch, expected %d, got %d", item.Round, len(clusters), len(item.ClusterInfo)) + } + for clusterID := range clusters { + if _, ok := item.ClusterInfo[clusterID]; !ok { + return nil, errors.Errorf("checkpoint item (round %d) cluster info missing for cluster %s", item.Round, clusterID) + } + } + } + + return r, nil } func (r *Recorder) GetCheckpoint() *Checkpoint { @@ -67,10 +85,11 @@ func (r *Recorder) initializeCheckpoint() error { if err := json.Unmarshal(data, r.checkpoint); err != nil { return errors.Trace(err) } + return nil } -func (r *Recorder) RecordTimeWindow(timeWindowData map[string]utils.TimeWindowData, report *Report) error { +func (r *Recorder) RecordTimeWindow(timeWindowData map[string]types.TimeWindowData, report *Report) error { for clusterID, timeWindow := range timeWindowData { log.Info("time window advanced", zap.Uint64("round", report.Round), @@ -107,7 +126,7 @@ func (r *Recorder) flushReport(report *Report) error { return nil } -func (r *Recorder) flushCheckpoint(round uint64, timeWindowData map[string]utils.TimeWindowData) error { +func (r *Recorder) flushCheckpoint(round uint64, timeWindowData map[string]types.TimeWindowData) error { r.checkpoint.NewTimeWindowData(round, timeWindowData) filename := filepath.Join(r.checkpointDir, "checkpoint.json") data, err := json.Marshal(r.checkpoint) diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go new file mode 100644 index 0000000000..c0da03775b --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go @@ -0,0 +1,371 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package recorder + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/stretchr/testify/require" +) + +func TestNewRecorder(t *testing.T) { + t.Parallel() + + t.Run("creates directories", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + require.NotNil(t, r) + + // Verify directories exist + info, err := os.Stat(filepath.Join(dataDir, "report")) + require.NoError(t, err) + require.True(t, info.IsDir()) + + info, err = os.Stat(filepath.Join(dataDir, "checkpoint")) + require.NoError(t, err) + require.True(t, info.IsDir()) + }) + + t.Run("checkpoint is initialized empty", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + cp := r.GetCheckpoint() + require.NotNil(t, cp) + require.Nil(t, cp.CheckpointItems[0]) + require.Nil(t, cp.CheckpointItems[1]) + require.Nil(t, cp.CheckpointItems[2]) + }) + + t.Run("loads existing checkpoint on startup", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // First recorder: write a checkpoint + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + twData := map[string]types.TimeWindowData{ + "c1": { + TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "db", Table: "tbl"}: {Version: 1, VersionPath: "vp1", DataPath: "dp1"}, + }, + }, + } + report := NewReport(0) + err = r1.RecordTimeWindow(twData, report) + require.NoError(t, err) + + // Second recorder: should load the checkpoint + r2, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + cp := r2.GetCheckpoint() + require.NotNil(t, cp.CheckpointItems[2]) + require.Equal(t, uint64(0), cp.CheckpointItems[2].Round) + info := cp.CheckpointItems[2].ClusterInfo["c1"] + require.Equal(t, uint64(1), info.TimeWindow.LeftBoundary) + require.Equal(t, uint64(10), info.TimeWindow.RightBoundary) + }) + + t.Run("cluster count mismatch rejects checkpoint", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Write a checkpoint with 2 clusters + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c2": {}}) + require.NoError(t, err) + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + "c2": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + } + err = r1.RecordTimeWindow(twData, NewReport(0)) + require.NoError(t, err) + + // Try to load with only 1 cluster — should fail + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.Error(t, err) + require.Contains(t, err.Error(), "cluster info length mismatch") + }) + + t.Run("cluster ID missing rejects checkpoint", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Write a checkpoint with clusters c1 and c2 + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c2": {}}) + require.NoError(t, err) + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + "c2": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + } + err = r1.RecordTimeWindow(twData, NewReport(0)) + require.NoError(t, err) + + // Try to load with c1 and c3 (same count, different ID) — should fail + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c3": {}}) + require.Error(t, err) + require.Contains(t, err.Error(), "cluster info missing for cluster c3") + }) + + t.Run("matching clusters loads checkpoint successfully", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + clusters := map[string]config.ClusterConfig{"c1": {}, "c2": {}} + + // Write checkpoint across 3 rounds so all 3 slots are filled + r1, err := NewRecorder(dataDir, clusters) + require.NoError(t, err) + for i := range 3 { + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: uint64(i * 10), RightBoundary: uint64((i + 1) * 10)}}, + "c2": {TimeWindow: types.TimeWindow{LeftBoundary: uint64(i * 10), RightBoundary: uint64((i + 1) * 10)}}, + } + err = r1.RecordTimeWindow(twData, NewReport(uint64(i))) + require.NoError(t, err) + } + + // Reload with the same clusters — should succeed + r2, err := NewRecorder(dataDir, clusters) + require.NoError(t, err) + cp := r2.GetCheckpoint() + require.NotNil(t, cp.CheckpointItems[0]) + require.NotNil(t, cp.CheckpointItems[1]) + require.NotNil(t, cp.CheckpointItems[2]) + }) + + t.Run("nil checkpoint items are skipped during validation", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Write only 1 round — items[0] and items[1] stay nil + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + } + err = r1.RecordTimeWindow(twData, NewReport(0)) + require.NoError(t, err) + + // Reload — should succeed even with a different cluster count since + // only the non-nil item[2] is validated + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + }) + + t.Run("no checkpoint file skips validation", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Fresh start with any cluster config — should always succeed + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c2": {}, "c3": {}}) + require.NoError(t, err) + require.NotNil(t, r) + + cp := r.GetCheckpoint() + require.Nil(t, cp.CheckpointItems[0]) + require.Nil(t, cp.CheckpointItems[1]) + require.Nil(t, cp.CheckpointItems[2]) + }) +} + +func TestRecorder_RecordTimeWindow(t *testing.T) { + t.Parallel() + + t.Run("without report flush writes only checkpoint", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}}, + } + report := NewReport(0) // needFlush = false + err = r.RecordTimeWindow(twData, report) + require.NoError(t, err) + + // checkpoint.json should exist + _, err = os.Stat(filepath.Join(dataDir, "checkpoint", "checkpoint.json")) + require.NoError(t, err) + + // No report files + entries, err := os.ReadDir(filepath.Join(dataDir, "report")) + require.NoError(t, err) + require.Empty(t, entries) + }) + + t.Run("with report flush writes both checkpoint and report", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}}, + } + report := NewReport(5) + cr := NewClusterReport("c1") + cr.AddDataLossItem("d1", "pk-1", 100, 200, false) + report.AddClusterReport("c1", cr) + require.True(t, report.NeedFlush()) + + err = r.RecordTimeWindow(twData, report) + require.NoError(t, err) + + // checkpoint.json should exist + _, err = os.Stat(filepath.Join(dataDir, "checkpoint", "checkpoint.json")) + require.NoError(t, err) + + // Report files should exist + _, err = os.Stat(filepath.Join(dataDir, "report", "report-5.report")) + require.NoError(t, err) + _, err = os.Stat(filepath.Join(dataDir, "report", "report-5.json")) + require.NoError(t, err) + + // Verify report content + reportData, err := os.ReadFile(filepath.Join(dataDir, "report", "report-5.report")) + require.NoError(t, err) + require.Contains(t, string(reportData), "round: 5") + require.Contains(t, string(reportData), "pk-1") + + // Verify json report is valid JSON + jsonData, err := os.ReadFile(filepath.Join(dataDir, "report", "report-5.json")) + require.NoError(t, err) + var parsed Report + err = json.Unmarshal(jsonData, &parsed) + require.NoError(t, err) + require.Equal(t, uint64(5), parsed.Round) + }) + + t.Run("multiple rounds advance checkpoint", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + for i := uint64(0); i < 4; i++ { + twData := map[string]types.TimeWindowData{ + "c1": { + TimeWindow: types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "db", Table: "tbl"}: {Version: i + 1}, + }, + }, + } + report := NewReport(i) + err = r.RecordTimeWindow(twData, report) + require.NoError(t, err) + } + + // After 4 rounds, checkpoint should have rounds 1, 2, 3 (oldest evicted) + cp := r.GetCheckpoint() + require.NotNil(t, cp.CheckpointItems[0]) + require.NotNil(t, cp.CheckpointItems[1]) + require.NotNil(t, cp.CheckpointItems[2]) + require.Equal(t, uint64(1), cp.CheckpointItems[0].Round) + require.Equal(t, uint64(2), cp.CheckpointItems[1].Round) + require.Equal(t, uint64(3), cp.CheckpointItems[2].Round) + }) +} + +func TestRecorder_CheckpointPersistence(t *testing.T) { + t.Parallel() + + t.Run("checkpoint survives restart", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + stk := types.SchemaTableKey{Schema: "db", Table: "tbl"} + + // Simulate 3 rounds + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + for i := uint64(0); i < 3; i++ { + twData := map[string]types.TimeWindowData{ + "c1": { + TimeWindow: types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: i + 1, VersionPath: fmt.Sprintf("vp%d", i), DataPath: fmt.Sprintf("dp%d", i)}, + }, + }, + } + report := NewReport(i) + err = r1.RecordTimeWindow(twData, report) + require.NoError(t, err) + } + + // Restart: new recorder from same dir + r2, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + cp := r2.GetCheckpoint() + require.Equal(t, uint64(0), cp.CheckpointItems[0].Round) + require.Equal(t, uint64(1), cp.CheckpointItems[1].Round) + require.Equal(t, uint64(2), cp.CheckpointItems[2].Round) + + // Verify ToScanRange works after restart + scanRange, err := cp.ToScanRange("c1") + require.NoError(t, err) + require.Len(t, scanRange, 1) + sr := scanRange[stk] + require.Equal(t, "vp0", sr.StartVersionKey) + require.Equal(t, "vp2", sr.EndVersionKey) + require.Equal(t, "dp0", sr.StartDataPath) + require.Equal(t, "dp2", sr.EndDataPath) + }) + + t.Run("checkpoint json is valid", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + r, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}) + require.NoError(t, err) + + twData := map[string]types.TimeWindowData{ + "c1": { + TimeWindow: types.TimeWindow{ + LeftBoundary: 100, + RightBoundary: 200, + CheckpointTs: map[string]uint64{"c2": 150}, + }, + }, + } + report := NewReport(0) + err = r.RecordTimeWindow(twData, report) + require.NoError(t, err) + + // Read and parse checkpoint.json + data, err := os.ReadFile(filepath.Join(dataDir, "checkpoint", "checkpoint.json")) + require.NoError(t, err) + + var cp Checkpoint + err = json.Unmarshal(data, &cp) + require.NoError(t, err) + require.NotNil(t, cp.CheckpointItems[2]) + require.Equal(t, uint64(100), cp.CheckpointItems[2].ClusterInfo["c1"].TimeWindow.LeftBoundary) + require.Equal(t, uint64(200), cp.CheckpointItems[2].ClusterInfo["c1"].TimeWindow.RightBoundary) + }) +} diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index 6c858eb99e..d0b38fb0d8 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -17,7 +17,7 @@ import ( "fmt" "strings" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" "github.com/pingcap/ticdc/pkg/errors" ) @@ -171,11 +171,11 @@ func (r *Report) NeedFlush() bool { } type SchemaTableVersionKey struct { - utils.SchemaTableKey - utils.VersionKey + types.SchemaTableKey + types.VersionKey } -func NewSchemaTableVersionKeyFromVersionKeyMap(versionKeyMap map[utils.SchemaTableKey]utils.VersionKey) []SchemaTableVersionKey { +func NewSchemaTableVersionKeyFromVersionKeyMap(versionKeyMap map[types.SchemaTableKey]types.VersionKey) []SchemaTableVersionKey { result := make([]SchemaTableVersionKey, 0, len(versionKeyMap)) for schemaTableKey, versionKey := range versionKeyMap { result = append(result, SchemaTableVersionKey{ @@ -187,7 +187,7 @@ func NewSchemaTableVersionKeyFromVersionKeyMap(versionKeyMap map[utils.SchemaTab } type CheckpointClusterInfo struct { - TimeWindow utils.TimeWindow `json:"time_window"` + TimeWindow types.TimeWindow `json:"time_window"` MaxVersion []SchemaTableVersionKey `json:"max_version"` } @@ -210,7 +210,7 @@ func NewCheckpoint() *Checkpoint { } } -func (c *Checkpoint) NewTimeWindowData(round uint64, timeWindowData map[string]utils.TimeWindowData) { +func (c *Checkpoint) NewTimeWindowData(round uint64, timeWindowData map[string]types.TimeWindowData) { newCheckpointItem := CheckpointItem{ Round: round, ClusterInfo: make(map[string]CheckpointClusterInfo), @@ -233,8 +233,8 @@ type ScanRange struct { EndDataPath string } -func (c *Checkpoint) ToScanRange(clusterID string) (map[utils.SchemaTableKey]*ScanRange, error) { - result := make(map[utils.SchemaTableKey]*ScanRange) +func (c *Checkpoint) ToScanRange(clusterID string) (map[types.SchemaTableKey]*ScanRange, error) { + result := make(map[types.SchemaTableKey]*ScanRange) if c.CheckpointItems[2] == nil { return result, nil } diff --git a/cmd/multi-cluster-consistency-checker/recorder/types_test.go b/cmd/multi-cluster-consistency-checker/recorder/types_test.go new file mode 100644 index 0000000000..b9139e6858 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/recorder/types_test.go @@ -0,0 +1,548 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package recorder + +import ( + "fmt" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/stretchr/testify/require" +) + +func TestDataLossItem_String(t *testing.T) { + t.Parallel() + + t.Run("data loss", func(t *testing.T) { + t.Parallel() + item := &DataLossItem{ + DownstreamClusterID: "cluster-2", + PK: "pk-1", + OriginTS: 100, + CommitTS: 200, + Inconsistent: false, + } + s := item.String() + require.Equal(t, "downstream cluster: cluster-2, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss", s) + }) + + t.Run("data inconsistent", func(t *testing.T) { + t.Parallel() + item := &DataLossItem{ + DownstreamClusterID: "cluster-3", + PK: "pk-2", + OriginTS: 300, + CommitTS: 400, + Inconsistent: true, + } + s := item.String() + require.Equal(t, "downstream cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent", s) + }) +} + +func TestDataRedundantItem_String(t *testing.T) { + t.Parallel() + item := &DataRedundantItem{PK: "pk-x", OriginTS: 10, CommitTS: 20} + s := item.String() + require.Equal(t, "pk: pk-x, origin ts: 10, commit ts: 20", s) +} + +func TestLWWViolationItem_String(t *testing.T) { + t.Parallel() + item := &LWWViolationItem{ + PK: "pk-y", + ExistingOriginTS: 1, + ExistingCommitTS: 2, + OriginTS: 3, + CommitTS: 4, + } + s := item.String() + require.Equal(t, "pk: pk-y, existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4", s) +} + +func TestClusterReport(t *testing.T) { + t.Parallel() + + t.Run("new cluster report is empty and does not need flush", func(t *testing.T) { + t.Parallel() + cr := NewClusterReport("c1") + require.Equal(t, "c1", cr.ClusterID) + require.Empty(t, cr.DataLossItems) + require.Empty(t, cr.DataRedundantItems) + require.Empty(t, cr.LWWViolationItems) + require.False(t, cr.needFlush) + }) + + t.Run("add data loss item sets needFlush", func(t *testing.T) { + t.Parallel() + cr := NewClusterReport("c1") + cr.AddDataLossItem("downstream-1", "pk-1", 100, 200, false) + require.Len(t, cr.DataLossItems, 1) + require.True(t, cr.needFlush) + require.Equal(t, "downstream-1", cr.DataLossItems[0].DownstreamClusterID) + require.Equal(t, "pk-1", cr.DataLossItems[0].PK) + require.Equal(t, uint64(100), cr.DataLossItems[0].OriginTS) + require.Equal(t, uint64(200), cr.DataLossItems[0].CommitTS) + require.False(t, cr.DataLossItems[0].Inconsistent) + }) + + t.Run("add data redundant item sets needFlush", func(t *testing.T) { + t.Parallel() + cr := NewClusterReport("c1") + cr.AddDataRedundantItem("pk-2", 300, 400) + require.Len(t, cr.DataRedundantItems, 1) + require.True(t, cr.needFlush) + }) + + t.Run("add lww violation item sets needFlush", func(t *testing.T) { + t.Parallel() + cr := NewClusterReport("c1") + cr.AddLWWViolationItem("pk-3", 1, 2, 3, 4) + require.Len(t, cr.LWWViolationItems, 1) + require.True(t, cr.needFlush) + require.Equal(t, uint64(1), cr.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(2), cr.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(3), cr.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(4), cr.LWWViolationItems[0].CommitTS) + }) + + t.Run("add multiple items", func(t *testing.T) { + t.Parallel() + cr := NewClusterReport("c1") + cr.AddDataLossItem("d1", "pk-1", 1, 2, false) + cr.AddDataLossItem("d2", "pk-2", 3, 4, true) + cr.AddDataRedundantItem("pk-3", 5, 6) + cr.AddLWWViolationItem("pk-4", 7, 8, 9, 10) + require.Len(t, cr.DataLossItems, 2) + require.Len(t, cr.DataRedundantItems, 1) + require.Len(t, cr.LWWViolationItems, 1) + }) +} + +func TestReport(t *testing.T) { + t.Parallel() + + t.Run("new report does not need flush", func(t *testing.T) { + t.Parallel() + r := NewReport(1) + require.Equal(t, uint64(1), r.Round) + require.Empty(t, r.ClusterReports) + require.False(t, r.NeedFlush()) + }) + + t.Run("add empty cluster report does not set needFlush", func(t *testing.T) { + t.Parallel() + r := NewReport(1) + cr := NewClusterReport("c1") + r.AddClusterReport("c1", cr) + require.Len(t, r.ClusterReports, 1) + require.False(t, r.NeedFlush()) + }) + + t.Run("add non-empty cluster report sets needFlush", func(t *testing.T) { + t.Parallel() + r := NewReport(1) + cr := NewClusterReport("c1") + cr.AddDataLossItem("d1", "pk-1", 1, 2, false) + r.AddClusterReport("c1", cr) + require.True(t, r.NeedFlush()) + }) + + t.Run("needFlush propagates from any cluster report", func(t *testing.T) { + t.Parallel() + r := NewReport(1) + cr1 := NewClusterReport("c1") + cr2 := NewClusterReport("c2") + cr2.AddDataRedundantItem("pk-1", 1, 2) + r.AddClusterReport("c1", cr1) + r.AddClusterReport("c2", cr2) + require.True(t, r.NeedFlush()) + }) +} + +func TestReport_MarshalReport(t *testing.T) { + t.Parallel() + + t.Run("empty report", func(t *testing.T) { + t.Parallel() + r := NewReport(5) + s := r.MarshalReport() + require.Equal(t, "round: 5\n\n", s) + }) + + t.Run("report with data loss items", func(t *testing.T) { + t.Parallel() + r := NewReport(1) + cr := NewClusterReport("c1") + cr.AddDataLossItem("d1", "pk-1", 100, 200, false) + r.AddClusterReport("c1", cr) + s := r.MarshalReport() + require.Equal(t, "round: 1\n\n"+ + "[cluster: c1]\n"+ + " - [data loss items: 1]\n"+ + " - [downstream cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss]\n\n", + s) + }) + + t.Run("report with data redundant items", func(t *testing.T) { + t.Parallel() + r := NewReport(2) + cr := NewClusterReport("c2") + cr.AddDataRedundantItem("pk-r", 10, 20) + r.AddClusterReport("c2", cr) + s := r.MarshalReport() + require.Equal(t, "round: 2\n\n"+ + "[cluster: c2]\n"+ + " - [data redundant items: 1]\n"+ + " - [pk: pk-r, origin ts: 10, commit ts: 20]\n\n", + s) + }) + + t.Run("report with lww violation items", func(t *testing.T) { + t.Parallel() + r := NewReport(3) + cr := NewClusterReport("c3") + cr.AddLWWViolationItem("pk-v", 1, 2, 3, 4) + r.AddClusterReport("c3", cr) + s := r.MarshalReport() + require.Equal(t, "round: 3\n\n"+ + "[cluster: c3]\n"+ + " - [lww violation items: 1]\n"+ + " - [pk: pk-v, existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4]\n\n", + s) + }) + + t.Run("skips cluster reports that do not need flush", func(t *testing.T) { + t.Parallel() + r := NewReport(1) + crEmpty := NewClusterReport("empty-cluster") + crFull := NewClusterReport("full-cluster") + crFull.AddDataLossItem("d1", "pk-1", 1, 2, false) + r.AddClusterReport("empty-cluster", crEmpty) + r.AddClusterReport("full-cluster", crFull) + s := r.MarshalReport() + require.Equal(t, "round: 1\n\n"+ + "[cluster: full-cluster]\n"+ + " - [data loss items: 1]\n"+ + " - [downstream cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data loss]\n\n", + s) + }) + + t.Run("report with mixed items", func(t *testing.T) { + t.Parallel() + r := NewReport(10) + cr := NewClusterReport("c1") + cr.AddDataLossItem("d1", "pk-1", 1, 2, true) + cr.AddDataRedundantItem("pk-2", 3, 4) + cr.AddLWWViolationItem("pk-3", 5, 6, 7, 8) + r.AddClusterReport("c1", cr) + s := r.MarshalReport() + require.Equal(t, "round: 10\n\n"+ + "[cluster: c1]\n"+ + " - [data loss items: 1]\n"+ + " - [downstream cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data inconsistent]\n"+ + " - [data redundant items: 1]\n"+ + " - [pk: pk-2, origin ts: 3, commit ts: 4]\n"+ + " - [lww violation items: 1]\n"+ + " - [pk: pk-3, existing origin ts: 5, existing commit ts: 6, origin ts: 7, commit ts: 8]\n\n", + s) + }) +} + +func TestNewSchemaTableVersionKeyFromVersionKeyMap(t *testing.T) { + t.Parallel() + + t.Run("empty map", func(t *testing.T) { + t.Parallel() + result := NewSchemaTableVersionKeyFromVersionKeyMap(nil) + require.Empty(t, result) + }) + + t.Run("single entry", func(t *testing.T) { + t.Parallel() + m := map[types.SchemaTableKey]types.VersionKey{ + {Schema: "db", Table: "tbl"}: {Version: 1, VersionPath: "path1"}, + } + result := NewSchemaTableVersionKeyFromVersionKeyMap(m) + require.Len(t, result, 1) + require.Equal(t, "db", result[0].Schema) + require.Equal(t, "tbl", result[0].Table) + require.Equal(t, uint64(1), result[0].Version) + require.Equal(t, "path1", result[0].VersionPath) + }) + + t.Run("multiple entries", func(t *testing.T) { + t.Parallel() + m := map[types.SchemaTableKey]types.VersionKey{ + {Schema: "db1", Table: "t1"}: {Version: 1}, + {Schema: "db2", Table: "t2"}: {Version: 2}, + } + result := NewSchemaTableVersionKeyFromVersionKeyMap(m) + require.Len(t, result, 2) + }) +} + +func TestCheckpoint_NewTimeWindowData(t *testing.T) { + t.Parallel() + + t.Run("first call populates slot 2", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}}, + }) + require.Nil(t, cp.CheckpointItems[0]) + require.Nil(t, cp.CheckpointItems[1]) + require.NotNil(t, cp.CheckpointItems[2]) + require.Equal(t, uint64(0), cp.CheckpointItems[2].Round) + }) + + t.Run("second call shifts slots", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}}, + }) + cp.NewTimeWindowData(1, map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 10, RightBoundary: 20}}, + }) + require.Nil(t, cp.CheckpointItems[0]) + require.NotNil(t, cp.CheckpointItems[1]) + require.NotNil(t, cp.CheckpointItems[2]) + require.Equal(t, uint64(0), cp.CheckpointItems[1].Round) + require.Equal(t, uint64(1), cp.CheckpointItems[2].Round) + }) + + t.Run("third call fills all slots", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + for i := uint64(0); i < 3; i++ { + cp.NewTimeWindowData(i, map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}}, + }) + } + require.NotNil(t, cp.CheckpointItems[0]) + require.NotNil(t, cp.CheckpointItems[1]) + require.NotNil(t, cp.CheckpointItems[2]) + require.Equal(t, uint64(0), cp.CheckpointItems[0].Round) + require.Equal(t, uint64(1), cp.CheckpointItems[1].Round) + require.Equal(t, uint64(2), cp.CheckpointItems[2].Round) + }) + + t.Run("fourth call evicts oldest", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + for i := uint64(0); i < 4; i++ { + cp.NewTimeWindowData(i, map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}}, + }) + } + require.Equal(t, uint64(1), cp.CheckpointItems[0].Round) + require.Equal(t, uint64(2), cp.CheckpointItems[1].Round) + require.Equal(t, uint64(3), cp.CheckpointItems[2].Round) + }) + + t.Run("stores max version from time window data", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}, + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "db", Table: "tbl"}: {Version: 5, VersionPath: "vp", DataPath: "dp"}, + }, + }, + }) + info := cp.CheckpointItems[2].ClusterInfo["c1"] + require.Len(t, info.MaxVersion, 1) + require.Equal(t, uint64(5), info.MaxVersion[0].Version) + require.Equal(t, "vp", info.MaxVersion[0].VersionPath) + require.Equal(t, "dp", info.MaxVersion[0].DataPath) + }) +} + +func TestCheckpoint_ToScanRange(t *testing.T) { + t.Parallel() + + stk := types.SchemaTableKey{Schema: "db", Table: "tbl"} + + t.Run("empty checkpoint returns empty", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + result, err := cp.ToScanRange("c1") + require.NoError(t, err) + require.Empty(t, result) + }) + + t.Run("only item[2] set", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 2, VersionPath: "vp2", DataPath: "dp2"}, + }, + }, + }) + result, err := cp.ToScanRange("c1") + require.NoError(t, err) + require.Len(t, result, 1) + sr := result[stk] + // With only item[2], Start and End are both from item[2] + require.Equal(t, "vp2", sr.StartVersionKey) + require.Equal(t, "vp2", sr.EndVersionKey) + require.Equal(t, "dp2", sr.StartDataPath) + require.Equal(t, "dp2", sr.EndDataPath) + }) + + t.Run("items[1] and items[2] set", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 1, VersionPath: "vp1", DataPath: "dp1"}, + }, + }, + }) + cp.NewTimeWindowData(1, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 2, VersionPath: "vp2", DataPath: "dp2"}, + }, + }, + }) + result, err := cp.ToScanRange("c1") + require.NoError(t, err) + require.Len(t, result, 1) + sr := result[stk] + // End comes from item[2], Start overridden by item[1] + require.Equal(t, "vp1", sr.StartVersionKey) + require.Equal(t, "vp2", sr.EndVersionKey) + require.Equal(t, "dp1", sr.StartDataPath) + require.Equal(t, "dp2", sr.EndDataPath) + }) + + t.Run("all three items set", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + for i := uint64(0); i < 3; i++ { + cp.NewTimeWindowData(i, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: { + Version: i + 1, + VersionPath: fmt.Sprintf("vp%d", i), + DataPath: fmt.Sprintf("dp%d", i), + }, + }, + }, + }) + } + result, err := cp.ToScanRange("c1") + require.NoError(t, err) + require.Len(t, result, 1) + sr := result[stk] + // End from item[2], Start overridden by item[0] (oldest) + require.Equal(t, "vp0", sr.StartVersionKey) + require.Equal(t, "vp2", sr.EndVersionKey) + require.Equal(t, "dp0", sr.StartDataPath) + require.Equal(t, "dp2", sr.EndDataPath) + }) + + t.Run("missing key in item[1] returns error", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "other", Table: "other"}: {Version: 1, VersionPath: "vp1"}, + }, + }, + }) + cp.NewTimeWindowData(1, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 2, VersionPath: "vp2"}, + }, + }, + }) + _, err := cp.ToScanRange("c1") + require.Error(t, err) + require.Contains(t, err.Error(), "not found") + }) + + t.Run("missing key in item[0] returns error", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + {Schema: "other", Table: "other"}: {Version: 1, VersionPath: "vp1"}, + }, + }, + }) + cp.NewTimeWindowData(1, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 2, VersionPath: "vp2"}, + }, + }, + }) + cp.NewTimeWindowData(2, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 3, VersionPath: "vp3"}, + }, + }, + }) + _, err := cp.ToScanRange("c1") + require.Error(t, err) + require.Contains(t, err.Error(), "not found") + }) + + t.Run("unknown cluster returns empty", func(t *testing.T) { + t.Parallel() + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 1, VersionPath: "vp1"}, + }, + }, + }) + result, err := cp.ToScanRange("unknown-cluster") + require.NoError(t, err) + require.Empty(t, result) + }) + + t.Run("multiple tables", func(t *testing.T) { + t.Parallel() + stk2 := types.SchemaTableKey{Schema: "db2", Table: "tbl2"} + cp := NewCheckpoint() + cp.NewTimeWindowData(0, map[string]types.TimeWindowData{ + "c1": { + MaxVersion: map[types.SchemaTableKey]types.VersionKey{ + stk: {Version: 1, VersionPath: "vp1-t1", DataPath: "dp1-t1"}, + stk2: {Version: 1, VersionPath: "vp1-t2", DataPath: "dp1-t2"}, + }, + }, + }) + result, err := cp.ToScanRange("c1") + require.NoError(t, err) + require.Len(t, result, 2) + require.Contains(t, result, stk) + require.Contains(t, result, stk2) + }) +} diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index d39c2cc831..e5cdd222f7 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -39,9 +39,9 @@ func runTask(ctx context.Context, cfg *config.Config) error { return errors.Trace(err) } // Ensure cleanup happens even if there's an error - defer cleanupClients(pdClients, etcdClients) + defer cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) - recorder, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir) + recorder, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir, cfg.Clusters) if err != nil { return errors.Trace(err) } @@ -78,13 +78,13 @@ func runTask(ctx context.Context, cfg *config.Config) error { } func initClients(ctx context.Context, cfg *config.Config) ( - map[string]map[string]*watcher.CheckpointWatcher, + map[string]map[string]watcher.Watcher, map[string]*watcher.S3Watcher, map[string]pd.Client, map[string]*etcd.CDCEtcdClientImpl, error, ) { - checkpointWatchers := make(map[string]map[string]*watcher.CheckpointWatcher) + checkpointWatchers := make(map[string]map[string]watcher.Watcher) s3Watchers := make(map[string]*watcher.S3Watcher) pdClients := make(map[string]pd.Client) etcdClients := make(map[string]*etcd.CDCEtcdClientImpl) @@ -93,14 +93,14 @@ func initClients(ctx context.Context, cfg *config.Config) ( pdClient, etcdClient, err := newPDClient(ctx, clusterConfig.PDAddr, &clusterConfig.SecurityConfig) if err != nil { // Clean up already created clients before returning error - cleanupClients(pdClients, etcdClients) + cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) return nil, nil, nil, nil, errors.Trace(err) } etcdClients[clusterID] = etcdClient - upstreamCheckpointWatchers := make(map[string]*watcher.CheckpointWatcher) + upstreamCheckpointWatchers := make(map[string]watcher.Watcher) for downstreamClusterID, downstreamClusterChangefeedConfig := range clusterConfig.DownstreamClusterChangefeedConfig { - checkpointWatcher := watcher.NewCheckpointWatcher(clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) + checkpointWatcher := watcher.NewCheckpointWatcher(ctx, clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) upstreamCheckpointWatchers[downstreamClusterID] = checkpointWatcher } checkpointWatchers[clusterID] = upstreamCheckpointWatchers @@ -108,11 +108,11 @@ func initClients(ctx context.Context, cfg *config.Config) ( s3Storage, err := putil.GetExternalStorageWithDefaultTimeout(ctx, clusterConfig.S3SinkURI) if err != nil { // Clean up already created clients before returning error - cleanupClients(pdClients, etcdClients) + cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) return nil, nil, nil, nil, errors.Trace(err) } s3Watcher := watcher.NewS3Watcher( - watcher.NewCheckpointWatcher(clusterID, "s3", clusterConfig.S3ChangefeedID, etcdClient), + watcher.NewCheckpointWatcher(ctx, clusterID, "s3", clusterConfig.S3ChangefeedID, etcdClient), s3Storage, cfg.GlobalConfig.Tables, ) @@ -153,8 +153,18 @@ func newPDClient(ctx context.Context, pdAddr string, securityConfig *security.Cr } // cleanupClients closes all PD and etcd clients gracefully -func cleanupClients(pdClients map[string]pd.Client, etcdClients map[string]*etcd.CDCEtcdClientImpl) { - log.Info("Cleaning up clients", zap.Int("pdClients", len(pdClients)), zap.Int("etcdClients", len(etcdClients))) +func cleanupClients( + pdClients map[string]pd.Client, + etcdClients map[string]*etcd.CDCEtcdClientImpl, + checkpointWatchers map[string]map[string]watcher.Watcher, + s3Watchers map[string]*watcher.S3Watcher, +) { + log.Info("Cleaning up clients", + zap.Int("pdClients", len(pdClients)), + zap.Int("etcdClients", len(etcdClients)), + zap.Int("checkpointWatchers", len(checkpointWatchers)), + zap.Int("s3Watchers", len(s3Watchers)), + ) // Close PD clients for clusterID, pdClient := range pdClients { @@ -177,5 +187,17 @@ func cleanupClients(pdClients map[string]pd.Client, etcdClients map[string]*etcd } } + // Close checkpoint watchers + for _, clusterWatchers := range checkpointWatchers { + for _, watcher := range clusterWatchers { + watcher.Close() + } + } + + // Close s3 watchers + for _, s3Watcher := range s3Watchers { + s3Watcher.Close() + } + log.Info("Client cleanup completed") } diff --git a/cmd/multi-cluster-consistency-checker/utils/types.go b/cmd/multi-cluster-consistency-checker/types/types.go similarity index 77% rename from cmd/multi-cluster-consistency-checker/utils/types.go rename to cmd/multi-cluster-consistency-checker/types/types.go index 51ad1e3e2a..5be66475c8 100644 --- a/cmd/multi-cluster-consistency-checker/utils/types.go +++ b/cmd/multi-cluster-consistency-checker/types/types.go @@ -11,7 +11,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package utils +package types import ( "github.com/pingcap/ticdc/pkg/sink/cloudstorage" @@ -19,11 +19,6 @@ import ( type PkType string -type ColumnValue struct { - ColumnID int64 - Value any -} - type CdcVersion struct { CommitTs uint64 OriginTs uint64 @@ -36,36 +31,6 @@ func (v *CdcVersion) GetCompareTs() uint64 { return v.CommitTs } -type Record struct { - CdcVersion - Pk PkType - ColumnValues []ColumnValue -} - -func (r *Record) EqualDownstreamRecord(downstreamRecord *Record) bool { - if downstreamRecord == nil { - return false - } - if r.CommitTs != downstreamRecord.OriginTs { - return false - } - if r.Pk != downstreamRecord.Pk { - return false - } - if len(r.ColumnValues) != len(downstreamRecord.ColumnValues) { - return false - } - for i, columnValue := range r.ColumnValues { - if columnValue.ColumnID != downstreamRecord.ColumnValues[i].ColumnID { - return false - } - if columnValue.Value != downstreamRecord.ColumnValues[i].Value { - return false - } - } - return true -} - type SchemaTableKey struct { Schema string Table string @@ -105,5 +70,4 @@ type TimeWindowData struct { type IncrementalData struct { DataContentSlices map[cloudstorage.FileIndexKey][][]byte - Parser *TableParser } diff --git a/cmd/multi-cluster-consistency-checker/types/types_test.go b/cmd/multi-cluster-consistency-checker/types/types_test.go new file mode 100644 index 0000000000..7fe90a9739 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/types/types_test.go @@ -0,0 +1,69 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package types_test + +import ( + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/stretchr/testify/require" +) + +func TestCdcVersion_GetCompareTs(t *testing.T) { + tests := []struct { + name string + version types.CdcVersion + expected uint64 + }{ + { + name: "OriginTs is set", + version: types.CdcVersion{ + CommitTs: 100, + OriginTs: 200, + }, + expected: 200, + }, + { + name: "OriginTs is smaller than CommitTs", + version: types.CdcVersion{ + CommitTs: 200, + OriginTs: 100, + }, + expected: 100, + }, + { + name: "OriginTs is zero", + version: types.CdcVersion{ + CommitTs: 100, + OriginTs: 0, + }, + expected: 100, + }, + { + name: "Both are zero", + version: types.CdcVersion{ + CommitTs: 0, + OriginTs: 0, + }, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.version.GetCompareTs() + require.Equal(t, tt.expected, result) + }) + } +} diff --git a/cmd/multi-cluster-consistency-checker/utils/decoder.go b/cmd/multi-cluster-consistency-checker/utils/decoder.go deleted file mode 100644 index 1a33718810..0000000000 --- a/cmd/multi-cluster-consistency-checker/utils/decoder.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package utils - -import ( - "context" - - commonType "github.com/pingcap/ticdc/pkg/common" - "github.com/pingcap/ticdc/pkg/config" - "github.com/pingcap/ticdc/pkg/errors" - "github.com/pingcap/ticdc/pkg/sink/codec/canal" - codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" - "github.com/pingcap/ticdc/pkg/sink/codec/csv" -) - -func defaultCsvCodecConfig(protocol config.Protocol) *codecCommon.Config { - codecConfig := codecCommon.NewConfig(protocol) - codecConfig.Delimiter = config.Comma - codecConfig.Quote = string(config.DoubleQuoteChar) - codecConfig.NullString = config.NULL - codecConfig.IncludeCommitTs = true - codecConfig.Terminator = config.CRLF - return codecConfig -} - -type csvDecoder struct { - codecConfig *codecCommon.Config -} - -func NewCsvDecoder() *csvDecoder { - codecConfig := defaultCsvCodecConfig(config.ProtocolCsv) - return &csvDecoder{ - codecConfig: codecConfig, - } -} - -func (d *csvDecoder) NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) { - decoder, err := csv.NewDecoder(ctx, d.codecConfig, tableInfo, content) - if err != nil { - return nil, errors.Trace(err) - } - return decoder, nil -} - -func defaultCanalJSONCodecConfig(protocol config.Protocol) *codecCommon.Config { - codecConfig := codecCommon.NewConfig(protocol) - // Always enable tidb extension for canal-json protocol - // because we need to get the commit ts from the extension field. - codecConfig.EnableTiDBExtension = true - codecConfig.Terminator = config.CRLF - return codecConfig -} - -type canalJSONDecoder struct { - codecConfig *codecCommon.Config -} - -func NewCanalJSONDecoder() *canalJSONDecoder { - codecConfig := defaultCanalJSONCodecConfig(config.ProtocolCanalJSON) - return &canalJSONDecoder{ - codecConfig: codecConfig, - } -} - -func (d *canalJSONDecoder) NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) { - decoder := canal.NewTxnDecoderWithTableInfo(d.codecConfig, tableInfo) - decoder.AddKeyValue(nil, content) - return decoder, nil -} diff --git a/cmd/multi-cluster-consistency-checker/utils/parser.go b/cmd/multi-cluster-consistency-checker/utils/parser.go deleted file mode 100644 index ce61f4cfd3..0000000000 --- a/cmd/multi-cluster-consistency-checker/utils/parser.go +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright 2026 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License. - -package utils - -import ( - "context" - "encoding/hex" - "encoding/json" - "time" - - "github.com/pingcap/log" - "github.com/pingcap/ticdc/pkg/common" - commonType "github.com/pingcap/ticdc/pkg/common" - "github.com/pingcap/ticdc/pkg/common/event" - "github.com/pingcap/ticdc/pkg/config" - "github.com/pingcap/ticdc/pkg/errors" - "github.com/pingcap/ticdc/pkg/sink/cloudstorage" - codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" - "github.com/pingcap/tidb/pkg/types" - "github.com/pingcap/tidb/pkg/util/chunk" - "github.com/pingcap/tidb/pkg/util/codec" - "go.uber.org/zap" -) - -func getPkColumnOffset(tableInfo *commonType.TableInfo) (map[int64]int, error) { - if tableInfo.PKIsHandle() { - pkColInfo := tableInfo.GetPkColInfo() - if pkColInfo == nil { - return nil, errors.Errorf("table %s has no primary key", tableInfo.GetTableName()) - } - return map[int64]int{pkColInfo.ID: 0}, nil - } - - pkColInfos := tableInfo.GetPrimaryKeyColumnInfos() - if len(pkColInfos) == 0 { - return nil, errors.Errorf("table %s has no primary key", tableInfo.GetTableName()) - } - - columns := tableInfo.GetColumns() - pkColumnOffsets := make(map[int64]int) - for i, pkColInfo := range pkColInfos { - if pkColInfo.Offset < 0 || pkColInfo.Offset >= len(columns) { - return nil, errors.Errorf("primary key column offset (%d) out of range for column (%d) in table %s", pkColInfo.Offset, len(columns), tableInfo.GetTableName()) - } - pkColumnOffsets[columns[pkColInfo.Offset].ID] = i - } - return pkColumnOffsets, nil -} - -type decoderFactory interface { - NewDecoder(ctx context.Context, tableInfo *commonType.TableInfo, content []byte) (codecCommon.Decoder, error) -} - -type TableParser struct { - tableKey string - tableInfo *common.TableInfo - pkColumnOffsets map[int64]int - decoderFactory decoderFactory -} - -func NewTableParserWithFormat(tableKey string, content []byte, protocol config.Protocol) (*TableParser, error) { - tableParser := &TableParser{} - if err := tableParser.parseTableInfo(tableKey, content); err != nil { - return nil, errors.Trace(err) - } - switch protocol { - case config.ProtocolCsv: - tableParser.decoderFactory = NewCsvDecoder() - case config.ProtocolCanalJSON: - tableParser.decoderFactory = NewCanalJSONDecoder() - default: - return nil, errors.Errorf("unsupported protocol: %s", protocol) - } - return tableParser, nil -} - -func (pt *TableParser) parseTableInfo(tableKey string, content []byte) error { - // Parse schema content to get tableInfo - var tableDef cloudstorage.TableDefinition - if err := json.Unmarshal(content, &tableDef); err != nil { - log.Error("failed to unmarshal schema content", - zap.String("tableKey", tableKey), - zap.ByteString("content", content), - zap.Error(err)) - return errors.Trace(err) - } - - tableInfo, err := tableDef.ToTableInfo() - if err != nil { - log.Error("failed to convert table definition to table info", - zap.String("tableKey", tableKey), - zap.ByteString("content", content), - zap.Error(err)) - return errors.Trace(err) - } - - pkColumnOffsets, err := getPkColumnOffset(tableInfo) - if err != nil { - log.Error("failed to get primary key column offsets", - zap.String("tableKey", tableKey), - zap.ByteString("content", content), - zap.Error(err)) - return errors.Annotate(err, "failed to get primary key column offsets") - } - - pt.tableKey = tableKey - pt.tableInfo = tableInfo - pt.pkColumnOffsets = pkColumnOffsets - return nil -} - -func (pt *TableParser) parseRecord(row *chunk.Row, commitTs uint64) (*Record, error) { - originTs := uint64(0) - pkCount := 0 - colInfos := pt.tableInfo.GetColInfosForRowChangedEvent() - columnValues := make([]ColumnValue, 0, len(colInfos)) - pkColumnValues := make([]types.Datum, len(pt.pkColumnOffsets)) - for _, colInfo := range colInfos { - col, ok := pt.tableInfo.GetColumnInfo(colInfo.ID) - if !ok { - log.Error("column info not found", - zap.String("tableKey", pt.tableKey), - zap.Int64("colID", colInfo.ID)) - return nil, errors.Errorf("column info not found for column %d in table %s", colInfo.ID, pt.tableKey) - } - rowColOffset, ok := pt.tableInfo.GetRowColumnsOffset()[colInfo.ID] - if !ok { - log.Error("row column offset not found", - zap.String("tableKey", pt.tableKey), - zap.Int64("colID", colInfo.ID)) - return nil, errors.Errorf("row column offset not found for column %d in table %s", colInfo.ID, pt.tableKey) - } - if offset, ok := pt.pkColumnOffsets[colInfo.ID]; ok { - dt := row.GetDatum(rowColOffset, &col.FieldType) - if !pkColumnValues[offset].IsNull() { - log.Error("duplicated primary key column value", - zap.String("tableKey", pt.tableKey), - zap.Int64("colID", colInfo.ID)) - return nil, errors.Errorf("duplicated primary key column value for column %d in table %s", colInfo.ID, pt.tableKey) - } - pkColumnValues[offset] = dt - pkCount += 1 - continue - } - if col.Name.O == event.OriginTsColumn { - if !row.IsNull(rowColOffset) { - d := row.GetDatum(rowColOffset, &col.FieldType) - if d.Kind() != types.KindInt64 && d.Kind() != types.KindUint64 { - log.Error("origin ts column value is not int64 or uint64", - zap.String("tableKey", pt.tableKey), - zap.String("datum", d.String())) - return nil, errors.Errorf("origin ts column value is not int64 or uint64 for column %d in table %s", colInfo.ID, pt.tableKey) - } - originTs = d.GetUint64() - } - } else { - colValue := commonType.ExtractColVal(row, col, rowColOffset) - columnValues = append(columnValues, ColumnValue{ - ColumnID: colInfo.ID, - Value: colValue, - }) - } - } - if pkCount != len(pt.pkColumnOffsets) { - log.Error("primary key column value missing", - zap.String("tableKey", pt.tableKey), - zap.Int("pkCount", pkCount), - zap.Int("len(pt.pkColumnOffsets)", len(pt.pkColumnOffsets))) - return nil, errors.Errorf("primary key column value is null for table %s", pt.tableKey) - } - pkEncoded, err := codec.EncodeKey(time.UTC, nil, pkColumnValues...) - if err != nil { - return nil, errors.Annotate(err, "failed to encode primary key") - } - pk := hex.EncodeToString(pkEncoded) - return &Record{ - Pk: PkType(pk), - ColumnValues: columnValues, - CdcVersion: CdcVersion{ - CommitTs: commitTs, - OriginTs: originTs, - }, - }, nil -} - -func (pt *TableParser) DecodeFiles(ctx context.Context, content []byte) ([]*Record, error) { - records := make([]*Record, 0) - - decoder, err := pt.decoderFactory.NewDecoder(ctx, pt.tableInfo, content) - if err != nil { - return nil, errors.Trace(err) - } - - for { - msgType, hasNext := decoder.HasNext() - if !hasNext { - break - } - if msgType != codecCommon.MessageTypeRow { - continue - } - dmlEvent := decoder.NextDMLEvent() - if dmlEvent == nil || dmlEvent.Rows == nil || dmlEvent.Rows.NumRows() == 0 { - continue - } - row := dmlEvent.Rows.GetRow(0) - record, err := pt.parseRecord(&row, dmlEvent.CommitTs) - if err != nil { - return nil, errors.Trace(err) - } - records = append(records, record) - } - return records, nil -} diff --git a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go index 02238fd3f7..eab3063b9f 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go @@ -15,6 +15,9 @@ package watcher import ( "context" + "strings" + "sync" + "time" "github.com/pingcap/log" "github.com/pingcap/ticdc/pkg/common" @@ -25,47 +28,206 @@ import ( "go.uber.org/zap" ) +const ( + // retryBackoffBase is the initial backoff duration for retries + retryBackoffBase = 500 * time.Millisecond + // retryBackoffMax is the maximum backoff duration for retries + retryBackoffMax = 30 * time.Second + // retryBackoffMultiplier is the multiplier for exponential backoff + retryBackoffMultiplier = 2.0 +) + +type Watcher interface { + AdvanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) + Close() +} + +type waitCheckpointTask struct { + respCh chan uint64 + minCheckpointTs uint64 +} + type CheckpointWatcher struct { upstreamClusterID string downstreamClusterID string changefeedID common.ChangeFeedID etcdClient etcd.CDCEtcdClient + + ctx context.Context + cancel context.CancelFunc + + mu sync.Mutex + latestCheckpoint uint64 + pendingTasks []*waitCheckpointTask + watchErr error + closed bool } func NewCheckpointWatcher( + ctx context.Context, upstreamClusterID, downstreamClusterID, changefeedID string, etcdClient etcd.CDCEtcdClient, ) *CheckpointWatcher { - return &CheckpointWatcher{ + cctx, cancel := context.WithCancel(ctx) + watcher := &CheckpointWatcher{ upstreamClusterID: upstreamClusterID, downstreamClusterID: downstreamClusterID, changefeedID: common.NewChangeFeedIDWithName(changefeedID, "default"), etcdClient: etcdClient, + + ctx: cctx, + cancel: cancel, } + go watcher.run() + return watcher } -// advanceCheckpointTs waits for the checkpoint to exceed minCheckpointTs +// AdvanceCheckpointTs waits for the checkpoint to exceed minCheckpointTs func (cw *CheckpointWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { - // First, get the current chceckpoint status from etcd - status, modRev, err := cw.etcdClient.GetChangeFeedStatus(ctx, cw.changefeedID) + cw.mu.Lock() + + // Check if watcher has encountered an error + if cw.watchErr != nil { + err := cw.watchErr + cw.mu.Unlock() + return 0, err + } + + // Check if watcher is closed + if cw.closed { + cw.mu.Unlock() + return 0, errors.Errorf("checkpoint watcher is closed") + } + + // Check if current checkpoint already exceeds minCheckpointTs + if cw.latestCheckpoint > minCheckpointTs { + checkpoint := cw.latestCheckpoint + cw.mu.Unlock() + return checkpoint, nil + } + + // Create a task and wait for the background goroutine to notify + task := &waitCheckpointTask{ + respCh: make(chan uint64, 1), + minCheckpointTs: minCheckpointTs, + } + cw.pendingTasks = append(cw.pendingTasks, task) + cw.mu.Unlock() + + // Wait for response or context cancellation + select { + case <-ctx.Done(): + // Remove the task from pending list + cw.mu.Lock() + for i, t := range cw.pendingTasks { + if t == task { + cw.pendingTasks = append(cw.pendingTasks[:i], cw.pendingTasks[i+1:]...) + break + } + } + cw.mu.Unlock() + return 0, errors.Annotate(ctx.Err(), "context canceled while waiting for checkpoint") + case <-cw.ctx.Done(): + return 0, errors.Annotate(cw.ctx.Err(), "watcher context canceled") + case checkpoint, ok := <-task.respCh: + if !ok { + return 0, errors.Errorf("checkpoint watcher is closed") + } + return checkpoint, nil + } +} + +// Close stops the watcher +func (cw *CheckpointWatcher) Close() { + cw.cancel() + cw.mu.Lock() + cw.closed = true + // Notify all pending tasks that watcher is closing + for _, task := range cw.pendingTasks { + close(task.respCh) + } + cw.pendingTasks = nil + cw.mu.Unlock() +} + +func (cw *CheckpointWatcher) run() { + backoff := retryBackoffBase + for { + select { + case <-cw.ctx.Done(): + cw.mu.Lock() + cw.watchErr = errors.Annotate(cw.ctx.Err(), "context canceled") + cw.mu.Unlock() + return + default: + } + + err := cw.watchOnce() + if err == nil { + // Normal exit (context canceled) + return + } + + // Check if this is a non-recoverable error + if isNonRecoverableError(err) { + cw.mu.Lock() + cw.watchErr = err + cw.mu.Unlock() + return + } + + // Log and retry with backoff + log.Warn("checkpoint watcher encountered error, will retry", + zap.String("changefeedID", cw.changefeedID.String()), + zap.Duration("backoff", backoff), + zap.Error(err)) + + select { + case <-cw.ctx.Done(): + cw.mu.Lock() + cw.watchErr = errors.Annotate(cw.ctx.Err(), "context canceled") + cw.mu.Unlock() + return + case <-time.After(backoff): + } + + // Increase backoff for next retry (exponential backoff with cap) + backoff = time.Duration(float64(backoff) * retryBackoffMultiplier) + backoff = min(backoff, retryBackoffMax) + } +} + +// watchOnce performs one watch cycle. Returns nil if context is canceled, +// returns error if watch fails and should be retried. +func (cw *CheckpointWatcher) watchOnce() error { + // First, get the current checkpoint status from etcd + status, modRev, err := cw.etcdClient.GetChangeFeedStatus(cw.ctx, cw.changefeedID) if err != nil { - return 0, errors.Annotate(err, "failed to get changefeed status") + // Check if context is canceled + if cw.ctx.Err() != nil { + return nil + } + return errors.Annotate(err, "failed to get changefeed status") } + + // Update latest checkpoint + cw.mu.Lock() + cw.latestCheckpoint = status.CheckpointTs + cw.notifyPendingTasksLocked() + cw.mu.Unlock() + statusKey := etcd.GetEtcdKeyJob(cw.etcdClient.GetClusterID(), cw.changefeedID.DisplayName) - // Watch for checkpoint updates - watchCtx, cancel := context.WithCancel(ctx) - defer cancel() + log.Debug("Starting to watch checkpoint", zap.String("changefeedID", cw.changefeedID.String()), zap.String("statusKey", statusKey), zap.String("upstreamClusterID", cw.upstreamClusterID), zap.String("downstreamClusterID", cw.downstreamClusterID), zap.Uint64("checkpoint", status.CheckpointTs), - zap.Int64("startRev", modRev+1), - zap.Uint64("minCheckpointTs", minCheckpointTs)) + zap.Int64("startRev", modRev+1)) watchCh := cw.etcdClient.GetEtcdClient().Watch( - watchCtx, + cw.ctx, statusKey, "checkpoint-watcher", clientv3.WithRev(modRev+1), @@ -73,42 +235,73 @@ func (cw *CheckpointWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpo for { select { - case <-ctx.Done(): - return 0, errors.Annotate(ctx.Err(), "context canceled") + case <-cw.ctx.Done(): + return nil case watchResp, ok := <-watchCh: if !ok { - return 0, errors.Errorf("[changefeedID: %s] watch channel closed", cw.changefeedID.String()) + return errors.Errorf("[changefeedID: %s] watch channel closed", cw.changefeedID.String()) } if err := watchResp.Err(); err != nil { - return 0, errors.Annotatef(err, "[changefeedID: %s] watch error", cw.changefeedID.String()) + return errors.Annotatef(err, "[changefeedID: %s] watch error", cw.changefeedID.String()) } for _, event := range watchResp.Events { if event.Type == clientv3.EventTypeDelete { - return 0, errors.Errorf("[changefeedID: %s] changefeed status key is deleted", cw.changefeedID.String()) + // Key deletion is a non-recoverable error + return errors.Errorf("[changefeedID: %s] changefeed status key is deleted", cw.changefeedID.String()) } // Parse the updated status - status := &config.ChangeFeedStatus{} - if err := status.Unmarshal(event.Kv.Value); err != nil { - return 0, errors.Annotatef(err, "[changefeedID: %s] failed to unmarshal changefeed status", cw.changefeedID.String()) + newStatus := &config.ChangeFeedStatus{} + if err := newStatus.Unmarshal(event.Kv.Value); err != nil { + log.Warn("failed to unmarshal changefeed status, skipping", + zap.String("changefeedID", cw.changefeedID.String()), + zap.Error(err)) + continue } - checkpointTs := status.CheckpointTs + checkpointTs := newStatus.CheckpointTs log.Debug("Checkpoint updated", zap.String("changefeedID", cw.changefeedID.String()), - zap.Uint64("checkpoint", checkpointTs), - zap.Uint64("minCheckpointTs", minCheckpointTs)) + zap.Uint64("checkpoint", checkpointTs)) - // Check if checkpoint exceeds minCheckpointTs - if checkpointTs > minCheckpointTs { - log.Debug("Checkpoint exceeds minCheckpointTs, getting TSO from downstream", - zap.String("changefeedID", cw.changefeedID.String()), - zap.Uint64("checkpoint", checkpointTs)) - return checkpointTs, nil + // Update latest checkpoint and notify pending tasks + cw.mu.Lock() + if checkpointTs > cw.latestCheckpoint { + cw.latestCheckpoint = checkpointTs + cw.notifyPendingTasksLocked() } + cw.mu.Unlock() + } + } + } +} + +// isNonRecoverableError checks if the error is non-recoverable and should not be retried +func isNonRecoverableError(err error) bool { + errMsg := err.Error() + // Key deletion is non-recoverable + if strings.Contains(errMsg, "deleted") { + return true + } + return false +} + +// notifyPendingTasksLocked notifies pending tasks whose minCheckpointTs has been exceeded +// Must be called with mu locked +func (cw *CheckpointWatcher) notifyPendingTasksLocked() { + remaining := cw.pendingTasks[:0] + for _, task := range cw.pendingTasks { + if cw.latestCheckpoint > task.minCheckpointTs { + // Non-blocking send since channel has buffer of 1 + select { + case task.respCh <- cw.latestCheckpoint: + default: } + } else { + remaining = append(remaining, task) } } + cw.pendingTasks = remaining } diff --git a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go new file mode 100644 index 0000000000..5819dd73b2 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go @@ -0,0 +1,543 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package watcher + +import ( + "context" + "strings" + "sync" + "testing" + "time" + + "github.com/golang/mock/gomock" + "github.com/pingcap/errors" + "github.com/pingcap/ticdc/pkg/config" + "github.com/pingcap/ticdc/pkg/etcd" + "github.com/stretchr/testify/require" + "go.etcd.io/etcd/api/v3/mvccpb" + clientv3 "go.etcd.io/etcd/client/v3" +) + +func TestCheckpointWatcher_AdvanceCheckpointTs_AlreadyExceeds(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + + // Setup mock expectations + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + // Create a watch channel that won't send anything during this test + watchCh := make(chan clientv3.WatchResponse) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Request checkpoint that's already exceeded + minCheckpointTs := uint64(500) + checkpoint, err := watcher.AdvanceCheckpointTs(t.Context(), minCheckpointTs) + require.NoError(t, err) + require.Equal(t, initialCheckpoint, checkpoint) +} + +func TestCheckpointWatcher_AdvanceCheckpointTs_WaitForUpdate(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + updatedCheckpoint := uint64(2000) + + // Setup mock expectations + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse, 1) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Start waiting for checkpoint in a goroutine + var checkpoint uint64 + var advanceErr error + done := make(chan struct{}) + go func() { + checkpoint, advanceErr = watcher.AdvanceCheckpointTs(context.Background(), uint64(1500)) + close(done) + }() + + // Give some time for the task to be registered + time.Sleep(50 * time.Millisecond) + + // Simulate checkpoint update via watch channel + newStatus := &config.ChangeFeedStatus{CheckpointTs: updatedCheckpoint} + statusStr, err := newStatus.Marshal() + require.NoError(t, err) + + watchCh <- clientv3.WatchResponse{ + Events: []*clientv3.Event{ + { + Type: clientv3.EventTypePut, + Kv: &mvccpb.KeyValue{ + Value: []byte(statusStr), + }, + }, + }, + } + + select { + case <-done: + require.NoError(t, advanceErr) + require.Equal(t, updatedCheckpoint, checkpoint) + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for checkpoint advance") + } +} + +func TestCheckpointWatcher_AdvanceCheckpointTs_ContextCanceled(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + + // Setup mock expectations + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Create a context that will be canceled + advanceCtx, advanceCancel := context.WithCancel(t.Context()) + + var advanceErr error + done := make(chan struct{}) + go func() { + _, advanceErr = watcher.AdvanceCheckpointTs(advanceCtx, uint64(2000)) + close(done) + }() + + // Give some time for the task to be registered + time.Sleep(50 * time.Millisecond) + + // Cancel the context + advanceCancel() + + select { + case <-done: + require.Error(t, advanceErr) + require.Contains(t, advanceErr.Error(), "context canceled") + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for context cancellation") + } +} + +func TestCheckpointWatcher_Close(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + + // Setup mock expectations + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Start waiting for checkpoint in a goroutine + var advanceErr error + done := make(chan struct{}) + go func() { + _, advanceErr = watcher.AdvanceCheckpointTs(context.Background(), uint64(2000)) + close(done) + }() + + // Give some time for the task to be registered + time.Sleep(50 * time.Millisecond) + + // Close the watcher + watcher.Close() + + select { + case <-done: + require.Error(t, advanceErr) + // Error can be "closed" or "canceled" depending on timing + errMsg := advanceErr.Error() + require.True(t, + strings.Contains(errMsg, "closed") || strings.Contains(errMsg, "canceled"), + "expected error to contain 'closed' or 'canceled', got: %s", errMsg) + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for watcher close") + } + + // Verify that subsequent calls return error + _, err := watcher.AdvanceCheckpointTs(context.Background(), uint64(100)) + require.Error(t, err) + // Error can be "closed" or "canceled" depending on timing + errMsg := err.Error() + require.True(t, + strings.Contains(errMsg, "closed") || strings.Contains(errMsg, "canceled"), + "expected error to contain 'closed' or 'canceled', got: %s", errMsg) +} + +func TestCheckpointWatcher_MultiplePendingTasks(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + + // Setup mock expectations + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse, 10) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Start multiple goroutines waiting for different checkpoints + var wg sync.WaitGroup + results := make([]struct { + checkpoint uint64 + err error + }, 3) + + for i := range 3 { + wg.Add(1) + go func(idx int) { + defer wg.Done() + minTs := uint64(1100 + idx*500) // 1100, 1600, 2100 + results[idx].checkpoint, results[idx].err = watcher.AdvanceCheckpointTs(context.Background(), minTs) + }(i) + } + + // Give some time for tasks to be registered + time.Sleep(50 * time.Millisecond) + + // Send checkpoint updates + checkpoints := []uint64{1500, 2000, 2500} + for _, cp := range checkpoints { + newStatus := &config.ChangeFeedStatus{CheckpointTs: cp} + statusStr, err := newStatus.Marshal() + require.NoError(t, err) + + watchCh <- clientv3.WatchResponse{ + Events: []*clientv3.Event{ + { + Type: clientv3.EventTypePut, + Kv: &mvccpb.KeyValue{ + Value: []byte(statusStr), + }, + }, + }, + } + // Give some time between updates + time.Sleep(20 * time.Millisecond) + } + + // Wait for all goroutines to complete + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + select { + case <-done: + // Verify results + for i := range 3 { + require.NoError(t, results[i].err, "task %d should not have error", i) + minTs := uint64(1100 + i*500) + require.Greater(t, results[i].checkpoint, minTs, "task %d checkpoint should exceed minTs", i) + } + case <-time.After(5 * time.Second): + t.Fatal("timeout waiting for all tasks to complete") + } +} + +func TestCheckpointWatcher_NotifyPendingTasksLocked(t *testing.T) { + cw := &CheckpointWatcher{ + latestCheckpoint: 2000, + pendingTasks: []*waitCheckpointTask{ + {respCh: make(chan uint64, 1), minCheckpointTs: 1000}, + {respCh: make(chan uint64, 1), minCheckpointTs: 1500}, + {respCh: make(chan uint64, 1), minCheckpointTs: 2500}, + {respCh: make(chan uint64, 1), minCheckpointTs: 3000}, + }, + } + + cw.notifyPendingTasksLocked() + + // Tasks with minCheckpointTs < 2000 should be notified and removed + require.Len(t, cw.pendingTasks, 2) + require.Equal(t, uint64(2500), cw.pendingTasks[0].minCheckpointTs) + require.Equal(t, uint64(3000), cw.pendingTasks[1].minCheckpointTs) +} + +func TestCheckpointWatcher_InitialCheckpointNotifiesPendingTasks(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(5000) + + // Setup mock expectations - initial checkpoint is high enough + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize and get the initial checkpoint + time.Sleep(100 * time.Millisecond) + + // Request checkpoint that's already exceeded by initial checkpoint + checkpoint, err := watcher.AdvanceCheckpointTs(context.Background(), uint64(1000)) + require.NoError(t, err) + require.Equal(t, initialCheckpoint, checkpoint) +} + +func TestCheckpointWatcher_WatchErrorRetry(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + retryCheckpoint := uint64(2000) + + // First call succeeds, second call (retry) also succeeds with updated checkpoint + firstCall := mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: retryCheckpoint}, + int64(101), + nil, + ).After(firstCall) + + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + // First watch channel will be closed (simulating error), second watch channel will work + watchCh1 := make(chan clientv3.WatchResponse) + watchCh2 := make(chan clientv3.WatchResponse, 1) + firstWatch := mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh1) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh2).After(firstWatch) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Close the first watch channel to trigger retry + close(watchCh1) + + // Wait for retry to happen (backoff + processing time) + time.Sleep(700 * time.Millisecond) + + // After retry, checkpoint should be updated to retryCheckpoint + checkpoint, err := watcher.AdvanceCheckpointTs(t.Context(), uint64(1500)) + require.NoError(t, err) + require.Equal(t, retryCheckpoint, checkpoint) +} + +func TestCheckpointWatcher_GetStatusRetry(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + successCheckpoint := uint64(2000) + + // First call fails, second call succeeds + firstCall := mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + nil, + int64(0), + errors.Errorf("connection refused"), + ) + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: successCheckpoint}, + int64(100), + nil, + ).After(firstCall) + + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for retry to happen (backoff + processing time) + time.Sleep(700 * time.Millisecond) + + // After retry, checkpoint should be available + checkpoint, err := watcher.AdvanceCheckpointTs(t.Context(), uint64(1000)) + require.NoError(t, err) + require.Equal(t, successCheckpoint, checkpoint) +} + +func TestCheckpointWatcher_KeyDeleted(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockEtcdClient := etcd.NewMockCDCEtcdClient(ctrl) + mockClient := etcd.NewMockClient(ctrl) + + initialCheckpoint := uint64(1000) + + // Setup mock expectations + mockEtcdClient.EXPECT().GetChangeFeedStatus(gomock.Any(), gomock.Any()).Return( + &config.ChangeFeedStatus{CheckpointTs: initialCheckpoint}, + int64(100), + nil, + ) + mockEtcdClient.EXPECT().GetClusterID().Return("test-cluster").AnyTimes() + mockEtcdClient.EXPECT().GetEtcdClient().Return(mockClient).AnyTimes() + + watchCh := make(chan clientv3.WatchResponse, 1) + mockClient.EXPECT().Watch(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(watchCh) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + defer watcher.Close() + + // Wait for watcher to initialize + time.Sleep(50 * time.Millisecond) + + // Send delete event + watchCh <- clientv3.WatchResponse{ + Events: []*clientv3.Event{ + { + Type: clientv3.EventTypeDelete, + }, + }, + } + + // Give time for the error to be processed + time.Sleep(50 * time.Millisecond) + + // Now trying to advance should return an error + _, err := watcher.AdvanceCheckpointTs(context.Background(), uint64(2000)) + require.Error(t, err) + require.Contains(t, err.Error(), "deleted") +} diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go index 3e8d0dda35..b6e7d92b62 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher.go @@ -18,19 +18,19 @@ import ( "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/consumer" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" - "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/utils" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "github.com/pingcap/tidb/br/pkg/storage" ) type S3Watcher struct { - checkpointWatcher *CheckpointWatcher + checkpointWatcher Watcher consumer *consumer.S3Consumer } func NewS3Watcher( - checkpointWatcher *CheckpointWatcher, + checkpointWatcher Watcher, s3Storage storage.ExternalStorage, tables map[string][]string, ) *S3Watcher { @@ -41,7 +41,11 @@ func NewS3Watcher( } } -func (sw *S3Watcher) InitializeFromCheckpoint(ctx context.Context, clusterID string, checkpoint *recorder.Checkpoint) (map[cloudstorage.DmlPathKey]utils.IncrementalData, error) { +func (sw *S3Watcher) Close() { + sw.checkpointWatcher.Close() +} + +func (sw *S3Watcher) InitializeFromCheckpoint(ctx context.Context, clusterID string, checkpoint *recorder.Checkpoint) (map[cloudstorage.DmlPathKey]types.IncrementalData, error) { return sw.consumer.InitializeFromCheckpoint(ctx, clusterID, checkpoint) } @@ -56,7 +60,7 @@ func (sw *S3Watcher) AdvanceS3CheckpointTs(ctx context.Context, minCheckpointTs func (sw *S3Watcher) ConsumeNewFiles( ctx context.Context, -) (map[cloudstorage.DmlPathKey]utils.IncrementalData, map[utils.SchemaTableKey]utils.VersionKey, error) { +) (map[cloudstorage.DmlPathKey]types.IncrementalData, map[types.SchemaTableKey]types.VersionKey, error) { // TODO: get the index updated from the s3 newData, maxVersionMap, err := sw.consumer.ConsumeNewFiles(ctx) if err != nil { diff --git a/cmd/multi-cluster-consistency-checker/watcher/s3_watcher_test.go b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher_test.go new file mode 100644 index 0000000000..b3ecfa16d6 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/watcher/s3_watcher_test.go @@ -0,0 +1,202 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package watcher + +import ( + "context" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/consumer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/tidb/br/pkg/storage" + "github.com/stretchr/testify/require" +) + +// mockWatcher is a mock implementation of the Watcher interface for testing. +type mockWatcher struct { + advanceCheckpointTsFn func(ctx context.Context, minCheckpointTs uint64) (uint64, error) + closeFn func() + closed bool +} + +func (m *mockWatcher) AdvanceCheckpointTs(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + if m.advanceCheckpointTsFn != nil { + return m.advanceCheckpointTsFn(ctx, minCheckpointTs) + } + return 0, nil +} + +func (m *mockWatcher) Close() { + m.closed = true + if m.closeFn != nil { + m.closeFn() + } +} + +func TestS3Watcher_Close(t *testing.T) { + t.Parallel() + + t.Run("close delegates to checkpoint watcher", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{} + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + sw.Close() + require.True(t, mock.closed) + }) + + t.Run("close calls custom close function", func(t *testing.T) { + t.Parallel() + closeCalled := false + mock := &mockWatcher{ + closeFn: func() { + closeCalled = true + }, + } + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + sw.Close() + require.True(t, closeCalled) + }) +} + +func TestS3Watcher_AdvanceS3CheckpointTs(t *testing.T) { + t.Parallel() + + t.Run("advance checkpoint ts success", func(t *testing.T) { + t.Parallel() + expectedCheckpoint := uint64(5000) + mock := &mockWatcher{ + advanceCheckpointTsFn: func(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + require.Equal(t, uint64(3000), minCheckpointTs) + return expectedCheckpoint, nil + }, + } + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + checkpoint, err := sw.AdvanceS3CheckpointTs(context.Background(), uint64(3000)) + require.NoError(t, err) + require.Equal(t, expectedCheckpoint, checkpoint) + }) + + t.Run("advance checkpoint ts error", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{ + advanceCheckpointTsFn: func(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + return 0, errors.Errorf("connection lost") + }, + } + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + checkpoint, err := sw.AdvanceS3CheckpointTs(context.Background(), uint64(3000)) + require.Error(t, err) + require.Equal(t, uint64(0), checkpoint) + require.Contains(t, err.Error(), "advance s3 checkpoint timestamp failed") + require.Contains(t, err.Error(), "connection lost") + }) + + t.Run("advance checkpoint ts with context canceled", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{ + advanceCheckpointTsFn: func(ctx context.Context, minCheckpointTs uint64) (uint64, error) { + return 0, context.Canceled + }, + } + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + checkpoint, err := sw.AdvanceS3CheckpointTs(context.Background(), uint64(3000)) + require.Error(t, err) + require.Equal(t, uint64(0), checkpoint) + require.Contains(t, err.Error(), "advance s3 checkpoint timestamp failed") + }) +} + +func TestS3Watcher_InitializeFromCheckpoint(t *testing.T) { + t.Parallel() + + t.Run("nil checkpoint returns nil", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{} + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + result, err := sw.InitializeFromCheckpoint(context.Background(), "cluster1", nil) + require.NoError(t, err) + require.Nil(t, result) + }) + + t.Run("empty checkpoint returns nil", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{} + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + checkpoint := recorder.NewCheckpoint() + result, err := sw.InitializeFromCheckpoint(context.Background(), "cluster1", checkpoint) + require.NoError(t, err) + require.Nil(t, result) + }) +} + +func TestS3Watcher_ConsumeNewFiles(t *testing.T) { + t.Parallel() + + t.Run("consume new files with empty tables", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{} + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), map[string][]string{}), + } + + newData, maxVersionMap, err := sw.ConsumeNewFiles(context.Background()) + require.NoError(t, err) + require.Empty(t, newData) + require.Empty(t, maxVersionMap) + }) + + t.Run("consume new files with nil tables", func(t *testing.T) { + t.Parallel() + mock := &mockWatcher{} + sw := &S3Watcher{ + checkpointWatcher: mock, + consumer: consumer.NewS3Consumer(storage.NewMemStorage(), nil), + } + + newData, maxVersionMap, err := sw.ConsumeNewFiles(context.Background()) + require.NoError(t, err) + require.Empty(t, newData) + require.Empty(t, maxVersionMap) + }) +} diff --git a/pkg/common/table_info.go b/pkg/common/table_info.go index c8a8c58b3b..3b1d3d7cf4 100644 --- a/pkg/common/table_info.go +++ b/pkg/common/table_info.go @@ -525,15 +525,6 @@ func (ti *TableInfo) GetPkColInfo() *model.ColumnInfo { return ti.columnSchema.GetPkColInfo() } -func (ti *TableInfo) GetPrimaryKeyColumnInfos() []*model.IndexColumn { - for _, idx := range ti.columnSchema.Indices { - if idx.Primary { - return idx.Columns - } - } - return nil -} - // GetPrimaryKeyColumnNames returns the primary key column names func (ti *TableInfo) GetPrimaryKeyColumnNames() []string { var result []string diff --git a/pkg/sink/codec/canal/canal_json_txn_decoder.go b/pkg/sink/codec/canal/canal_json_txn_decoder.go index 7bbdbe111e..9c41be58e0 100644 --- a/pkg/sink/codec/canal/canal_json_txn_decoder.go +++ b/pkg/sink/codec/canal/canal_json_txn_decoder.go @@ -33,19 +33,6 @@ type txnDecoder struct { config *common.Config msg canalJSONMessageInterface - - cachedTableInfo *commonType.TableInfo -} - -// NewTxnDecoderWithTableInfo return a new txn decoder with a cached table info. -func NewTxnDecoderWithTableInfo( - codecConfig *common.Config, - tableInfo *commonType.TableInfo, -) *txnDecoder { - return &txnDecoder{ - config: codecConfig, - cachedTableInfo: tableInfo, - } } // NewTxnDecoder return a new CanalJSONTxnEventDecoder. @@ -121,10 +108,7 @@ func (d *txnDecoder) NextDMLEvent() *commonEvent.DMLEvent { func (d *txnDecoder) canalJSONMessage2RowChange() *commonEvent.DMLEvent { msg := d.msg - tableInfo := d.cachedTableInfo - if tableInfo == nil { - tableInfo = newTableInfo(msg) - } + tableInfo := newTableInfo(msg) result := new(commonEvent.DMLEvent) result.Length++ // todo: set this field correctly result.StartTs = msg.getCommitTs() // todo: how to set this correctly? From 81b12e8fab3d25c807d53700b396a01a203b3582 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 12 Feb 2026 15:03:27 +0800 Subject: [PATCH 19/23] add integration test Signed-off-by: Jianjun Liao --- .../checker/checker.go | 373 +++++--- .../checker/checker_test.go | 153 +-- .../config/config.example.toml | 6 +- .../config/config.go | 8 +- .../config/config_test.go | 22 +- .../decoder/value_to_datum_test.go | 898 ++++++++++++++++++ .../integration/integration_test.go | 735 ++++++++++++++ .../integration/mock_cluster.go | 205 ++++ cmd/multi-cluster-consistency-checker/main.go | 7 - .../recorder/recorder_test.go | 4 +- .../recorder/types.go | 89 +- .../recorder/types_test.go | 120 ++- cmd/multi-cluster-consistency-checker/task.go | 10 +- .../types/types.go | 12 + 14 files changed, 2309 insertions(+), 333 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/decoder/value_to_datum_test.go create mode 100644 cmd/multi-cluster-consistency-checker/integration/integration_test.go create mode 100644 cmd/multi-cluster-consistency-checker/integration/mock_cluster.go diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index 8bdcb9d055..43565adcfe 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -34,20 +34,25 @@ type versionCacheEntry struct { type clusterViolationChecker struct { clusterID string - twoPreviousTimeWindowKeyVersionCache map[types.PkType]versionCacheEntry + twoPreviousTimeWindowKeyVersionCache map[string]map[types.PkType]versionCacheEntry } func newClusterViolationChecker(clusterID string) *clusterViolationChecker { return &clusterViolationChecker{ clusterID: clusterID, - twoPreviousTimeWindowKeyVersionCache: make(map[types.PkType]versionCacheEntry), + twoPreviousTimeWindowKeyVersionCache: make(map[string]map[types.PkType]versionCacheEntry), } } -func (c *clusterViolationChecker) NewRecordFromCheckpoint(record *decoder.Record, previous int) { - entry, exists := c.twoPreviousTimeWindowKeyVersionCache[record.Pk] +func (c *clusterViolationChecker) NewRecordFromCheckpoint(schemaKey string, record *decoder.Record, previous int) { + tableSchemaKeyVersionCache, exists := c.twoPreviousTimeWindowKeyVersionCache[schemaKey] if !exists { - c.twoPreviousTimeWindowKeyVersionCache[record.Pk] = versionCacheEntry{ + tableSchemaKeyVersionCache = make(map[types.PkType]versionCacheEntry) + c.twoPreviousTimeWindowKeyVersionCache[schemaKey] = tableSchemaKeyVersionCache + } + entry, exists := tableSchemaKeyVersionCache[record.Pk] + if !exists { + tableSchemaKeyVersionCache[record.Pk] = versionCacheEntry{ previous: previous, cdcVersion: record.CdcVersion, } @@ -56,17 +61,22 @@ func (c *clusterViolationChecker) NewRecordFromCheckpoint(record *decoder.Record entryCompareTs := entry.cdcVersion.GetCompareTs() recordCompareTs := record.GetCompareTs() if entryCompareTs < recordCompareTs { - c.twoPreviousTimeWindowKeyVersionCache[record.Pk] = versionCacheEntry{ + tableSchemaKeyVersionCache[record.Pk] = versionCacheEntry{ previous: previous, cdcVersion: record.CdcVersion, } } } -func (c *clusterViolationChecker) Check(r *decoder.Record, report *recorder.ClusterReport) { - entry, exists := c.twoPreviousTimeWindowKeyVersionCache[r.Pk] +func (c *clusterViolationChecker) Check(schemaKey string, r *decoder.Record, report *recorder.ClusterReport) { + tableSchemaKeyVersionCache, exists := c.twoPreviousTimeWindowKeyVersionCache[schemaKey] if !exists { - c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ + tableSchemaKeyVersionCache = make(map[types.PkType]versionCacheEntry) + c.twoPreviousTimeWindowKeyVersionCache[schemaKey] = tableSchemaKeyVersionCache + } + entry, exists := tableSchemaKeyVersionCache[r.Pk] + if !exists { + tableSchemaKeyVersionCache[r.Pk] = versionCacheEntry{ previous: 0, cdcVersion: r.CdcVersion, } @@ -84,89 +94,112 @@ func (c *clusterViolationChecker) Check(r *decoder.Record, report *recorder.Clus zap.String("clusterID", c.clusterID), zap.Any("entry", entry), zap.Any("record", r)) - report.AddLWWViolationItem(string(r.Pk), entry.cdcVersion.OriginTs, entry.cdcVersion.CommitTs, r.OriginTs, r.CommitTs) + report.AddLWWViolationItem(schemaKey, string(r.Pk), entry.cdcVersion.OriginTs, entry.cdcVersion.CommitTs, r.OriginTs, r.CommitTs) return } - c.twoPreviousTimeWindowKeyVersionCache[r.Pk] = versionCacheEntry{ + tableSchemaKeyVersionCache[r.Pk] = versionCacheEntry{ previous: 0, cdcVersion: r.CdcVersion, } } func (c *clusterViolationChecker) UpdateCache() { - newTwoPreviousTimeWindowKeyVersionCache := make(map[types.PkType]versionCacheEntry) - for primaryKey, entry := range c.twoPreviousTimeWindowKeyVersionCache { - if entry.previous >= 2 { - continue + newTwoPreviousTimeWindowKeyVersionCache := make(map[string]map[types.PkType]versionCacheEntry) + for schemaKey, tableSchemaKeyVersionCache := range c.twoPreviousTimeWindowKeyVersionCache { + newTableSchemaKeyVersionCache := make(map[types.PkType]versionCacheEntry) + for primaryKey, entry := range tableSchemaKeyVersionCache { + if entry.previous >= 2 { + continue + } + newTableSchemaKeyVersionCache[primaryKey] = versionCacheEntry{ + previous: entry.previous + 1, + cdcVersion: entry.cdcVersion, + } } - newTwoPreviousTimeWindowKeyVersionCache[primaryKey] = versionCacheEntry{ - previous: entry.previous + 1, - cdcVersion: entry.cdcVersion, + if len(newTableSchemaKeyVersionCache) > 0 { + newTwoPreviousTimeWindowKeyVersionCache[schemaKey] = newTableSchemaKeyVersionCache } } c.twoPreviousTimeWindowKeyVersionCache = newTwoPreviousTimeWindowKeyVersionCache } -type timeWindowDataCache struct { +type tableDataCache struct { // upstreamDataCache is a map of primary key to a map of commit ts to a record upstreamDataCache map[types.PkType]map[uint64]*decoder.Record // downstreamDataCache is a map of primary key to a map of origin ts to a record downstreamDataCache map[types.PkType]map[uint64]*decoder.Record - - leftBoundary uint64 - rightBoundary uint64 - checkpointTs map[string]uint64 } -func newTimeWindowDataCache(leftBoundary, rightBoundary uint64, checkpointTs map[string]uint64) timeWindowDataCache { - return timeWindowDataCache{ +func newTableDataCache() *tableDataCache { + return &tableDataCache{ upstreamDataCache: make(map[types.PkType]map[uint64]*decoder.Record), downstreamDataCache: make(map[types.PkType]map[uint64]*decoder.Record), - leftBoundary: leftBoundary, - rightBoundary: rightBoundary, - checkpointTs: checkpointTs, } } -func (twdc *timeWindowDataCache) newUpstreamRecord(record *decoder.Record) { - recordsMap, exists := twdc.upstreamDataCache[record.Pk] +func (tdc *tableDataCache) newUpstreamRecord(record *decoder.Record) { + recordsMap, exists := tdc.upstreamDataCache[record.Pk] if !exists { recordsMap = make(map[uint64]*decoder.Record) - twdc.upstreamDataCache[record.Pk] = recordsMap + tdc.upstreamDataCache[record.Pk] = recordsMap } recordsMap[record.CommitTs] = record } -func (twdc *timeWindowDataCache) newDownstreamRecord(record *decoder.Record) { - recordsMap, exists := twdc.downstreamDataCache[record.Pk] +func (tdc *tableDataCache) newDownstreamRecord(record *decoder.Record) { + recordsMap, exists := tdc.downstreamDataCache[record.Pk] if !exists { recordsMap = make(map[uint64]*decoder.Record) - twdc.downstreamDataCache[record.Pk] = recordsMap + tdc.downstreamDataCache[record.Pk] = recordsMap } recordsMap[record.OriginTs] = record } -func (twdc *timeWindowDataCache) NewRecord(record *decoder.Record) { +type timeWindowDataCache struct { + tableDataCaches map[string]*tableDataCache + + leftBoundary uint64 + rightBoundary uint64 + checkpointTs map[string]uint64 +} + +func newTimeWindowDataCache(leftBoundary, rightBoundary uint64, checkpointTs map[string]uint64) timeWindowDataCache { + return timeWindowDataCache{ + tableDataCaches: make(map[string]*tableDataCache), + leftBoundary: leftBoundary, + rightBoundary: rightBoundary, + checkpointTs: checkpointTs, + } +} + +func (twdc *timeWindowDataCache) NewRecord(schemaKey string, record *decoder.Record) { if record.CommitTs <= twdc.leftBoundary { // record is before the left boundary, just skip it return } + tableDataCache, exists := twdc.tableDataCaches[schemaKey] + if !exists { + tableDataCache = newTableDataCache() + twdc.tableDataCaches[schemaKey] = tableDataCache + } if record.OriginTs == 0 { - twdc.newUpstreamRecord(record) + tableDataCache.newUpstreamRecord(record) } else { - twdc.newDownstreamRecord(record) + tableDataCache.newDownstreamRecord(record) } } type clusterDataChecker struct { clusterID string + thisRoundTimeWindow types.TimeWindow + timeWindowDataCaches [3]timeWindowDataCache rightBoundary uint64 - overDataCaches []*decoder.Record + overDataCaches map[string][]*decoder.Record clusterViolationChecker *clusterViolationChecker @@ -178,7 +211,7 @@ func newClusterDataChecker(clusterID string) *clusterDataChecker { clusterID: clusterID, timeWindowDataCaches: [3]timeWindowDataCache{}, rightBoundary: 0, - overDataCaches: make([]*decoder.Record, 0), + overDataCaches: make(map[string][]*decoder.Record), clusterViolationChecker: newClusterViolationChecker(clusterID), } } @@ -203,7 +236,8 @@ func (cd *clusterDataChecker) InitializeFromCheckpoint( cd.timeWindowDataCaches[1] = newTimeWindowDataCache( clusterInfo.TimeWindow.LeftBoundary, clusterInfo.TimeWindow.RightBoundary, clusterInfo.TimeWindow.CheckpointTs) } - for _, incrementalData := range checkpointDataMap { + for schemaPathKey, incrementalData := range checkpointDataMap { + schemaKey := schemaPathKey.GetKey() for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { records, err := decoder.Decode(content) @@ -211,7 +245,7 @@ func (cd *clusterDataChecker) InitializeFromCheckpoint( return errors.Trace(err) } for _, record := range records { - cd.newRecordFromCheckpoint(record) + cd.newRecordFromCheckpoint(schemaKey, record) } } } @@ -219,18 +253,18 @@ func (cd *clusterDataChecker) InitializeFromCheckpoint( return nil } -func (cd *clusterDataChecker) newRecordFromCheckpoint(record *decoder.Record) { +func (cd *clusterDataChecker) newRecordFromCheckpoint(schemaKey string, record *decoder.Record) { if record.CommitTs > cd.rightBoundary { - cd.overDataCaches = append(cd.overDataCaches, record) + cd.overDataCaches[schemaKey] = append(cd.overDataCaches[schemaKey], record) return } if cd.timeWindowDataCaches[2].leftBoundary < record.CommitTs { - cd.timeWindowDataCaches[2].NewRecord(record) - cd.clusterViolationChecker.NewRecordFromCheckpoint(record, 1) + cd.timeWindowDataCaches[2].NewRecord(schemaKey, record) + cd.clusterViolationChecker.NewRecordFromCheckpoint(schemaKey, record, 1) } else if cd.timeWindowDataCaches[1].leftBoundary < record.CommitTs { - cd.timeWindowDataCaches[1].NewRecord(record) - cd.clusterViolationChecker.NewRecordFromCheckpoint(record, 2) + cd.timeWindowDataCaches[1].NewRecord(schemaKey, record) + cd.clusterViolationChecker.NewRecordFromCheckpoint(schemaKey, record, 2) } } @@ -242,29 +276,37 @@ func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow types.TimeWin cd.timeWindowDataCaches[1] = cd.timeWindowDataCaches[2] newTimeWindowDataCache := newTimeWindowDataCache(timeWindow.LeftBoundary, timeWindow.RightBoundary, timeWindow.CheckpointTs) cd.rightBoundary = timeWindow.RightBoundary - newOverDataCache := make([]*decoder.Record, 0, len(cd.overDataCaches)) - for _, overRecord := range cd.overDataCaches { - if overRecord.CommitTs > timeWindow.RightBoundary { - newOverDataCache = append(newOverDataCache, overRecord) - } else { - newTimeWindowDataCache.NewRecord(overRecord) + newOverDataCache := make(map[string][]*decoder.Record) + for schemaKey, overRecords := range cd.overDataCaches { + newTableOverDataCache := make([]*decoder.Record, 0, len(overRecords)) + for _, overRecord := range overRecords { + if overRecord.CommitTs > timeWindow.RightBoundary { + newTableOverDataCache = append(newTableOverDataCache, overRecord) + } else { + newTimeWindowDataCache.NewRecord(schemaKey, overRecord) + } } + newOverDataCache[schemaKey] = newTableOverDataCache } cd.timeWindowDataCaches[2] = newTimeWindowDataCache cd.overDataCaches = newOverDataCache return nil } -func (cd *clusterDataChecker) NewRecord(record *decoder.Record) { +func (cd *clusterDataChecker) NewRecord(schemaKey string, record *decoder.Record) { if record.CommitTs > cd.rightBoundary { - cd.overDataCaches = append(cd.overDataCaches, record) + cd.overDataCaches[schemaKey] = append(cd.overDataCaches[schemaKey], record) return } - cd.timeWindowDataCaches[2].NewRecord(record) + cd.timeWindowDataCaches[2].NewRecord(schemaKey, record) } -func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowIdx int, pk types.PkType, originTs uint64) (*decoder.Record, bool) { - records, exists := cd.timeWindowDataCaches[timeWindowIdx].downstreamDataCache[pk] +func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowIdx int, schemaKey string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { + tableDataCache, exists := cd.timeWindowDataCaches[timeWindowIdx].tableDataCaches[schemaKey] + if !exists { + return nil, false + } + records, exists := tableDataCache.downstreamDataCache[pk] if !exists { return nil, false } @@ -279,8 +321,12 @@ func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowId return nil, false } -func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx int, pk types.PkType, commitTs uint64) bool { - records, exists := cd.timeWindowDataCaches[timeWindowIdx].upstreamDataCache[pk] +func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx int, schemaKey string, pk types.PkType, commitTs uint64) bool { + tableDataCache, exists := cd.timeWindowDataCaches[timeWindowIdx].tableDataCaches[schemaKey] + if !exists { + return false + } + records, exists := tableDataCache.upstreamDataCache[pk] if !exists { return false } @@ -293,58 +339,62 @@ func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx // in the downstream data cache [1] or [2] or another new record is present in the downstream data // cache [1] or [2]. func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { - for _, upstreamDataCache := range cd.timeWindowDataCaches[1].upstreamDataCache { - for _, record := range upstreamDataCache { - for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[1].checkpointTs { - if record.CommitTs <= checkpointTs { - continue - } - downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, record.Pk, record.CommitTs) - if skipped { - continue - } - if downstreamRecord == nil { - // data loss detected - log.Error("data loss detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), - zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, false) - } else if !record.EqualDownstreamRecord(downstreamRecord) { - // data inconsistent detected - log.Error("data inconsistent detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), - zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, true) + for schemaKey, tableDataCache := range cd.timeWindowDataCaches[1].tableDataCaches { + for _, upstreamDataCache := range tableDataCache.upstreamDataCache { + for _, record := range upstreamDataCache { + for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[1].checkpointTs { + if record.CommitTs <= checkpointTs { + continue + } + downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, schemaKey, record.Pk, record.CommitTs) + if skipped { + continue + } + if downstreamRecord == nil { + // data loss detected + log.Error("data loss detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, false) + } else if !record.EqualDownstreamRecord(downstreamRecord) { + // data inconsistent detected + log.Error("data inconsistent detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, true) + } } } } } - for _, upstreamDataCache := range cd.timeWindowDataCaches[2].upstreamDataCache { - for _, record := range upstreamDataCache { - for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[2].checkpointTs { - if record.CommitTs > checkpointTs { - continue - } - downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, record.Pk, record.CommitTs) - if skipped { - continue - } - if downstreamRecord == nil { - // data loss detected - log.Error("data loss detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), - zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, false) - } else if !record.EqualDownstreamRecord(downstreamRecord) { - // data inconsistent detected - log.Error("data inconsistent detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), - zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, string(record.Pk), record.OriginTs, record.CommitTs, true) + for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { + for _, upstreamDataCache := range tableDataCache.upstreamDataCache { + for _, record := range upstreamDataCache { + for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[2].checkpointTs { + if record.CommitTs > checkpointTs { + continue + } + downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, schemaKey, record.Pk, record.CommitTs) + if skipped { + continue + } + if downstreamRecord == nil { + // data loss detected + log.Error("data loss detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, false) + } else if !record.EqualDownstreamRecord(downstreamRecord) { + // data inconsistent detected + log.Error("data inconsistent detected", + zap.String("upstreamClusterID", cd.clusterID), + zap.String("downstreamClusterID", downstreamClusterID), + zap.Any("record", record)) + cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, true) + } } } } @@ -354,15 +404,17 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { // dataRedundantDetection iterates through the downstream data cache [2]. The record must be present // in the upstream data cache [1] [2] or [3]. func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { - for _, downstreamDataCache := range cd.timeWindowDataCaches[2].downstreamDataCache { - for _, record := range downstreamDataCache { - // For downstream records, OriginTs is the upstream commit ts - if !checker.FindClusterUpstreamData(cd.clusterID, record.Pk, record.OriginTs) { - // data redundant detected - log.Error("data redundant detected", - zap.String("downstreamClusterID", cd.clusterID), - zap.Any("record", record)) - cd.report.AddDataRedundantItem(string(record.Pk), record.OriginTs, record.CommitTs) + for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { + for _, downstreamDataCache := range tableDataCache.downstreamDataCache { + for _, record := range downstreamDataCache { + // For downstream records, OriginTs is the upstream commit ts + if !checker.FindClusterUpstreamData(cd.clusterID, schemaKey, record.Pk, record.OriginTs) { + // data redundant detected + log.Error("data redundant detected", + zap.String("downstreamClusterID", cd.clusterID), + zap.Any("record", record)) + cd.report.AddDataRedundantItem(schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) + } } } } @@ -370,35 +422,38 @@ func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { // lwwViolationDetection check the orderliness of the records func (cd *clusterDataChecker) lwwViolationDetection() { - for pk, upstreamRecords := range cd.timeWindowDataCaches[2].upstreamDataCache { - downstreamRecords := cd.timeWindowDataCaches[2].downstreamDataCache[pk] - pkRecords := make([]*decoder.Record, 0, len(upstreamRecords)+len(downstreamRecords)) - for _, upstreamRecord := range upstreamRecords { - pkRecords = append(pkRecords, upstreamRecord) - } - for _, downstreamRecord := range downstreamRecords { - pkRecords = append(pkRecords, downstreamRecord) - } - sort.Slice(pkRecords, func(i, j int) bool { - return pkRecords[i].CommitTs < pkRecords[j].CommitTs - }) - for _, record := range pkRecords { - cd.clusterViolationChecker.Check(record, cd.report) - } - } - for pk, downstreamRecords := range cd.timeWindowDataCaches[2].downstreamDataCache { - if _, exists := cd.timeWindowDataCaches[2].upstreamDataCache[pk]; exists { - continue - } - pkRecords := make([]*decoder.Record, 0, len(downstreamRecords)) - for _, downstreamRecord := range downstreamRecords { - pkRecords = append(pkRecords, downstreamRecord) + for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { + for pk, upstreamRecords := range tableDataCache.upstreamDataCache { + downstreamRecords := tableDataCache.downstreamDataCache[pk] + pkRecords := make([]*decoder.Record, 0, len(upstreamRecords)+len(downstreamRecords)) + for _, upstreamRecord := range upstreamRecords { + pkRecords = append(pkRecords, upstreamRecord) + } + for _, downstreamRecord := range downstreamRecords { + pkRecords = append(pkRecords, downstreamRecord) + } + sort.Slice(pkRecords, func(i, j int) bool { + return pkRecords[i].CommitTs < pkRecords[j].CommitTs + }) + for _, record := range pkRecords { + cd.clusterViolationChecker.Check(schemaKey, record, cd.report) + } } - sort.Slice(pkRecords, func(i, j int) bool { - return pkRecords[i].CommitTs < pkRecords[j].CommitTs - }) - for _, record := range pkRecords { - cd.clusterViolationChecker.Check(record, cd.report) + + for pk, downstreamRecords := range tableDataCache.downstreamDataCache { + if _, exists := tableDataCache.upstreamDataCache[pk]; exists { + continue + } + pkRecords := make([]*decoder.Record, 0, len(downstreamRecords)) + for _, downstreamRecord := range downstreamRecords { + pkRecords = append(pkRecords, downstreamRecord) + } + sort.Slice(pkRecords, func(i, j int) bool { + return pkRecords[i].CommitTs < pkRecords[j].CommitTs + }) + for _, record := range pkRecords { + cd.clusterViolationChecker.Check(schemaKey, record, cd.report) + } } } @@ -406,7 +461,7 @@ func (cd *clusterDataChecker) lwwViolationDetection() { } func (cd *clusterDataChecker) Check(checker *DataChecker) { - cd.report = recorder.NewClusterReport(cd.clusterID) + cd.report = recorder.NewClusterReport(cd.clusterID, cd.thisRoundTimeWindow) // CHECK 1 - Data Loss Detection cd.dataLossDetection(checker) // CHECK 2 - Data Redundant Detection @@ -455,38 +510,38 @@ func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDa // FindClusterDownstreamData checks whether the record is present in the downstream data // cache [1] or [2] or another new record is present in the downstream data cache [1] or [2]. -func (c *DataChecker) FindClusterDownstreamData(clusterID string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { +func (c *DataChecker) FindClusterDownstreamData(clusterID string, schemaKey string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { clusterDataChecker, exists := c.clusterDataCheckers[clusterID] if !exists { return nil, false } - record, skipped := clusterDataChecker.findClusterDownstreamDataInTimeWindow(1, pk, originTs) + record, skipped := clusterDataChecker.findClusterDownstreamDataInTimeWindow(1, schemaKey, pk, originTs) if skipped || record != nil { return record, skipped } - return clusterDataChecker.findClusterDownstreamDataInTimeWindow(2, pk, originTs) + return clusterDataChecker.findClusterDownstreamDataInTimeWindow(2, schemaKey, pk, originTs) } -func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, pk types.PkType, commitTs uint64) bool { +func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, schemaKey string, pk types.PkType, commitTs uint64) bool { for _, clusterDataChecker := range c.clusterDataCheckers { if clusterDataChecker.clusterID == downstreamClusterID { continue } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(0, pk, commitTs) { + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(0, schemaKey, pk, commitTs) { return true } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(1, pk, commitTs) { + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(1, schemaKey, pk, commitTs) { return true } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(2, pk, commitTs) { + if clusterDataChecker.findClusterUpstreamDataInTimeWindow(2, schemaKey, pk, commitTs) { return true } } return false } -func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowData map[string]types.TimeWindowData) (*recorder.Report, error) { - if err := c.decodeNewTimeWindowData(ctx, newTimeWindowData); err != nil { +func (c *DataChecker) CheckInNextTimeWindow(newTimeWindowData map[string]types.TimeWindowData) (*recorder.Report, error) { + if err := c.decodeNewTimeWindowData(newTimeWindowData); err != nil { log.Error("failed to decode new time window data", zap.Error(err)) return nil, errors.Annotate(err, "failed to decode new time window data") } @@ -503,7 +558,7 @@ func (c *DataChecker) CheckInNextTimeWindow(ctx context.Context, newTimeWindowDa return report, nil } -func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindowData map[string]types.TimeWindowData) error { +func (c *DataChecker) decodeNewTimeWindowData(newTimeWindowData map[string]types.TimeWindowData) error { if len(newTimeWindowData) != len(c.clusterDataCheckers) { return errors.Errorf("number of clusters mismatch, expected %d, got %d", len(c.clusterDataCheckers), len(newTimeWindowData)) } @@ -512,10 +567,12 @@ func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindow if !exists { return errors.Errorf("cluster %s not found", clusterID) } + clusterDataChecker.thisRoundTimeWindow = timeWindowData.TimeWindow if err := clusterDataChecker.PrepareNextTimeWindowData(timeWindowData.TimeWindow); err != nil { return errors.Trace(err) } - for _, incrementalData := range timeWindowData.Data { + for schemaPathKey, incrementalData := range timeWindowData.Data { + schemaKey := schemaPathKey.GetKey() for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { records, err := decoder.Decode(content) @@ -523,7 +580,7 @@ func (c *DataChecker) decodeNewTimeWindowData(ctx context.Context, newTimeWindow return errors.Trace(err) } for _, record := range records { - clusterDataChecker.NewRecord(record) + clusterDataChecker.NewRecord(schemaKey, record) } } } diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go index 312bee6c12..9818ee2c03 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker_test.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -34,12 +34,12 @@ func TestNewDataChecker(t *testing.T) { t.Parallel() clusterConfig := map[string]config.ClusterConfig{ "cluster1": { - PDAddr: "127.0.0.1:2379", + PDAddrs: []string{"127.0.0.1:2379"}, S3SinkURI: "s3://bucket/cluster1/", S3ChangefeedID: "s3-cf-1", }, "cluster2": { - PDAddr: "127.0.0.1:2479", + PDAddrs: []string{"127.0.0.1:2479"}, S3SinkURI: "s3://bucket/cluster2/", S3ChangefeedID: "s3-cf-2", }, @@ -84,10 +84,12 @@ func TestNewClusterViolationChecker(t *testing.T) { func TestClusterViolationChecker_Check(t *testing.T) { t.Parallel() + const schemaKey = "test_schema" + t.Run("check new record", func(t *testing.T) { t.Parallel() checker := newClusterViolationChecker("cluster1") - report := recorder.NewClusterReport("cluster1") + report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record := &decoder.Record{ Pk: "pk1", @@ -97,15 +99,16 @@ func TestClusterViolationChecker_Check(t *testing.T) { }, } - checker.Check(record, report) - require.Len(t, report.LWWViolationItems, 0) - require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache, record.Pk) + checker.Check(schemaKey, record, report) + require.Empty(t, report.TableFailureItems) + require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache, schemaKey) + require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache[schemaKey], record.Pk) }) t.Run("check duplicate old version", func(t *testing.T) { t.Parallel() checker := newClusterViolationChecker("cluster1") - report := recorder.NewClusterReport("cluster1") + report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record1 := &decoder.Record{ Pk: "pk1", @@ -122,15 +125,15 @@ func TestClusterViolationChecker_Check(t *testing.T) { }, } - checker.Check(record1, report) - checker.Check(record2, report) - require.Len(t, report.LWWViolationItems, 0) // Should skip duplicate old version + checker.Check(schemaKey, record1, report) + checker.Check(schemaKey, record2, report) + require.Empty(t, report.TableFailureItems) // Should skip duplicate old version }) t.Run("check lww violation", func(t *testing.T) { t.Parallel() checker := newClusterViolationChecker("cluster1") - report := recorder.NewClusterReport("cluster1") + report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record1 := &decoder.Record{ Pk: "pk1", @@ -147,24 +150,29 @@ func TestClusterViolationChecker_Check(t *testing.T) { }, } - checker.Check(record1, report) - checker.Check(record2, report) - require.Len(t, report.LWWViolationItems, 1) - require.Equal(t, "pk1", report.LWWViolationItems[0].PK) - require.Equal(t, uint64(0), report.LWWViolationItems[0].ExistingOriginTS) - require.Equal(t, uint64(100), report.LWWViolationItems[0].ExistingCommitTS) - require.Equal(t, uint64(50), report.LWWViolationItems[0].OriginTS) - require.Equal(t, uint64(150), report.LWWViolationItems[0].CommitTS) + checker.Check(schemaKey, record1, report) + checker.Check(schemaKey, record2, report) + require.Len(t, report.TableFailureItems, 1) + require.Contains(t, report.TableFailureItems, schemaKey) + tableItems := report.TableFailureItems[schemaKey] + require.Len(t, tableItems.LWWViolationItems, 1) + require.Equal(t, "pk1", tableItems.LWWViolationItems[0].PK) + require.Equal(t, uint64(0), tableItems.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(100), tableItems.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(50), tableItems.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(150), tableItems.LWWViolationItems[0].CommitTS) }) } func TestClusterViolationChecker_UpdateCache(t *testing.T) { t.Parallel() + const schemaKey = "test_schema" + t.Run("update cache", func(t *testing.T) { t.Parallel() checker := newClusterViolationChecker("cluster1") - report := recorder.NewClusterReport("cluster1") + report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record := &decoder.Record{ Pk: "pk1", @@ -174,22 +182,22 @@ func TestClusterViolationChecker_UpdateCache(t *testing.T) { }, } - checker.Check(record, report) - require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache, record.Pk) - entry := checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + checker.Check(schemaKey, record, report) + require.Contains(t, checker.twoPreviousTimeWindowKeyVersionCache, schemaKey) + entry := checker.twoPreviousTimeWindowKeyVersionCache[schemaKey][record.Pk] require.Equal(t, 0, entry.previous) checker.UpdateCache() - entry = checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + entry = checker.twoPreviousTimeWindowKeyVersionCache[schemaKey][record.Pk] require.Equal(t, 1, entry.previous) checker.UpdateCache() - entry = checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + entry = checker.twoPreviousTimeWindowKeyVersionCache[schemaKey][record.Pk] require.Equal(t, 2, entry.previous) checker.UpdateCache() // Entry should be removed after 2 updates - _, exists := checker.twoPreviousTimeWindowKeyVersionCache[record.Pk] + _, exists := checker.twoPreviousTimeWindowKeyVersionCache[schemaKey] require.False(t, exists) }) } @@ -209,14 +217,15 @@ func TestNewTimeWindowDataCache(t *testing.T) { require.Equal(t, leftBoundary, cache.leftBoundary) require.Equal(t, rightBoundary, cache.rightBoundary) require.Equal(t, checkpointTs, cache.checkpointTs) - require.NotNil(t, cache.upstreamDataCache) - require.NotNil(t, cache.downstreamDataCache) + require.NotNil(t, cache.tableDataCaches) }) } func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Parallel() + const schemaKey = "test_schema" + t.Run("add upstream record", func(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) @@ -228,9 +237,10 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { }, } - cache.NewRecord(record) - require.Contains(t, cache.upstreamDataCache, record.Pk) - require.Contains(t, cache.upstreamDataCache[record.Pk], record.CommitTs) + cache.NewRecord(schemaKey, record) + require.Contains(t, cache.tableDataCaches, schemaKey) + require.Contains(t, cache.tableDataCaches[schemaKey].upstreamDataCache, record.Pk) + require.Contains(t, cache.tableDataCaches[schemaKey].upstreamDataCache[record.Pk], record.CommitTs) }) t.Run("add downstream record", func(t *testing.T) { @@ -244,9 +254,10 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { }, } - cache.NewRecord(record) - require.Contains(t, cache.downstreamDataCache, record.Pk) - require.Contains(t, cache.downstreamDataCache[record.Pk], record.OriginTs) + cache.NewRecord(schemaKey, record) + require.Contains(t, cache.tableDataCaches, schemaKey) + require.Contains(t, cache.tableDataCaches[schemaKey].downstreamDataCache, record.Pk) + require.Contains(t, cache.tableDataCaches[schemaKey].downstreamDataCache[record.Pk], record.OriginTs) }) t.Run("skip record before left boundary", func(t *testing.T) { @@ -260,9 +271,8 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { }, } - cache.NewRecord(record) - require.NotContains(t, cache.upstreamDataCache, record.Pk) - require.NotContains(t, cache.downstreamDataCache, record.Pk) + cache.NewRecord(schemaKey, record) + require.NotContains(t, cache.tableDataCaches, schemaKey) }) } @@ -345,6 +355,10 @@ func makeTWData(left, right uint64, checkpointTs map[string]uint64, content []by } } +// defaultSchemaKey is the schema key produced by DmlPathKey{}.GetKey() +// which is QuoteSchema("", "") = "“.“" +var defaultSchemaKey = (&cloudstorage.DmlPathKey{}).GetKey() + // TestDataChecker_FourRoundsCheck simulates 4 rounds with increasing data and verifies check results. // Setup: 2 clusters (c1 upstream, c2 downstream from c1). // Rounds 0-2: accumulate data, check not yet active (checkableRound < 3). @@ -396,7 +410,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} for i, roundData := range rounds { - report, err := checker.CheckInNextTimeWindow(ctx, roundData) + report, err := checker.CheckInNextTimeWindow(roundData) require.NoError(t, err, "round %d", i) require.Equal(t, uint64(i), report.Round) if i < 3 { @@ -406,9 +420,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { require.Len(t, report.ClusterReports, 2) require.False(t, report.NeedFlush(), "round 3 should not need flush (all consistent)") for clusterID, cr := range report.ClusterReports { - require.Empty(t, cr.DataLossItems, "cluster %s should have no data loss", clusterID) - require.Empty(t, cr.DataRedundantItems, "cluster %s should have no data redundant", clusterID) - require.Empty(t, cr.LWWViolationItems, "cluster %s should have no LWW violation", clusterID) + require.Empty(t, cr.TableFailureItems, "cluster %s should have no table failure items", clusterID) } } } @@ -435,7 +447,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} var lastReport *recorder.Report for i, roundData := range rounds { - report, err := checker.CheckInNextTimeWindow(ctx, roundData) + report, err := checker.CheckInNextTimeWindow(roundData) require.NoError(t, err, "round %d", i) lastReport = report } @@ -444,15 +456,16 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { // c1 should detect data loss: pk=3 (commitTs=250) missing in c2's downstream c1Report := lastReport.ClusterReports["c1"] require.NotNil(t, c1Report) - require.Len(t, c1Report.DataLossItems, 1) - require.Equal(t, "c2", c1Report.DataLossItems[0].DownstreamClusterID) - require.Equal(t, uint64(0), c1Report.DataLossItems[0].OriginTS) - require.Equal(t, uint64(250), c1Report.DataLossItems[0].CommitTS) - require.False(t, c1Report.DataLossItems[0].Inconsistent) + require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) + tableItems := c1Report.TableFailureItems[defaultSchemaKey] + require.Len(t, tableItems.DataLossItems, 1) + require.Equal(t, "c2", tableItems.DataLossItems[0].DownstreamClusterID) + require.Equal(t, uint64(0), tableItems.DataLossItems[0].OriginTS) + require.Equal(t, uint64(250), tableItems.DataLossItems[0].CommitTS) + require.False(t, tableItems.DataLossItems[0].Inconsistent) // c2 should have no issues c2Report := lastReport.ClusterReports["c2"] - require.Empty(t, c2Report.DataLossItems) - require.Empty(t, c2Report.DataRedundantItems) + require.Empty(t, c2Report.TableFailureItems) }) t.Run("data inconsistent detected", func(t *testing.T) { @@ -477,17 +490,19 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} var lastReport *recorder.Report for i, roundData := range rounds { - report, err := checker.CheckInNextTimeWindow(ctx, roundData) + report, err := checker.CheckInNextTimeWindow(roundData) require.NoError(t, err, "round %d", i) lastReport = report } require.True(t, lastReport.NeedFlush()) c1Report := lastReport.ClusterReports["c1"] - require.Len(t, c1Report.DataLossItems, 1) - require.Equal(t, "c2", c1Report.DataLossItems[0].DownstreamClusterID) - require.Equal(t, uint64(250), c1Report.DataLossItems[0].CommitTS) - require.True(t, c1Report.DataLossItems[0].Inconsistent) // data inconsistent, not pure data loss + require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) + tableItems := c1Report.TableFailureItems[defaultSchemaKey] + require.Len(t, tableItems.DataLossItems, 1) + require.Equal(t, "c2", tableItems.DataLossItems[0].DownstreamClusterID) + require.Equal(t, uint64(250), tableItems.DataLossItems[0].CommitTS) + require.True(t, tableItems.DataLossItems[0].Inconsistent) // data inconsistent, not pure data loss }) t.Run("data redundant detected", func(t *testing.T) { @@ -516,7 +531,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} var lastReport *recorder.Report for i, roundData := range rounds { - report, err := checker.CheckInNextTimeWindow(ctx, roundData) + report, err := checker.CheckInNextTimeWindow(roundData) require.NoError(t, err, "round %d", i) lastReport = report } @@ -524,12 +539,14 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { require.True(t, lastReport.NeedFlush()) // c1 should have no data loss c1Report := lastReport.ClusterReports["c1"] - require.Empty(t, c1Report.DataLossItems) + require.Empty(t, c1Report.TableFailureItems) // c2 should detect data redundant: pk=99 has no matching upstream in c1 c2Report := lastReport.ClusterReports["c2"] - require.Len(t, c2Report.DataRedundantItems, 1) - require.Equal(t, uint64(330), c2Report.DataRedundantItems[0].OriginTS) - require.Equal(t, uint64(340), c2Report.DataRedundantItems[0].CommitTS) + require.Contains(t, c2Report.TableFailureItems, defaultSchemaKey) + tableItems := c2Report.TableFailureItems[defaultSchemaKey] + require.Len(t, tableItems.DataRedundantItems, 1) + require.Equal(t, uint64(330), tableItems.DataRedundantItems[0].OriginTS) + require.Equal(t, uint64(340), tableItems.DataRedundantItems[0].CommitTS) }) t.Run("lww violation detected", func(t *testing.T) { @@ -563,21 +580,25 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { rounds := [4]map[string]types.TimeWindowData{base[0], base[1], round2, round3} var lastReport *recorder.Report for i, roundData := range rounds { - report, err := checker.CheckInNextTimeWindow(ctx, roundData) + report, err := checker.CheckInNextTimeWindow(roundData) require.NoError(t, err, "round %d", i) lastReport = report } require.True(t, lastReport.NeedFlush()) c1Report := lastReport.ClusterReports["c1"] - require.Len(t, c1Report.LWWViolationItems, 1) - require.Equal(t, uint64(0), c1Report.LWWViolationItems[0].ExistingOriginTS) - require.Equal(t, uint64(350), c1Report.LWWViolationItems[0].ExistingCommitTS) - require.Equal(t, uint64(310), c1Report.LWWViolationItems[0].OriginTS) - require.Equal(t, uint64(370), c1Report.LWWViolationItems[0].CommitTS) + require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) + c1TableItems := c1Report.TableFailureItems[defaultSchemaKey] + require.Len(t, c1TableItems.LWWViolationItems, 1) + require.Equal(t, uint64(0), c1TableItems.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(350), c1TableItems.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(310), c1TableItems.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(370), c1TableItems.LWWViolationItems[0].CommitTS) // c2 should have no LWW violation (its records are ordered correctly: // upstream commitTs=310 compareTs=310, downstream commitTs=360 compareTs=350, 310 < 350) c2Report := lastReport.ClusterReports["c2"] - require.Empty(t, c2Report.LWWViolationItems) + if c2TableItems, ok := c2Report.TableFailureItems[defaultSchemaKey]; ok { + require.Empty(t, c2TableItems.LWWViolationItems) + } }) } diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index 8073475ae2..794aaae2f8 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -18,7 +18,7 @@ data-dir = "/tmp/multi-cluster-consistency-checker-data" [clusters] # First cluster configuration [clusters.cluster1] - pd-addr = "127.0.0.1:2379" + pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket-name/cluster1/" s3-changefeed-id = "s3-changefeed-id-1" # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } @@ -27,7 +27,7 @@ data-dir = "/tmp/multi-cluster-consistency-checker-data" # Second cluster configuration [clusters.cluster2] - pd-addr = "127.0.0.1:2479" + pd-addrs = ["127.0.0.1:2479"] s3-sink-uri = "s3://bucket-name/cluster2/" s3-changefeed-id = "s3-changefeed-id-2" # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } @@ -36,7 +36,7 @@ data-dir = "/tmp/multi-cluster-consistency-checker-data" # Third cluster configuration (optional) # [clusters.cluster3] - # pd-addr = "127.0.0.1:2579" + # pd-addrs = ["127.0.0.1:2579"] # cdc-addr = "127.0.0.1:8500" # s3-sink-uri = "s3://bucket-name/cluster3/" # s3-changefeed-id = "s3-changefeed-id-3" diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index 05f6da5ed2..b76d223201 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -44,8 +44,8 @@ type DownstreamClusterChangefeedConfig struct { // ClusterConfig represents configuration for a single cluster type ClusterConfig struct { - // PDAddr is the address of the PD (Placement Driver) server - PDAddr string `toml:"pd-addr" json:"pd-addr"` + // PDAddrs is the addresses of the PD (Placement Driver) servers + PDAddrs []string `toml:"pd-addrs" json:"pd-addrs"` // S3SinkURI is the S3 sink URI for this cluster S3SinkURI string `toml:"s3-sink-uri" json:"s3-sink-uri"` @@ -84,8 +84,8 @@ func LoadConfig(path string) (*Config, error) { // Validate cluster configurations for name, cluster := range cfg.Clusters { - if cluster.PDAddr == "" { - return nil, fmt.Errorf("cluster '%s': pd-addr is required", name) + if len(cluster.PDAddrs) == 0 { + return nil, fmt.Errorf("cluster '%s': pd-addrs is required", name) } if cluster.S3SinkURI == "" { return nil, fmt.Errorf("cluster '%s': s3-sink-uri is required", name) diff --git a/cmd/multi-cluster-consistency-checker/config/config_test.go b/cmd/multi-cluster-consistency-checker/config/config_test.go index 3b048d4a65..99dd3222e4 100644 --- a/cmd/multi-cluster-consistency-checker/config/config_test.go +++ b/cmd/multi-cluster-consistency-checker/config/config_test.go @@ -38,14 +38,14 @@ data-dir = "/tmp/data" [clusters] [clusters.cluster1] - pd-addr = "127.0.0.1:2379" + pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" s3-changefeed-id = "s3-cf-1" [clusters.cluster1.downstream-cluster-changefeed-config] cluster2 = { changefeed-id = "cf-1-to-2" } [clusters.cluster2] - pd-addr = "127.0.0.1:2479" + pd-addrs = ["127.0.0.1:2479"] s3-sink-uri = "s3://bucket/cluster2/" s3-changefeed-id = "s3-cf-2" [clusters.cluster2.downstream-cluster-changefeed-config] @@ -62,7 +62,7 @@ data-dir = "/tmp/data" require.Len(t, cfg.Clusters, 2) require.Contains(t, cfg.Clusters, "cluster1") require.Contains(t, cfg.Clusters, "cluster2") - require.Equal(t, "127.0.0.1:2379", cfg.Clusters["cluster1"].PDAddr) + require.Equal(t, []string{"127.0.0.1:2379"}, cfg.Clusters["cluster1"].PDAddrs) require.Equal(t, "s3://bucket/cluster1/", cfg.Clusters["cluster1"].S3SinkURI) require.Equal(t, "s3-cf-1", cfg.Clusters["cluster1"].S3ChangefeedID) require.Len(t, cfg.Clusters["cluster1"].DownstreamClusterChangefeedConfig, 1) @@ -109,7 +109,7 @@ report-dir = "/tmp/reports" require.Contains(t, err.Error(), "at least one cluster must be configured") }) - t.Run("missing pd-addr", func(t *testing.T) { + t.Run("missing pd-addrs", func(t *testing.T) { t.Parallel() tmpDir := t.TempDir() configPath := filepath.Join(tmpDir, "config.toml") @@ -129,7 +129,7 @@ report-dir = "/tmp/reports" cfg, err := LoadConfig(configPath) require.Error(t, err) require.Nil(t, cfg) - require.Contains(t, err.Error(), "pd-addr is required") + require.Contains(t, err.Error(), "pd-addrs is required") }) t.Run("missing s3-sink-uri", func(t *testing.T) { @@ -143,7 +143,7 @@ report-dir = "/tmp/reports" [clusters] [clusters.cluster1] - pd-addr = "127.0.0.1:2379" + pd-addrs = ["127.0.0.1:2379"] s3-changefeed-id = "s3-cf-1" ` err := os.WriteFile(configPath, []byte(configContent), 0644) @@ -166,7 +166,7 @@ report-dir = "/tmp/reports" [clusters] [clusters.cluster1] - pd-addr = "127.0.0.1:2379" + pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" ` err := os.WriteFile(configPath, []byte(configContent), 0644) @@ -189,14 +189,14 @@ report-dir = "/tmp/reports" [clusters] [clusters.cluster1] - pd-addr = "127.0.0.1:2379" + pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" s3-changefeed-id = "s3-cf-1" [clusters.cluster1.downstream-cluster-changefeed-config] cluster2 = { changefeed-id = "cf-1-to-2" } [clusters.cluster2] - pd-addr = "127.0.0.1:2479" + pd-addrs = ["127.0.0.1:2479"] s3-sink-uri = "s3://bucket/cluster2/" s3-changefeed-id = "s3-cf-2" ` @@ -220,14 +220,14 @@ report-dir = "/tmp/reports" [clusters] [clusters.cluster1] - pd-addr = "127.0.0.1:2379" + pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" s3-changefeed-id = "s3-cf-1" [clusters.cluster1.downstream-cluster-changefeed-config] cluster2 = {} [clusters.cluster2] - pd-addr = "127.0.0.1:2479" + pd-addrs = ["127.0.0.1:2479"] s3-sink-uri = "s3://bucket/cluster2/" s3-changefeed-id = "s3-cf-2" [clusters.cluster2.downstream-cluster-changefeed-config] diff --git a/cmd/multi-cluster-consistency-checker/decoder/value_to_datum_test.go b/cmd/multi-cluster-consistency-checker/decoder/value_to_datum_test.go new file mode 100644 index 0000000000..95bc8d43e9 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/decoder/value_to_datum_test.go @@ -0,0 +1,898 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package decoder + +import ( + "math" + "testing" + + "github.com/pingcap/tidb/pkg/parser/mysql" + ptypes "github.com/pingcap/tidb/pkg/parser/types" + tiTypes "github.com/pingcap/tidb/pkg/types" + "github.com/stretchr/testify/require" +) + +func TestValueToDatum_NilValue(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeLong) + d := valueToDatum(nil, ft) + require.True(t, d.IsNull()) +} + +func TestValueToDatum_NonStringPanics(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeLong) + require.Panics(t, func() { + valueToDatum(123, ft) + }) +} + +func TestValueToDatum_SignedIntegers(t *testing.T) { + t.Parallel() + + intTypes := []struct { + name string + tp byte + }{ + {"TypeTiny", mysql.TypeTiny}, + {"TypeShort", mysql.TypeShort}, + {"TypeInt24", mysql.TypeInt24}, + {"TypeLong", mysql.TypeLong}, + {"TypeLonglong", mysql.TypeLonglong}, + } + + for _, it := range intTypes { + t.Run(it.name, func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(it.tp) + + t.Run("positive", func(t *testing.T) { + t.Parallel() + d := valueToDatum("42", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(42), d.GetInt64()) + }) + + t.Run("zero", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(0), d.GetInt64()) + }) + + t.Run("negative", func(t *testing.T) { + t.Parallel() + d := valueToDatum("-100", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(-100), d.GetInt64()) + }) + + t.Run("max int64", func(t *testing.T) { + t.Parallel() + d := valueToDatum("9223372036854775807", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(math.MaxInt64), d.GetInt64()) + }) + + t.Run("min int64", func(t *testing.T) { + t.Parallel() + d := valueToDatum("-9223372036854775808", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(math.MinInt64), d.GetInt64()) + }) + }) + } +} + +func TestValueToDatum_UnsignedIntegers(t *testing.T) { + t.Parallel() + + intTypes := []struct { + name string + tp byte + }{ + {"TypeTiny", mysql.TypeTiny}, + {"TypeShort", mysql.TypeShort}, + {"TypeInt24", mysql.TypeInt24}, + {"TypeLong", mysql.TypeLong}, + {"TypeLonglong", mysql.TypeLonglong}, + } + + for _, it := range intTypes { + t.Run(it.name, func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(it.tp) + ft.AddFlag(mysql.UnsignedFlag) + + t.Run("positive", func(t *testing.T) { + t.Parallel() + d := valueToDatum("42", ft) + require.Equal(t, tiTypes.KindUint64, d.Kind()) + require.Equal(t, uint64(42), d.GetUint64()) + }) + + t.Run("zero", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindUint64, d.Kind()) + require.Equal(t, uint64(0), d.GetUint64()) + }) + + t.Run("max uint64", func(t *testing.T) { + t.Parallel() + d := valueToDatum("18446744073709551615", ft) + require.Equal(t, tiTypes.KindUint64, d.Kind()) + require.Equal(t, uint64(math.MaxUint64), d.GetUint64()) + }) + }) + } +} + +func TestValueToDatum_InvalidIntegerPanics(t *testing.T) { + t.Parallel() + + t.Run("signed invalid", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeLong) + require.Panics(t, func() { + valueToDatum("not_a_number", ft) + }) + }) + + t.Run("unsigned invalid", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeLong) + ft.AddFlag(mysql.UnsignedFlag) + require.Panics(t, func() { + valueToDatum("not_a_number", ft) + }) + }) +} + +func TestValueToDatum_Year(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeYear) + + t.Run("normal year", func(t *testing.T) { + t.Parallel() + d := valueToDatum("2026", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(2026), d.GetInt64()) + }) + + t.Run("zero year", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(0), d.GetInt64()) + }) + + t.Run("invalid year panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum("abc", ft) + }) + }) +} + +func TestValueToDatum_Float(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeFloat) + + t.Run("positive float", func(t *testing.T) { + t.Parallel() + d := valueToDatum("3.14", ft) + require.Equal(t, tiTypes.KindFloat32, d.Kind()) + require.InDelta(t, float32(3.14), d.GetFloat32(), 0.001) + }) + + t.Run("negative float", func(t *testing.T) { + t.Parallel() + d := valueToDatum("-2.5", ft) + require.Equal(t, tiTypes.KindFloat32, d.Kind()) + require.InDelta(t, float32(-2.5), d.GetFloat32(), 0.001) + }) + + t.Run("zero float", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindFloat32, d.Kind()) + require.Equal(t, float32(0), d.GetFloat32()) + }) + + t.Run("invalid float panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum("not_a_float", ft) + }) + }) +} + +func TestValueToDatum_Double(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeDouble) + + t.Run("positive double", func(t *testing.T) { + t.Parallel() + d := valueToDatum("3.141592653589793", ft) + require.Equal(t, tiTypes.KindFloat64, d.Kind()) + require.InDelta(t, 3.141592653589793, d.GetFloat64(), 1e-15) + }) + + t.Run("negative double", func(t *testing.T) { + t.Parallel() + d := valueToDatum("-1.23456789", ft) + require.Equal(t, tiTypes.KindFloat64, d.Kind()) + require.InDelta(t, -1.23456789, d.GetFloat64(), 1e-9) + }) + + t.Run("zero double", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindFloat64, d.Kind()) + require.Equal(t, float64(0), d.GetFloat64()) + }) + + t.Run("very large double", func(t *testing.T) { + t.Parallel() + d := valueToDatum("1.7976931348623157e+308", ft) + require.Equal(t, tiTypes.KindFloat64, d.Kind()) + require.InDelta(t, math.MaxFloat64, d.GetFloat64(), 1e+293) + }) + + t.Run("invalid double panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum("not_a_double", ft) + }) + }) +} + +func TestValueToDatum_StringTypes(t *testing.T) { + t.Parallel() + + stringTypes := []struct { + name string + tp byte + }{ + {"TypeVarString", mysql.TypeVarString}, + {"TypeVarchar", mysql.TypeVarchar}, + {"TypeString", mysql.TypeString}, + {"TypeBlob", mysql.TypeBlob}, + {"TypeTinyBlob", mysql.TypeTinyBlob}, + {"TypeMediumBlob", mysql.TypeMediumBlob}, + {"TypeLongBlob", mysql.TypeLongBlob}, + } + + for _, st := range stringTypes { + t.Run(st.name, func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(st.tp) + ft.SetCollate("utf8mb4_bin") + + t.Run("normal string", func(t *testing.T) { + t.Parallel() + d := valueToDatum("hello world", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "hello world", d.GetString()) + }) + + t.Run("empty string", func(t *testing.T) { + t.Parallel() + d := valueToDatum("", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "", d.GetString()) + }) + + t.Run("unicode string", func(t *testing.T) { + t.Parallel() + d := valueToDatum("你好世界🌍", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "你好世界🌍", d.GetString()) + }) + }) + } +} + +func TestValueToDatum_BinaryFlag(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeString) + ft.AddFlag(mysql.BinaryFlag) + ft.SetCharset("binary") + ft.SetCollate("binary") + + t.Run("ascii content", func(t *testing.T) { + t.Parallel() + d := valueToDatum("abc", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "abc", d.GetString()) + }) + + t.Run("empty binary", func(t *testing.T) { + t.Parallel() + d := valueToDatum("", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "", d.GetString()) + }) +} + +func TestValueToDatum_Decimal(t *testing.T) { + t.Parallel() + + t.Run("simple decimal", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeNewDecimal) + ft.SetFlen(10) + ft.SetDecimal(2) + + d := valueToDatum("123.45", ft) + require.Equal(t, tiTypes.KindMysqlDecimal, d.Kind()) + require.Equal(t, "123.45", d.GetMysqlDecimal().String()) + require.Equal(t, 10, d.Length()) + require.Equal(t, 2, d.Frac()) + }) + + t.Run("negative decimal", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeNewDecimal) + ft.SetFlen(10) + ft.SetDecimal(3) + + d := valueToDatum("-99.999", ft) + require.Equal(t, tiTypes.KindMysqlDecimal, d.Kind()) + require.Equal(t, "-99.999", d.GetMysqlDecimal().String()) + require.Equal(t, 3, d.Frac()) + }) + + t.Run("zero decimal", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeNewDecimal) + ft.SetFlen(10) + ft.SetDecimal(0) + + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindMysqlDecimal, d.Kind()) + require.Equal(t, "0", d.GetMysqlDecimal().String()) + }) + + t.Run("large decimal", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeNewDecimal) + ft.SetFlen(65) + ft.SetDecimal(30) + + d := valueToDatum("12345678901234567890.123456789012345678", ft) + require.Equal(t, tiTypes.KindMysqlDecimal, d.Kind()) + require.Equal(t, 65, d.Length()) + require.Equal(t, 30, d.Frac()) + }) + + t.Run("unspecified decimal uses actual frac", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeNewDecimal) + ft.SetFlen(10) + ft.SetDecimal(tiTypes.UnspecifiedLength) + + d := valueToDatum("12.345", ft) + require.Equal(t, tiTypes.KindMysqlDecimal, d.Kind()) + require.Equal(t, 3, d.Frac()) // actual digits frac from the value + }) + + t.Run("invalid decimal panics", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeNewDecimal) + ft.SetFlen(10) + ft.SetDecimal(2) + require.Panics(t, func() { + valueToDatum("not_decimal", ft) + }) + }) +} + +func TestValueToDatum_Date(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeDate) + ft.SetDecimal(0) + + t.Run("normal date", func(t *testing.T) { + t.Parallel() + d := valueToDatum("2026-02-11", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + require.Equal(t, "2026-02-11", d.GetMysqlTime().String()) + }) + + t.Run("zero date", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0000-00-00", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + require.Equal(t, "0000-00-00", d.GetMysqlTime().String()) + }) +} + +func TestValueToDatum_Datetime(t *testing.T) { + t.Parallel() + + t.Run("datetime without fractional seconds", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeDatetime) + ft.SetDecimal(0) + + d := valueToDatum("2026-02-11 10:30:00", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + require.Equal(t, "2026-02-11 10:30:00", d.GetMysqlTime().String()) + }) + + t.Run("datetime with fractional seconds", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeDatetime) + ft.SetDecimal(6) + + d := valueToDatum("2026-02-11 10:30:00.123456", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + require.Equal(t, "2026-02-11 10:30:00.123456", d.GetMysqlTime().String()) + }) +} + +func TestValueToDatum_Timestamp(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeTimestamp) + ft.SetDecimal(0) + + d := valueToDatum("2026-02-11 10:30:00", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + require.Equal(t, "2026-02-11 10:30:00", d.GetMysqlTime().String()) +} + +func TestValueToDatum_Duration(t *testing.T) { + t.Parallel() + + t.Run("positive duration", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeDuration) + ft.SetDecimal(0) + + d := valueToDatum("12:30:45", ft) + require.Equal(t, tiTypes.KindMysqlDuration, d.Kind()) + require.Equal(t, "12:30:45", d.GetMysqlDuration().String()) + }) + + t.Run("negative duration", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeDuration) + ft.SetDecimal(0) + + d := valueToDatum("-01:00:00", ft) + require.Equal(t, tiTypes.KindMysqlDuration, d.Kind()) + require.Equal(t, "-01:00:00", d.GetMysqlDuration().String()) + }) + + t.Run("duration with fractional seconds", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeDuration) + ft.SetDecimal(3) + + d := valueToDatum("10:20:30.123", ft) + require.Equal(t, tiTypes.KindMysqlDuration, d.Kind()) + require.Equal(t, "10:20:30.123", d.GetMysqlDuration().String()) + }) + + t.Run("zero duration", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeDuration) + ft.SetDecimal(0) + + d := valueToDatum("00:00:00", ft) + require.Equal(t, tiTypes.KindMysqlDuration, d.Kind()) + require.Equal(t, "00:00:00", d.GetMysqlDuration().String()) + }) +} + +func TestValueToDatum_Enum(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeEnum) + ft.SetCharset("utf8mb4") + ft.SetCollate("utf8mb4_bin") + ft.SetElems([]string{"a", "b", "c"}) + + t.Run("valid enum value", func(t *testing.T) { + t.Parallel() + d := valueToDatum("1", ft) + require.Equal(t, tiTypes.KindMysqlEnum, d.Kind()) + require.Equal(t, uint64(1), d.GetMysqlEnum().Value) + }) + + t.Run("enum value 2", func(t *testing.T) { + t.Parallel() + d := valueToDatum("2", ft) + require.Equal(t, tiTypes.KindMysqlEnum, d.Kind()) + require.Equal(t, uint64(2), d.GetMysqlEnum().Value) + }) + + t.Run("enum value 0", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindMysqlEnum, d.Kind()) + require.Equal(t, uint64(0), d.GetMysqlEnum().Value) + }) + + t.Run("invalid enum panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum("abc", ft) + }) + }) +} + +func TestValueToDatum_Set(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeSet) + ft.SetCharset("utf8mb4") + ft.SetCollate("utf8mb4_bin") + ft.SetElems([]string{"a", "b", "c"}) + + t.Run("single set value", func(t *testing.T) { + t.Parallel() + d := valueToDatum("1", ft) + require.Equal(t, tiTypes.KindMysqlSet, d.Kind()) + require.Equal(t, uint64(1), d.GetMysqlSet().Value) + }) + + t.Run("combined set value", func(t *testing.T) { + t.Parallel() + d := valueToDatum("3", ft) // a,b + require.Equal(t, tiTypes.KindMysqlSet, d.Kind()) + require.Equal(t, uint64(3), d.GetMysqlSet().Value) + }) + + t.Run("zero set value", func(t *testing.T) { + t.Parallel() + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindMysqlSet, d.Kind()) + require.Equal(t, uint64(0), d.GetMysqlSet().Value) + }) + + t.Run("invalid set panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum("xyz", ft) + }) + }) +} + +func TestValueToDatum_Bit(t *testing.T) { + t.Parallel() + + t.Run("bit(1) value 1", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeBit) + ft.SetFlen(1) + + d := valueToDatum("1", ft) + require.Equal(t, tiTypes.KindMysqlBit, d.Kind()) + }) + + t.Run("bit(8) value 255", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeBit) + ft.SetFlen(8) + + d := valueToDatum("255", ft) + require.Equal(t, tiTypes.KindMysqlBit, d.Kind()) + }) + + t.Run("bit(64) large value", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeBit) + ft.SetFlen(64) + + d := valueToDatum("18446744073709551615", ft) + require.Equal(t, tiTypes.KindMysqlBit, d.Kind()) + }) + + t.Run("bit zero", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeBit) + ft.SetFlen(8) + + d := valueToDatum("0", ft) + require.Equal(t, tiTypes.KindMysqlBit, d.Kind()) + }) + + t.Run("invalid bit panics", func(t *testing.T) { + t.Parallel() + ft := ptypes.NewFieldType(mysql.TypeBit) + ft.SetFlen(8) + require.Panics(t, func() { + valueToDatum("not_a_bit", ft) + }) + }) +} + +func TestValueToDatum_JSON(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeJSON) + + t.Run("json object", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`{"key": "value"}`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + require.Contains(t, d.GetMysqlJSON().String(), "key") + require.Contains(t, d.GetMysqlJSON().String(), "value") + }) + + t.Run("json array", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`[1, 2, 3]`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) + + t.Run("json string", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`"hello"`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) + + t.Run("json number", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`42`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) + + t.Run("json null", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`null`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) + + t.Run("json boolean", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`true`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) + + t.Run("nested json", func(t *testing.T) { + t.Parallel() + d := valueToDatum(`{"a": [1, {"b": "c"}], "d": null}`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) + + t.Run("invalid json panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum(`{invalid`, ft) + }) + }) +} + +func TestValueToDatum_VectorFloat32(t *testing.T) { + t.Parallel() + + ft := ptypes.NewFieldType(mysql.TypeTiDBVectorFloat32) + + t.Run("simple vector", func(t *testing.T) { + t.Parallel() + d := valueToDatum("[1,2,3]", ft) + require.False(t, d.IsNull()) + }) + + t.Run("single element vector", func(t *testing.T) { + t.Parallel() + d := valueToDatum("[0.5]", ft) + require.False(t, d.IsNull()) + }) + + t.Run("invalid vector panics", func(t *testing.T) { + t.Parallel() + require.Panics(t, func() { + valueToDatum("not_a_vector", ft) + }) + }) +} + +func TestValueToDatum_UnknownType(t *testing.T) { + t.Parallel() + // Use a type that doesn't match any case in the switch (TypeGeometry). + // The default datum returned is a zero-value datum, which is null. + ft := ptypes.NewFieldType(mysql.TypeGeometry) + d := valueToDatum("some_value", ft) + require.True(t, d.IsNull()) +} + +func TestValueToDatum_ViaNewPKColumnFieldType(t *testing.T) { + t.Parallel() + // Test valueToDatum using FieldTypes produced by newPKColumnFieldTypeFromMysqlType, + // which is the real caller in production code. + + t.Run("int", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("int") + d := valueToDatum("42", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(42), d.GetInt64()) + }) + + t.Run("int unsigned", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("int unsigned") + d := valueToDatum("42", ft) + require.Equal(t, tiTypes.KindUint64, d.Kind()) + require.Equal(t, uint64(42), d.GetUint64()) + }) + + t.Run("bigint", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("bigint") + d := valueToDatum("9223372036854775807", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(math.MaxInt64), d.GetInt64()) + }) + + t.Run("bigint unsigned", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("bigint unsigned") + d := valueToDatum("18446744073709551615", ft) + require.Equal(t, tiTypes.KindUint64, d.Kind()) + require.Equal(t, uint64(math.MaxUint64), d.GetUint64()) + }) + + t.Run("varchar", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("varchar") + d := valueToDatum("hello", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "hello", d.GetString()) + }) + + t.Run("char", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("char") + d := valueToDatum("x", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + require.Equal(t, "x", d.GetString()) + }) + + t.Run("decimal(10,2)", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("decimal(10,2)") + d := valueToDatum("123.45", ft) + require.Equal(t, tiTypes.KindMysqlDecimal, d.Kind()) + require.Equal(t, "123.45", d.GetMysqlDecimal().String()) + require.Equal(t, 10, d.Length()) + require.Equal(t, 2, d.Frac()) + }) + + t.Run("float", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("float") + d := valueToDatum("3.14", ft) + require.Equal(t, tiTypes.KindFloat32, d.Kind()) + require.InDelta(t, float32(3.14), d.GetFloat32(), 0.001) + }) + + t.Run("double", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("double") + d := valueToDatum("3.141592653589793", ft) + require.Equal(t, tiTypes.KindFloat64, d.Kind()) + require.InDelta(t, 3.141592653589793, d.GetFloat64(), 1e-15) + }) + + t.Run("binary", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("binary") + require.True(t, mysql.HasBinaryFlag(ft.GetFlag())) + d := valueToDatum("abc", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + }) + + t.Run("varbinary", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("varbinary") + require.True(t, mysql.HasBinaryFlag(ft.GetFlag())) + d := valueToDatum("abc", ft) + require.Equal(t, tiTypes.KindString, d.Kind()) + }) + + t.Run("tinyint", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("tinyint") + d := valueToDatum("127", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(127), d.GetInt64()) + }) + + t.Run("smallint unsigned", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("smallint unsigned") + d := valueToDatum("65535", ft) + require.Equal(t, tiTypes.KindUint64, d.Kind()) + require.Equal(t, uint64(65535), d.GetUint64()) + }) + + t.Run("date", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("date") + d := valueToDatum("2026-02-11", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + require.Equal(t, "2026-02-11", d.GetMysqlTime().String()) + }) + + t.Run("datetime", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("datetime") + d := valueToDatum("2026-02-11 10:30:00", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + }) + + t.Run("timestamp", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("timestamp") + d := valueToDatum("2026-02-11 10:30:00", ft) + require.Equal(t, tiTypes.KindMysqlTime, d.Kind()) + }) + + t.Run("time", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("time") + d := valueToDatum("12:30:45", ft) + require.Equal(t, tiTypes.KindMysqlDuration, d.Kind()) + }) + + t.Run("year", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("year") + d := valueToDatum("2026", ft) + require.Equal(t, tiTypes.KindInt64, d.Kind()) + require.Equal(t, int64(2026), d.GetInt64()) + }) + + t.Run("enum('a','b','c')", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("enum('a','b','c')") + d := valueToDatum("2", ft) + require.Equal(t, tiTypes.KindMysqlEnum, d.Kind()) + require.Equal(t, uint64(2), d.GetMysqlEnum().Value) + }) + + t.Run("set('x','y','z')", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("set('x','y','z')") + d := valueToDatum("5", ft) + require.Equal(t, tiTypes.KindMysqlSet, d.Kind()) + require.Equal(t, uint64(5), d.GetMysqlSet().Value) + }) + + t.Run("bit(8)", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("bit(8)") + d := valueToDatum("255", ft) + require.Equal(t, tiTypes.KindMysqlBit, d.Kind()) + }) + + t.Run("json", func(t *testing.T) { + t.Parallel() + ft := newPKColumnFieldTypeFromMysqlType("json") + d := valueToDatum(`{"key":"value"}`, ft) + require.Equal(t, tiTypes.KindMysqlJSON, d.Kind()) + }) +} diff --git a/cmd/multi-cluster-consistency-checker/integration/integration_test.go b/cmd/multi-cluster-consistency-checker/integration/integration_test.go new file mode 100644 index 0000000000..dc8eede3a5 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/integration/integration_test.go @@ -0,0 +1,735 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration + +import ( + "context" + "fmt" + "testing" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/advancer" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/checker" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + "github.com/stretchr/testify/require" +) + +// schemaKey is the schema key for data stored via S3 path "test/t1/1/...". +// It equals QuoteSchema("test", "t1") = "`test`.`t1`". +var schemaKey = (&cloudstorage.DmlPathKey{ + SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, +}).GetKey() + +// testEnv holds the initialized test environment. +type testEnv struct { + ctx context.Context + mc *MockMultiCluster + advancer *advancer.TimeWindowAdvancer + checker *checker.DataChecker +} + +// setupEnv creates a test environment with 2 clusters (c1, c2), both +// replicating to each other, with in-memory S3 storage and mock PD/watchers. +func setupEnv(t *testing.T) *testEnv { + t.Helper() + ctx := context.Background() + tables := map[string][]string{"test": {"t1"}} + + mc := NewMockMultiCluster( + []string{"c1", "c2"}, + tables, + 0, // pdBase: start physical time at 0ms + 100, // pdStep: 100ms per PD GetTS call + 100, // cpDelta: checkpoint = minCheckpointTs + 100 + 50, // s3Delta: s3 checkpoint = minCheckpointTs + 50 + ) + + require.NoError(t, mc.InitSchemaFiles(ctx)) + + twa, _, err := advancer.NewTimeWindowAdvancer( + ctx, mc.CPWatchers, mc.S3Watchers, mc.GetPDClients(), nil, + ) + require.NoError(t, err) + + clusterCfg := map[string]config.ClusterConfig{"c1": {}, "c2": {}} + dc := checker.NewDataChecker(ctx, clusterCfg, nil, nil) + + return &testEnv{ctx: ctx, mc: mc, advancer: twa, checker: dc} +} + +// roundResult holds the output of a single round. +type roundResult struct { + report *recorder.Report + twData map[string]types.TimeWindowData +} + +// executeRound writes data to clusters' S3 storage, advances the time window, +// and runs the checker for one round. +func (e *testEnv) executeRound(t *testing.T, c1Content, c2Content []byte) roundResult { + t.Helper() + if c1Content != nil { + require.NoError(t, e.mc.WriteDMLFile(e.ctx, "c1", c1Content)) + } + if c2Content != nil { + require.NoError(t, e.mc.WriteDMLFile(e.ctx, "c2", c2Content)) + } + + twData, err := e.advancer.AdvanceTimeWindow(e.ctx) + require.NoError(t, err) + + report, err := e.checker.CheckInNextTimeWindow(twData) + require.NoError(t, err) + + return roundResult{report: report, twData: twData} +} + +// maxRightBoundary returns the maximum RightBoundary across all clusters. +func maxRightBoundary(twData map[string]types.TimeWindowData) uint64 { + maxRB := uint64(0) + for _, tw := range twData { + if tw.TimeWindow.RightBoundary > maxRB { + maxRB = tw.TimeWindow.RightBoundary + } + } + return maxRB +} + +// The test architecture simulates a 2-cluster active-active setup: +// +// c1 (upstream) ──CDC──> c2 (downstream) +// c2 (upstream) ──CDC──> c1 (downstream) +// +// Each cluster writes upstream data (originTs=0) and receives downstream +// replicated data from the other cluster (originTs>0). +// +// The checker needs 3 warm-up rounds before it starts checking (checkableRound >= 3). +// Data written in round 0 is tracked by the S3 consumer but not downloaded +// (skipDownloadData=true for the first round). From round 1 onwards, only +// NEW files (with higher indices) are downloaded. +// +// Data commitTs is set to prevMaxRightBoundary+1 to ensure records fall +// within the current time window (leftBoundary, rightBoundary]. +// +// TestIntegration_AllConsistent verifies that no errors are reported +// when all upstream data has matching downstream records. +func TestIntegration_AllConsistent(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + + for round := 0; round < 6; round++ { + cts := prevMaxRB + 1 + // c1: upstream write (originTs=0) + c1 := MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + // c2: downstream replicated from c1 (originTs = c1's commitTs) + c2 := MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: c1 TW=[%d, %d], c2 TW=[%d, %d], commitTs=%d", + round, + result.twData["c1"].TimeWindow.LeftBoundary, result.twData["c1"].TimeWindow.RightBoundary, + result.twData["c2"].TimeWindow.LeftBoundary, result.twData["c2"].TimeWindow.RightBoundary, + cts) + + if round >= 3 { + require.Len(t, result.report.ClusterReports, 2, "round %d", round) + require.False(t, result.report.NeedFlush(), + "round %d: all data should be consistent, no report needed", round) + for clusterID, cr := range result.report.ClusterReports { + require.Empty(t, cr.TableFailureItems, + "round %d, cluster %s: should have no failures", round, clusterID) + } + } + } +} + +// TestIntegration_AllConsistent_CrossRoundDownstream verifies that the checker +// treats data as consistent when an upstream record's commitTs exceeds the +// round's checkpointTs, and the matching downstream only appears in the next +// round. +// +// This occurs when upstream commits happen late in the time window, after +// the checkpoint has already been determined. For TW[2], records with +// commitTs > checkpointTs are deferred (skipped). In the next round they +// become TW[1], where the check condition is commitTs > checkpointTs (checked), +// and the downstream is searched in TW[1] + TW[2] — finding the match in +// the current round's TW[2]. +func TestIntegration_AllConsistent_CrossRoundDownstream(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + + // Offset to place commitTs between checkpointTs and rightBoundary. + // With pdStep=100 and 2 clusters, each round's time window spans + // approximately ComposeTS(300, 0) = 78643200, and checkpointTs sits + // at roughly ComposeTS(200, 0) from leftBoundary. + // Using ComposeTS(250, 0) = 65536000 lands safely between them. + crossRoundOffset := uint64(250 << 18) // ComposeTS(250, 0) = 65536000 + + var lateUpstreamCommitTs uint64 + + for round := 0; round < 7; round++ { + cts := prevMaxRB + 1 + + var c1, c2 []byte + + switch round { + case 4: + // Round N: c1 upstream has two records: + // pk=round+1 normal commitTs (checked in this round's TW[2]) + // pk=100 large commitTs > checkpointTs + // (deferred in TW[2], checked via TW[1] next round) + // c2 downstream only matches pk=round+1. + lateUpstreamCommitTs = prevMaxRB + crossRoundOffset + c1 = MakeContent( + MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round)), + MakeCanalJSON(100, lateUpstreamCommitTs, 0, "late"), + ) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + + case 5: + // Round N+1: c2 now includes the downstream for pk=100. + // The checker evaluates TW[1] (= round 4), finds pk=100 with + // commitTs > checkpointTs, and searches c2's TW[1] + TW[2]. + // pk=100's matching downstream is in c2's TW[2] (this round). + c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 = MakeContent( + MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), + MakeCanalJSON(100, cts+2, lateUpstreamCommitTs, "late"), + ) + + default: + c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: c1 TW=[%d, %d], cpTs=%v, commitTs=%d", + round, + result.twData["c1"].TimeWindow.LeftBoundary, + result.twData["c1"].TimeWindow.RightBoundary, + result.twData["c1"].TimeWindow.CheckpointTs, + cts) + + if round == 4 { + // Verify the late commitTs falls between checkpointTs and rightBoundary. + c1TW := result.twData["c1"].TimeWindow + cpTs := c1TW.CheckpointTs["c2"] + require.Greater(t, lateUpstreamCommitTs, cpTs, + "lateUpstreamCommitTs must be > checkpointTs for cross-round detection") + require.LessOrEqual(t, lateUpstreamCommitTs, c1TW.RightBoundary, + "lateUpstreamCommitTs must be <= rightBoundary to stay in this time window") + t.Logf("Round 4 verification: lateCommitTs=%d, checkpointTs=%d, rightBoundary=%d", + lateUpstreamCommitTs, cpTs, c1TW.RightBoundary) + } + + if round >= 3 { + require.Len(t, result.report.ClusterReports, 2, "round %d", round) + require.False(t, result.report.NeedFlush(), + "round %d: data should be consistent (cross-round matching should work)", round) + for clusterID, cr := range result.report.ClusterReports { + require.Empty(t, cr.TableFailureItems, + "round %d, cluster %s: should have no failures", round, clusterID) + } + } + } +} + +// TestIntegration_AllConsistent_LWWSkippedDownstream verifies that no errors +// are reported when a downstream record is "LWW-skipped" during data-loss +// detection, combined with cross-time-window matching. +// +// pk=100: single-cluster overwrite (c1 writes old+new, c2 only has newer downstream) +// +// Round N: c1 upstream pk=100 × 2 (commitTs=A, B; both > checkpointTs) +// c2 has NO downstream for pk=100 +// Round N+1: c2 downstream pk=100 (originTs=B, matches newer upstream only) +// → old upstream LWW-skipped (c2 downstream compareTs=B >= A) +// +// pk=200: bidirectional write (c1 and c2 both write the same pk) +// +// Round N: c1 upstream pk=200 (commitTs=A, deferred) +// Round N+1: c1 upstream pk=200 (commitTs=E, newer), c1 downstream pk=200 (originTs=D, from c2) +// c2 upstream pk=200 (commitTs=D, D < E), c2 downstream pk=200 (originTs=E, from c1) +// +// Key constraint: c1 upstream commitTs (E) > c2 upstream commitTs (D). +// This ensures that on c2, the downstream (compareTs=E) > upstream (compareTs=D), +// so the LWW violation checker sees monotonically increasing compareTs. +// +// c1 data loss for old pk=200 (commitTs=A): +// → c2 downstream has originTs=E, compareTs=E >= A → LWW-skipped ✓ +// c1 data loss for new pk=200 (commitTs=E): +// → c2 downstream has originTs=E → exact match ✓ +// c2 data loss for c2 upstream pk=200 (commitTs=D): +// → c1 downstream has originTs=D → exact match ✓ +func TestIntegration_AllConsistent_LWWSkippedDownstream(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + + // Place both commitTs values between checkpointTs and rightBoundary. + // With pdStep=100 and 2 clusters: + // window width ≈ ComposeTS(300, 0), checkpointTs ≈ leftBoundary + ComposeTS(200, 0) + // Using ComposeTS(250, 0) puts us safely in the gap. + crossRoundOffset := uint64(250 << 18) // ComposeTS(250, 0) = 65536000 + + var oldCommitTs, newCommitTs uint64 + + for round := 0; round < 7; round++ { + cts := prevMaxRB + 1 + + var c1, c2 []byte + + switch round { + case 4: + // Round N: c1 upstream writes pk=100 twice + pk=200 once, all > checkpointTs. + // c2 has NO downstream for pk=100 or pk=200; they arrive next round. + oldCommitTs = prevMaxRB + crossRoundOffset + newCommitTs = oldCommitTs + 5 + c1 = MakeContent( + MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round)), + MakeCanalJSON(100, oldCommitTs, 0, "old_write"), + MakeCanalJSON(100, newCommitTs, 0, "new_write"), + MakeCanalJSON(200, oldCommitTs, 0, "old_write"), + ) + c2 = MakeContent( + MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), + ) + + case 5: + // Round N+1: downstream data arrives for both pk=100 and pk=200. + // + // pk=200 bidirectional: c1 upstream at cts+5 (> c2 upstream at cts+2) + // ensures c2's LWW check sees increasing compareTs. + // c1: downstream(commitTs=cts+4, originTs=cts+2) then upstream(commitTs=cts+5) + // → compareTs order: cts+2 < cts+5 ✓ + // c2: upstream(commitTs=cts+2) then downstream(commitTs=cts+6, originTs=cts+5) + // → compareTs order: cts+2 < cts+5 ✓ + c1 = MakeContent( + MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round)), + MakeCanalJSON(200, cts+4, cts+2, "pk200_c2"), // c1 downstream pk=200 from c2 + MakeCanalJSON(200, cts+5, 0, "pk200_c1"), // c1 upstream pk=200 (newer) + ) + c2 = MakeContent( + MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), + MakeCanalJSON(100, cts+2, newCommitTs, "new_write"), + MakeCanalJSON(200, cts+2, 0, "pk200_c2"), // c2 upstream pk=200 + MakeCanalJSON(200, cts+6, cts+5, "pk200_c1"), // c2 downstream pk=200 from c1 + ) + + default: + c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d", round, result.report.NeedFlush(), cts) + + if round == 4 { + // Verify both commitTs fall between checkpointTs and rightBoundary. + c1TW := result.twData["c1"].TimeWindow + cpTs := c1TW.CheckpointTs["c2"] + require.Greater(t, oldCommitTs, cpTs, + "oldCommitTs must be > checkpointTs for cross-round deferral") + require.LessOrEqual(t, newCommitTs, c1TW.RightBoundary, + "newCommitTs must be <= rightBoundary to stay in this time window") + t.Logf("Round 4 verification: oldCommitTs=%d, newCommitTs=%d, checkpointTs=%d, rightBoundary=%d", + oldCommitTs, newCommitTs, cpTs, c1TW.RightBoundary) + } + + if round >= 3 { + require.Len(t, result.report.ClusterReports, 2, "round %d", round) + require.False(t, result.report.NeedFlush(), + "round %d: cross-round LWW-skipped downstream should not cause errors", round) + for clusterID, cr := range result.report.ClusterReports { + require.Empty(t, cr.TableFailureItems, + "round %d, cluster %s: should have no failures", round, clusterID) + } + } + } +} + +// TestIntegration_DataLoss verifies that the checker detects data loss +// when an upstream record has no matching downstream in the other cluster. +func TestIntegration_DataLoss(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + dataLossDetected := false + + for round := 0; round < 6; round++ { + cts := prevMaxRB + 1 + + // c1 always produces upstream data + c1 := MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + + var c2 []byte + if round == 4 { + // Round 4: c2 has NO matching downstream → data loss expected + // (round 4's data is checked in the same round since checkableRound >= 3) + c2 = nil + } else { + // Normal: c2 has matching downstream + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d", round, result.report.NeedFlush(), cts) + + if round >= 3 && result.report.NeedFlush() { + c1Report := result.report.ClusterReports["c1"] + if c1Report != nil { + if items, ok := c1Report.TableFailureItems[schemaKey]; ok { + if len(items.DataLossItems) > 0 { + t.Logf("Round %d: detected data loss: %+v", round, items.DataLossItems) + dataLossDetected = true + // Verify the data loss item + for _, item := range items.DataLossItems { + require.Equal(t, "c2", item.DownstreamClusterID) + require.False(t, item.Inconsistent, "should be pure data loss, not inconsistency") + } + } + } + } + } + } + + require.True(t, dataLossDetected, "data loss should have been detected") +} + +// TestIntegration_DataInconsistent verifies that the checker detects data +// inconsistency when a downstream record has different column values +// from the upstream record. +func TestIntegration_DataInconsistent(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + inconsistentDetected := false + + for round := 0; round < 6; round++ { + cts := prevMaxRB + 1 + + c1 := MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + + var c2 []byte + if round == 4 { + // Round 4: c2 has downstream with WRONG column value + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, "WRONG_VALUE")) + } else { + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d", round, result.report.NeedFlush(), cts) + + if round >= 3 && result.report.NeedFlush() { + c1Report := result.report.ClusterReports["c1"] + if c1Report != nil { + if items, ok := c1Report.TableFailureItems[schemaKey]; ok { + for _, item := range items.DataLossItems { + if item.Inconsistent { + t.Logf("Round %d: detected data inconsistency: %+v", round, item) + inconsistentDetected = true + require.Equal(t, "c2", item.DownstreamClusterID) + } + } + } + } + } + } + + require.True(t, inconsistentDetected, "data inconsistency should have been detected") +} + +// TestIntegration_DataRedundant verifies that the checker detects redundant +// downstream data that has no matching upstream record. +func TestIntegration_DataRedundant(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + redundantDetected := false + + for round := 0; round < 6; round++ { + cts := prevMaxRB + 1 + + c1 := MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 := MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + + if round == 4 { + // Round 4: c2 has an EXTRA downstream record (pk=999) with a fake + // originTs that doesn't match any upstream commitTs in c1. + fakeOriginTs := cts - 5 // Doesn't match any c1 upstream commitTs + c2 = MakeContent( + MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), + MakeCanalJSON(999, cts+2, fakeOriginTs, "extra"), + ) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d", round, result.report.NeedFlush(), cts) + + if round >= 3 && result.report.NeedFlush() { + c2Report := result.report.ClusterReports["c2"] + if c2Report != nil { + if items, ok := c2Report.TableFailureItems[schemaKey]; ok { + if len(items.DataRedundantItems) > 0 { + t.Logf("Round %d: detected data redundant: %+v", round, items.DataRedundantItems) + redundantDetected = true + } + } + } + } + } + + require.True(t, redundantDetected, "data redundancy should have been detected") +} + +// TestIntegration_LWWViolation verifies that the checker detects Last Write Wins +// violations when records for the same primary key have non-monotonic origin timestamps. +func TestIntegration_LWWViolation(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + lwwViolationDetected := false + + for round := 0; round < 6; round++ { + cts := prevMaxRB + 1 + + var c1, c2 []byte + + if round == 4 { + // Round 4: inject LWW violation in c1. + // Record A: pk=5, commitTs=cts, originTs=0 → compareTs = cts + // Record B: pk=5, commitTs=cts+2, originTs=cts-10 → compareTs = cts-10 + // Since A's compareTs (cts) >= B's compareTs (cts-10) and A's commitTs < B's commitTs, + // this is a Last Write Wins violation. + c1 = MakeContent( + MakeCanalJSON(5, cts, 0, "original"), + MakeCanalJSON(5, cts+2, cts-10, "replicated"), + ) + // c2: provide matching downstream to avoid data loss noise + c2 = MakeContent( + MakeCanalJSON(5, cts+1, cts, "original"), + ) + } else { + c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d", round, result.report.NeedFlush(), cts) + + if round >= 3 && result.report.NeedFlush() { + c1Report := result.report.ClusterReports["c1"] + if c1Report != nil { + if items, ok := c1Report.TableFailureItems[schemaKey]; ok { + if len(items.LWWViolationItems) > 0 { + t.Logf("Round %d: detected LWW violation: %+v", round, items.LWWViolationItems) + lwwViolationDetected = true + } + } + } + } + } + + require.True(t, lwwViolationDetected, "LWW violation should have been detected") +} + +// TestIntegration_LWWViolation_AcrossRounds verifies that the checker detects +// LWW violations when conflicting records for the same pk appear in rounds N +// and N+2, with no data for that pk in round N+1. +// +// The clusterViolationChecker keeps cache entries for up to 3 rounds +// (previous: 0 → 1 → 2). Since Check runs before UpdateCache, an entry +// created in round N (previous=0) is still available at previous=2 when +// round N+2 runs. +// +// Timeline: +// +// Round N: c1 upstream pk=50 (originTs=0, compareTs=A) → cached +// Round N+1: no pk=50 data → cache ages (prev 1→2) +// Round N+2: c1 downstream pk=50 (originTs=B= new.compareTs + // → LWW violation across 2-round gap. + violatingOriginTs := firstRecordCommitTs - 10 + c1 = MakeContent( + MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round)), + MakeCanalJSON(50, cts+2, violatingOriginTs, "second"), + ) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + + default: + c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d", round, result.report.NeedFlush(), cts) + + if round >= 3 && result.report.NeedFlush() { + c1Report := result.report.ClusterReports["c1"] + if c1Report != nil { + if items, ok := c1Report.TableFailureItems[schemaKey]; ok { + if len(items.LWWViolationItems) > 0 { + t.Logf("Round %d: LWW violation across rounds: %+v", + round, items.LWWViolationItems) + lwwViolationDetected = true + // Verify the violation details + item := items.LWWViolationItems[0] + require.Equal(t, uint64(0), item.ExistingOriginTS, + "existing record should be upstream (originTs=0)") + require.Equal(t, firstRecordCommitTs, item.ExistingCommitTS, + "existing record should be from round N") + } + } + } + } + } + + require.True(t, lwwViolationDetected, + "LWW violation across round N and N+2 should have been detected") +} + +// TestIntegration_MultipleErrorTypes verifies that the checker can detect +// multiple error types simultaneously across different clusters and rounds. +func TestIntegration_MultipleErrorTypes(t *testing.T) { + t.Parallel() + env := setupEnv(t) + defer env.mc.Close() + + prevMaxRB := uint64(0) + dataLossDetected := false + redundantDetected := false + + for round := 0; round < 7; round++ { + cts := prevMaxRB + 1 + + var c1, c2 []byte + + switch round { + case 4: + // Data loss: c1 upstream pk=5, c2 has NO downstream + c1 = MakeContent(MakeCanalJSON(5, cts, 0, "lost")) + c2 = nil + case 5: + // Data redundant: c2 has extra downstream pk=888 + c1 = MakeContent(MakeCanalJSON(6, cts, 0, "normal")) + fakeOriginTs := cts - 3 + c2 = MakeContent( + MakeCanalJSON(6, cts+1, cts, "normal"), + MakeCanalJSON(888, cts+2, fakeOriginTs, "ghost"), + ) + default: + c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) + c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) + } + + result := env.executeRound(t, c1, c2) + prevMaxRB = maxRightBoundary(result.twData) + + t.Logf("Round %d: NeedFlush=%v, commitTs=%d, ClusterReports=%d", + round, result.report.NeedFlush(), cts, len(result.report.ClusterReports)) + + if round >= 3 && result.report.NeedFlush() { + // Check c1 for data loss + if c1Report := result.report.ClusterReports["c1"]; c1Report != nil { + if items, ok := c1Report.TableFailureItems[schemaKey]; ok { + if len(items.DataLossItems) > 0 { + dataLossDetected = true + t.Logf("Round %d: data loss detected in c1: %d items", + round, len(items.DataLossItems)) + } + } + } + // Check c2 for data redundant + if c2Report := result.report.ClusterReports["c2"]; c2Report != nil { + if items, ok := c2Report.TableFailureItems[schemaKey]; ok { + if len(items.DataRedundantItems) > 0 { + redundantDetected = true + t.Logf("Round %d: data redundant detected in c2: %d items", + round, len(items.DataRedundantItems)) + } + } + } + } + } + + require.True(t, dataLossDetected, "data loss should have been detected") + require.True(t, redundantDetected, "data redundancy should have been detected") +} diff --git a/cmd/multi-cluster-consistency-checker/integration/mock_cluster.go b/cmd/multi-cluster-consistency-checker/integration/mock_cluster.go new file mode 100644 index 0000000000..40a3e620e3 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/integration/mock_cluster.go @@ -0,0 +1,205 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package integration + +import ( + "context" + "fmt" + "strings" + "sync/atomic" + + "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" + "github.com/pingcap/tidb/br/pkg/storage" + pd "github.com/tikv/pd/client" +) + +// mockPDClient simulates a PD server's TSO service. +// Each GetTS call returns a monotonically increasing physical timestamp. +type mockPDClient struct { + pd.Client + seq atomic.Int64 + base int64 // base physical time in milliseconds + step int64 // increment per call in milliseconds +} + +func (m *mockPDClient) GetTS(_ context.Context) (int64, int64, error) { + n := m.seq.Add(1) + return m.base + n*m.step, 0, nil +} + +func (m *mockPDClient) Close() {} + +// ---------- Mock Checkpoint Watcher ---------- + +// mockWatcher simulates a checkpoint watcher. +// It always returns minCheckpointTs + delta, ensuring the result exceeds the minimum. +type mockWatcher struct { + delta uint64 +} + +func (m *mockWatcher) AdvanceCheckpointTs(_ context.Context, minCheckpointTs uint64) (uint64, error) { + return minCheckpointTs + m.delta, nil +} + +func (m *mockWatcher) Close() {} + +// MockMultiCluster manages the mock infrastructure for simulating multiple +// TiCDC clusters. It provides: +// - Mock PD clients for TSO generation +// - Mock checkpoint watchers for inter-cluster replication checkpoints +// - Mock S3 checkpoint watchers and S3 watchers for cloud storage +// - In-memory S3 storage for each cluster +// - Helpers to write canal-JSON formatted data files +type MockMultiCluster struct { + ClusterIDs []string + Tables map[string][]string // schema -> table names + + S3Storages map[string]storage.ExternalStorage + pdClients map[string]*mockPDClient + CPWatchers map[string]map[string]watcher.Watcher + S3Watchers map[string]*watcher.S3Watcher + + // fileCounters tracks the next DML file index per cluster. + // Files are written with monotonically increasing indices so the + // S3Consumer discovers only new files in each round. + fileCounters map[string]uint64 + + date string // fixed date used in all DML file paths +} + +// NewMockMultiCluster creates a new mock multi-cluster environment. +// +// Parameters: +// - clusterIDs: identifiers for the clusters (e.g. ["c1", "c2"]) +// - tables: schema -> table names mapping (e.g. {"test": ["t1"]}) +// - pdBase: base physical time (ms) for mock PD TSO generation +// - pdStep: physical time increment (ms) per GetTS call +// - cpDelta: checkpoint watcher returns minCheckpointTs + cpDelta +// - s3Delta: S3 checkpoint watcher returns minCheckpointTs + s3Delta +func NewMockMultiCluster( + clusterIDs []string, + tables map[string][]string, + pdBase, pdStep int64, + cpDelta, s3Delta uint64, +) *MockMultiCluster { + mc := &MockMultiCluster{ + ClusterIDs: clusterIDs, + Tables: tables, + S3Storages: make(map[string]storage.ExternalStorage), + pdClients: make(map[string]*mockPDClient), + CPWatchers: make(map[string]map[string]watcher.Watcher), + S3Watchers: make(map[string]*watcher.S3Watcher), + fileCounters: make(map[string]uint64), + date: "2026-02-11", + } + + for _, id := range clusterIDs { + mc.S3Storages[id] = storage.NewMemStorage() + mc.pdClients[id] = &mockPDClient{base: pdBase, step: pdStep} + + // Checkpoint watchers: one per downstream cluster + watchers := make(map[string]watcher.Watcher) + for _, other := range clusterIDs { + if other != id { + watchers[other] = &mockWatcher{delta: cpDelta} + } + } + mc.CPWatchers[id] = watchers + + // S3 watcher: uses in-memory storage + mock checkpoint watcher + s3CpWatcher := &mockWatcher{delta: s3Delta} + mc.S3Watchers[id] = watcher.NewS3Watcher( + s3CpWatcher, + mc.S3Storages[id], + tables, + ) + } + + return mc +} + +// InitSchemaFiles writes initial schema files for all tables in all clusters. +// The schema file content is empty (parser is nil in the current implementation). +func (mc *MockMultiCluster) InitSchemaFiles(ctx context.Context) error { + for _, s3 := range mc.S3Storages { + for schema, tableList := range mc.Tables { + for _, table := range tableList { + path := fmt.Sprintf("%s/%s/meta/schema_1_0000000000.json", schema, table) + if err := s3.WriteFile(ctx, path, []byte("{}")); err != nil { + return err + } + } + } + } + return nil +} + +// WriteDMLFile writes a canal-JSON DML file to a cluster's S3 storage. +// Each call increments the file index for that cluster, ensuring the +// S3Consumer discovers it as a new file. +func (mc *MockMultiCluster) WriteDMLFile(ctx context.Context, clusterID string, content []byte) error { + mc.fileCounters[clusterID]++ + idx := mc.fileCounters[clusterID] + for schema, tableList := range mc.Tables { + for _, table := range tableList { + path := fmt.Sprintf("%s/%s/1/%s/CDC%020d.json", schema, table, mc.date, idx) + if err := mc.S3Storages[clusterID].WriteFile(ctx, path, content); err != nil { + return err + } + } + } + return nil +} + +// GetPDClients returns mock PD clients as the pd.Client interface. +func (mc *MockMultiCluster) GetPDClients() map[string]pd.Client { + clients := make(map[string]pd.Client) + for id, c := range mc.pdClients { + clients[id] = c + } + return clients +} + +// Close closes all S3 watchers. +func (mc *MockMultiCluster) Close() { + for _, sw := range mc.S3Watchers { + sw.Close() + } +} + +// MakeCanalJSON builds a canal-JSON formatted record for testing. +// +// Parameters: +// - pkID: primary key value (int column "id") +// - commitTs: TiDB commit timestamp +// - originTs: origin timestamp (0 for upstream, non-zero for downstream) +// - val: value for the "val" varchar column +func MakeCanalJSON(pkID int, commitTs uint64, originTs uint64, val string) string { + originTsVal := "null" + if originTs > 0 { + originTsVal = fmt.Sprintf(`"%d"`, originTs) + } + return fmt.Sprintf( + `{"id":0,"database":"test","table":"t1","pkNames":["id"],"isDdl":false,"type":"INSERT",`+ + `"es":0,"ts":0,"sql":"","sqlType":{"id":4,"val":12,"_tidb_origin_ts":-5},`+ + `"mysqlType":{"id":"int","val":"varchar","_tidb_origin_ts":"bigint"},`+ + `"old":null,"data":[{"id":"%d","val":"%s","_tidb_origin_ts":%s}],`+ + `"_tidb":{"commitTs":%d}}`, + pkID, val, originTsVal, commitTs) +} + +// MakeContent combines canal-JSON records with CRLF terminator (matching codec config). +func MakeContent(records ...string) []byte { + return []byte(strings.Join(records, "\r\n")) +} diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 087d5dfa4d..f24a4223ae 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -85,13 +85,6 @@ func run(cmd *cobra.Command, args []string) { } log.Info("Logger initialized", zap.String("level", logLevel)) - fmt.Printf("Loaded configuration with %d cluster(s)\n", len(cfg.Clusters)) - for name, cluster := range cfg.Clusters { - fmt.Printf(" Cluster: %s\n", name) - fmt.Printf(" PD Address: %s\n", cluster.PDAddr) - fmt.Printf(" S3 Sink URI: %s\n", cluster.S3SinkURI) - } - // Create a context that can be cancelled by signals ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go index c0da03775b..11300ea384 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go @@ -228,8 +228,8 @@ func TestRecorder_RecordTimeWindow(t *testing.T) { "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}}, } report := NewReport(5) - cr := NewClusterReport("c1") - cr.AddDataLossItem("d1", "pk-1", 100, 200, false) + cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}) + cr.AddDataLossItem("d1", "test_table", "pk-1", 100, 200, false) report.AddClusterReport("c1", cr) require.True(t, report.NeedFlush()) diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index d0b38fb0d8..2314c5faa2 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -61,28 +61,46 @@ func (item *LWWViolationItem) String() string { item.PK, item.ExistingOriginTS, item.ExistingCommitTS, item.OriginTS, item.CommitTS) } -type ClusterReport struct { - ClusterID string `json:"cluster_id"` - +type TableFailureItems struct { DataLossItems []DataLossItem `json:"data_loss_items"` // data loss items DataRedundantItems []DataRedundantItem `json:"data_redundant_items"` // data redundant items LWWViolationItems []LWWViolationItem `json:"lww_violation_items"` // lww violation items - - needFlush bool `json:"-"` } -func NewClusterReport(clusterID string) *ClusterReport { - return &ClusterReport{ - ClusterID: clusterID, +func NewTableFailureItems() *TableFailureItems { + return &TableFailureItems{ DataLossItems: make([]DataLossItem, 0), DataRedundantItems: make([]DataRedundantItem, 0), LWWViolationItems: make([]LWWViolationItem, 0), - needFlush: false, } } -func (r *ClusterReport) AddDataLossItem(downstreamClusterID, pk string, originTS, commitTS uint64, inconsistent bool) { - r.DataLossItems = append(r.DataLossItems, DataLossItem{ +type ClusterReport struct { + ClusterID string `json:"cluster_id"` + + TimeWindow types.TimeWindow `json:"time_window"` + + TableFailureItems map[string]*TableFailureItems `json:"table_failure_items"` // table failure items + + needFlush bool `json:"-"` +} + +func NewClusterReport(clusterID string, timeWindow types.TimeWindow) *ClusterReport { + return &ClusterReport{ + ClusterID: clusterID, + TimeWindow: timeWindow, + TableFailureItems: make(map[string]*TableFailureItems), + needFlush: false, + } +} + +func (r *ClusterReport) AddDataLossItem(downstreamClusterID, schemaKey, pk string, originTS, commitTS uint64, inconsistent bool) { + tableFailureItems, exists := r.TableFailureItems[schemaKey] + if !exists { + tableFailureItems = NewTableFailureItems() + r.TableFailureItems[schemaKey] = tableFailureItems + } + tableFailureItems.DataLossItems = append(tableFailureItems.DataLossItems, DataLossItem{ DownstreamClusterID: downstreamClusterID, PK: pk, OriginTS: originTS, @@ -92,8 +110,13 @@ func (r *ClusterReport) AddDataLossItem(downstreamClusterID, pk string, originTS r.needFlush = true } -func (r *ClusterReport) AddDataRedundantItem(pk string, originTS, commitTS uint64) { - r.DataRedundantItems = append(r.DataRedundantItems, DataRedundantItem{ +func (r *ClusterReport) AddDataRedundantItem(schemaKey, pk string, originTS, commitTS uint64) { + tableFailureItems, exists := r.TableFailureItems[schemaKey] + if !exists { + tableFailureItems = NewTableFailureItems() + r.TableFailureItems[schemaKey] = tableFailureItems + } + tableFailureItems.DataRedundantItems = append(tableFailureItems.DataRedundantItems, DataRedundantItem{ PK: pk, OriginTS: originTS, CommitTS: commitTS, @@ -102,11 +125,17 @@ func (r *ClusterReport) AddDataRedundantItem(pk string, originTS, commitTS uint6 } func (r *ClusterReport) AddLWWViolationItem( + schemaKey string, pk string, existingOriginTS, existingCommitTS uint64, originTS, commitTS uint64, ) { - r.LWWViolationItems = append(r.LWWViolationItems, LWWViolationItem{ + tableFailureItems, exists := r.TableFailureItems[schemaKey] + if !exists { + tableFailureItems = NewTableFailureItems() + r.TableFailureItems[schemaKey] = tableFailureItems + } + tableFailureItems.LWWViolationItems = append(tableFailureItems.LWWViolationItems, LWWViolationItem{ PK: pk, ExistingOriginTS: existingOriginTS, ExistingCommitTS: existingCommitTS, @@ -143,22 +172,26 @@ func (r *Report) MarshalReport() string { continue } fmt.Fprintf(&reportMsg, "\n[cluster: %s]\n", clusterID) - if len(clusterReport.DataLossItems) > 0 { - fmt.Fprintf(&reportMsg, " - [data loss items: %d]\n", len(clusterReport.DataLossItems)) - for _, dataLossItem := range clusterReport.DataLossItems { - fmt.Fprintf(&reportMsg, " - [%s]\n", dataLossItem.String()) + fmt.Fprintf(&reportMsg, "time window: %s\n", clusterReport.TimeWindow.String()) + for schemaKey, tableFailureItems := range clusterReport.TableFailureItems { + fmt.Fprintf(&reportMsg, " - [table name: %s]\n", schemaKey) + if len(tableFailureItems.DataLossItems) > 0 { + fmt.Fprintf(&reportMsg, " - [data loss items: %d]\n", len(tableFailureItems.DataLossItems)) + for _, dataLossItem := range tableFailureItems.DataLossItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", dataLossItem.String()) + } } - } - if len(clusterReport.DataRedundantItems) > 0 { - fmt.Fprintf(&reportMsg, " - [data redundant items: %d]\n", len(clusterReport.DataRedundantItems)) - for _, dataRedundantItem := range clusterReport.DataRedundantItems { - fmt.Fprintf(&reportMsg, " - [%s]\n", dataRedundantItem.String()) + if len(tableFailureItems.DataRedundantItems) > 0 { + fmt.Fprintf(&reportMsg, " - [data redundant items: %d]\n", len(tableFailureItems.DataRedundantItems)) + for _, dataRedundantItem := range tableFailureItems.DataRedundantItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", dataRedundantItem.String()) + } } - } - if len(clusterReport.LWWViolationItems) > 0 { - fmt.Fprintf(&reportMsg, " - [lww violation items: %d]\n", len(clusterReport.LWWViolationItems)) - for _, lwwViolationItem := range clusterReport.LWWViolationItems { - fmt.Fprintf(&reportMsg, " - [%s]\n", lwwViolationItem.String()) + if len(tableFailureItems.LWWViolationItems) > 0 { + fmt.Fprintf(&reportMsg, " - [lww violation items: %d]\n", len(tableFailureItems.LWWViolationItems)) + for _, lwwViolationItem := range tableFailureItems.LWWViolationItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", lwwViolationItem.String()) + } } } } diff --git a/cmd/multi-cluster-consistency-checker/recorder/types_test.go b/cmd/multi-cluster-consistency-checker/recorder/types_test.go index b9139e6858..503d322f60 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types_test.go @@ -71,62 +71,71 @@ func TestLWWViolationItem_String(t *testing.T) { require.Equal(t, "pk: pk-y, existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4", s) } +const testSchemaKey = "test_table" + func TestClusterReport(t *testing.T) { t.Parallel() t.Run("new cluster report is empty and does not need flush", func(t *testing.T) { t.Parallel() - cr := NewClusterReport("c1") + cr := NewClusterReport("c1", types.TimeWindow{}) require.Equal(t, "c1", cr.ClusterID) - require.Empty(t, cr.DataLossItems) - require.Empty(t, cr.DataRedundantItems) - require.Empty(t, cr.LWWViolationItems) + require.Empty(t, cr.TableFailureItems) require.False(t, cr.needFlush) }) t.Run("add data loss item sets needFlush", func(t *testing.T) { t.Parallel() - cr := NewClusterReport("c1") - cr.AddDataLossItem("downstream-1", "pk-1", 100, 200, false) - require.Len(t, cr.DataLossItems, 1) + cr := NewClusterReport("c1", types.TimeWindow{}) + cr.AddDataLossItem("downstream-1", testSchemaKey, "pk-1", 100, 200, false) + require.Len(t, cr.TableFailureItems, 1) + require.Contains(t, cr.TableFailureItems, testSchemaKey) + tableItems := cr.TableFailureItems[testSchemaKey] + require.Len(t, tableItems.DataLossItems, 1) require.True(t, cr.needFlush) - require.Equal(t, "downstream-1", cr.DataLossItems[0].DownstreamClusterID) - require.Equal(t, "pk-1", cr.DataLossItems[0].PK) - require.Equal(t, uint64(100), cr.DataLossItems[0].OriginTS) - require.Equal(t, uint64(200), cr.DataLossItems[0].CommitTS) - require.False(t, cr.DataLossItems[0].Inconsistent) + require.Equal(t, "downstream-1", tableItems.DataLossItems[0].DownstreamClusterID) + require.Equal(t, "pk-1", tableItems.DataLossItems[0].PK) + require.Equal(t, uint64(100), tableItems.DataLossItems[0].OriginTS) + require.Equal(t, uint64(200), tableItems.DataLossItems[0].CommitTS) + require.False(t, tableItems.DataLossItems[0].Inconsistent) }) t.Run("add data redundant item sets needFlush", func(t *testing.T) { t.Parallel() - cr := NewClusterReport("c1") - cr.AddDataRedundantItem("pk-2", 300, 400) - require.Len(t, cr.DataRedundantItems, 1) + cr := NewClusterReport("c1", types.TimeWindow{}) + cr.AddDataRedundantItem(testSchemaKey, "pk-2", 300, 400) + require.Len(t, cr.TableFailureItems, 1) + tableItems := cr.TableFailureItems[testSchemaKey] + require.Len(t, tableItems.DataRedundantItems, 1) require.True(t, cr.needFlush) }) t.Run("add lww violation item sets needFlush", func(t *testing.T) { t.Parallel() - cr := NewClusterReport("c1") - cr.AddLWWViolationItem("pk-3", 1, 2, 3, 4) - require.Len(t, cr.LWWViolationItems, 1) + cr := NewClusterReport("c1", types.TimeWindow{}) + cr.AddLWWViolationItem(testSchemaKey, "pk-3", 1, 2, 3, 4) + require.Len(t, cr.TableFailureItems, 1) + tableItems := cr.TableFailureItems[testSchemaKey] + require.Len(t, tableItems.LWWViolationItems, 1) require.True(t, cr.needFlush) - require.Equal(t, uint64(1), cr.LWWViolationItems[0].ExistingOriginTS) - require.Equal(t, uint64(2), cr.LWWViolationItems[0].ExistingCommitTS) - require.Equal(t, uint64(3), cr.LWWViolationItems[0].OriginTS) - require.Equal(t, uint64(4), cr.LWWViolationItems[0].CommitTS) + require.Equal(t, uint64(1), tableItems.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(2), tableItems.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(3), tableItems.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(4), tableItems.LWWViolationItems[0].CommitTS) }) t.Run("add multiple items", func(t *testing.T) { t.Parallel() - cr := NewClusterReport("c1") - cr.AddDataLossItem("d1", "pk-1", 1, 2, false) - cr.AddDataLossItem("d2", "pk-2", 3, 4, true) - cr.AddDataRedundantItem("pk-3", 5, 6) - cr.AddLWWViolationItem("pk-4", 7, 8, 9, 10) - require.Len(t, cr.DataLossItems, 2) - require.Len(t, cr.DataRedundantItems, 1) - require.Len(t, cr.LWWViolationItems, 1) + cr := NewClusterReport("c1", types.TimeWindow{}) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, false) + cr.AddDataLossItem("d2", testSchemaKey, "pk-2", 3, 4, true) + cr.AddDataRedundantItem(testSchemaKey, "pk-3", 5, 6) + cr.AddLWWViolationItem(testSchemaKey, "pk-4", 7, 8, 9, 10) + require.Len(t, cr.TableFailureItems, 1) + tableItems := cr.TableFailureItems[testSchemaKey] + require.Len(t, tableItems.DataLossItems, 2) + require.Len(t, tableItems.DataRedundantItems, 1) + require.Len(t, tableItems.LWWViolationItems, 1) }) } @@ -144,7 +153,7 @@ func TestReport(t *testing.T) { t.Run("add empty cluster report does not set needFlush", func(t *testing.T) { t.Parallel() r := NewReport(1) - cr := NewClusterReport("c1") + cr := NewClusterReport("c1", types.TimeWindow{}) r.AddClusterReport("c1", cr) require.Len(t, r.ClusterReports, 1) require.False(t, r.NeedFlush()) @@ -153,8 +162,8 @@ func TestReport(t *testing.T) { t.Run("add non-empty cluster report sets needFlush", func(t *testing.T) { t.Parallel() r := NewReport(1) - cr := NewClusterReport("c1") - cr.AddDataLossItem("d1", "pk-1", 1, 2, false) + cr := NewClusterReport("c1", types.TimeWindow{}) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, false) r.AddClusterReport("c1", cr) require.True(t, r.NeedFlush()) }) @@ -162,9 +171,9 @@ func TestReport(t *testing.T) { t.Run("needFlush propagates from any cluster report", func(t *testing.T) { t.Parallel() r := NewReport(1) - cr1 := NewClusterReport("c1") - cr2 := NewClusterReport("c2") - cr2.AddDataRedundantItem("pk-1", 1, 2) + cr1 := NewClusterReport("c1", types.TimeWindow{}) + cr2 := NewClusterReport("c2", types.TimeWindow{}) + cr2.AddDataRedundantItem(testSchemaKey, "pk-1", 1, 2) r.AddClusterReport("c1", cr1) r.AddClusterReport("c2", cr2) require.True(t, r.NeedFlush()) @@ -174,6 +183,9 @@ func TestReport(t *testing.T) { func TestReport_MarshalReport(t *testing.T) { t.Parallel() + tw := types.TimeWindow{LeftBoundary: 0, RightBoundary: 0} + twStr := tw.String() + t.Run("empty report", func(t *testing.T) { t.Parallel() r := NewReport(5) @@ -184,12 +196,14 @@ func TestReport_MarshalReport(t *testing.T) { t.Run("report with data loss items", func(t *testing.T) { t.Parallel() r := NewReport(1) - cr := NewClusterReport("c1") - cr.AddDataLossItem("d1", "pk-1", 100, 200, false) + cr := NewClusterReport("c1", tw) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 100, 200, false) r.AddClusterReport("c1", cr) s := r.MarshalReport() require.Equal(t, "round: 1\n\n"+ "[cluster: c1]\n"+ + "time window: "+twStr+"\n"+ + " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ " - [downstream cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss]\n\n", s) @@ -198,12 +212,14 @@ func TestReport_MarshalReport(t *testing.T) { t.Run("report with data redundant items", func(t *testing.T) { t.Parallel() r := NewReport(2) - cr := NewClusterReport("c2") - cr.AddDataRedundantItem("pk-r", 10, 20) + cr := NewClusterReport("c2", tw) + cr.AddDataRedundantItem(testSchemaKey, "pk-r", 10, 20) r.AddClusterReport("c2", cr) s := r.MarshalReport() require.Equal(t, "round: 2\n\n"+ "[cluster: c2]\n"+ + "time window: "+twStr+"\n"+ + " - [table name: "+testSchemaKey+"]\n"+ " - [data redundant items: 1]\n"+ " - [pk: pk-r, origin ts: 10, commit ts: 20]\n\n", s) @@ -212,12 +228,14 @@ func TestReport_MarshalReport(t *testing.T) { t.Run("report with lww violation items", func(t *testing.T) { t.Parallel() r := NewReport(3) - cr := NewClusterReport("c3") - cr.AddLWWViolationItem("pk-v", 1, 2, 3, 4) + cr := NewClusterReport("c3", tw) + cr.AddLWWViolationItem(testSchemaKey, "pk-v", 1, 2, 3, 4) r.AddClusterReport("c3", cr) s := r.MarshalReport() require.Equal(t, "round: 3\n\n"+ "[cluster: c3]\n"+ + "time window: "+twStr+"\n"+ + " - [table name: "+testSchemaKey+"]\n"+ " - [lww violation items: 1]\n"+ " - [pk: pk-v, existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4]\n\n", s) @@ -226,14 +244,16 @@ func TestReport_MarshalReport(t *testing.T) { t.Run("skips cluster reports that do not need flush", func(t *testing.T) { t.Parallel() r := NewReport(1) - crEmpty := NewClusterReport("empty-cluster") - crFull := NewClusterReport("full-cluster") - crFull.AddDataLossItem("d1", "pk-1", 1, 2, false) + crEmpty := NewClusterReport("empty-cluster", tw) + crFull := NewClusterReport("full-cluster", tw) + crFull.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, false) r.AddClusterReport("empty-cluster", crEmpty) r.AddClusterReport("full-cluster", crFull) s := r.MarshalReport() require.Equal(t, "round: 1\n\n"+ "[cluster: full-cluster]\n"+ + "time window: "+twStr+"\n"+ + " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ " - [downstream cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data loss]\n\n", s) @@ -242,14 +262,16 @@ func TestReport_MarshalReport(t *testing.T) { t.Run("report with mixed items", func(t *testing.T) { t.Parallel() r := NewReport(10) - cr := NewClusterReport("c1") - cr.AddDataLossItem("d1", "pk-1", 1, 2, true) - cr.AddDataRedundantItem("pk-2", 3, 4) - cr.AddLWWViolationItem("pk-3", 5, 6, 7, 8) + cr := NewClusterReport("c1", tw) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, true) + cr.AddDataRedundantItem(testSchemaKey, "pk-2", 3, 4) + cr.AddLWWViolationItem(testSchemaKey, "pk-3", 5, 6, 7, 8) r.AddClusterReport("c1", cr) s := r.MarshalReport() require.Equal(t, "round: 10\n\n"+ "[cluster: c1]\n"+ + "time window: "+twStr+"\n"+ + " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ " - [downstream cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data inconsistent]\n"+ " - [data redundant items: 1]\n"+ diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index e5cdd222f7..b118d8187c 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -66,7 +66,7 @@ func runTask(ctx context.Context, cfg *config.Config) error { return errors.Trace(err) } - report, err := dataChecker.CheckInNextTimeWindow(ctx, newTimeWindowData) + report, err := dataChecker.CheckInNextTimeWindow(newTimeWindowData) if err != nil { return errors.Trace(err) } @@ -90,7 +90,7 @@ func initClients(ctx context.Context, cfg *config.Config) ( etcdClients := make(map[string]*etcd.CDCEtcdClientImpl) for clusterID, clusterConfig := range cfg.Clusters { - pdClient, etcdClient, err := newPDClient(ctx, clusterConfig.PDAddr, &clusterConfig.SecurityConfig) + pdClient, etcdClient, err := newPDClient(ctx, clusterConfig.PDAddrs, &clusterConfig.SecurityConfig) if err != nil { // Clean up already created clients before returning error cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) @@ -123,16 +123,16 @@ func initClients(ctx context.Context, cfg *config.Config) ( return checkpointWatchers, s3Watchers, pdClients, etcdClients, nil } -func newPDClient(ctx context.Context, pdAddr string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { +func newPDClient(ctx context.Context, pdAddrs []string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { pdClient, err := pd.NewClientWithContext( - ctx, "consistency-checker", []string{pdAddr}, securityConfig.PDSecurityOption(), + ctx, "consistency-checker", pdAddrs, securityConfig.PDSecurityOption(), pdopt.WithCustomTimeoutOption(10*time.Second), ) if err != nil { return nil, nil, errors.Trace(err) } - etcdCli, err := etcd.CreateRawEtcdClient(securityConfig, grpc.EmptyDialOption{}, pdAddr) + etcdCli, err := etcd.CreateRawEtcdClient(securityConfig, grpc.EmptyDialOption{}, pdAddrs...) if err != nil { // Clean up PD client if etcd client creation fails if pdClient != nil { diff --git a/cmd/multi-cluster-consistency-checker/types/types.go b/cmd/multi-cluster-consistency-checker/types/types.go index 5be66475c8..85aaf197ab 100644 --- a/cmd/multi-cluster-consistency-checker/types/types.go +++ b/cmd/multi-cluster-consistency-checker/types/types.go @@ -14,6 +14,9 @@ package types import ( + "fmt" + "strings" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" ) @@ -62,6 +65,15 @@ type TimeWindow struct { NextMinLeftBoundary uint64 `json:"next_min_left_boundary"` } +func (t *TimeWindow) String() string { + var builder strings.Builder + fmt.Fprintf(&builder, "time window boundary: (%d, %d]\n", t.LeftBoundary, t.RightBoundary) + for downstreamClusterID, checkpointTs := range t.CheckpointTs { + fmt.Fprintf(&builder, "checkpoint ts [to cluster %s]: %d\n", downstreamClusterID, checkpointTs) + } + return builder.String() +} + type TimeWindowData struct { TimeWindow Data map[cloudstorage.DmlPathKey]IncrementalData From 17d5831314119ea45687538e9d736accbdaaa016 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Sat, 14 Feb 2026 00:31:28 +0800 Subject: [PATCH 20/23] meet the spec Signed-off-by: Jianjun Liao --- .../advancer/time_window_advancer.go | 47 ++-- .../advancer/time_window_advancer_test.go | 15 +- .../checker/checker.go | 212 ++++++++++++------ .../checker/checker_test.go | 51 +++-- .../config/config.example.toml | 6 +- .../config/config.go | 47 +++- .../config/config_test.go | 181 +++++++++++++-- .../consumer/consumer_test.go | 13 -- .../decoder/decoder.go | 14 +- .../decoder/decoder_test.go | 46 ++-- .../integration/integration_test.go | 159 +++++++------ .../integration/mock_cluster.go | 4 +- cmd/multi-cluster-consistency-checker/main.go | 5 +- .../recorder/recorder.go | 133 +++++++++-- .../recorder/recorder_test.go | 109 +++++++-- .../recorder/types.go | 93 ++++++-- .../recorder/types_test.go | 120 +++++++--- cmd/multi-cluster-consistency-checker/task.go | 96 +++++++- .../types/types.go | 12 +- .../watcher/checkpoint_watcher.go | 16 +- .../watcher/checkpoint_watcher_test.go | 18 +- 21 files changed, 989 insertions(+), 408 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go index 353ae40ff7..afa93bec49 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer.go @@ -38,7 +38,7 @@ type TimeWindowAdvancer struct { timeWindowTriplet map[string][3]types.TimeWindow // checkpointWatcher is the Active-Active checkpoint watcher for each cluster, - // mapping from cluster ID to the downstream cluster ID to the checkpoint watcher + // mapping from local cluster ID to replicated cluster ID to the checkpoint watcher checkpointWatcher map[string]map[string]watcher.Watcher // s3checkpointWatcher is the S3 checkpoint watcher for each cluster, mapping from cluster ID to the s3 checkpoint watcher @@ -118,18 +118,17 @@ func (t *TimeWindowAdvancer) initializeFromCheckpoint( } // AdvanceTimeWindow advances the time window for each cluster. Here is the steps: -// 1. Advance the checkpoint ts for each upstream-downstream cluster changefeed. +// 1. Advance the checkpoint ts for each local-to-replicated changefeed. // -// For any upstream-downstream cluster changefeed, the checkpoint ts should be advanced to -// the maximum of pd timestamp after previouds time window of downstream advanced and -// the right boundary of previouds time window of every clusters. +// For any local-to-replicated changefeed, the checkpoint ts should be advanced to +// the maximum of pd timestamp after previous time window of the replicated cluster +// advanced and the right boundary of previous time window of every clusters. // // 2. Advance the right boundary for each cluster. // // For any cluster, the right boundary should be advanced to the maximum of pd timestamp of -// the cluster after the checkpoint ts of its upstream cluster advanced and the previous -// timewindow's checkpoint ts of changefeed where the cluster is the upstream cluster or -// the downstream cluster. +// the cluster after the checkpoint ts of its local cluster advanced and the previous +// timewindow's checkpoint ts of changefeed where the cluster is the local or the replicated. // // 3. Update the time window for each cluster. // @@ -138,15 +137,15 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( pctx context.Context, ) (map[string]types.TimeWindowData, error) { log.Debug("advance time window", zap.Uint64("round", t.round)) - // mapping from upstream cluster ID to the downstream cluster ID to the min checkpoint timestamp + // mapping from local cluster ID to replicated cluster ID to the min checkpoint timestamp minCheckpointTsMap := make(map[string]map[string]uint64) maxTimeWindowRightBoundary := uint64(0) - for downstreamClusterID, triplet := range t.timeWindowTriplet { - for upstreamClusterID, pdTimestampAfterTimeWindow := range triplet[2].PDTimestampAfterTimeWindow { - if _, ok := minCheckpointTsMap[upstreamClusterID]; !ok { - minCheckpointTsMap[upstreamClusterID] = make(map[string]uint64) + for replicatedClusterID, triplet := range t.timeWindowTriplet { + for localClusterID, pdTimestampAfterTimeWindow := range triplet[2].PDTimestampAfterTimeWindow { + if _, ok := minCheckpointTsMap[localClusterID]; !ok { + minCheckpointTsMap[localClusterID] = make(map[string]uint64) } - minCheckpointTsMap[upstreamClusterID][downstreamClusterID] = max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], pdTimestampAfterTimeWindow) + minCheckpointTsMap[localClusterID][replicatedClusterID] = max(minCheckpointTsMap[localClusterID][replicatedClusterID], pdTimestampAfterTimeWindow) } maxTimeWindowRightBoundary = max(maxTimeWindowRightBoundary, triplet[2].RightBoundary) } @@ -158,31 +157,31 @@ func (t *TimeWindowAdvancer) AdvanceTimeWindow( maxCheckpointTs := make(map[string]uint64) // Advance the checkpoint ts for each cluster eg, ctx := errgroup.WithContext(pctx) - for upstreamClusterID, downstreamCheckpointWatcherMap := range t.checkpointWatcher { - for downstreamClusterID, checkpointWatcher := range downstreamCheckpointWatcherMap { - mincheckpointTs := max(minCheckpointTsMap[upstreamClusterID][downstreamClusterID], maxTimeWindowRightBoundary) + for localClusterID, replicatedCheckpointWatcherMap := range t.checkpointWatcher { + for replicatedClusterID, checkpointWatcher := range replicatedCheckpointWatcherMap { + minCheckpointTs := max(minCheckpointTsMap[localClusterID][replicatedClusterID], maxTimeWindowRightBoundary) eg.Go(func() error { - checkpointTs, err := checkpointWatcher.AdvanceCheckpointTs(ctx, mincheckpointTs) + checkpointTs, err := checkpointWatcher.AdvanceCheckpointTs(ctx, minCheckpointTs) if err != nil { return errors.Trace(err) } // TODO: optimize this by getting pd ts in the end of all checkpoint ts advance - pdtsos, err := t.getPDTsFromOtherClusters(ctx, upstreamClusterID) + pdtsos, err := t.getPDTsFromOtherClusters(ctx, localClusterID) if err != nil { return errors.Trace(err) } lock.Lock() - timeWindow := newTimeWindow[upstreamClusterID] + timeWindow := newTimeWindow[localClusterID] if timeWindow.CheckpointTs == nil { timeWindow.CheckpointTs = make(map[string]uint64) } - timeWindow.CheckpointTs[downstreamClusterID] = checkpointTs - newTimeWindow[upstreamClusterID] = timeWindow + timeWindow.CheckpointTs[replicatedClusterID] = checkpointTs + newTimeWindow[localClusterID] = timeWindow for otherClusterID, pdtso := range pdtsos { maxPDTimestampAfterCheckpointTs[otherClusterID] = max(maxPDTimestampAfterCheckpointTs[otherClusterID], pdtso) } - maxCheckpointTs[upstreamClusterID] = max(maxCheckpointTs[upstreamClusterID], checkpointTs) - maxCheckpointTs[downstreamClusterID] = max(maxCheckpointTs[downstreamClusterID], checkpointTs) + maxCheckpointTs[localClusterID] = max(maxCheckpointTs[localClusterID], checkpointTs) + maxCheckpointTs[replicatedClusterID] = max(maxCheckpointTs[replicatedClusterID], checkpointTs) lock.Unlock() return nil }) diff --git a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go index 187cc6f7ca..d327f85a7d 100644 --- a/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go +++ b/cmd/multi-cluster-consistency-checker/advancer/time_window_advancer_test.go @@ -15,6 +15,7 @@ package advancer import ( "context" + "maps" "sync" "sync/atomic" "testing" @@ -161,9 +162,9 @@ func TestTimeWindowAdvancer_AdvanceMultipleRounds(t *testing.T) { // 3. CheckpointTs should be populated and strictly increasing across rounds require.NotEmpty(t, tw.CheckpointTs, "round %d, cluster %s: CheckpointTs should be populated", round, clusterID) - for downstream, cpTs := range tw.CheckpointTs { - require.Greater(t, cpTs, prevCheckpointTs[clusterID][downstream], - "round %d, %s->%s: checkpoint should be strictly increasing", round, clusterID, downstream) + for replicatedCluster, cpTs := range tw.CheckpointTs { + require.Greater(t, cpTs, prevCheckpointTs[clusterID][replicatedCluster], + "round %d, %s->%s: checkpoint should be strictly increasing", round, clusterID, replicatedCluster) } // 4. PDTimestampAfterTimeWindow should be populated @@ -178,10 +179,10 @@ func TestTimeWindowAdvancer_AdvanceMultipleRounds(t *testing.T) { // 6. PD TSO values in PDTimestampAfterTimeWindow > all CheckpointTs values // (PD TSOs are obtained after checkpoint advance) for otherCluster, pdTs := range tw.PDTimestampAfterTimeWindow { - for downstream, cpTs := range tw.CheckpointTs { + for replicatedCluster, cpTs := range tw.CheckpointTs { require.Greater(t, pdTs, cpTs, "round %d, cluster %s: PD TSO (from %s) should be > checkpoint (%s->%s)", - round, clusterID, otherCluster, clusterID, downstream) + round, clusterID, otherCluster, clusterID, replicatedCluster) } } @@ -197,9 +198,7 @@ func TestTimeWindowAdvancer_AdvanceMultipleRounds(t *testing.T) { if twData.TimeWindow.RightBoundary > maxRB { maxRB = twData.TimeWindow.RightBoundary } - for downstream, cpTs := range twData.TimeWindow.CheckpointTs { - prevCheckpointTs[clusterID][downstream] = cpTs - } + maps.Copy(prevCheckpointTs[clusterID], twData.TimeWindow.CheckpointTs) } prevRightBoundary = maxRB } diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index 43565adcfe..c3df9088fd 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -124,34 +124,34 @@ func (c *clusterViolationChecker) UpdateCache() { } type tableDataCache struct { - // upstreamDataCache is a map of primary key to a map of commit ts to a record - upstreamDataCache map[types.PkType]map[uint64]*decoder.Record + // localDataCache is a map of primary key to a map of commit ts to a record + localDataCache map[types.PkType]map[uint64]*decoder.Record - // downstreamDataCache is a map of primary key to a map of origin ts to a record - downstreamDataCache map[types.PkType]map[uint64]*decoder.Record + // replicatedDataCache is a map of primary key to a map of origin ts to a record + replicatedDataCache map[types.PkType]map[uint64]*decoder.Record } func newTableDataCache() *tableDataCache { return &tableDataCache{ - upstreamDataCache: make(map[types.PkType]map[uint64]*decoder.Record), - downstreamDataCache: make(map[types.PkType]map[uint64]*decoder.Record), + localDataCache: make(map[types.PkType]map[uint64]*decoder.Record), + replicatedDataCache: make(map[types.PkType]map[uint64]*decoder.Record), } } -func (tdc *tableDataCache) newUpstreamRecord(record *decoder.Record) { - recordsMap, exists := tdc.upstreamDataCache[record.Pk] +func (tdc *tableDataCache) newLocalRecord(record *decoder.Record) { + recordsMap, exists := tdc.localDataCache[record.Pk] if !exists { recordsMap = make(map[uint64]*decoder.Record) - tdc.upstreamDataCache[record.Pk] = recordsMap + tdc.localDataCache[record.Pk] = recordsMap } recordsMap[record.CommitTs] = record } -func (tdc *tableDataCache) newDownstreamRecord(record *decoder.Record) { - recordsMap, exists := tdc.downstreamDataCache[record.Pk] +func (tdc *tableDataCache) newReplicatedRecord(record *decoder.Record) { + recordsMap, exists := tdc.replicatedDataCache[record.Pk] if !exists { recordsMap = make(map[uint64]*decoder.Record) - tdc.downstreamDataCache[record.Pk] = recordsMap + tdc.replicatedDataCache[record.Pk] = recordsMap } recordsMap[record.OriginTs] = record } @@ -184,9 +184,9 @@ func (twdc *timeWindowDataCache) NewRecord(schemaKey string, record *decoder.Rec twdc.tableDataCaches[schemaKey] = tableDataCache } if record.OriginTs == 0 { - tableDataCache.newUpstreamRecord(record) + tableDataCache.newLocalRecord(record) } else { - tableDataCache.newDownstreamRecord(record) + tableDataCache.newReplicatedRecord(record) } } @@ -204,6 +204,10 @@ type clusterDataChecker struct { clusterViolationChecker *clusterViolationChecker report *recorder.ClusterReport + + lwwSkippedRecordsCount int + checkedRecordsCount int + newTimeWindowRecordsCount int } func newClusterDataChecker(clusterID string) *clusterDataChecker { @@ -290,6 +294,9 @@ func (cd *clusterDataChecker) PrepareNextTimeWindowData(timeWindow types.TimeWin } cd.timeWindowDataCaches[2] = newTimeWindowDataCache cd.overDataCaches = newOverDataCache + cd.lwwSkippedRecordsCount = 0 + cd.checkedRecordsCount = 0 + cd.newTimeWindowRecordsCount = 0 return nil } @@ -301,12 +308,12 @@ func (cd *clusterDataChecker) NewRecord(schemaKey string, record *decoder.Record cd.timeWindowDataCaches[2].NewRecord(schemaKey, record) } -func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowIdx int, schemaKey string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { +func (cd *clusterDataChecker) findClusterReplicatedDataInTimeWindow(timeWindowIdx int, schemaKey string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { tableDataCache, exists := cd.timeWindowDataCaches[timeWindowIdx].tableDataCaches[schemaKey] if !exists { return nil, false } - records, exists := tableDataCache.downstreamDataCache[pk] + records, exists := tableDataCache.replicatedDataCache[pk] if !exists { return nil, false } @@ -321,12 +328,12 @@ func (cd *clusterDataChecker) findClusterDownstreamDataInTimeWindow(timeWindowId return nil, false } -func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx int, schemaKey string, pk types.PkType, commitTs uint64) bool { +func (cd *clusterDataChecker) findClusterLocalDataInTimeWindow(timeWindowIdx int, schemaKey string, pk types.PkType, commitTs uint64) bool { tableDataCache, exists := cd.timeWindowDataCaches[timeWindowIdx].tableDataCaches[schemaKey] if !exists { return false } - records, exists := tableDataCache.upstreamDataCache[pk] + records, exists := tableDataCache.localDataCache[pk] if !exists { return false } @@ -334,66 +341,117 @@ func (cd *clusterDataChecker) findClusterUpstreamDataInTimeWindow(timeWindowIdx return exists } -// datalossDetection iterates through the upstream data cache [1] and [2] and filter out the records +// diffColumns compares column values between local written and replicated records +// and returns the list of inconsistent columns. +func diffColumns(local, replicated *decoder.Record) []recorder.InconsistentColumn { + var result []recorder.InconsistentColumn + for colName, localVal := range local.ColumnValues { + replicatedVal, ok := replicated.ColumnValues[colName] + if !ok { + result = append(result, recorder.InconsistentColumn{ + Column: colName, + Local: localVal, + Replicated: nil, + }) + } else if localVal != replicatedVal { + result = append(result, recorder.InconsistentColumn{ + Column: colName, + Local: localVal, + Replicated: replicatedVal, + }) + } + } + for colName, replicatedVal := range replicated.ColumnValues { + if _, ok := local.ColumnValues[colName]; !ok { + result = append(result, recorder.InconsistentColumn{ + Column: colName, + Local: nil, + Replicated: replicatedVal, + }) + } + } + sort.Slice(result, func(i, j int) bool { + return result[i].Column < result[j].Column + }) + return result +} + +// datalossDetection iterates through the local-written data cache [1] and [2] and filter out the records // whose checkpoint ts falls within the (checkpoint[1], checkpoint[2]]. The record must be present -// in the downstream data cache [1] or [2] or another new record is present in the downstream data +// in the replicated data cache [1] or [2] or another new record is present in the replicated data // cache [1] or [2]. func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { for schemaKey, tableDataCache := range cd.timeWindowDataCaches[1].tableDataCaches { - for _, upstreamDataCache := range tableDataCache.upstreamDataCache { - for _, record := range upstreamDataCache { - for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[1].checkpointTs { + for _, localDataCache := range tableDataCache.localDataCache { + for _, record := range localDataCache { + for replicatedClusterID, checkpointTs := range cd.timeWindowDataCaches[1].checkpointTs { if record.CommitTs <= checkpointTs { continue } - downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, schemaKey, record.Pk, record.CommitTs) + cd.checkedRecordsCount++ + replicatedRecord, skipped := checker.FindClusterReplicatedData(replicatedClusterID, schemaKey, record.Pk, record.CommitTs) if skipped { + log.Debug("replicated record skipped by LWW", + zap.String("local cluster ID", cd.clusterID), + zap.String("replicated cluster ID", replicatedClusterID), + zap.String("schemaKey", schemaKey), + zap.String("pk", string(record.Pk)), + zap.Uint64("commitTs", record.CommitTs)) + cd.lwwSkippedRecordsCount++ continue } - if downstreamRecord == nil { + if replicatedRecord == nil { // data loss detected log.Error("data loss detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), + zap.String("local cluster ID", cd.clusterID), + zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, false) - } else if !record.EqualDownstreamRecord(downstreamRecord) { + cd.report.AddDataLossItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) + } else if !record.EqualReplicatedRecord(replicatedRecord) { // data inconsistent detected log.Error("data inconsistent detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), + zap.String("local cluster ID", cd.clusterID), + zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, true) + cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) } } } } } for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { - for _, upstreamDataCache := range tableDataCache.upstreamDataCache { - for _, record := range upstreamDataCache { - for downstreamClusterID, checkpointTs := range cd.timeWindowDataCaches[2].checkpointTs { + for _, localDataCache := range tableDataCache.localDataCache { + for _, record := range localDataCache { + for replicatedClusterID, checkpointTs := range cd.timeWindowDataCaches[2].checkpointTs { if record.CommitTs > checkpointTs { continue } - downstreamRecord, skipped := checker.FindClusterDownstreamData(downstreamClusterID, schemaKey, record.Pk, record.CommitTs) + cd.checkedRecordsCount++ + replicatedRecord, skipped := checker.FindClusterReplicatedData(replicatedClusterID, schemaKey, record.Pk, record.CommitTs) if skipped { + log.Debug("replicated record skipped by LWW", + zap.String("local cluster ID", cd.clusterID), + zap.String("replicated cluster ID", replicatedClusterID), + zap.String("schemaKey", schemaKey), + zap.String("pk", string(record.Pk)), + zap.Uint64("commitTs", record.CommitTs)) + cd.lwwSkippedRecordsCount++ continue } - if downstreamRecord == nil { + if replicatedRecord == nil { // data loss detected log.Error("data loss detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), + zap.String("local cluster ID", cd.clusterID), + zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, false) - } else if !record.EqualDownstreamRecord(downstreamRecord) { + cd.report.AddDataLossItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) + } else if !record.EqualReplicatedRecord(replicatedRecord) { // data inconsistent detected log.Error("data inconsistent detected", - zap.String("upstreamClusterID", cd.clusterID), - zap.String("downstreamClusterID", downstreamClusterID), + zap.String("local cluster ID", cd.clusterID), + zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(downstreamClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, true) + cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) } } } @@ -401,17 +459,18 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { } } -// dataRedundantDetection iterates through the downstream data cache [2]. The record must be present -// in the upstream data cache [1] [2] or [3]. +// dataRedundantDetection iterates through the replicated data cache [2]. The record must be present +// in the local data cache [1] [2] or [3]. func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { - for _, downstreamDataCache := range tableDataCache.downstreamDataCache { - for _, record := range downstreamDataCache { - // For downstream records, OriginTs is the upstream commit ts - if !checker.FindClusterUpstreamData(cd.clusterID, schemaKey, record.Pk, record.OriginTs) { + for _, replicatedDataCache := range tableDataCache.replicatedDataCache { + for _, record := range replicatedDataCache { + cd.checkedRecordsCount++ + // For replicated records, OriginTs is the local commit ts + if !checker.FindSourceLocalData(cd.clusterID, schemaKey, record.Pk, record.OriginTs) { // data redundant detected log.Error("data redundant detected", - zap.String("downstreamClusterID", cd.clusterID), + zap.String("replicated cluster ID", cd.clusterID), zap.Any("record", record)) cd.report.AddDataRedundantItem(schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) } @@ -423,35 +482,37 @@ func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { // lwwViolationDetection check the orderliness of the records func (cd *clusterDataChecker) lwwViolationDetection() { for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { - for pk, upstreamRecords := range tableDataCache.upstreamDataCache { - downstreamRecords := tableDataCache.downstreamDataCache[pk] - pkRecords := make([]*decoder.Record, 0, len(upstreamRecords)+len(downstreamRecords)) - for _, upstreamRecord := range upstreamRecords { - pkRecords = append(pkRecords, upstreamRecord) + for pk, localRecords := range tableDataCache.localDataCache { + replicatedRecords := tableDataCache.replicatedDataCache[pk] + pkRecords := make([]*decoder.Record, 0, len(localRecords)+len(replicatedRecords)) + for _, localRecord := range localRecords { + pkRecords = append(pkRecords, localRecord) } - for _, downstreamRecord := range downstreamRecords { - pkRecords = append(pkRecords, downstreamRecord) + for _, replicatedRecord := range replicatedRecords { + pkRecords = append(pkRecords, replicatedRecord) } sort.Slice(pkRecords, func(i, j int) bool { return pkRecords[i].CommitTs < pkRecords[j].CommitTs }) for _, record := range pkRecords { + cd.newTimeWindowRecordsCount++ cd.clusterViolationChecker.Check(schemaKey, record, cd.report) } } - for pk, downstreamRecords := range tableDataCache.downstreamDataCache { - if _, exists := tableDataCache.upstreamDataCache[pk]; exists { + for pk, replicatedRecords := range tableDataCache.replicatedDataCache { + if _, exists := tableDataCache.localDataCache[pk]; exists { continue } - pkRecords := make([]*decoder.Record, 0, len(downstreamRecords)) - for _, downstreamRecord := range downstreamRecords { - pkRecords = append(pkRecords, downstreamRecord) + pkRecords := make([]*decoder.Record, 0, len(replicatedRecords)) + for _, replicatedRecord := range replicatedRecords { + pkRecords = append(pkRecords, replicatedRecord) } sort.Slice(pkRecords, func(i, j int) bool { return pkRecords[i].CommitTs < pkRecords[j].CommitTs }) for _, record := range pkRecords { + cd.newTimeWindowRecordsCount++ cd.clusterViolationChecker.Check(schemaKey, record, cd.report) } } @@ -508,32 +569,32 @@ func (c *DataChecker) initializeFromCheckpoint(ctx context.Context, checkpointDa } } -// FindClusterDownstreamData checks whether the record is present in the downstream data -// cache [1] or [2] or another new record is present in the downstream data cache [1] or [2]. -func (c *DataChecker) FindClusterDownstreamData(clusterID string, schemaKey string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { +// FindClusterReplicatedData checks whether the record is present in the replicated data +// cache [1] or [2] or another new record is present in the replicated data cache [1] or [2]. +func (c *DataChecker) FindClusterReplicatedData(clusterID string, schemaKey string, pk types.PkType, originTs uint64) (*decoder.Record, bool) { clusterDataChecker, exists := c.clusterDataCheckers[clusterID] if !exists { return nil, false } - record, skipped := clusterDataChecker.findClusterDownstreamDataInTimeWindow(1, schemaKey, pk, originTs) + record, skipped := clusterDataChecker.findClusterReplicatedDataInTimeWindow(1, schemaKey, pk, originTs) if skipped || record != nil { return record, skipped } - return clusterDataChecker.findClusterDownstreamDataInTimeWindow(2, schemaKey, pk, originTs) + return clusterDataChecker.findClusterReplicatedDataInTimeWindow(2, schemaKey, pk, originTs) } -func (c *DataChecker) FindClusterUpstreamData(downstreamClusterID string, schemaKey string, pk types.PkType, commitTs uint64) bool { +func (c *DataChecker) FindSourceLocalData(localClusterID string, schemaKey string, pk types.PkType, commitTs uint64) bool { for _, clusterDataChecker := range c.clusterDataCheckers { - if clusterDataChecker.clusterID == downstreamClusterID { + if clusterDataChecker.clusterID == localClusterID { continue } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(0, schemaKey, pk, commitTs) { + if clusterDataChecker.findClusterLocalDataInTimeWindow(0, schemaKey, pk, commitTs) { return true } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(1, schemaKey, pk, commitTs) { + if clusterDataChecker.findClusterLocalDataInTimeWindow(1, schemaKey, pk, commitTs) { return true } - if clusterDataChecker.findClusterUpstreamDataInTimeWindow(2, schemaKey, pk, commitTs) { + if clusterDataChecker.findClusterLocalDataInTimeWindow(2, schemaKey, pk, commitTs) { return true } } @@ -549,6 +610,11 @@ func (c *DataChecker) CheckInNextTimeWindow(newTimeWindowData map[string]types.T if c.checkableRound >= 3 { for clusterID, clusterDataChecker := range c.clusterDataCheckers { clusterDataChecker.Check(c) + log.Info("checked records count", + zap.String("clusterID", clusterID), + zap.Int("checked records count", clusterDataChecker.checkedRecordsCount), + zap.Int("new time window records count", clusterDataChecker.newTimeWindowRecordsCount), + zap.Int("lww skipped records count", clusterDataChecker.lwwSkippedRecordsCount)) report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) } } else { diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go index 9818ee2c03..11638f8817 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker_test.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -226,7 +226,7 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { const schemaKey = "test_schema" - t.Run("add upstream record", func(t *testing.T) { + t.Run("add local record", func(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) record := &decoder.Record{ @@ -239,11 +239,11 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { cache.NewRecord(schemaKey, record) require.Contains(t, cache.tableDataCaches, schemaKey) - require.Contains(t, cache.tableDataCaches[schemaKey].upstreamDataCache, record.Pk) - require.Contains(t, cache.tableDataCaches[schemaKey].upstreamDataCache[record.Pk], record.CommitTs) + require.Contains(t, cache.tableDataCaches[schemaKey].localDataCache, record.Pk) + require.Contains(t, cache.tableDataCaches[schemaKey].localDataCache[record.Pk], record.CommitTs) }) - t.Run("add downstream record", func(t *testing.T) { + t.Run("add replicated record", func(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) record := &decoder.Record{ @@ -256,8 +256,8 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { cache.NewRecord(schemaKey, record) require.Contains(t, cache.tableDataCaches, schemaKey) - require.Contains(t, cache.tableDataCaches[schemaKey].downstreamDataCache, record.Pk) - require.Contains(t, cache.tableDataCaches[schemaKey].downstreamDataCache[record.Pk], record.OriginTs) + require.Contains(t, cache.tableDataCaches[schemaKey].replicatedDataCache, record.Pk) + require.Contains(t, cache.tableDataCaches[schemaKey].replicatedDataCache[record.Pk], record.OriginTs) }) t.Run("skip record before left boundary", func(t *testing.T) { @@ -314,7 +314,7 @@ func TestClusterDataChecker_PrepareNextTimeWindowData(t *testing.T) { // makeCanalJSON builds a canal-JSON formatted record for testing. // pkID is the primary key value, commitTs is the TiDB commit timestamp, -// originTs is the origin timestamp (0 for upstream records, non-zero for downstream), +// originTs is the origin timestamp (0 for locally-written records, non-zero for replicated records), // val is a varchar column value. func makeCanalJSON(pkID int, commitTs uint64, originTs uint64, val string) string { originTsVal := "null" @@ -360,7 +360,7 @@ func makeTWData(left, right uint64, checkpointTs map[string]uint64, content []by var defaultSchemaKey = (&cloudstorage.DmlPathKey{}).GetKey() // TestDataChecker_FourRoundsCheck simulates 4 rounds with increasing data and verifies check results. -// Setup: 2 clusters (c1 upstream, c2 downstream from c1). +// Setup: 2 clusters (c1 locally-written, c2 replicated from c1). // Rounds 0-2: accumulate data, check not yet active (checkableRound < 3). // Round 3: first real check runs, detecting violations. func TestDataChecker_FourRoundsCheck(t *testing.T) { @@ -370,7 +370,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { clusterCfg := map[string]config.ClusterConfig{"c1": {}, "c2": {}} // makeBaseRounds creates shared rounds 0 and 1 data for all subtests. - // c1 produces upstream data, c2 receives matching downstream from c1. + // c1 produces locally-written data, c2 receives matching replicated data from c1. makeBaseRounds := func() [2]map[string]types.TimeWindowData { return [2]map[string]types.TimeWindowData{ // Round 0: [0, 100] @@ -431,7 +431,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { checker := NewDataChecker(ctx, clusterCfg, nil, nil) base := makeBaseRounds() - // Round 2: c1 has upstream pk=3 but c2 has NO matching downstream + // Round 2: c1 has locally-written pk=3 but c2 has NO matching replicated data round2 := map[string]types.TimeWindowData{ "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, makeContent(makeCanalJSON(3, 250, 0, "c"))), @@ -453,16 +453,15 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { } require.True(t, lastReport.NeedFlush()) - // c1 should detect data loss: pk=3 (commitTs=250) missing in c2's downstream + // c1 should detect data loss: pk=3 (commitTs=250) missing in c2's replicated data c1Report := lastReport.ClusterReports["c1"] require.NotNil(t, c1Report) require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) tableItems := c1Report.TableFailureItems[defaultSchemaKey] require.Len(t, tableItems.DataLossItems, 1) - require.Equal(t, "c2", tableItems.DataLossItems[0].DownstreamClusterID) + require.Equal(t, "c2", tableItems.DataLossItems[0].PeerClusterID) require.Equal(t, uint64(0), tableItems.DataLossItems[0].OriginTS) require.Equal(t, uint64(250), tableItems.DataLossItems[0].CommitTS) - require.False(t, tableItems.DataLossItems[0].Inconsistent) // c2 should have no issues c2Report := lastReport.ClusterReports["c2"] require.Empty(t, c2Report.TableFailureItems) @@ -473,7 +472,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { checker := NewDataChecker(ctx, clusterCfg, nil, nil) base := makeBaseRounds() - // Round 2: c2 has downstream for pk=3 but with wrong column value + // Round 2: c2 has replicated data for pk=3 but with wrong column value round2 := map[string]types.TimeWindowData{ "c1": makeTWData(200, 300, map[string]uint64{"c2": 240}, makeContent(makeCanalJSON(3, 250, 0, "c"))), @@ -499,10 +498,14 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { c1Report := lastReport.ClusterReports["c1"] require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) tableItems := c1Report.TableFailureItems[defaultSchemaKey] - require.Len(t, tableItems.DataLossItems, 1) - require.Equal(t, "c2", tableItems.DataLossItems[0].DownstreamClusterID) - require.Equal(t, uint64(250), tableItems.DataLossItems[0].CommitTS) - require.True(t, tableItems.DataLossItems[0].Inconsistent) // data inconsistent, not pure data loss + require.Empty(t, tableItems.DataLossItems) + require.Len(t, tableItems.DataInconsistentItems, 1) + require.Equal(t, "c2", tableItems.DataInconsistentItems[0].PeerClusterID) + require.Equal(t, uint64(250), tableItems.DataInconsistentItems[0].CommitTS) + require.Len(t, tableItems.DataInconsistentItems[0].InconsistentColumns, 1) + require.Equal(t, "val", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Column) + require.Equal(t, "c", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Local) + require.Equal(t, "WRONG", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Replicated) }) t.Run("data redundant detected", func(t *testing.T) { @@ -516,8 +519,8 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { "c2": makeTWData(200, 300, nil, makeContent(makeCanalJSON(3, 260, 250, "c"))), } - // Round 3: c2 has an extra downstream pk=99 (originTs=330) that doesn't match - // any upstream record in c1 + // Round 3: c2 has an extra replicated pk=99 (originTs=330) that doesn't match + // any locally-written record in c1 round3 := map[string]types.TimeWindowData{ "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, makeContent(makeCanalJSON(4, 350, 0, "d"))), @@ -540,7 +543,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { // c1 should have no data loss c1Report := lastReport.ClusterReports["c1"] require.Empty(t, c1Report.TableFailureItems) - // c2 should detect data redundant: pk=99 has no matching upstream in c1 + // c2 should detect data redundant: pk=99 has no matching locally-written record in c1 c2Report := lastReport.ClusterReports["c2"] require.Contains(t, c2Report.TableFailureItems, defaultSchemaKey) tableItems := c2Report.TableFailureItems[defaultSchemaKey] @@ -560,8 +563,8 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { "c2": makeTWData(200, 300, nil, makeContent(makeCanalJSON(3, 260, 250, "c"))), } - // Round 3: c1 has upstream pk=5 (commitTs=350, compareTs=350) and - // downstream pk=5 from c2 (commitTs=370, originTs=310, compareTs=310). + // Round 3: c1 has locally-written pk=5 (commitTs=350, compareTs=350) and + // replicated pk=5 from c2 (commitTs=370, originTs=310, compareTs=310). // Since 350 >= 310 with commitTs 350 < 370, this is an LWW violation. // c2 also has matching records to avoid data loss/redundant noise. round3 := map[string]types.TimeWindowData{ @@ -595,7 +598,7 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { require.Equal(t, uint64(310), c1TableItems.LWWViolationItems[0].OriginTS) require.Equal(t, uint64(370), c1TableItems.LWWViolationItems[0].CommitTS) // c2 should have no LWW violation (its records are ordered correctly: - // upstream commitTs=310 compareTs=310, downstream commitTs=360 compareTs=350, 310 < 350) + // locally-written commitTs=310 compareTs=310, replicated commitTs=360 compareTs=350, 310 < 350) c2Report := lastReport.ClusterReports["c2"] if c2TableItems, ok := c2Report.TableFailureItems[defaultSchemaKey]; ok { require.Empty(t, c2TableItems.LWWViolationItems) diff --git a/cmd/multi-cluster-consistency-checker/config/config.example.toml b/cmd/multi-cluster-consistency-checker/config/config.example.toml index 794aaae2f8..b5430add8e 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.example.toml +++ b/cmd/multi-cluster-consistency-checker/config/config.example.toml @@ -22,7 +22,7 @@ data-dir = "/tmp/multi-cluster-consistency-checker-data" s3-sink-uri = "s3://bucket-name/cluster1/" s3-changefeed-id = "s3-changefeed-id-1" # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } - [clusters.cluster1.downstream-cluster-changefeed-config] + [clusters.cluster1.peer-cluster-changefeed-config] cluster2 = { changefeed-id = "active-active-changefeed-id-from-cluster1-to-cluster2" } # Second cluster configuration @@ -31,7 +31,7 @@ data-dir = "/tmp/multi-cluster-consistency-checker-data" s3-sink-uri = "s3://bucket-name/cluster2/" s3-changefeed-id = "s3-changefeed-id-2" # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } - [clusters.cluster2.downstream-cluster-changefeed-config] + [clusters.cluster2.peer-cluster-changefeed-config] cluster1 = { changefeed-id = "active-active-changefeed-id-from-cluster2-to-cluster1" } # Third cluster configuration (optional) @@ -41,6 +41,6 @@ data-dir = "/tmp/multi-cluster-consistency-checker-data" # s3-sink-uri = "s3://bucket-name/cluster3/" # s3-changefeed-id = "s3-changefeed-id-3" # security-config = { ca-path = "ca.crt", cert-path = "cert.crt", key-path = "key.crt" } - # [clusters.cluster3.downstream-cluster-changefeed-config] + # [clusters.cluster3.peer-cluster-changefeed-config] # cluster1 = { changefeed-id = "active-active-changefeed-id-from-cluster3-to-cluster1" } # cluster2 = { changefeed-id = "active-active-changefeed-id-from-cluster3-to-cluster2" } diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index b76d223201..ea648d756b 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -30,14 +30,17 @@ type Config struct { Clusters map[string]ClusterConfig `toml:"clusters" json:"clusters"` } +const DefaultMaxReportFiles = 1000 + // GlobalConfig contains global configuration settings type GlobalConfig struct { - LogLevel string `toml:"log-level" json:"log-level"` - DataDir string `toml:"data-dir" json:"data-dir"` - Tables map[string][]string `toml:"tables" json:"tables"` + LogLevel string `toml:"log-level" json:"log-level"` + DataDir string `toml:"data-dir" json:"data-dir"` + MaxReportFiles int `toml:"max-report-files" json:"max-report-files"` + Tables map[string][]string `toml:"tables" json:"tables"` } -type DownstreamClusterChangefeedConfig struct { +type PeerClusterChangefeedConfig struct { // ChangefeedID is the changefeed ID for the changefeed ChangefeedID string `toml:"changefeed-id" json:"changefeed-id"` } @@ -56,9 +59,9 @@ type ClusterConfig struct { // SecurityConfig is the security configuration for the cluster SecurityConfig security.Credential `toml:"security-config" json:"security-config"` - // DownstreamClusterChangefeedConfig is the configuration for the changefeed of the downstream cluster - // mapping from downstream cluster ID to the changefeed configuration - DownstreamClusterChangefeedConfig map[string]DownstreamClusterChangefeedConfig `toml:"downstream-cluster-changefeed-config" json:"downstream-cluster-changefeed-config"` + // PeerClusterChangefeedConfig is the configuration for the changefeed of the peer cluster + // mapping from peer cluster ID to the changefeed configuration + PeerClusterChangefeedConfig map[string]PeerClusterChangefeedConfig `toml:"peer-cluster-changefeed-config" json:"peer-cluster-changefeed-config"` } // loadConfig loads the configuration from a TOML file @@ -77,6 +80,26 @@ func LoadConfig(path string) (*Config, error) { return nil, fmt.Errorf("failed to decode config file: %w", err) } + // Apply defaults + if cfg.GlobalConfig.MaxReportFiles <= 0 { + cfg.GlobalConfig.MaxReportFiles = DefaultMaxReportFiles + } + + // Validate DataDir + if cfg.GlobalConfig.DataDir == "" { + return nil, fmt.Errorf("global: data-dir is required") + } + + // Validate Tables + if len(cfg.GlobalConfig.Tables) == 0 { + return nil, fmt.Errorf("global: at least one schema must be configured in tables") + } + for schema, tables := range cfg.GlobalConfig.Tables { + if len(tables) == 0 { + return nil, fmt.Errorf("global: tables[%s]: at least one table must be configured", schema) + } + } + // Validate that at least one cluster is configured if len(cfg.Clusters) == 0 { return nil, fmt.Errorf("at least one cluster must be configured") @@ -93,12 +116,12 @@ func LoadConfig(path string) (*Config, error) { if cluster.S3ChangefeedID == "" { return nil, fmt.Errorf("cluster '%s': s3-changefeed-id is required", name) } - if len(cluster.DownstreamClusterChangefeedConfig) != len(cfg.Clusters)-1 { - return nil, fmt.Errorf("cluster '%s': downstream-cluster-changefeed-config is not entirely configured", name) + if len(cluster.PeerClusterChangefeedConfig) != len(cfg.Clusters)-1 { + return nil, fmt.Errorf("cluster '%s': peer-cluster-changefeed-config is not entirely configured", name) } - for downstreamClusterID, downstreamClusterChangefeedConfig := range cluster.DownstreamClusterChangefeedConfig { - if downstreamClusterChangefeedConfig.ChangefeedID == "" { - return nil, fmt.Errorf("cluster '%s': downstream-cluster-changefeed-config[%s]: changefeed-id is required", name, downstreamClusterID) + for peerClusterID, peerClusterChangefeedConfig := range cluster.PeerClusterChangefeedConfig { + if peerClusterChangefeedConfig.ChangefeedID == "" { + return nil, fmt.Errorf("cluster '%s': peer-cluster-changefeed-config[%s]: changefeed-id is required", name, peerClusterID) } } } diff --git a/cmd/multi-cluster-consistency-checker/config/config_test.go b/cmd/multi-cluster-consistency-checker/config/config_test.go index 99dd3222e4..e57f2e6a7b 100644 --- a/cmd/multi-cluster-consistency-checker/config/config_test.go +++ b/cmd/multi-cluster-consistency-checker/config/config_test.go @@ -41,14 +41,14 @@ data-dir = "/tmp/data" pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" s3-changefeed-id = "s3-cf-1" - [clusters.cluster1.downstream-cluster-changefeed-config] + [clusters.cluster1.peer-cluster-changefeed-config] cluster2 = { changefeed-id = "cf-1-to-2" } [clusters.cluster2] pd-addrs = ["127.0.0.1:2479"] s3-sink-uri = "s3://bucket/cluster2/" s3-changefeed-id = "s3-cf-2" - [clusters.cluster2.downstream-cluster-changefeed-config] + [clusters.cluster2.peer-cluster-changefeed-config] cluster1 = { changefeed-id = "cf-2-to-1" } ` err := os.WriteFile(configPath, []byte(configContent), 0644) @@ -65,8 +65,45 @@ data-dir = "/tmp/data" require.Equal(t, []string{"127.0.0.1:2379"}, cfg.Clusters["cluster1"].PDAddrs) require.Equal(t, "s3://bucket/cluster1/", cfg.Clusters["cluster1"].S3SinkURI) require.Equal(t, "s3-cf-1", cfg.Clusters["cluster1"].S3ChangefeedID) - require.Len(t, cfg.Clusters["cluster1"].DownstreamClusterChangefeedConfig, 1) - require.Equal(t, "cf-1-to-2", cfg.Clusters["cluster1"].DownstreamClusterChangefeedConfig["cluster2"].ChangefeedID) + require.Len(t, cfg.Clusters["cluster1"].PeerClusterChangefeedConfig, 1) + require.Equal(t, "cf-1-to-2", cfg.Clusters["cluster1"].PeerClusterChangefeedConfig["cluster2"].ChangefeedID) + // max-report-files not set, should default to DefaultMaxReportFiles + require.Equal(t, DefaultMaxReportFiles, cfg.GlobalConfig.MaxReportFiles) + }) + + t.Run("custom max-report-files", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +data-dir = "/tmp/data" +max-report-files = 50 + [global.tables] + schema1 = ["table1"] + +[clusters] + [clusters.cluster1] + pd-addrs = ["127.0.0.1:2379"] + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.peer-cluster-changefeed-config] + cluster2 = { changefeed-id = "cf-1-to-2" } + + [clusters.cluster2] + pd-addrs = ["127.0.0.1:2479"] + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" + [clusters.cluster2.peer-cluster-changefeed-config] + cluster1 = { changefeed-id = "cf-2-to-1" } +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.NoError(t, err) + require.Equal(t, 50, cfg.GlobalConfig.MaxReportFiles) }) t.Run("file not exists", func(t *testing.T) { @@ -91,6 +128,106 @@ data-dir = "/tmp/data" require.Contains(t, err.Error(), "failed to decode config file") }) + t.Run("missing data-dir", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" + +[clusters] + [clusters.cluster1] + pd-addrs = ["127.0.0.1:2379"] + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.peer-cluster-changefeed-config] + cluster2 = { changefeed-id = "cf-1-to-2" } + + [clusters.cluster2] + pd-addrs = ["127.0.0.1:2479"] + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" + [clusters.cluster2.peer-cluster-changefeed-config] + cluster1 = { changefeed-id = "cf-2-to-1" } +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "data-dir is required") + }) + + t.Run("missing tables", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +data-dir = "/tmp/data" + +[clusters] + [clusters.cluster1] + pd-addrs = ["127.0.0.1:2379"] + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.peer-cluster-changefeed-config] + cluster2 = { changefeed-id = "cf-1-to-2" } + + [clusters.cluster2] + pd-addrs = ["127.0.0.1:2479"] + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" + [clusters.cluster2.peer-cluster-changefeed-config] + cluster1 = { changefeed-id = "cf-2-to-1" } +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "at least one schema must be configured in tables") + }) + + t.Run("empty table list in schema", func(t *testing.T) { + t.Parallel() + tmpDir := t.TempDir() + configPath := filepath.Join(tmpDir, "config.toml") + configContent := ` +[global] +log-level = "info" +data-dir = "/tmp/data" + [global.tables] + schema1 = [] + +[clusters] + [clusters.cluster1] + pd-addrs = ["127.0.0.1:2379"] + s3-sink-uri = "s3://bucket/cluster1/" + s3-changefeed-id = "s3-cf-1" + [clusters.cluster1.peer-cluster-changefeed-config] + cluster2 = { changefeed-id = "cf-1-to-2" } + + [clusters.cluster2] + pd-addrs = ["127.0.0.1:2479"] + s3-sink-uri = "s3://bucket/cluster2/" + s3-changefeed-id = "s3-cf-2" + [clusters.cluster2.peer-cluster-changefeed-config] + cluster1 = { changefeed-id = "cf-2-to-1" } +` + err := os.WriteFile(configPath, []byte(configContent), 0644) + require.NoError(t, err) + + cfg, err := LoadConfig(configPath) + require.Error(t, err) + require.Nil(t, cfg) + require.Contains(t, err.Error(), "at least one table must be configured") + }) + t.Run("no clusters", func(t *testing.T) { t.Parallel() tmpDir := t.TempDir() @@ -98,7 +235,9 @@ data-dir = "/tmp/data" configContent := ` [global] log-level = "info" -report-dir = "/tmp/reports" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1"] ` err := os.WriteFile(configPath, []byte(configContent), 0644) require.NoError(t, err) @@ -116,7 +255,9 @@ report-dir = "/tmp/reports" configContent := ` [global] log-level = "info" -report-dir = "/tmp/reports" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1"] [clusters] [clusters.cluster1] @@ -139,7 +280,9 @@ report-dir = "/tmp/reports" configContent := ` [global] log-level = "info" -report-dir = "/tmp/reports" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1"] [clusters] [clusters.cluster1] @@ -162,7 +305,9 @@ report-dir = "/tmp/reports" configContent := ` [global] log-level = "info" -report-dir = "/tmp/reports" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1"] [clusters] [clusters.cluster1] @@ -178,21 +323,23 @@ report-dir = "/tmp/reports" require.Contains(t, err.Error(), "s3-changefeed-id is required") }) - t.Run("incomplete downstream cluster changefeed config", func(t *testing.T) { + t.Run("incomplete replicated cluster changefeed config", func(t *testing.T) { t.Parallel() tmpDir := t.TempDir() configPath := filepath.Join(tmpDir, "config.toml") configContent := ` [global] log-level = "info" -report-dir = "/tmp/reports" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1"] [clusters] [clusters.cluster1] pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" s3-changefeed-id = "s3-cf-1" - [clusters.cluster1.downstream-cluster-changefeed-config] + [clusters.cluster1.peer-cluster-changefeed-config] cluster2 = { changefeed-id = "cf-1-to-2" } [clusters.cluster2] @@ -206,31 +353,33 @@ report-dir = "/tmp/reports" cfg, err := LoadConfig(configPath) require.Error(t, err) require.Nil(t, cfg) - require.Contains(t, err.Error(), "downstream-cluster-changefeed-config is not entirely configured") + require.Contains(t, err.Error(), "peer-cluster-changefeed-config is not entirely configured") }) - t.Run("missing changefeed-id in downstream config", func(t *testing.T) { + t.Run("missing changefeed-id in peer cluster config", func(t *testing.T) { t.Parallel() tmpDir := t.TempDir() configPath := filepath.Join(tmpDir, "config.toml") configContent := ` [global] log-level = "info" -report-dir = "/tmp/reports" +data-dir = "/tmp/data" + [global.tables] + schema1 = ["table1"] [clusters] [clusters.cluster1] pd-addrs = ["127.0.0.1:2379"] s3-sink-uri = "s3://bucket/cluster1/" s3-changefeed-id = "s3-cf-1" - [clusters.cluster1.downstream-cluster-changefeed-config] + [clusters.cluster1.peer-cluster-changefeed-config] cluster2 = {} [clusters.cluster2] pd-addrs = ["127.0.0.1:2479"] s3-sink-uri = "s3://bucket/cluster2/" s3-changefeed-id = "s3-cf-2" - [clusters.cluster2.downstream-cluster-changefeed-config] + [clusters.cluster2.peer-cluster-changefeed-config] cluster1 = { changefeed-id = "cf-2-to-1" } ` err := os.WriteFile(configPath, []byte(configContent), 0644) diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go b/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go index 5761d8845d..05900fda81 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go @@ -16,7 +16,6 @@ package consumer import ( "bytes" "context" - "fmt" "path" "slices" "strings" @@ -29,18 +28,6 @@ import ( "github.com/stretchr/testify/require" ) -// helper to build a DML file path for tests (day separator, no partition, no dispatcherID). -// Format: {schema}/{table}/{version}/{date}/CDC{idx:020d}.json -func buildDMLFilePath(schema, table string, version uint64, date string, idx uint64) string { - return fmt.Sprintf("%s/%s/%d/%s/CDC%020d.json", schema, table, version, date, idx) -} - -// helper to build a schema file path for tests. -// Format: {schema}/{table}/meta/schema_{version}_{checksum}.json -func buildSchemaFilePath(schema, table string, version uint64, checksum uint32) string { - return fmt.Sprintf("%s/%s/meta/schema_%d_%010d.json", schema, table, version, checksum) -} - func TestUpdateTableDMLIdxMap(t *testing.T) { t.Parallel() diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder.go b/cmd/multi-cluster-consistency-checker/decoder/decoder.go index 14f7fc4355..64d6c6eaef 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder.go @@ -84,25 +84,25 @@ type Record struct { ColumnValues map[string]any } -func (r *Record) EqualDownstreamRecord(downstreamRecord *Record) bool { - if downstreamRecord == nil { +func (r *Record) EqualReplicatedRecord(replicatedRecord *Record) bool { + if replicatedRecord == nil { return false } - if r.CommitTs != downstreamRecord.OriginTs { + if r.CommitTs != replicatedRecord.OriginTs { return false } - if r.Pk != downstreamRecord.Pk { + if r.Pk != replicatedRecord.Pk { return false } - if len(r.ColumnValues) != len(downstreamRecord.ColumnValues) { + if len(r.ColumnValues) != len(replicatedRecord.ColumnValues) { return false } for columnName, columnValue := range r.ColumnValues { - downstreamColumnValue, ok := downstreamRecord.ColumnValues[columnName] + replicatedColumnValue, ok := replicatedRecord.ColumnValues[columnName] if !ok { return false } - if columnValue != downstreamColumnValue { + if columnValue != replicatedColumnValue { return false } } diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go index 152dfa35b3..c2539b1b8e 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go @@ -78,16 +78,16 @@ func TestCanalJSONDecoder2(t *testing.T) { } } -func TestRecord_EqualDownstreamRecord(t *testing.T) { +func TestRecord_EqualReplicatedRecord(t *testing.T) { tests := []struct { name string - upstream *decoder.Record - downstream *decoder.Record + local *decoder.Record + replicated *decoder.Record expectedEqual bool }{ { name: "equal records", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", ColumnValues: map[string]any{ @@ -95,7 +95,7 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { "col2": 42, }, }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk1", ColumnValues: map[string]any{ @@ -106,21 +106,21 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { expectedEqual: true, }, { - name: "downstream is nil", - upstream: &decoder.Record{ + name: "replicated is nil", + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", }, - downstream: nil, + replicated: nil, expectedEqual: false, }, { name: "different CommitTs and OriginTs", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 200}, Pk: "pk1", }, @@ -128,11 +128,11 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { }, { name: "different primary keys", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk2", }, @@ -140,14 +140,14 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { }, { name: "different column count", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", ColumnValues: map[string]any{ "col1": "value1", }, }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk1", ColumnValues: map[string]any{ @@ -159,14 +159,14 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { }, { name: "different column names", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", ColumnValues: map[string]any{ "col1": "value1", }, }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk1", ColumnValues: map[string]any{ @@ -177,14 +177,14 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { }, { name: "different column values", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", ColumnValues: map[string]any{ "col1": "value1", }, }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk1", ColumnValues: map[string]any{ @@ -195,12 +195,12 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { }, { name: "empty column values", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", ColumnValues: map[string]any{}, }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk1", ColumnValues: map[string]any{}, @@ -209,12 +209,12 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { }, { name: "nil column values", - upstream: &decoder.Record{ + local: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 100, OriginTs: 0}, Pk: "pk1", ColumnValues: nil, }, - downstream: &decoder.Record{ + replicated: &decoder.Record{ CdcVersion: types.CdcVersion{CommitTs: 101, OriginTs: 100}, Pk: "pk1", ColumnValues: nil, @@ -225,7 +225,7 @@ func TestRecord_EqualDownstreamRecord(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result := tt.upstream.EqualDownstreamRecord(tt.downstream) + result := tt.local.EqualReplicatedRecord(tt.replicated) require.Equal(t, tt.expectedEqual, result) }) } diff --git a/cmd/multi-cluster-consistency-checker/integration/integration_test.go b/cmd/multi-cluster-consistency-checker/integration/integration_test.go index dc8eede3a5..236ebb2568 100644 --- a/cmd/multi-cluster-consistency-checker/integration/integration_test.go +++ b/cmd/multi-cluster-consistency-checker/integration/integration_test.go @@ -109,11 +109,11 @@ func maxRightBoundary(twData map[string]types.TimeWindowData) uint64 { // The test architecture simulates a 2-cluster active-active setup: // -// c1 (upstream) ──CDC──> c2 (downstream) -// c2 (upstream) ──CDC──> c1 (downstream) +// c1 (locally-written records) ──CDC──> c2 (replicated records) +// c2 (locally-written records) ──CDC──> c1 (replicated records) // -// Each cluster writes upstream data (originTs=0) and receives downstream -// replicated data from the other cluster (originTs>0). +// Each cluster writes locally-written records (originTs=0) and receives replicated +// records from the other cluster (originTs>0). // // The checker needs 3 warm-up rounds before it starts checking (checkableRound >= 3). // Data written in round 0 is tracked by the S3 consumer but not downloaded @@ -124,7 +124,7 @@ func maxRightBoundary(twData map[string]types.TimeWindowData) uint64 { // within the current time window (leftBoundary, rightBoundary]. // // TestIntegration_AllConsistent verifies that no errors are reported -// when all upstream data has matching downstream records. +// when all locally-written records have matching replicated records. func TestIntegration_AllConsistent(t *testing.T) { t.Parallel() env := setupEnv(t) @@ -134,9 +134,9 @@ func TestIntegration_AllConsistent(t *testing.T) { for round := 0; round < 6; round++ { cts := prevMaxRB + 1 - // c1: upstream write (originTs=0) + // c1: locally-written records write (originTs=0) c1 := MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) - // c2: downstream replicated from c1 (originTs = c1's commitTs) + // c2: replicated records replicated from c1 (originTs = c1's commitTs) c2 := MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) result := env.executeRound(t, c1, c2) @@ -160,18 +160,18 @@ func TestIntegration_AllConsistent(t *testing.T) { } } -// TestIntegration_AllConsistent_CrossRoundDownstream verifies that the checker -// treats data as consistent when an upstream record's commitTs exceeds the -// round's checkpointTs, and the matching downstream only appears in the next +// TestIntegration_AllConsistent_CrossRoundReplicatedRecords verifies that the checker +// treats data as consistent when a locally-written record's commitTs exceeds the +// round's checkpointTs, and the matching replicated records only appears in the next // round. // -// This occurs when upstream commits happen late in the time window, after +// This occurs when locally-written records commitTs happen late in the time window, after // the checkpoint has already been determined. For TW[2], records with // commitTs > checkpointTs are deferred (skipped). In the next round they // become TW[1], where the check condition is commitTs > checkpointTs (checked), -// and the downstream is searched in TW[1] + TW[2] — finding the match in +// and the replicated records are searched in TW[1] + TW[2] — finding the match in // the current round's TW[2]. -func TestIntegration_AllConsistent_CrossRoundDownstream(t *testing.T) { +func TestIntegration_AllConsistent_CrossRoundReplicatedRecords(t *testing.T) { t.Parallel() env := setupEnv(t) defer env.mc.Close() @@ -185,7 +185,7 @@ func TestIntegration_AllConsistent_CrossRoundDownstream(t *testing.T) { // Using ComposeTS(250, 0) = 65536000 lands safely between them. crossRoundOffset := uint64(250 << 18) // ComposeTS(250, 0) = 65536000 - var lateUpstreamCommitTs uint64 + var lateLocallyWrittenRecordsCommitTs uint64 for round := 0; round < 7; round++ { cts := prevMaxRB + 1 @@ -194,27 +194,27 @@ func TestIntegration_AllConsistent_CrossRoundDownstream(t *testing.T) { switch round { case 4: - // Round N: c1 upstream has two records: + // Round N: c1 local has two records: // pk=round+1 normal commitTs (checked in this round's TW[2]) // pk=100 large commitTs > checkpointTs // (deferred in TW[2], checked via TW[1] next round) - // c2 downstream only matches pk=round+1. - lateUpstreamCommitTs = prevMaxRB + crossRoundOffset + // c2 replicated only matches pk=round+1. + lateLocallyWrittenRecordsCommitTs = prevMaxRB + crossRoundOffset c1 = MakeContent( MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round)), - MakeCanalJSON(100, lateUpstreamCommitTs, 0, "late"), + MakeCanalJSON(100, lateLocallyWrittenRecordsCommitTs, 0, "late"), ) c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) case 5: - // Round N+1: c2 now includes the downstream for pk=100. + // Round N+1: c2 now includes the replicated record for pk=100. // The checker evaluates TW[1] (= round 4), finds pk=100 with // commitTs > checkpointTs, and searches c2's TW[1] + TW[2]. - // pk=100's matching downstream is in c2's TW[2] (this round). + // pk=100's matching replicated record is in c2's TW[2] (this round). c1 = MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) c2 = MakeContent( MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), - MakeCanalJSON(100, cts+2, lateUpstreamCommitTs, "late"), + MakeCanalJSON(100, cts+2, lateLocallyWrittenRecordsCommitTs, "late"), ) default: @@ -236,12 +236,12 @@ func TestIntegration_AllConsistent_CrossRoundDownstream(t *testing.T) { // Verify the late commitTs falls between checkpointTs and rightBoundary. c1TW := result.twData["c1"].TimeWindow cpTs := c1TW.CheckpointTs["c2"] - require.Greater(t, lateUpstreamCommitTs, cpTs, - "lateUpstreamCommitTs must be > checkpointTs for cross-round detection") - require.LessOrEqual(t, lateUpstreamCommitTs, c1TW.RightBoundary, - "lateUpstreamCommitTs must be <= rightBoundary to stay in this time window") + require.Greater(t, lateLocallyWrittenRecordsCommitTs, cpTs, + "lateLocallyWrittenRecordsCommitTs must be > checkpointTs for cross-round detection") + require.LessOrEqual(t, lateLocallyWrittenRecordsCommitTs, c1TW.RightBoundary, + "lateLocallyWrittenRecordsCommitTs must be <= rightBoundary to stay in this time window") t.Logf("Round 4 verification: lateCommitTs=%d, checkpointTs=%d, rightBoundary=%d", - lateUpstreamCommitTs, cpTs, c1TW.RightBoundary) + lateLocallyWrittenRecordsCommitTs, cpTs, c1TW.RightBoundary) } if round >= 3 { @@ -256,34 +256,34 @@ func TestIntegration_AllConsistent_CrossRoundDownstream(t *testing.T) { } } -// TestIntegration_AllConsistent_LWWSkippedDownstream verifies that no errors -// are reported when a downstream record is "LWW-skipped" during data-loss +// TestIntegration_AllConsistent_LWWSkippedReplicatedRecords verifies that no errors +// are reported when a replicated record is "LWW-skipped" during data-loss // detection, combined with cross-time-window matching. // -// pk=100: single-cluster overwrite (c1 writes old+new, c2 only has newer downstream) +// pk=100: single-cluster overwrite (c1 writes old+new, c2 only has newer replicated records) // -// Round N: c1 upstream pk=100 × 2 (commitTs=A, B; both > checkpointTs) -// c2 has NO downstream for pk=100 -// Round N+1: c2 downstream pk=100 (originTs=B, matches newer upstream only) -// → old upstream LWW-skipped (c2 downstream compareTs=B >= A) +// Round N: c1 locally-written records pk=100 × 2 (commitTs=A, B; both > checkpointTs) +// c2 has NO replicated records for pk=100 +// Round N+1: c2 replicated records pk=100 (originTs=B, matches newer locally-written records only) +// → old locally-written records LWW-skipped (c2 replicated records compareTs=B >= A) // // pk=200: bidirectional write (c1 and c2 both write the same pk) // -// Round N: c1 upstream pk=200 (commitTs=A, deferred) -// Round N+1: c1 upstream pk=200 (commitTs=E, newer), c1 downstream pk=200 (originTs=D, from c2) -// c2 upstream pk=200 (commitTs=D, D < E), c2 downstream pk=200 (originTs=E, from c1) +// Round N: c1 locally-written records pk=200 (commitTs=A, deferred) +// Round N+1: c1 locally-written records pk=200 (commitTs=E, newer), c1 replicated records pk=200 (originTs=D, from c2) +// c2 replicated records pk=200 (commitTs=D, D < E), c2 replicated records pk=200 (originTs=E, from c1) // -// Key constraint: c1 upstream commitTs (E) > c2 upstream commitTs (D). -// This ensures that on c2, the downstream (compareTs=E) > upstream (compareTs=D), +// Key constraint: c1 local commitTs (E) > c2 local commitTs (D). +// This ensures that on c2, the replicated (compareTs=E) > local (compareTs=D), // so the LWW violation checker sees monotonically increasing compareTs. // // c1 data loss for old pk=200 (commitTs=A): -// → c2 downstream has originTs=E, compareTs=E >= A → LWW-skipped ✓ +// → c2 replicated has originTs=E, compareTs=E >= A → LWW-skipped ✓ // c1 data loss for new pk=200 (commitTs=E): -// → c2 downstream has originTs=E → exact match ✓ -// c2 data loss for c2 upstream pk=200 (commitTs=D): -// → c1 downstream has originTs=D → exact match ✓ -func TestIntegration_AllConsistent_LWWSkippedDownstream(t *testing.T) { +// → c2 replicated has originTs=E → exact match ✓ +// c2 data loss for c2 local pk=200 (commitTs=D): +// → c1 replicated has originTs=D → exact match ✓ +func TestIntegration_AllConsistent_LWWSkippedReplicatedRecords(t *testing.T) { t.Parallel() env := setupEnv(t) defer env.mc.Close() @@ -305,8 +305,8 @@ func TestIntegration_AllConsistent_LWWSkippedDownstream(t *testing.T) { switch round { case 4: - // Round N: c1 upstream writes pk=100 twice + pk=200 once, all > checkpointTs. - // c2 has NO downstream for pk=100 or pk=200; they arrive next round. + // Round N: c1 local writes pk=100 twice + pk=200 once, all > checkpointTs. + // c2 has NO replicated record for pk=100 or pk=200; they arrive next round. oldCommitTs = prevMaxRB + crossRoundOffset newCommitTs = oldCommitTs + 5 c1 = MakeContent( @@ -320,24 +320,24 @@ func TestIntegration_AllConsistent_LWWSkippedDownstream(t *testing.T) { ) case 5: - // Round N+1: downstream data arrives for both pk=100 and pk=200. + // Round N+1: replicated data arrives for both pk=100 and pk=200. // - // pk=200 bidirectional: c1 upstream at cts+5 (> c2 upstream at cts+2) + // pk=200 bidirectional: c1 local at cts+5 (> c2 local at cts+2) // ensures c2's LWW check sees increasing compareTs. - // c1: downstream(commitTs=cts+4, originTs=cts+2) then upstream(commitTs=cts+5) + // c1: replicated(commitTs=cts+4, originTs=cts+2) then local(commitTs=cts+5) // → compareTs order: cts+2 < cts+5 ✓ - // c2: upstream(commitTs=cts+2) then downstream(commitTs=cts+6, originTs=cts+5) + // c2: local(commitTs=cts+2) then replicated(commitTs=cts+6, originTs=cts+5) // → compareTs order: cts+2 < cts+5 ✓ c1 = MakeContent( MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round)), - MakeCanalJSON(200, cts+4, cts+2, "pk200_c2"), // c1 downstream pk=200 from c2 - MakeCanalJSON(200, cts+5, 0, "pk200_c1"), // c1 upstream pk=200 (newer) + MakeCanalJSON(200, cts+4, cts+2, "pk200_c2"), // c1 replicated pk=200 from c2 + MakeCanalJSON(200, cts+5, 0, "pk200_c1"), // c1 local pk=200 (newer) ) c2 = MakeContent( MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), MakeCanalJSON(100, cts+2, newCommitTs, "new_write"), - MakeCanalJSON(200, cts+2, 0, "pk200_c2"), // c2 upstream pk=200 - MakeCanalJSON(200, cts+6, cts+5, "pk200_c1"), // c2 downstream pk=200 from c1 + MakeCanalJSON(200, cts+2, 0, "pk200_c2"), // c2 local pk=200 + MakeCanalJSON(200, cts+6, cts+5, "pk200_c1"), // c2 replicated pk=200 from c1 ) default: @@ -365,7 +365,7 @@ func TestIntegration_AllConsistent_LWWSkippedDownstream(t *testing.T) { if round >= 3 { require.Len(t, result.report.ClusterReports, 2, "round %d", round) require.False(t, result.report.NeedFlush(), - "round %d: cross-round LWW-skipped downstream should not cause errors", round) + "round %d: cross-round LWW-skipped replicated should not cause errors", round) for clusterID, cr := range result.report.ClusterReports { require.Empty(t, cr.TableFailureItems, "round %d, cluster %s: should have no failures", round, clusterID) @@ -375,7 +375,7 @@ func TestIntegration_AllConsistent_LWWSkippedDownstream(t *testing.T) { } // TestIntegration_DataLoss verifies that the checker detects data loss -// when an upstream record has no matching downstream in the other cluster. +// when a locally-written record has no matching replicated record in the other cluster. func TestIntegration_DataLoss(t *testing.T) { t.Parallel() env := setupEnv(t) @@ -387,16 +387,16 @@ func TestIntegration_DataLoss(t *testing.T) { for round := 0; round < 6; round++ { cts := prevMaxRB + 1 - // c1 always produces upstream data + // c1 always produces local data c1 := MakeContent(MakeCanalJSON(round+1, cts, 0, fmt.Sprintf("v%d", round))) var c2 []byte if round == 4 { - // Round 4: c2 has NO matching downstream → data loss expected + // Round 4: c2 has NO matching replicated record → data loss expected // (round 4's data is checked in the same round since checkableRound >= 3) c2 = nil } else { - // Normal: c2 has matching downstream + // Normal: c2 has matching replicated record c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) } @@ -414,8 +414,7 @@ func TestIntegration_DataLoss(t *testing.T) { dataLossDetected = true // Verify the data loss item for _, item := range items.DataLossItems { - require.Equal(t, "c2", item.DownstreamClusterID) - require.False(t, item.Inconsistent, "should be pure data loss, not inconsistency") + require.Equal(t, "c2", item.PeerClusterID) } } } @@ -427,8 +426,8 @@ func TestIntegration_DataLoss(t *testing.T) { } // TestIntegration_DataInconsistent verifies that the checker detects data -// inconsistency when a downstream record has different column values -// from the upstream record. +// inconsistency when a replicated record has different column values +// from the locally-written record. func TestIntegration_DataInconsistent(t *testing.T) { t.Parallel() env := setupEnv(t) @@ -444,7 +443,7 @@ func TestIntegration_DataInconsistent(t *testing.T) { var c2 []byte if round == 4 { - // Round 4: c2 has downstream with WRONG column value + // Round 4: c2 has replicated record with WRONG column value c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, "WRONG_VALUE")) } else { c2 = MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) @@ -459,12 +458,10 @@ func TestIntegration_DataInconsistent(t *testing.T) { c1Report := result.report.ClusterReports["c1"] if c1Report != nil { if items, ok := c1Report.TableFailureItems[schemaKey]; ok { - for _, item := range items.DataLossItems { - if item.Inconsistent { - t.Logf("Round %d: detected data inconsistency: %+v", round, item) - inconsistentDetected = true - require.Equal(t, "c2", item.DownstreamClusterID) - } + for _, item := range items.DataInconsistentItems { + t.Logf("Round %d: detected data inconsistency: %+v", round, item) + inconsistentDetected = true + require.Equal(t, "c2", item.PeerClusterID) } } } @@ -475,7 +472,7 @@ func TestIntegration_DataInconsistent(t *testing.T) { } // TestIntegration_DataRedundant verifies that the checker detects redundant -// downstream data that has no matching upstream record. +// replicated data that has no matching locally-written record. func TestIntegration_DataRedundant(t *testing.T) { t.Parallel() env := setupEnv(t) @@ -491,9 +488,9 @@ func TestIntegration_DataRedundant(t *testing.T) { c2 := MakeContent(MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round))) if round == 4 { - // Round 4: c2 has an EXTRA downstream record (pk=999) with a fake - // originTs that doesn't match any upstream commitTs in c1. - fakeOriginTs := cts - 5 // Doesn't match any c1 upstream commitTs + // Round 4: c2 has an EXTRA replicated record (pk=999) with a fake + // originTs that doesn't match any c1 local commitTs. + fakeOriginTs := cts - 5 // Doesn't match any c1 local commitTs c2 = MakeContent( MakeCanalJSON(round+1, cts+1, cts, fmt.Sprintf("v%d", round)), MakeCanalJSON(999, cts+2, fakeOriginTs, "extra"), @@ -546,7 +543,7 @@ func TestIntegration_LWWViolation(t *testing.T) { MakeCanalJSON(5, cts, 0, "original"), MakeCanalJSON(5, cts+2, cts-10, "replicated"), ) - // c2: provide matching downstream to avoid data loss noise + // c2: provide matching replicated record to avoid data loss noise c2 = MakeContent( MakeCanalJSON(5, cts+1, cts, "original"), ) @@ -587,9 +584,9 @@ func TestIntegration_LWWViolation(t *testing.T) { // // Timeline: // -// Round N: c1 upstream pk=50 (originTs=0, compareTs=A) → cached +// Round N: c1 local pk=50 (originTs=0, compareTs=A) → cached // Round N+1: no pk=50 data → cache ages (prev 1→2) -// Round N+2: c1 downstream pk=50 (originTs=B 0 { + sb.WriteString(", inconsistent columns: [") + for i, col := range item.InconsistentColumns { + if i > 0 { + sb.WriteString("; ") + } + sb.WriteString(col.String()) + } + sb.WriteString("]") } - return fmt.Sprintf("downstream cluster: %s, pk: %s, origin ts: %d, commit ts: %d, type: %s", item.DownstreamClusterID, item.PK, item.OriginTS, item.CommitTS, errType) + return sb.String() } type DataRedundantItem struct { @@ -62,16 +92,18 @@ func (item *LWWViolationItem) String() string { } type TableFailureItems struct { - DataLossItems []DataLossItem `json:"data_loss_items"` // data loss items - DataRedundantItems []DataRedundantItem `json:"data_redundant_items"` // data redundant items - LWWViolationItems []LWWViolationItem `json:"lww_violation_items"` // lww violation items + DataLossItems []DataLossItem `json:"data_loss_items"` // data loss items + DataInconsistentItems []DataInconsistentItem `json:"data_inconsistent_items"` // data inconsistent items + DataRedundantItems []DataRedundantItem `json:"data_redundant_items"` // data redundant items + LWWViolationItems []LWWViolationItem `json:"lww_violation_items"` // lww violation items } func NewTableFailureItems() *TableFailureItems { return &TableFailureItems{ - DataLossItems: make([]DataLossItem, 0), - DataRedundantItems: make([]DataRedundantItem, 0), - LWWViolationItems: make([]LWWViolationItem, 0), + DataLossItems: make([]DataLossItem, 0), + DataInconsistentItems: make([]DataInconsistentItem, 0), + DataRedundantItems: make([]DataRedundantItem, 0), + LWWViolationItems: make([]LWWViolationItem, 0), } } @@ -94,18 +126,33 @@ func NewClusterReport(clusterID string, timeWindow types.TimeWindow) *ClusterRep } } -func (r *ClusterReport) AddDataLossItem(downstreamClusterID, schemaKey, pk string, originTS, commitTS uint64, inconsistent bool) { +func (r *ClusterReport) AddDataLossItem(peerClusterID, schemaKey, pk string, originTS, commitTS uint64) { tableFailureItems, exists := r.TableFailureItems[schemaKey] if !exists { tableFailureItems = NewTableFailureItems() r.TableFailureItems[schemaKey] = tableFailureItems } tableFailureItems.DataLossItems = append(tableFailureItems.DataLossItems, DataLossItem{ - DownstreamClusterID: downstreamClusterID, + PeerClusterID: peerClusterID, + PK: pk, + OriginTS: originTS, + CommitTS: commitTS, + }) + r.needFlush = true +} + +func (r *ClusterReport) AddDataInconsistentItem(peerClusterID, schemaKey, pk string, originTS, commitTS uint64, inconsistentColumns []InconsistentColumn) { + tableFailureItems, exists := r.TableFailureItems[schemaKey] + if !exists { + tableFailureItems = NewTableFailureItems() + r.TableFailureItems[schemaKey] = tableFailureItems + } + tableFailureItems.DataInconsistentItems = append(tableFailureItems.DataInconsistentItems, DataInconsistentItem{ + PeerClusterID: peerClusterID, PK: pk, OriginTS: originTS, CommitTS: commitTS, - Inconsistent: inconsistent, + InconsistentColumns: inconsistentColumns, }) r.needFlush = true } @@ -181,6 +228,12 @@ func (r *Report) MarshalReport() string { fmt.Fprintf(&reportMsg, " - [%s]\n", dataLossItem.String()) } } + if len(tableFailureItems.DataInconsistentItems) > 0 { + fmt.Fprintf(&reportMsg, " - [data inconsistent items: %d]\n", len(tableFailureItems.DataInconsistentItems)) + for _, dataInconsistentItem := range tableFailureItems.DataInconsistentItems { + fmt.Fprintf(&reportMsg, " - [%s]\n", dataInconsistentItem.String()) + } + } if len(tableFailureItems.DataRedundantItems) > 0 { fmt.Fprintf(&reportMsg, " - [data redundant items: %d]\n", len(tableFailureItems.DataRedundantItems)) for _, dataRedundantItem := range tableFailureItems.DataRedundantItems { @@ -248,8 +301,8 @@ func (c *Checkpoint) NewTimeWindowData(round uint64, timeWindowData map[string]t Round: round, ClusterInfo: make(map[string]CheckpointClusterInfo), } - for downstreamClusterID, timeWindow := range timeWindowData { - newCheckpointItem.ClusterInfo[downstreamClusterID] = CheckpointClusterInfo{ + for clusterID, timeWindow := range timeWindowData { + newCheckpointItem.ClusterInfo[clusterID] = CheckpointClusterInfo{ TimeWindow: timeWindow.TimeWindow, MaxVersion: NewSchemaTableVersionKeyFromVersionKeyMap(timeWindow.MaxVersion), } diff --git a/cmd/multi-cluster-consistency-checker/recorder/types_test.go b/cmd/multi-cluster-consistency-checker/recorder/types_test.go index 503d322f60..2ccd68f179 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types_test.go @@ -23,31 +23,66 @@ import ( func TestDataLossItem_String(t *testing.T) { t.Parallel() + item := &DataLossItem{ + PeerClusterID: "cluster-2", + PK: "pk-1", + OriginTS: 100, + CommitTS: 200, + } + s := item.String() + require.Equal(t, "peer cluster: cluster-2, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss", s) +} - t.Run("data loss", func(t *testing.T) { +func TestDataInconsistentItem_String(t *testing.T) { + t.Parallel() + + t.Run("without inconsistent columns", func(t *testing.T) { t.Parallel() - item := &DataLossItem{ - DownstreamClusterID: "cluster-2", - PK: "pk-1", - OriginTS: 100, - CommitTS: 200, - Inconsistent: false, + item := &DataInconsistentItem{ + PeerClusterID: "cluster-3", + PK: "pk-2", + OriginTS: 300, + CommitTS: 400, } s := item.String() - require.Equal(t, "downstream cluster: cluster-2, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss", s) + require.Equal(t, "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent", s) }) - t.Run("data inconsistent", func(t *testing.T) { + t.Run("with inconsistent columns", func(t *testing.T) { t.Parallel() - item := &DataLossItem{ - DownstreamClusterID: "cluster-3", - PK: "pk-2", - OriginTS: 300, - CommitTS: 400, - Inconsistent: true, + item := &DataInconsistentItem{ + PeerClusterID: "cluster-3", + PK: "pk-2", + OriginTS: 300, + CommitTS: 400, + InconsistentColumns: []InconsistentColumn{ + {Column: "col1", Local: "val_a", Replicated: "val_b"}, + {Column: "col2", Local: 100, Replicated: 200}, + }, } s := item.String() - require.Equal(t, "downstream cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent", s) + require.Equal(t, + "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent, "+ + "inconsistent columns: [column: col1, local: val_a, replicated: val_b; column: col2, local: 100, replicated: 200]", + s) + }) + + t.Run("with missing column in replicated", func(t *testing.T) { + t.Parallel() + item := &DataInconsistentItem{ + PeerClusterID: "cluster-3", + PK: "pk-2", + OriginTS: 300, + CommitTS: 400, + InconsistentColumns: []InconsistentColumn{ + {Column: "col1", Local: "val_a", Replicated: nil}, + }, + } + s := item.String() + require.Equal(t, + "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent, "+ + "inconsistent columns: [column: col1, local: val_a, replicated: ]", + s) }) } @@ -87,17 +122,38 @@ func TestClusterReport(t *testing.T) { t.Run("add data loss item sets needFlush", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("downstream-1", testSchemaKey, "pk-1", 100, 200, false) + cr.AddDataLossItem("peer-cluster-1", testSchemaKey, "pk-1", 100, 200) require.Len(t, cr.TableFailureItems, 1) require.Contains(t, cr.TableFailureItems, testSchemaKey) tableItems := cr.TableFailureItems[testSchemaKey] require.Len(t, tableItems.DataLossItems, 1) require.True(t, cr.needFlush) - require.Equal(t, "downstream-1", tableItems.DataLossItems[0].DownstreamClusterID) + require.Equal(t, "peer-cluster-1", tableItems.DataLossItems[0].PeerClusterID) require.Equal(t, "pk-1", tableItems.DataLossItems[0].PK) require.Equal(t, uint64(100), tableItems.DataLossItems[0].OriginTS) require.Equal(t, uint64(200), tableItems.DataLossItems[0].CommitTS) - require.False(t, tableItems.DataLossItems[0].Inconsistent) + }) + + t.Run("add data inconsistent item sets needFlush", func(t *testing.T) { + t.Parallel() + cr := NewClusterReport("c1", types.TimeWindow{}) + cols := []InconsistentColumn{ + {Column: "val", Local: "a", Replicated: "b"}, + } + cr.AddDataInconsistentItem("peer-cluster-2", testSchemaKey, "pk-2", 300, 400, cols) + require.Len(t, cr.TableFailureItems, 1) + require.Contains(t, cr.TableFailureItems, testSchemaKey) + tableItems := cr.TableFailureItems[testSchemaKey] + require.Len(t, tableItems.DataInconsistentItems, 1) + require.True(t, cr.needFlush) + require.Equal(t, "peer-cluster-2", tableItems.DataInconsistentItems[0].PeerClusterID) + require.Equal(t, "pk-2", tableItems.DataInconsistentItems[0].PK) + require.Equal(t, uint64(300), tableItems.DataInconsistentItems[0].OriginTS) + require.Equal(t, uint64(400), tableItems.DataInconsistentItems[0].CommitTS) + require.Len(t, tableItems.DataInconsistentItems[0].InconsistentColumns, 1) + require.Equal(t, "val", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Column) + require.Equal(t, "a", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Local) + require.Equal(t, "b", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Replicated) }) t.Run("add data redundant item sets needFlush", func(t *testing.T) { @@ -127,13 +183,14 @@ func TestClusterReport(t *testing.T) { t.Run("add multiple items", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, false) - cr.AddDataLossItem("d2", testSchemaKey, "pk-2", 3, 4, true) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2) + cr.AddDataInconsistentItem("d2", testSchemaKey, "pk-2", 3, 4, nil) cr.AddDataRedundantItem(testSchemaKey, "pk-3", 5, 6) cr.AddLWWViolationItem(testSchemaKey, "pk-4", 7, 8, 9, 10) require.Len(t, cr.TableFailureItems, 1) tableItems := cr.TableFailureItems[testSchemaKey] - require.Len(t, tableItems.DataLossItems, 2) + require.Len(t, tableItems.DataLossItems, 1) + require.Len(t, tableItems.DataInconsistentItems, 1) require.Len(t, tableItems.DataRedundantItems, 1) require.Len(t, tableItems.LWWViolationItems, 1) }) @@ -163,7 +220,7 @@ func TestReport(t *testing.T) { t.Parallel() r := NewReport(1) cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, false) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2) r.AddClusterReport("c1", cr) require.True(t, r.NeedFlush()) }) @@ -197,7 +254,7 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(1) cr := NewClusterReport("c1", tw) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 100, 200, false) + cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 100, 200) r.AddClusterReport("c1", cr) s := r.MarshalReport() require.Equal(t, "round: 1\n\n"+ @@ -205,7 +262,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [downstream cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss]\n\n", + " - [peer cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss]\n\n", s) }) @@ -246,7 +303,7 @@ func TestReport_MarshalReport(t *testing.T) { r := NewReport(1) crEmpty := NewClusterReport("empty-cluster", tw) crFull := NewClusterReport("full-cluster", tw) - crFull.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, false) + crFull.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2) r.AddClusterReport("empty-cluster", crEmpty) r.AddClusterReport("full-cluster", crFull) s := r.MarshalReport() @@ -255,7 +312,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [downstream cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data loss]\n\n", + " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data loss]\n\n", s) }) @@ -263,7 +320,10 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(10) cr := NewClusterReport("c1", tw) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2, true) + cr.AddDataLossItem("d0", testSchemaKey, "pk-0", 0, 1) + cr.AddDataInconsistentItem("d1", testSchemaKey, "pk-1", 1, 2, []InconsistentColumn{ + {Column: "val", Local: "x", Replicated: "y"}, + }) cr.AddDataRedundantItem(testSchemaKey, "pk-2", 3, 4) cr.AddLWWViolationItem(testSchemaKey, "pk-3", 5, 6, 7, 8) r.AddClusterReport("c1", cr) @@ -273,7 +333,9 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [downstream cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data inconsistent]\n"+ + " - [peer cluster: d0, pk: pk-0, origin ts: 0, commit ts: 1, type: data loss]\n"+ + " - [data inconsistent items: 1]\n"+ + " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data inconsistent, inconsistent columns: [column: val, local: x, replicated: y]]\n"+ " - [data redundant items: 1]\n"+ " - [pk: pk-2, origin ts: 3, commit ts: 4]\n"+ " - [lww violation items: 1]\n"+ diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index b118d8187c..30f1c33996 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -15,6 +15,8 @@ package main import ( "context" + "fmt" + "strings" "time" "github.com/pingcap/log" @@ -23,17 +25,19 @@ import ( "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/recorder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/watcher" + "github.com/pingcap/ticdc/pkg/common" + cdcconfig "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/etcd" "github.com/pingcap/ticdc/pkg/security" - putil "github.com/pingcap/ticdc/pkg/util" + "github.com/pingcap/ticdc/pkg/util" pd "github.com/tikv/pd/client" pdopt "github.com/tikv/pd/client/opt" "go.uber.org/zap" "google.golang.org/grpc" ) -func runTask(ctx context.Context, cfg *config.Config) error { +func runTask(ctx context.Context, cfg *config.Config, dryRun bool) error { checkpointWatchers, s3Watchers, pdClients, etcdClients, err := initClients(ctx, cfg) if err != nil { return errors.Trace(err) @@ -41,7 +45,12 @@ func runTask(ctx context.Context, cfg *config.Config) error { // Ensure cleanup happens even if there's an error defer cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) - recorder, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir, cfg.Clusters) + if dryRun { + log.Info("Dry-run mode: config validation and connectivity check passed, exiting") + return nil + } + + recorder, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir, cfg.Clusters, cfg.GlobalConfig.MaxReportFiles) if err != nil { return errors.Trace(err) } @@ -98,14 +107,20 @@ func initClients(ctx context.Context, cfg *config.Config) ( } etcdClients[clusterID] = etcdClient - upstreamCheckpointWatchers := make(map[string]watcher.Watcher) - for downstreamClusterID, downstreamClusterChangefeedConfig := range clusterConfig.DownstreamClusterChangefeedConfig { - checkpointWatcher := watcher.NewCheckpointWatcher(ctx, clusterID, downstreamClusterID, downstreamClusterChangefeedConfig.ChangefeedID, etcdClient) - upstreamCheckpointWatchers[downstreamClusterID] = checkpointWatcher + clusterCheckpointWatchers := make(map[string]watcher.Watcher) + for peerClusterID, peerClusterChangefeedConfig := range clusterConfig.PeerClusterChangefeedConfig { + checkpointWatcher := watcher.NewCheckpointWatcher(ctx, clusterID, peerClusterID, peerClusterChangefeedConfig.ChangefeedID, etcdClient) + clusterCheckpointWatchers[peerClusterID] = checkpointWatcher } - checkpointWatchers[clusterID] = upstreamCheckpointWatchers + checkpointWatchers[clusterID] = clusterCheckpointWatchers - s3Storage, err := putil.GetExternalStorageWithDefaultTimeout(ctx, clusterConfig.S3SinkURI) + // Validate s3 changefeed sink config from etcd + if err := validateS3ChangefeedSinkConfig(ctx, etcdClient, clusterID, clusterConfig.S3ChangefeedID); err != nil { + cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) + return nil, nil, nil, nil, errors.Trace(err) + } + + s3Storage, err := util.GetExternalStorageWithDefaultTimeout(ctx, clusterConfig.S3SinkURI) if err != nil { // Clean up already created clients before returning error cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) @@ -123,6 +138,69 @@ func initClients(ctx context.Context, cfg *config.Config) ( return checkpointWatchers, s3Watchers, pdClients, etcdClients, nil } +// validateS3ChangefeedSinkConfig fetches the changefeed info from etcd and validates that: +// 1. The protocol must be canal-json +// 2. The date separator must be "day" +// 3. The file index width must be DefaultFileIndexWidth +func validateS3ChangefeedSinkConfig(ctx context.Context, etcdClient *etcd.CDCEtcdClientImpl, clusterID string, s3ChangefeedID string) error { + displayName := common.NewChangeFeedDisplayName(s3ChangefeedID, "default") + cfInfo, err := etcdClient.GetChangeFeedInfo(ctx, displayName) + if err != nil { + return errors.Annotate(err, fmt.Sprintf("failed to get changefeed info for s3 changefeed %s in cluster %s", s3ChangefeedID, clusterID)) + } + + if cfInfo.Config == nil || cfInfo.Config.Sink == nil { + return fmt.Errorf("cluster %s: s3 changefeed %s has no sink configuration", clusterID, s3ChangefeedID) + } + + sinkConfig := cfInfo.Config.Sink + + // 1. Validate protocol must be canal-json + protocolStr := strings.ToLower(util.GetOrZero(sinkConfig.Protocol)) + if protocolStr == "" { + return fmt.Errorf("cluster %s: s3 changefeed %s has no protocol configured in sink config", clusterID, s3ChangefeedID) + } + protocol, err := cdcconfig.ParseSinkProtocolFromString(protocolStr) + if err != nil { + return errors.Annotate(err, fmt.Sprintf("cluster %s: s3 changefeed %s has invalid protocol", clusterID, s3ChangefeedID)) + } + if protocol != cdcconfig.ProtocolCanalJSON { + return fmt.Errorf("cluster %s: s3 changefeed %s protocol is %q, but only %q is supported", + clusterID, s3ChangefeedID, protocolStr, cdcconfig.ProtocolCanalJSON.String()) + } + + // 2. Validate date separator must be "day" + dateSeparatorStr := util.GetOrZero(sinkConfig.DateSeparator) + if dateSeparatorStr == "" { + dateSeparatorStr = cdcconfig.DateSeparatorNone.String() + } + var dateSep cdcconfig.DateSeparator + if err := dateSep.FromString(dateSeparatorStr); err != nil { + return errors.Annotate(err, fmt.Sprintf("cluster %s: s3 changefeed %s has invalid date-separator %q", clusterID, s3ChangefeedID, dateSeparatorStr)) + } + if dateSep != cdcconfig.DateSeparatorDay { + return fmt.Errorf("cluster %s: s3 changefeed %s date-separator is %q, but only %q is supported", + clusterID, s3ChangefeedID, dateSep.String(), cdcconfig.DateSeparatorDay.String()) + } + + // 3. Validate file index width must be DefaultFileIndexWidth + fileIndexWidth := util.GetOrZero(sinkConfig.FileIndexWidth) + if fileIndexWidth != cdcconfig.DefaultFileIndexWidth { + return fmt.Errorf("cluster %s: s3 changefeed %s file-index-width is %d, but only %d is supported", + clusterID, s3ChangefeedID, fileIndexWidth, cdcconfig.DefaultFileIndexWidth) + } + + log.Info("Validated s3 changefeed sink config from etcd", + zap.String("clusterID", clusterID), + zap.String("s3ChangefeedID", s3ChangefeedID), + zap.String("protocol", protocolStr), + zap.String("dateSeparator", dateSep.String()), + zap.Int("fileIndexWidth", fileIndexWidth), + ) + + return nil +} + func newPDClient(ctx context.Context, pdAddrs []string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { pdClient, err := pd.NewClientWithContext( ctx, "consistency-checker", pdAddrs, securityConfig.PDSecurityOption(), diff --git a/cmd/multi-cluster-consistency-checker/types/types.go b/cmd/multi-cluster-consistency-checker/types/types.go index 85aaf197ab..a8f06265fd 100644 --- a/cmd/multi-cluster-consistency-checker/types/types.go +++ b/cmd/multi-cluster-consistency-checker/types/types.go @@ -55,11 +55,11 @@ type VersionKey struct { type TimeWindow struct { LeftBoundary uint64 `json:"left_boundary"` RightBoundary uint64 `json:"right_boundary"` - // CheckpointTs is the checkpoint timestamp for each changefeed from upstream cluster, - // mapping from downstream cluster ID to the checkpoint timestamp + // CheckpointTs is the checkpoint timestamp for each local-to-replicated changefeed, + // mapping from replicated cluster ID to the checkpoint timestamp CheckpointTs map[string]uint64 `json:"checkpoint_ts"` - // PDTimestampAfterTimeWindow is the max PD timestamp after the time window for each downstream cluster, - // mapping from upstream cluster ID to the max PD timestamp + // PDTimestampAfterTimeWindow is the max PD timestamp after the time window for each replicated cluster, + // mapping from local cluster ID to the max PD timestamp PDTimestampAfterTimeWindow map[string]uint64 `json:"pd_timestamp_after_time_window"` // NextMinLeftBoundary is the minimum left boundary of the next time window for the cluster NextMinLeftBoundary uint64 `json:"next_min_left_boundary"` @@ -68,8 +68,8 @@ type TimeWindow struct { func (t *TimeWindow) String() string { var builder strings.Builder fmt.Fprintf(&builder, "time window boundary: (%d, %d]\n", t.LeftBoundary, t.RightBoundary) - for downstreamClusterID, checkpointTs := range t.CheckpointTs { - fmt.Fprintf(&builder, "checkpoint ts [to cluster %s]: %d\n", downstreamClusterID, checkpointTs) + for replicatedClusterID, checkpointTs := range t.CheckpointTs { + fmt.Fprintf(&builder, "checkpoint ts [replicated cluster: %s]: %d\n", replicatedClusterID, checkpointTs) } return builder.String() } diff --git a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go index eab3063b9f..09c5827008 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go @@ -48,8 +48,8 @@ type waitCheckpointTask struct { } type CheckpointWatcher struct { - upstreamClusterID string - downstreamClusterID string + localClusterID string + replicatedClusterID string changefeedID common.ChangeFeedID etcdClient etcd.CDCEtcdClient @@ -65,13 +65,13 @@ type CheckpointWatcher struct { func NewCheckpointWatcher( ctx context.Context, - upstreamClusterID, downstreamClusterID, changefeedID string, + localClusterID, replicatedClusterID, changefeedID string, etcdClient etcd.CDCEtcdClient, ) *CheckpointWatcher { cctx, cancel := context.WithCancel(ctx) watcher := &CheckpointWatcher{ - upstreamClusterID: upstreamClusterID, - downstreamClusterID: downstreamClusterID, + localClusterID: localClusterID, + replicatedClusterID: replicatedClusterID, changefeedID: common.NewChangeFeedIDWithName(changefeedID, "default"), etcdClient: etcdClient, @@ -219,10 +219,10 @@ func (cw *CheckpointWatcher) watchOnce() error { statusKey := etcd.GetEtcdKeyJob(cw.etcdClient.GetClusterID(), cw.changefeedID.DisplayName) log.Debug("Starting to watch checkpoint", - zap.String("changefeedID", cw.changefeedID.String()), + zap.String("changefeed ID", cw.changefeedID.String()), zap.String("statusKey", statusKey), - zap.String("upstreamClusterID", cw.upstreamClusterID), - zap.String("downstreamClusterID", cw.downstreamClusterID), + zap.String("local cluster ID", cw.localClusterID), + zap.String("replicated cluster ID", cw.replicatedClusterID), zap.Uint64("checkpoint", status.CheckpointTs), zap.Int64("startRev", modRev+1)) diff --git a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go index 5819dd73b2..c2f0055ad0 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher_test.go @@ -54,7 +54,7 @@ func TestCheckpointWatcher_AdvanceCheckpointTs_AlreadyExceeds(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize @@ -92,7 +92,7 @@ func TestCheckpointWatcher_AdvanceCheckpointTs_WaitForUpdate(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize @@ -159,7 +159,7 @@ func TestCheckpointWatcher_AdvanceCheckpointTs_ContextCanceled(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize @@ -214,7 +214,7 @@ func TestCheckpointWatcher_Close(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) // Wait for watcher to initialize time.Sleep(50 * time.Millisecond) @@ -279,7 +279,7 @@ func TestCheckpointWatcher_MultiplePendingTasks(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize @@ -388,7 +388,7 @@ func TestCheckpointWatcher_InitialCheckpointNotifiesPendingTasks(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize and get the initial checkpoint @@ -434,7 +434,7 @@ func TestCheckpointWatcher_WatchErrorRetry(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize @@ -482,7 +482,7 @@ func TestCheckpointWatcher_GetStatusRetry(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for retry to happen (backoff + processing time) @@ -518,7 +518,7 @@ func TestCheckpointWatcher_KeyDeleted(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) defer cancel() - watcher := NewCheckpointWatcher(ctx, "upstream-1", "downstream-1", "test-cf", mockEtcdClient) + watcher := NewCheckpointWatcher(ctx, "local-1", "replicated-1", "test-cf", mockEtcdClient) defer watcher.Close() // Wait for watcher to initialize From 4e339f62d0afac8bd1204335b65aee12c725f2c1 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Sat, 14 Feb 2026 19:59:38 +0800 Subject: [PATCH 21/23] meet the spec Signed-off-by: Jianjun Liao --- .../checker/checker.go | 66 +++--- .../checker/checker_test.go | 185 ++++++++++++++++- .../decoder/decoder.go | 12 +- .../decoder/decoder_test.go | 22 +- cmd/multi-cluster-consistency-checker/main.go | 52 ++++- .../main_test.go | 192 ++++++++++++++++++ .../recorder/recorder.go | 18 +- .../recorder/recorder_test.go | 121 +++++++++++ .../recorder/types.go | 4 +- .../recorder/types_test.go | 16 +- cmd/multi-cluster-consistency-checker/task.go | 85 ++++++-- 11 files changed, 684 insertions(+), 89 deletions(-) create mode 100644 cmd/multi-cluster-consistency-checker/main_test.go diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index c3df9088fd..1e9fbb075c 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -395,7 +395,7 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.String("schemaKey", schemaKey), - zap.String("pk", string(record.Pk)), + zap.String("pk", record.PkStr), zap.Uint64("commitTs", record.CommitTs)) cd.lwwSkippedRecordsCount++ continue @@ -406,14 +406,14 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) + cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs) } else if !record.EqualReplicatedRecord(replicatedRecord) { // data inconsistent detected log.Error("data inconsistent detected", zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) + cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) } } } @@ -433,7 +433,7 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.String("schemaKey", schemaKey), - zap.String("pk", string(record.Pk)), + zap.String("pk", record.PkStr), zap.Uint64("commitTs", record.CommitTs)) cd.lwwSkippedRecordsCount++ continue @@ -444,14 +444,14 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) + cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs) } else if !record.EqualReplicatedRecord(replicatedRecord) { // data inconsistent detected log.Error("data inconsistent detected", zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, string(record.Pk), record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) + cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) } } } @@ -472,7 +472,7 @@ func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { log.Error("data redundant detected", zap.String("replicated cluster ID", cd.clusterID), zap.Any("record", record)) - cd.report.AddDataRedundantItem(schemaKey, string(record.Pk), record.OriginTs, record.CommitTs) + cd.report.AddDataRedundantItem(schemaKey, record.PkStr, record.OriginTs, record.CommitTs) } } } @@ -521,13 +521,22 @@ func (cd *clusterDataChecker) lwwViolationDetection() { cd.clusterViolationChecker.UpdateCache() } -func (cd *clusterDataChecker) Check(checker *DataChecker) { +func (cd *clusterDataChecker) Check(checker *DataChecker, enableDataLoss, enableDataRedundant bool) { cd.report = recorder.NewClusterReport(cd.clusterID, cd.thisRoundTimeWindow) - // CHECK 1 - Data Loss Detection - cd.dataLossDetection(checker) - // CHECK 2 - Data Redundant Detection - cd.dataRedundantDetection(checker) + if enableDataLoss { + // CHECK 1 - Data Loss / Inconsistency Detection (round 2+) + // Needs [1] and [2] populated. + cd.dataLossDetection(checker) + } + if enableDataRedundant { + // CHECK 2 - Data Redundant Detection (round 3+) + // Needs [0], [1] and [2] all populated with real data; + // at round 2 [0] is still round 0 (empty), which would cause false positives. + cd.dataRedundantDetection(checker) + } // CHECK 3 - LWW Violation Detection + // Always runs to keep the version cache up-to-date; meaningful results + // start from round 1 once the cache has been seeded. cd.lwwViolationDetection() } @@ -607,17 +616,28 @@ func (c *DataChecker) CheckInNextTimeWindow(newTimeWindowData map[string]types.T return nil, errors.Annotate(err, "failed to decode new time window data") } report := recorder.NewReport(c.round) - if c.checkableRound >= 3 { - for clusterID, clusterDataChecker := range c.clusterDataCheckers { - clusterDataChecker.Check(c) - log.Info("checked records count", - zap.String("clusterID", clusterID), - zap.Int("checked records count", clusterDataChecker.checkedRecordsCount), - zap.Int("new time window records count", clusterDataChecker.newTimeWindowRecordsCount), - zap.Int("lww skipped records count", clusterDataChecker.lwwSkippedRecordsCount)) - report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) - } - } else { + + // Round 0: seed the LWW cache (round 0 data is empty by convention). + // Round 1+: LWW violation detection produces meaningful results. + // Round 2+: data loss / inconsistency detection (needs [1] and [2]). + // Round 3+: data redundant detection (needs [0], [1] and [2] with real data). + enableDataLoss := c.checkableRound >= 2 + enableDataRedundant := c.checkableRound >= 3 + + for clusterID, clusterDataChecker := range c.clusterDataCheckers { + clusterDataChecker.Check(c, enableDataLoss, enableDataRedundant) + log.Info("checked records count", + zap.String("clusterID", clusterID), + zap.Uint64("round", c.round), + zap.Bool("enableDataLoss", enableDataLoss), + zap.Bool("enableDataRedundant", enableDataRedundant), + zap.Int("checked records count", clusterDataChecker.checkedRecordsCount), + zap.Int("new time window records count", clusterDataChecker.newTimeWindowRecordsCount), + zap.Int("lww skipped records count", clusterDataChecker.lwwSkippedRecordsCount)) + report.AddClusterReport(clusterID, clusterDataChecker.GetReport()) + } + + if c.checkableRound < 3 { c.checkableRound++ } c.round++ diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go index 11638f8817..7b690b4174 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker_test.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -361,8 +361,10 @@ var defaultSchemaKey = (&cloudstorage.DmlPathKey{}).GetKey() // TestDataChecker_FourRoundsCheck simulates 4 rounds with increasing data and verifies check results. // Setup: 2 clusters (c1 locally-written, c2 replicated from c1). -// Rounds 0-2: accumulate data, check not yet active (checkableRound < 3). -// Round 3: first real check runs, detecting violations. +// - Round 0: LWW cache is seeded (data is empty by convention). +// - Round 1+: LWW violation detection is active. +// - Round 2+: data loss / inconsistent detection is active. +// - Round 3+: data redundant detection is also active (needs [0],[1],[2] all populated). func TestDataChecker_FourRoundsCheck(t *testing.T) { t.Parallel() ctx := context.Background() @@ -413,15 +415,11 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { report, err := checker.CheckInNextTimeWindow(roundData) require.NoError(t, err, "round %d", i) require.Equal(t, uint64(i), report.Round) - if i < 3 { - require.Empty(t, report.ClusterReports, "round %d should have no cluster reports", i) - require.False(t, report.NeedFlush(), "round %d should not need flush", i) - } else { - require.Len(t, report.ClusterReports, 2) - require.False(t, report.NeedFlush(), "round 3 should not need flush (all consistent)") - for clusterID, cr := range report.ClusterReports { - require.Empty(t, cr.TableFailureItems, "cluster %s should have no table failure items", clusterID) - } + // Every round now produces cluster reports (LWW always runs). + require.Len(t, report.ClusterReports, 2, "round %d should have 2 cluster reports", i) + require.False(t, report.NeedFlush(), "round %d should not need flush (all consistent)", i) + for clusterID, cr := range report.ClusterReports { + require.Empty(t, cr.TableFailureItems, "round %d cluster %s should have no table failure items", i, clusterID) } } }) @@ -604,4 +602,169 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { require.Empty(t, c2TableItems.LWWViolationItems) } }) + + // lww violation detected at round 1: LWW is active from round 1, + // so a violation introduced in round 1 data should surface immediately. + t.Run("lww violation detected at round 1", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + + // Round 0: [0, 100] — c1 writes pk=1 (commitTs=50, compareTs=50) + round0 := map[string]types.TimeWindowData{ + "c1": makeTWData(0, 100, nil, + makeContent(makeCanalJSON(1, 50, 0, "a"))), + "c2": makeTWData(0, 100, nil, nil), + } + // Round 1: [100, 200] — c1 writes pk=1 again: + // locally-written pk=1 (commitTs=150, originTs=0, compareTs=150) + // replicated pk=1 (commitTs=180, originTs=120, compareTs=120) + // The LWW cache already has pk=1 compareTs=50 from round 0. + // Record order: commitTs=150 (compareTs=150 > cached 50 → update cache), + // commitTs=180 (compareTs=120 < cached 150 → VIOLATION) + round1 := map[string]types.TimeWindowData{ + "c1": makeTWData(100, 200, nil, + makeContent( + makeCanalJSON(1, 150, 0, "b"), + makeCanalJSON(1, 180, 120, "b"), + )), + "c2": makeTWData(100, 200, nil, nil), + } + + report0, err := checker.CheckInNextTimeWindow(round0) + require.NoError(t, err) + require.False(t, report0.NeedFlush(), "round 0 should not need flush") + + report1, err := checker.CheckInNextTimeWindow(round1) + require.NoError(t, err) + + // LWW violation should be detected at round 1 + require.True(t, report1.NeedFlush(), "round 1 should detect LWW violation") + c1Report := report1.ClusterReports["c1"] + require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) + c1TableItems := c1Report.TableFailureItems[defaultSchemaKey] + require.Len(t, c1TableItems.LWWViolationItems, 1) + require.Equal(t, uint64(0), c1TableItems.LWWViolationItems[0].ExistingOriginTS) + require.Equal(t, uint64(150), c1TableItems.LWWViolationItems[0].ExistingCommitTS) + require.Equal(t, uint64(120), c1TableItems.LWWViolationItems[0].OriginTS) + require.Equal(t, uint64(180), c1TableItems.LWWViolationItems[0].CommitTS) + }) + + // data loss detected at round 2: Data loss detection is active from round 2. + // A record in round 1 whose commitTs > checkpointTs will enter [1] at round 2, + // and if the replicated counterpart is missing, data loss is detected at round 2. + t.Run("data loss detected at round 2", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + + round0 := map[string]types.TimeWindowData{ + "c1": makeTWData(0, 100, nil, nil), + "c2": makeTWData(0, 100, nil, nil), + } + // Round 1: c1 writes pk=1 (commitTs=150), checkpointTs["c2"]=140 + // Since 150 > 140, this record needs replication checking. + round1 := map[string]types.TimeWindowData{ + "c1": makeTWData(100, 200, map[string]uint64{"c2": 140}, + makeContent(makeCanalJSON(1, 150, 0, "a"))), + "c2": makeTWData(100, 200, nil, nil), // c2 has NO replicated data + } + // Round 2: round 1 data is now in [1], data loss detection enabled. + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 280}, + makeContent(makeCanalJSON(2, 250, 0, "b"))), + "c2": makeTWData(200, 300, nil, + makeContent(makeCanalJSON(2, 260, 250, "b"))), + } + + report0, err := checker.CheckInNextTimeWindow(round0) + require.NoError(t, err) + require.False(t, report0.NeedFlush()) + + report1, err := checker.CheckInNextTimeWindow(round1) + require.NoError(t, err) + require.False(t, report1.NeedFlush(), "round 1 should not detect data loss yet") + + report2, err := checker.CheckInNextTimeWindow(round2) + require.NoError(t, err) + require.True(t, report2.NeedFlush(), "round 2 should detect data loss") + c1Report := report2.ClusterReports["c1"] + require.Contains(t, c1Report.TableFailureItems, defaultSchemaKey) + tableItems := c1Report.TableFailureItems[defaultSchemaKey] + require.Len(t, tableItems.DataLossItems, 1) + require.Equal(t, "c2", tableItems.DataLossItems[0].PeerClusterID) + require.Equal(t, uint64(150), tableItems.DataLossItems[0].CommitTS) + }) + + // data redundant detected at round 3 (not round 2): + // dataRedundantDetection checks timeWindowDataCaches[2] (latest round). + // At round 2 [0]=round 0 (empty) so FindSourceLocalData may miss data in + // that window → enableDataRedundant is false to avoid false positives. + // At round 3 [0]=round 1, [1]=round 2, [2]=round 3 are all populated + // with real data, so enableDataRedundant=true and an orphan in [2] is caught. + // + // This test puts the SAME orphan pk=99 in both round 2 and round 3: + // - Round 2: orphan in [2] but enableDataRedundant=false → NOT flagged. + // - Round 3: orphan in [2] and enableDataRedundant=true → flagged. + t.Run("data redundant detected at round 3 not round 2", func(t *testing.T) { + t.Parallel() + checker := NewDataChecker(ctx, clusterCfg, nil, nil) + + round0 := map[string]types.TimeWindowData{ + "c1": makeTWData(0, 100, nil, nil), + "c2": makeTWData(0, 100, nil, nil), + } + // Round 1: normal consistent data. + round1 := map[string]types.TimeWindowData{ + "c1": makeTWData(100, 200, map[string]uint64{"c2": 180}, + makeContent(makeCanalJSON(1, 150, 0, "a"))), + "c2": makeTWData(100, 200, nil, + makeContent(makeCanalJSON(1, 160, 150, "a"))), + } + // Round 2: c2 has orphan replicated pk=99 (originTs=230) in [2]. + // enableDataRedundant=false at round 2, so it must NOT be flagged. + round2 := map[string]types.TimeWindowData{ + "c1": makeTWData(200, 300, map[string]uint64{"c2": 280}, + makeContent(makeCanalJSON(2, 250, 0, "b"))), + "c2": makeTWData(200, 300, nil, + makeContent( + makeCanalJSON(2, 260, 250, "b"), + makeCanalJSON(99, 240, 230, "x"), // orphan replicated + )), + } + // Round 3: c2 has another orphan replicated pk=99 (originTs=330) in [2]. + // enableDataRedundant=true at round 3, so it IS caught. + round3 := map[string]types.TimeWindowData{ + "c1": makeTWData(300, 400, map[string]uint64{"c2": 380}, + makeContent(makeCanalJSON(3, 350, 0, "c"))), + "c2": makeTWData(300, 400, nil, + makeContent( + makeCanalJSON(3, 360, 350, "c"), + makeCanalJSON(99, 340, 330, "y"), // orphan replicated + )), + } + + report0, err := checker.CheckInNextTimeWindow(round0) + require.NoError(t, err) + require.False(t, report0.NeedFlush(), "round 0 should not need flush") + + report1, err := checker.CheckInNextTimeWindow(round1) + require.NoError(t, err) + require.False(t, report1.NeedFlush(), "round 1 should not need flush") + + report2, err := checker.CheckInNextTimeWindow(round2) + require.NoError(t, err) + // Round 2: redundant detection is NOT enabled; the orphan pk=99 should NOT be flagged. + require.False(t, report2.NeedFlush(), "round 2 should not flag data redundant yet") + + report3, err := checker.CheckInNextTimeWindow(round3) + require.NoError(t, err) + // Round 3: redundant detection is enabled; the orphan pk=99 in [2] (round 3) + // is now caught. + require.True(t, report3.NeedFlush(), "round 3 should detect data redundant") + c2Report := report3.ClusterReports["c2"] + require.Contains(t, c2Report.TableFailureItems, defaultSchemaKey) + c2TableItems := c2Report.TableFailureItems[defaultSchemaKey] + require.Len(t, c2TableItems.DataRedundantItems, 1) + require.Equal(t, uint64(330), c2TableItems.DataRedundantItems[0].OriginTS) + require.Equal(t, uint64(340), c2TableItems.DataRedundantItems[0].CommitTS) + }) } diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder.go b/cmd/multi-cluster-consistency-checker/decoder/decoder.go index 64d6c6eaef..ac03677229 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder.go @@ -17,6 +17,7 @@ import ( "bytes" "encoding/hex" "encoding/json" + "fmt" "slices" "strconv" "strings" @@ -81,6 +82,7 @@ func defaultCanalJSONCodecConfig() *codecCommon.Config { type Record struct { types.CdcVersion Pk types.PkType + PkStr string ColumnValues map[string]any } @@ -193,9 +195,11 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { return nil, errors.New("invalid message") } + var pkStrBuilder strings.Builder + pkStrBuilder.WriteString("[") pkValues := make([]tiTypes.Datum, 0, len(d.msg.PkNames)) slices.Sort(d.msg.PkNames) - for _, pkName := range d.msg.PkNames { + for i, pkName := range d.msg.PkNames { mysqlType, ok := d.msg.MySQLType[pkName] if !ok { log.Error("mysql type not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) @@ -206,6 +210,10 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { log.Error("column value not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) return nil, errors.Errorf("column value of column %s not found", pkName) } + if i > 0 { + pkStrBuilder.WriteString(", ") + } + fmt.Fprintf(&pkStrBuilder, "%s: %v", pkName, columnValue) ft := newPKColumnFieldTypeFromMysqlType(mysqlType) datum := valueToDatum(columnValue, ft) if datum.IsNull() { @@ -215,6 +223,7 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { pkValues = append(pkValues, *datum) delete(d.msg.Data[0], pkName) } + pkStrBuilder.WriteString("]") pkEncoded, err := codec.EncodeKey(time.UTC, nil, pkValues...) if err != nil { return nil, errors.Annotate(err, "failed to encode primary key") @@ -238,6 +247,7 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { d.msg = nil return &Record{ Pk: types.PkType(pk), + PkStr: pkStrBuilder.String(), ColumnValues: columnValues, CdcVersion: types.CdcVersion{ CommitTs: commitTs, diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go index c2539b1b8e..0dba247b82 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go @@ -33,14 +33,14 @@ const DataContent1 string = "" + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"UPDATE","es":1770303522494,"ts":1770303523900,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":[{"id":"8","first_name":"h","last_name":"H","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"data":[{"id":"8","first_name":"h","last_name":"H","_tidb_origin_ts":"464074446196178963","_tidb_softdelete_time":"2026-02-05 22:58:40.992217"}],"_tidb":{"commitTs":464074446600667164}}` var ExpectedRecords1 = []decoder.Record{ - {CdcVersion: types.CdcVersion{CommitTs: 464043256649875456, OriginTs: 0}, Pk: "038000000000000014", ColumnValues: map[string]any{"first_name": "t", "last_name": "TT", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464043256649875456, OriginTs: 0}, Pk: "038000000000000015", ColumnValues: map[string]any{"first_name": "u", "last_name": "UU", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464073967049113629, OriginTs: 464073966942421014}, Pk: "038000000000000005", ColumnValues: map[string]any{"first_name": "e", "last_name": "E", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464073967049113629, OriginTs: 464073966942421014}, Pk: "038000000000000006", ColumnValues: map[string]any{"first_name": "f", "last_name": "F", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464074440664678441, OriginTs: 464074440387592202}, Pk: "038000000000000007", ColumnValues: map[string]any{"first_name": "g", "last_name": "G", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464074446196178963, OriginTs: 0}, Pk: "038000000000000007", ColumnValues: map[string]any{"first_name": "g", "last_name": "G", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, - {CdcVersion: types.CdcVersion{CommitTs: 464074440387592202, OriginTs: 0}, Pk: "038000000000000008", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464074446600667164, OriginTs: 464074446196178963}, Pk: "038000000000000008", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, + {CdcVersion: types.CdcVersion{CommitTs: 464043256649875456, OriginTs: 0}, Pk: "038000000000000014", PkStr: "[id: 20]", ColumnValues: map[string]any{"first_name": "t", "last_name": "TT", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464043256649875456, OriginTs: 0}, Pk: "038000000000000015", PkStr: "[id: 21]", ColumnValues: map[string]any{"first_name": "u", "last_name": "UU", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464073967049113629, OriginTs: 464073966942421014}, Pk: "038000000000000005", PkStr: "[id: 5]", ColumnValues: map[string]any{"first_name": "e", "last_name": "E", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464073967049113629, OriginTs: 464073966942421014}, Pk: "038000000000000006", PkStr: "[id: 6]", ColumnValues: map[string]any{"first_name": "f", "last_name": "F", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074440664678441, OriginTs: 464074440387592202}, Pk: "038000000000000007", PkStr: "[id: 7]", ColumnValues: map[string]any{"first_name": "g", "last_name": "G", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074446196178963, OriginTs: 0}, Pk: "038000000000000007", PkStr: "[id: 7]", ColumnValues: map[string]any{"first_name": "g", "last_name": "G", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074440387592202, OriginTs: 0}, Pk: "038000000000000008", PkStr: "[id: 8]", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464074446600667164, OriginTs: 464074446196178963}, Pk: "038000000000000008", PkStr: "[id: 8]", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, } func TestCanalJSONDecoder1(t *testing.T) { @@ -50,6 +50,7 @@ func TestCanalJSONDecoder1(t *testing.T) { for i, actualRecord := range records { expectedRecord := ExpectedRecords1[i] require.Equal(t, actualRecord.Pk, expectedRecord.Pk) + require.Equal(t, actualRecord.PkStr, expectedRecord.PkStr) require.Equal(t, actualRecord.ColumnValues, expectedRecord.ColumnValues) require.Equal(t, actualRecord.CdcVersion.CommitTs, expectedRecord.CdcVersion.CommitTs) require.Equal(t, actualRecord.CdcVersion.OriginTs, expectedRecord.CdcVersion.OriginTs) @@ -61,8 +62,8 @@ const DataContent2 string = "" + `{"id":0,"database":"test_active","table":"message2","pkNames":["id","first_name"],"isDdl":false,"type":"INSERT","es":1770344427851,"ts":1770344429772,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"101","first_name":"b","last_name":"B","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"_tidb":{"commitTs":464085169694572575}}` + "\r\n" var ExpectedRecords2 = []decoder.Record{ - {CdcVersion: types.CdcVersion{CommitTs: 464085165736198159, OriginTs: 464085165262503958}, Pk: "016100000000000000f8038000000000000064", ColumnValues: map[string]any{"last_name": "A", "_tidb_softdelete_time": nil}}, - {CdcVersion: types.CdcVersion{CommitTs: 464085169694572575, OriginTs: 0}, Pk: "016200000000000000f8038000000000000065", ColumnValues: map[string]any{"last_name": "B", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464085165736198159, OriginTs: 464085165262503958}, Pk: "016100000000000000f8038000000000000064", PkStr: "[first_name: a, id: 100]", ColumnValues: map[string]any{"last_name": "A", "_tidb_softdelete_time": nil}}, + {CdcVersion: types.CdcVersion{CommitTs: 464085169694572575, OriginTs: 0}, Pk: "016200000000000000f8038000000000000065", PkStr: "[first_name: b, id: 101]", ColumnValues: map[string]any{"last_name": "B", "_tidb_softdelete_time": nil}}, } func TestCanalJSONDecoder2(t *testing.T) { @@ -72,6 +73,7 @@ func TestCanalJSONDecoder2(t *testing.T) { for i, actualRecord := range records { expectedRecord := ExpectedRecords2[i] require.Equal(t, actualRecord.Pk, expectedRecord.Pk) + require.Equal(t, actualRecord.PkStr, expectedRecord.PkStr) require.Equal(t, actualRecord.ColumnValues, expectedRecord.ColumnValues) require.Equal(t, actualRecord.CdcVersion.CommitTs, expectedRecord.CdcVersion.CommitTs) require.Equal(t, actualRecord.CdcVersion.OriginTs, expectedRecord.CdcVersion.OriginTs) diff --git a/cmd/multi-cluster-consistency-checker/main.go b/cmd/multi-cluster-consistency-checker/main.go index 8b811795bf..7f2390f99e 100644 --- a/cmd/multi-cluster-consistency-checker/main.go +++ b/cmd/multi-cluster-consistency-checker/main.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" + "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/logger" "github.com/spf13/cobra" "go.uber.org/zap" @@ -32,12 +33,43 @@ var ( dryRun bool ) +// Exit codes for multi-cluster-consistency-checker. +// +// 0 – clean shutdown (normal exit or graceful signal handling) +// 1 – transient error, safe to restart (network, I/O, temporary failures) +// 2 – invalid configuration (missing required flags / fields) +// 3 – configuration decode failure (malformed config file) +// 4 – checkpoint corruption, requires manual intervention +// 5 – unrecoverable internal error const ( - ExitCodeExecuteFailed = 1 - ExitCodeInvalidConfig = 2 - ExitCodeDecodeConfigFailed = 3 + ExitCodeTransient = 1 + ExitCodeInvalidConfig = 2 + ExitCodeDecodeConfigFailed = 3 + ExitCodeCheckpointCorruption = 4 + ExitCodeUnrecoverable = 5 ) +// ExitError wraps an error with a process exit code so that callers higher in +// the stack can translate domain errors into the correct exit status. +type ExitError struct { + Code int + Err error +} + +func (e *ExitError) Error() string { return e.Err.Error() } +func (e *ExitError) Unwrap() error { return e.Err } + +// exitCodeFromError extracts the exit code from an error. +// If the error is an *ExitError the embedded code is returned; +// otherwise the fallback code is returned. +func exitCodeFromError(err error, fallback int) int { + var ee *ExitError + if errors.As(err, &ee) { + return ee.Code + } + return fallback +} + const ( FlagConfig = "config" FlagDryRun = "dry-run" @@ -57,7 +89,7 @@ func main() { if err := rootCmd.Execute(); err != nil { fmt.Fprintf(os.Stderr, "Error: %v\n", err) - os.Exit(ExitCodeExecuteFailed) + os.Exit(ExitCodeUnrecoverable) } } @@ -84,7 +116,7 @@ func run(cmd *cobra.Command, args []string) { err = logger.InitLogger(loggerConfig) if err != nil { fmt.Fprintf(os.Stderr, "failed to init logger: %v\n", err) - os.Exit(ExitCodeExecuteFailed) + os.Exit(ExitCodeUnrecoverable) } log.Info("Logger initialized", zap.String("level", logLevel)) @@ -112,15 +144,17 @@ func run(cmd *cobra.Command, args []string) { fmt.Fprintf(os.Stdout, "\nReceived signal: %v, shutting down gracefully...\n", sig) cancel() // Wait for the task to finish - if err := <-errChan; err != nil && err != context.Canceled { - fmt.Fprintf(os.Stderr, "task error: %v\n", err) - os.Exit(ExitCodeExecuteFailed) + if err := <-errChan; err != nil && !errors.Is(err, context.Canceled) { + fmt.Fprintf(os.Stderr, "task error during shutdown: %v\n", err) + code := exitCodeFromError(err, ExitCodeTransient) + os.Exit(code) } fmt.Fprintf(os.Stdout, "Shutdown complete\n") case err := <-errChan: if err != nil { fmt.Fprintf(os.Stderr, "failed to run task: %v\n", err) - os.Exit(ExitCodeExecuteFailed) + code := exitCodeFromError(err, ExitCodeTransient) + os.Exit(code) } } } diff --git a/cmd/multi-cluster-consistency-checker/main_test.go b/cmd/multi-cluster-consistency-checker/main_test.go new file mode 100644 index 0000000000..ee3fc71f23 --- /dev/null +++ b/cmd/multi-cluster-consistency-checker/main_test.go @@ -0,0 +1,192 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "testing" + + "github.com/pingcap/ticdc/pkg/errors" + "github.com/stretchr/testify/require" +) + +func TestExitError_Error(t *testing.T) { + t.Parallel() + inner := fmt.Errorf("something went wrong") + ee := &ExitError{Code: ExitCodeTransient, Err: inner} + require.Equal(t, "something went wrong", ee.Error()) +} + +func TestExitError_Unwrap(t *testing.T) { + t.Parallel() + inner := fmt.Errorf("root cause") + ee := &ExitError{Code: ExitCodeCheckpointCorruption, Err: inner} + require.ErrorIs(t, ee, inner) + require.Equal(t, inner, ee.Unwrap()) +} + +func TestExitError_Unwrap_deep(t *testing.T) { + t.Parallel() + root := errors.New("root") + wrapped := fmt.Errorf("layer1: %w", root) + ee := &ExitError{Code: ExitCodeTransient, Err: wrapped} + require.ErrorIs(t, ee, root) +} + +func TestValidateS3BucketPrefix(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + changefeedURI string + configURI string + wantErr bool + errContains string + }{ + { + name: "exact match", + changefeedURI: "s3://my-bucket/cluster1/", + configURI: "s3://my-bucket/cluster1/", + wantErr: false, + }, + { + name: "match ignoring trailing slash", + changefeedURI: "s3://my-bucket/cluster1", + configURI: "s3://my-bucket/cluster1/", + wantErr: false, + }, + { + name: "match with query params in changefeed URI", + changefeedURI: "s3://my-bucket/prefix/?protocol=canal-json&date-separator=day", + configURI: "s3://my-bucket/prefix/", + wantErr: false, + }, + { + name: "bucket mismatch", + changefeedURI: "s3://bucket-a/prefix/", + configURI: "s3://bucket-b/prefix/", + wantErr: true, + errContains: "bucket/prefix mismatch", + }, + { + name: "prefix mismatch", + changefeedURI: "s3://my-bucket/cluster1/", + configURI: "s3://my-bucket/cluster2/", + wantErr: true, + errContains: "bucket/prefix mismatch", + }, + { + name: "scheme mismatch", + changefeedURI: "gcs://my-bucket/prefix/", + configURI: "s3://my-bucket/prefix/", + wantErr: true, + errContains: "bucket/prefix mismatch", + }, + { + name: "deeper prefix mismatch", + changefeedURI: "s3://my-bucket/a/b/c/", + configURI: "s3://my-bucket/a/b/d/", + wantErr: true, + errContains: "bucket/prefix mismatch", + }, + { + name: "empty config URI", + changefeedURI: "s3://my-bucket/prefix/", + configURI: "", + wantErr: true, + errContains: "bucket/prefix mismatch", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + err := validateS3BucketPrefix(tt.changefeedURI, tt.configURI, "test-cluster", "cf-1") + if tt.wantErr { + require.Error(t, err) + require.Contains(t, err.Error(), tt.errContains) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestExitCodeFromError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + fallback int + expected int + }{ + { + name: "ExitError with transient code", + err: &ExitError{Code: ExitCodeTransient, Err: fmt.Errorf("timeout")}, + fallback: ExitCodeUnrecoverable, + expected: ExitCodeTransient, + }, + { + name: "ExitError with checkpoint corruption code", + err: &ExitError{Code: ExitCodeCheckpointCorruption, Err: fmt.Errorf("bad checkpoint")}, + fallback: ExitCodeTransient, + expected: ExitCodeCheckpointCorruption, + }, + { + name: "ExitError with unrecoverable code", + err: &ExitError{Code: ExitCodeUnrecoverable, Err: fmt.Errorf("fatal")}, + fallback: ExitCodeTransient, + expected: ExitCodeUnrecoverable, + }, + { + name: "ExitError with invalid config code", + err: &ExitError{Code: ExitCodeInvalidConfig, Err: fmt.Errorf("missing field")}, + fallback: ExitCodeTransient, + expected: ExitCodeInvalidConfig, + }, + { + name: "ExitError with decode config failed code", + err: &ExitError{Code: ExitCodeDecodeConfigFailed, Err: fmt.Errorf("bad toml")}, + fallback: ExitCodeTransient, + expected: ExitCodeDecodeConfigFailed, + }, + { + name: "plain error returns fallback", + err: fmt.Errorf("some plain error"), + fallback: ExitCodeTransient, + expected: ExitCodeTransient, + }, + { + name: "plain error returns different fallback", + err: fmt.Errorf("another plain error"), + fallback: ExitCodeUnrecoverable, + expected: ExitCodeUnrecoverable, + }, + { + name: "wrapped ExitError is still extracted", + err: fmt.Errorf("outer: %w", &ExitError{Code: ExitCodeCheckpointCorruption, Err: fmt.Errorf("inner")}), + fallback: ExitCodeTransient, + expected: ExitCodeCheckpointCorruption, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + code := exitCodeFromError(tt.err, tt.fallback) + require.Equal(t, tt.expected, code) + }) + } +} diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 2e0a54dcbe..25052ec8ab 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -27,6 +27,10 @@ import ( "go.uber.org/zap" ) +// ErrCheckpointCorruption is a sentinel error indicating that the persisted +// checkpoint data is corrupted and requires manual intervention to fix. +var ErrCheckpointCorruption = errors.New("checkpoint corruption") + type Recorder struct { reportDir string checkpointDir string @@ -79,11 +83,11 @@ func NewRecorder(dataDir string, clusters map[string]config.ClusterConfig, maxRe continue } if len(item.ClusterInfo) != len(clusters) { - return nil, errors.Errorf("checkpoint item (round %d) cluster info length mismatch, expected %d, got %d", item.Round, len(clusters), len(item.ClusterInfo)) + return nil, errors.Annotatef(ErrCheckpointCorruption, "checkpoint item (round %d) cluster info length mismatch, expected %d, got %d", item.Round, len(clusters), len(item.ClusterInfo)) } for clusterID := range clusters { if _, ok := item.ClusterInfo[clusterID]; !ok { - return nil, errors.Errorf("checkpoint item (round %d) cluster info missing for cluster %s", item.Round, clusterID) + return nil, errors.Annotatef(ErrCheckpointCorruption, "checkpoint item (round %d) cluster info missing for cluster %s", item.Round, clusterID) } } } @@ -103,10 +107,10 @@ func (r *Recorder) initializeCheckpoint() error { if _, err := os.Stat(checkpointFile); err == nil { data, err := os.ReadFile(checkpointFile) if err != nil { - return errors.Trace(err) + return errors.Trace(err) // transient I/O error } if err := json.Unmarshal(data, r.checkpoint); err != nil { - return errors.Trace(err) + return errors.Annotatef(ErrCheckpointCorruption, "failed to unmarshal checkpoint.json: %v", err) } return nil } @@ -118,14 +122,14 @@ func (r *Recorder) initializeCheckpoint() error { log.Warn("checkpoint.json not found, recovering from checkpoint.json.bak") data, err := os.ReadFile(bakFile) if err != nil { - return errors.Trace(err) + return errors.Trace(err) // transient I/O error } if err := json.Unmarshal(data, r.checkpoint); err != nil { - return errors.Trace(err) + return errors.Annotatef(ErrCheckpointCorruption, "failed to unmarshal checkpoint.json.bak: %v", err) } // Restore the backup as the primary file if err := os.Rename(bakFile, checkpointFile); err != nil { - return errors.Trace(err) + return errors.Trace(err) // transient I/O error } return nil } diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go index 96a3d1871d..1bbfaa16e9 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/config" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/errors" "github.com/stretchr/testify/require" ) @@ -438,3 +439,123 @@ func TestRecorder_CheckpointPersistence(t *testing.T) { require.Equal(t, uint64(200), cp.CheckpointItems[2].ClusterInfo["c1"].TimeWindow.RightBoundary) }) } + +func TestErrCheckpointCorruption(t *testing.T) { + t.Parallel() + + t.Run("corrupted checkpoint file returns ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Create report and checkpoint directories + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "report"), 0755)) + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "checkpoint"), 0755)) + + // Write invalid JSON to checkpoint.json + err := os.WriteFile(filepath.Join(dataDir, "checkpoint", "checkpoint.json"), []byte("{bad json"), 0600) + require.NoError(t, err) + + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}, 0) + require.Error(t, err) + require.True(t, errors.Is(err, ErrCheckpointCorruption)) + }) + + t.Run("cluster count mismatch returns ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Write a valid checkpoint with 2 clusters + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c2": {}}, 0) + require.NoError(t, err) + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + "c2": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + } + err = r1.RecordTimeWindow(twData, NewReport(0)) + require.NoError(t, err) + + // Reload with 1 cluster — should be ErrCheckpointCorruption + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}, 0) + require.Error(t, err) + require.True(t, errors.Is(err, ErrCheckpointCorruption)) + }) + + t.Run("missing cluster ID returns ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Write a valid checkpoint with clusters c1 and c2 + r1, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c2": {}}, 0) + require.NoError(t, err) + twData := map[string]types.TimeWindowData{ + "c1": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + "c2": {TimeWindow: types.TimeWindow{LeftBoundary: 0, RightBoundary: 10}}, + } + err = r1.RecordTimeWindow(twData, NewReport(0)) + require.NoError(t, err) + + // Reload with c1 and c3 — should be ErrCheckpointCorruption + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}, "c3": {}}, 0) + require.Error(t, err) + require.True(t, errors.Is(err, ErrCheckpointCorruption)) + }) + + t.Run("fresh start does not return ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + _, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}, 0) + require.NoError(t, err) + }) + + t.Run("unreadable checkpoint file does not return ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Create directories + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "report"), 0755)) + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "checkpoint"), 0755)) + + // Create checkpoint.json as a directory so ReadFile fails with a non-corruption I/O error + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "checkpoint", "checkpoint.json"), 0755)) + + _, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}, 0) + require.Error(t, err) + require.False(t, errors.Is(err, ErrCheckpointCorruption), + "I/O errors should NOT be classified as ErrCheckpointCorruption, got: %v", err) + }) + + t.Run("unreadable backup checkpoint does not return ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Create directories + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "report"), 0755)) + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "checkpoint"), 0755)) + + // Make checkpoint.json.bak a directory so ReadFile fails with an I/O error + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "checkpoint", "checkpoint.json.bak"), 0755)) + + _, err := NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}, 0) + require.Error(t, err) + require.False(t, errors.Is(err, ErrCheckpointCorruption), + "I/O errors should NOT be classified as ErrCheckpointCorruption, got: %v", err) + }) + + t.Run("corrupted backup checkpoint returns ErrCheckpointCorruption", func(t *testing.T) { + t.Parallel() + dataDir := t.TempDir() + + // Create directories + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "report"), 0755)) + require.NoError(t, os.MkdirAll(filepath.Join(dataDir, "checkpoint"), 0755)) + + // Write invalid JSON to checkpoint.json.bak (simulate crash recovery with corrupted backup) + err := os.WriteFile(filepath.Join(dataDir, "checkpoint", "checkpoint.json.bak"), []byte("not valid json"), 0600) + require.NoError(t, err) + + _, err = NewRecorder(dataDir, map[string]config.ClusterConfig{"c1": {}}, 0) + require.Error(t, err) + require.True(t, errors.Is(err, ErrCheckpointCorruption)) + }) +} diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index 86ce1caa3d..ff5bc15bbe 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -29,7 +29,7 @@ type DataLossItem struct { } func (item *DataLossItem) String() string { - return fmt.Sprintf("peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d, type: data loss", item.PeerClusterID, item.PK, item.OriginTS, item.CommitTS) + return fmt.Sprintf("peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", item.PeerClusterID, item.PK, item.OriginTS, item.CommitTS) } type InconsistentColumn struct { @@ -52,7 +52,7 @@ type DataInconsistentItem struct { func (item *DataInconsistentItem) String() string { var sb strings.Builder - fmt.Fprintf(&sb, "peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d, type: data inconsistent", + fmt.Fprintf(&sb, "peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", item.PeerClusterID, item.PK, item.OriginTS, item.CommitTS) if len(item.InconsistentColumns) > 0 { sb.WriteString(", inconsistent columns: [") diff --git a/cmd/multi-cluster-consistency-checker/recorder/types_test.go b/cmd/multi-cluster-consistency-checker/recorder/types_test.go index 2ccd68f179..c7e8d69033 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types_test.go @@ -30,7 +30,7 @@ func TestDataLossItem_String(t *testing.T) { CommitTS: 200, } s := item.String() - require.Equal(t, "peer cluster: cluster-2, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss", s) + require.Equal(t, "peer cluster: cluster-2, pk: pk-1, origin ts: 100, commit ts: 200", s) } func TestDataInconsistentItem_String(t *testing.T) { @@ -45,7 +45,7 @@ func TestDataInconsistentItem_String(t *testing.T) { CommitTS: 400, } s := item.String() - require.Equal(t, "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent", s) + require.Equal(t, "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400", s) }) t.Run("with inconsistent columns", func(t *testing.T) { @@ -62,7 +62,7 @@ func TestDataInconsistentItem_String(t *testing.T) { } s := item.String() require.Equal(t, - "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent, "+ + "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, "+ "inconsistent columns: [column: col1, local: val_a, replicated: val_b; column: col2, local: 100, replicated: 200]", s) }) @@ -80,7 +80,7 @@ func TestDataInconsistentItem_String(t *testing.T) { } s := item.String() require.Equal(t, - "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, type: data inconsistent, "+ + "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, "+ "inconsistent columns: [column: col1, local: val_a, replicated: ]", s) }) @@ -262,7 +262,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [peer cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200, type: data loss]\n\n", + " - [peer cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200]\n\n", s) }) @@ -312,7 +312,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data loss]\n\n", + " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2]\n\n", s) }) @@ -333,9 +333,9 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [peer cluster: d0, pk: pk-0, origin ts: 0, commit ts: 1, type: data loss]\n"+ + " - [peer cluster: d0, pk: pk-0, origin ts: 0, commit ts: 1]\n"+ " - [data inconsistent items: 1]\n"+ - " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, type: data inconsistent, inconsistent columns: [column: val, local: x, replicated: y]]\n"+ + " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, inconsistent columns: [column: val, local: x, replicated: y]]\n"+ " - [data redundant items: 1]\n"+ " - [pk: pk-2, origin ts: 3, commit ts: 4]\n"+ " - [lww violation items: 1]\n"+ diff --git a/cmd/multi-cluster-consistency-checker/task.go b/cmd/multi-cluster-consistency-checker/task.go index 30f1c33996..1bb8c35d86 100644 --- a/cmd/multi-cluster-consistency-checker/task.go +++ b/cmd/multi-cluster-consistency-checker/task.go @@ -16,6 +16,7 @@ package main import ( "context" "fmt" + "net/url" "strings" "time" @@ -40,7 +41,8 @@ import ( func runTask(ctx context.Context, cfg *config.Config, dryRun bool) error { checkpointWatchers, s3Watchers, pdClients, etcdClients, err := initClients(ctx, cfg) if err != nil { - return errors.Trace(err) + // Client initialisation is typically a transient (network) failure. + return &ExitError{Code: ExitCodeTransient, Err: errors.Trace(err)} } // Ensure cleanup happens even if there's an error defer cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) @@ -50,15 +52,19 @@ func runTask(ctx context.Context, cfg *config.Config, dryRun bool) error { return nil } - recorder, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir, cfg.Clusters, cfg.GlobalConfig.MaxReportFiles) + rec, err := recorder.NewRecorder(cfg.GlobalConfig.DataDir, cfg.Clusters, cfg.GlobalConfig.MaxReportFiles) if err != nil { - return errors.Trace(err) + if errors.Is(err, recorder.ErrCheckpointCorruption) { + return &ExitError{Code: ExitCodeCheckpointCorruption, Err: err} + } + // Other recorder init errors (e.g. mkdir, readdir) are transient. + return &ExitError{Code: ExitCodeTransient, Err: errors.Trace(err)} } - timeWindowAdvancer, checkpointDataMap, err := advancer.NewTimeWindowAdvancer(ctx, checkpointWatchers, s3Watchers, pdClients, recorder.GetCheckpoint()) + timeWindowAdvancer, checkpointDataMap, err := advancer.NewTimeWindowAdvancer(ctx, checkpointWatchers, s3Watchers, pdClients, rec.GetCheckpoint()) if err != nil { - return errors.Trace(err) + return &ExitError{Code: ExitCodeTransient, Err: errors.Trace(err)} } - dataChecker := checker.NewDataChecker(ctx, cfg.Clusters, checkpointDataMap, recorder.GetCheckpoint()) + dataChecker := checker.NewDataChecker(ctx, cfg.Clusters, checkpointDataMap, rec.GetCheckpoint()) log.Info("Starting consistency checker task") for { @@ -72,16 +78,16 @@ func runTask(ctx context.Context, cfg *config.Config, dryRun bool) error { newTimeWindowData, err := timeWindowAdvancer.AdvanceTimeWindow(ctx) if err != nil { - return errors.Trace(err) + return &ExitError{Code: ExitCodeTransient, Err: errors.Trace(err)} } report, err := dataChecker.CheckInNextTimeWindow(newTimeWindowData) if err != nil { - return errors.Trace(err) + return &ExitError{Code: ExitCodeTransient, Err: errors.Trace(err)} } - if err := recorder.RecordTimeWindow(newTimeWindowData, report); err != nil { - return errors.Trace(err) + if err := rec.RecordTimeWindow(newTimeWindowData, report); err != nil { + return &ExitError{Code: ExitCodeTransient, Err: errors.Trace(err)} } } } @@ -115,7 +121,7 @@ func initClients(ctx context.Context, cfg *config.Config) ( checkpointWatchers[clusterID] = clusterCheckpointWatchers // Validate s3 changefeed sink config from etcd - if err := validateS3ChangefeedSinkConfig(ctx, etcdClient, clusterID, clusterConfig.S3ChangefeedID); err != nil { + if err := validateS3ChangefeedSinkConfig(ctx, etcdClient, clusterID, clusterConfig.S3ChangefeedID, clusterConfig.S3SinkURI); err != nil { cleanupClients(pdClients, etcdClients, checkpointWatchers, s3Watchers) return nil, nil, nil, nil, errors.Trace(err) } @@ -139,23 +145,30 @@ func initClients(ctx context.Context, cfg *config.Config) ( } // validateS3ChangefeedSinkConfig fetches the changefeed info from etcd and validates that: -// 1. The protocol must be canal-json -// 2. The date separator must be "day" -// 3. The file index width must be DefaultFileIndexWidth -func validateS3ChangefeedSinkConfig(ctx context.Context, etcdClient *etcd.CDCEtcdClientImpl, clusterID string, s3ChangefeedID string) error { +// 1. The changefeed SinkURI bucket/prefix matches the configured s3SinkURI +// 2. The protocol must be canal-json +// 3. The date separator must be "day" +// 4. The file index width must be DefaultFileIndexWidth +func validateS3ChangefeedSinkConfig(ctx context.Context, etcdClient *etcd.CDCEtcdClientImpl, clusterID string, s3ChangefeedID string, s3SinkURI string) error { displayName := common.NewChangeFeedDisplayName(s3ChangefeedID, "default") cfInfo, err := etcdClient.GetChangeFeedInfo(ctx, displayName) if err != nil { return errors.Annotate(err, fmt.Sprintf("failed to get changefeed info for s3 changefeed %s in cluster %s", s3ChangefeedID, clusterID)) } + // 1. Validate that the changefeed's SinkURI bucket/prefix matches the configured s3SinkURI. + // This prevents the checker from reading data that was written by a different changefeed. + if err := validateS3BucketPrefix(cfInfo.SinkURI, s3SinkURI, clusterID, s3ChangefeedID); err != nil { + return err + } + if cfInfo.Config == nil || cfInfo.Config.Sink == nil { return fmt.Errorf("cluster %s: s3 changefeed %s has no sink configuration", clusterID, s3ChangefeedID) } sinkConfig := cfInfo.Config.Sink - // 1. Validate protocol must be canal-json + // 2. Validate protocol must be canal-json protocolStr := strings.ToLower(util.GetOrZero(sinkConfig.Protocol)) if protocolStr == "" { return fmt.Errorf("cluster %s: s3 changefeed %s has no protocol configured in sink config", clusterID, s3ChangefeedID) @@ -169,7 +182,7 @@ func validateS3ChangefeedSinkConfig(ctx context.Context, etcdClient *etcd.CDCEtc clusterID, s3ChangefeedID, protocolStr, cdcconfig.ProtocolCanalJSON.String()) } - // 2. Validate date separator must be "day" + // 3. Validate date separator must be "day" dateSeparatorStr := util.GetOrZero(sinkConfig.DateSeparator) if dateSeparatorStr == "" { dateSeparatorStr = cdcconfig.DateSeparatorNone.String() @@ -183,7 +196,7 @@ func validateS3ChangefeedSinkConfig(ctx context.Context, etcdClient *etcd.CDCEtc clusterID, s3ChangefeedID, dateSep.String(), cdcconfig.DateSeparatorDay.String()) } - // 3. Validate file index width must be DefaultFileIndexWidth + // 4. Validate file index width must be DefaultFileIndexWidth fileIndexWidth := util.GetOrZero(sinkConfig.FileIndexWidth) if fileIndexWidth != cdcconfig.DefaultFileIndexWidth { return fmt.Errorf("cluster %s: s3 changefeed %s file-index-width is %d, but only %d is supported", @@ -201,6 +214,42 @@ func validateS3ChangefeedSinkConfig(ctx context.Context, etcdClient *etcd.CDCEtc return nil } +// validateS3BucketPrefix checks that the changefeed's SinkURI and the +// configured s3-sink-uri point to the same S3 bucket and prefix. +// This is a critical sanity check — a mismatch means the checker would +// read data from a different location than where the changefeed writes. +func validateS3BucketPrefix(changefeedSinkURI, configS3SinkURI, clusterID, s3ChangefeedID string) error { + cfURL, err := url.Parse(changefeedSinkURI) + if err != nil { + return fmt.Errorf("cluster %s: s3 changefeed %s has invalid sink URI %q: %v", + clusterID, s3ChangefeedID, changefeedSinkURI, err) + } + cfgURL, err := url.Parse(configS3SinkURI) + if err != nil { + return fmt.Errorf("cluster %s: configured s3-sink-uri %q is invalid: %v", + clusterID, configS3SinkURI, err) + } + + // Compare scheme (s3, gcs, …), bucket (Host) and prefix (Path). + // Path is normalized by trimming trailing slashes so that + // "s3://bucket/prefix" and "s3://bucket/prefix/" are considered equal. + cfScheme := strings.ToLower(cfURL.Scheme) + cfgScheme := strings.ToLower(cfgURL.Scheme) + cfBucket := cfURL.Host + cfgBucket := cfgURL.Host + cfPrefix := strings.TrimRight(cfURL.Path, "/") + cfgPrefix := strings.TrimRight(cfgURL.Path, "/") + + if cfScheme != cfgScheme || cfBucket != cfgBucket || cfPrefix != cfgPrefix { + return fmt.Errorf("cluster %s: s3 changefeed %s sink URI bucket/prefix mismatch: "+ + "changefeed has %s://%s%s but config has %s://%s%s", + clusterID, s3ChangefeedID, + cfScheme, cfBucket, cfURL.Path, + cfgScheme, cfgBucket, cfgURL.Path) + } + return nil +} + func newPDClient(ctx context.Context, pdAddrs []string, securityConfig *security.Credential) (pd.Client, *etcd.CDCEtcdClientImpl, error) { pdClient, err := pd.NewClientWithContext( ctx, "consistency-checker", pdAddrs, securityConfig.PDSecurityOption(), From 2dfb2b66d9c3bf737370183447480a4a6d1ad6b5 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 19 Feb 2026 19:48:28 +0800 Subject: [PATCH 22/23] meet the spec Signed-off-by: Jianjun Liao --- .../checker/checker.go | 74 +++++--------- .../checker/checker_test.go | 29 ++++-- .../config/config.go | 6 ++ .../consumer/consumer.go | 42 ++++---- .../decoder/decoder.go | 17 +++- .../recorder/recorder.go | 43 ++++++-- .../recorder/recorder_test.go | 8 +- .../recorder/types.go | 98 ++++++++++++++----- .../recorder/types_test.go | 85 ++++++++-------- .../types/types.go | 15 ++- .../watcher/checkpoint_watcher.go | 21 ++-- 11 files changed, 265 insertions(+), 173 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index 1e9fbb075c..998fbc472c 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -93,8 +93,8 @@ func (c *clusterViolationChecker) Check(schemaKey string, r *decoder.Record, rep log.Error("LWW violation detected", zap.String("clusterID", c.clusterID), zap.Any("entry", entry), - zap.Any("record", r)) - report.AddLWWViolationItem(schemaKey, string(r.Pk), entry.cdcVersion.OriginTs, entry.cdcVersion.CommitTs, r.OriginTs, r.CommitTs) + zap.String("pk", r.PkStr)) + report.AddLWWViolationItem(schemaKey, r.PkMap, r.PkStr, entry.cdcVersion.OriginTs, entry.cdcVersion.CommitTs, r.OriginTs, r.CommitTs) return } tableSchemaKeyVersionCache[r.Pk] = versionCacheEntry{ @@ -353,7 +353,7 @@ func diffColumns(local, replicated *decoder.Record) []recorder.InconsistentColum Local: localVal, Replicated: nil, }) - } else if localVal != replicatedVal { + } else if localVal != replicatedVal { // safe: ColumnValues only holds comparable types (see decoder.go) result = append(result, recorder.InconsistentColumn{ Column: colName, Local: localVal, @@ -381,11 +381,29 @@ func diffColumns(local, replicated *decoder.Record) []recorder.InconsistentColum // in the replicated data cache [1] or [2] or another new record is present in the replicated data // cache [1] or [2]. func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { - for schemaKey, tableDataCache := range cd.timeWindowDataCaches[1].tableDataCaches { + // Time window [1]: skip records whose commitTs <= checkpoint (already checked in previous round) + cd.checkLocalRecordsForDataLoss(1, func(commitTs, checkpointTs uint64) bool { + return commitTs <= checkpointTs + }, checker) + // Time window [2]: skip records whose commitTs > checkpoint (will be checked in next round) + cd.checkLocalRecordsForDataLoss(2, func(commitTs, checkpointTs uint64) bool { + return commitTs > checkpointTs + }, checker) +} + +// checkLocalRecordsForDataLoss iterates through the local-written data cache at timeWindowIdx +// and checks each record against the replicated data cache. Records for which shouldSkip returns +// true are skipped. This helper unifies the logic for time windows [1] and [2]. +func (cd *clusterDataChecker) checkLocalRecordsForDataLoss( + timeWindowIdx int, + shouldSkip func(commitTs, checkpointTs uint64) bool, + checker *DataChecker, +) { + for schemaKey, tableDataCache := range cd.timeWindowDataCaches[timeWindowIdx].tableDataCaches { for _, localDataCache := range tableDataCache.localDataCache { for _, record := range localDataCache { - for replicatedClusterID, checkpointTs := range cd.timeWindowDataCaches[1].checkpointTs { - if record.CommitTs <= checkpointTs { + for replicatedClusterID, checkpointTs := range cd.timeWindowDataCaches[timeWindowIdx].checkpointTs { + if shouldSkip(record.CommitTs, checkpointTs) { continue } cd.checkedRecordsCount++ @@ -406,52 +424,14 @@ func (cd *clusterDataChecker) dataLossDetection(checker *DataChecker) { zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs) + cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkMap, record.PkStr, record.OriginTs, record.CommitTs) } else if !record.EqualReplicatedRecord(replicatedRecord) { // data inconsistent detected log.Error("data inconsistent detected", zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) - } - } - } - } - } - for schemaKey, tableDataCache := range cd.timeWindowDataCaches[2].tableDataCaches { - for _, localDataCache := range tableDataCache.localDataCache { - for _, record := range localDataCache { - for replicatedClusterID, checkpointTs := range cd.timeWindowDataCaches[2].checkpointTs { - if record.CommitTs > checkpointTs { - continue - } - cd.checkedRecordsCount++ - replicatedRecord, skipped := checker.FindClusterReplicatedData(replicatedClusterID, schemaKey, record.Pk, record.CommitTs) - if skipped { - log.Debug("replicated record skipped by LWW", - zap.String("local cluster ID", cd.clusterID), - zap.String("replicated cluster ID", replicatedClusterID), - zap.String("schemaKey", schemaKey), - zap.String("pk", record.PkStr), - zap.Uint64("commitTs", record.CommitTs)) - cd.lwwSkippedRecordsCount++ - continue - } - if replicatedRecord == nil { - // data loss detected - log.Error("data loss detected", - zap.String("local cluster ID", cd.clusterID), - zap.String("replicated cluster ID", replicatedClusterID), - zap.Any("record", record)) - cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs) - } else if !record.EqualReplicatedRecord(replicatedRecord) { - // data inconsistent detected - log.Error("data inconsistent detected", - zap.String("local cluster ID", cd.clusterID), - zap.String("replicated cluster ID", replicatedClusterID), - zap.Any("record", record)) - cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkStr, record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) + cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkMap, record.PkStr, record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) } } } @@ -472,7 +452,7 @@ func (cd *clusterDataChecker) dataRedundantDetection(checker *DataChecker) { log.Error("data redundant detected", zap.String("replicated cluster ID", cd.clusterID), zap.Any("record", record)) - cd.report.AddDataRedundantItem(schemaKey, record.PkStr, record.OriginTs, record.CommitTs) + cd.report.AddDataRedundantItem(schemaKey, record.PkMap, record.PkStr, record.OriginTs, record.CommitTs) } } } diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go index 7b690b4174..e1228d58b9 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker_test.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -92,7 +92,8 @@ func TestClusterViolationChecker_Check(t *testing.T) { report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, @@ -111,14 +112,16 @@ func TestClusterViolationChecker_Check(t *testing.T) { report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record1 := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, }, } record2 := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 50, OriginTs: 0, @@ -136,14 +139,16 @@ func TestClusterViolationChecker_Check(t *testing.T) { report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record1 := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, }, } record2 := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 150, OriginTs: 50, // OriginTs is less than record1's CommitTs, causing violation @@ -156,7 +161,7 @@ func TestClusterViolationChecker_Check(t *testing.T) { require.Contains(t, report.TableFailureItems, schemaKey) tableItems := report.TableFailureItems[schemaKey] require.Len(t, tableItems.LWWViolationItems, 1) - require.Equal(t, "pk1", tableItems.LWWViolationItems[0].PK) + require.Equal(t, map[string]any{"id": "1"}, tableItems.LWWViolationItems[0].PK) require.Equal(t, uint64(0), tableItems.LWWViolationItems[0].ExistingOriginTS) require.Equal(t, uint64(100), tableItems.LWWViolationItems[0].ExistingCommitTS) require.Equal(t, uint64(50), tableItems.LWWViolationItems[0].OriginTS) @@ -175,7 +180,8 @@ func TestClusterViolationChecker_UpdateCache(t *testing.T) { report := recorder.NewClusterReport("cluster1", types.TimeWindow{}) record := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 100, OriginTs: 0, @@ -230,7 +236,8 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) record := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 150, OriginTs: 0, @@ -247,7 +254,8 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) record := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 150, OriginTs: 100, @@ -264,7 +272,8 @@ func TestTimeWindowDataCache_NewRecord(t *testing.T) { t.Parallel() cache := newTimeWindowDataCache(100, 200, map[string]uint64{}) record := &decoder.Record{ - Pk: "pk1", + Pk: "pk1", + PkMap: map[string]any{"id": "1"}, CdcVersion: types.CdcVersion{ CommitTs: 50, OriginTs: 0, diff --git a/cmd/multi-cluster-consistency-checker/config/config.go b/cmd/multi-cluster-consistency-checker/config/config.go index ea648d756b..86fac59af3 100644 --- a/cmd/multi-cluster-consistency-checker/config/config.go +++ b/cmd/multi-cluster-consistency-checker/config/config.go @@ -120,6 +120,12 @@ func LoadConfig(path string) (*Config, error) { return nil, fmt.Errorf("cluster '%s': peer-cluster-changefeed-config is not entirely configured", name) } for peerClusterID, peerClusterChangefeedConfig := range cluster.PeerClusterChangefeedConfig { + if peerClusterID == name { + return nil, fmt.Errorf("cluster '%s': peer-cluster-changefeed-config references itself", name) + } + if _, ok := cfg.Clusters[peerClusterID]; !ok { + return nil, fmt.Errorf("cluster '%s': peer-cluster-changefeed-config references unknown cluster '%s'", name, peerClusterID) + } if peerClusterChangefeedConfig.ChangefeedID == "" { return nil, fmt.Errorf("cluster '%s': peer-cluster-changefeed-config[%s]: changefeed-id is required", name, peerClusterID) } diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go index e251a06813..73544ddde5 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -440,32 +440,32 @@ func (c *S3Consumer) getNewFilesForSchemaPathKeyWithEndPath( // downloadSchemaFiles downloads schema files concurrently for given schema path keys func (c *S3Consumer) downloadSchemaFiles( - ctx context.Context, + _ context.Context, newVersionPaths map[cloudstorage.SchemaPathKey]string, ) error { - eg, _ := errgroup.WithContext(ctx) + // eg, ectx := errgroup.WithContext(ctx) log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) for schemaPathKey, filePath := range newVersionPaths { - eg.Go(func() error { - // content, err := c.s3Storage.ReadFile(egCtx, filePath) - // if err != nil { - // return errors.Annotatef(err, "failed to read schema file: %s", filePath) - // } - // - // Use canal-json decoder for S3 sink with .json file extension - // parser, err := types.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) - // if err != nil { - // return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) - // } - - c.schemaParser.SetSchemaParser(schemaPathKey, filePath, nil) - return nil - }) - } - if err := eg.Wait(); err != nil { - return errors.Trace(err) - } + // eg.Go(func() error { + // content, err := c.s3Storage.ReadFile(egCtx, filePath) + // if err != nil { + // return errors.Annotatef(err, "failed to read schema file: %s", filePath) + // } + // + // Use canal-json decoder for S3 sink with .json file extension + // parser, err := types.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) + // if err != nil { + // return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) + // } + // + c.schemaParser.SetSchemaParser(schemaPathKey, filePath, nil) + // return nil + // }) + } + //if err := eg.Wait(); err != nil { + // return errors.Trace(err) + //} return nil } diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder.go b/cmd/multi-cluster-consistency-checker/decoder/decoder.go index ac03677229..dff25ad771 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder.go @@ -29,7 +29,6 @@ import ( "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/codec/common" - codecCommon "github.com/pingcap/ticdc/pkg/sink/codec/common" "github.com/pingcap/tidb/pkg/parser/mysql" ptypes "github.com/pingcap/tidb/pkg/parser/types" tiTypes "github.com/pingcap/tidb/pkg/types" @@ -70,8 +69,8 @@ type canalValueDecoderJSONMessageWithTiDBExtension struct { TiDBCommitTsExtension *TiDBCommitTsExtension `json:"_tidb"` } -func defaultCanalJSONCodecConfig() *codecCommon.Config { - codecConfig := codecCommon.NewConfig(config.ProtocolCanalJSON) +func defaultCanalJSONCodecConfig() *common.Config { + codecConfig := common.NewConfig(config.ProtocolCanalJSON) // Always enable tidb extension for canal-json protocol // because we need to get the commit ts from the extension field. codecConfig.EnableTiDBExtension = true @@ -83,6 +82,7 @@ type Record struct { types.CdcVersion Pk types.PkType PkStr string + PkMap map[string]any ColumnValues map[string]any } @@ -104,6 +104,10 @@ func (r *Record) EqualReplicatedRecord(replicatedRecord *Record) bool { if !ok { return false } + // NOTE: This comparison is safe because ColumnValues only holds comparable + // types (nil, string, int64, float64, etc.) as produced by the canal-json + // decoder. If a non-comparable type (e.g. []byte or map) were ever stored, + // the != operator would panic at runtime. if columnValue != replicatedColumnValue { return false } @@ -181,9 +185,9 @@ func (d *columnValueDecoder) tryNext() (common.MessageType, bool) { } if err := json.Unmarshal(encodedData, msg); err != nil { - log.Panic("canal-json decoder unmarshal data failed", + log.Error("canal-json decoder unmarshal data failed", zap.Error(err), zap.ByteString("data", encodedData)) - return common.MessageTypeUnknown, false + return common.MessageTypeUnknown, true } d.msg = msg return d.msg.messageType(), true @@ -198,6 +202,7 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { var pkStrBuilder strings.Builder pkStrBuilder.WriteString("[") pkValues := make([]tiTypes.Datum, 0, len(d.msg.PkNames)) + pkMap := make(map[string]any, len(d.msg.PkNames)) slices.Sort(d.msg.PkNames) for i, pkName := range d.msg.PkNames { mysqlType, ok := d.msg.MySQLType[pkName] @@ -214,6 +219,7 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { pkStrBuilder.WriteString(", ") } fmt.Fprintf(&pkStrBuilder, "%s: %v", pkName, columnValue) + pkMap[pkName] = columnValue ft := newPKColumnFieldTypeFromMysqlType(mysqlType) datum := valueToDatum(columnValue, ft) if datum.IsNull() { @@ -248,6 +254,7 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { return &Record{ Pk: types.PkType(pk), PkStr: pkStrBuilder.String(), + PkMap: pkMap, ColumnValues: columnValues, CdcVersion: types.CdcVersion{ CommitTs: commitTs, diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder.go b/cmd/multi-cluster-consistency-checker/recorder/recorder.go index 25052ec8ab..3affe7b7dd 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder.go @@ -161,19 +161,16 @@ func (r *Recorder) RecordTimeWindow(timeWindowData map[string]types.TimeWindowDa func (r *Recorder) flushReport(report *Report) error { reportName := fmt.Sprintf("report-%d.report", report.Round) - filename := filepath.Join(r.reportDir, reportName) - data := report.MarshalReport() - if err := os.WriteFile(filename, []byte(data), 0600); err != nil { + if err := atomicWriteFile(filepath.Join(r.reportDir, reportName), []byte(report.MarshalReport())); err != nil { return errors.Trace(err) } jsonName := fmt.Sprintf("report-%d.json", report.Round) - filename = filepath.Join(r.reportDir, jsonName) dataBytes, err := json.Marshal(report) if err != nil { return errors.Trace(err) } - if err := os.WriteFile(filename, dataBytes, 0600); err != nil { + if err := atomicWriteFile(filepath.Join(r.reportDir, jsonName), dataBytes); err != nil { return errors.Trace(err) } @@ -182,6 +179,38 @@ func (r *Recorder) flushReport(report *Report) error { return nil } +// atomicWriteFile writes data to a temporary file, fsyncs it to ensure +// durability, and then atomically renames it to the target path. +// This prevents partial / corrupt files on crash. +func atomicWriteFile(targetPath string, data []byte) error { + tempPath := targetPath + ".tmp" + if err := syncWriteFile(tempPath, data); err != nil { + return errors.Trace(err) + } + if err := os.Rename(tempPath, targetPath); err != nil { + return errors.Trace(err) + } + return nil +} + +// syncWriteFile writes data to a file and fsyncs it before returning, +// guaranteeing that the content is durable on disk. +func syncWriteFile(path string, data []byte) error { + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) + if err != nil { + return errors.Trace(err) + } + if _, err := f.Write(data); err != nil { + f.Close() + return errors.Trace(err) + } + if err := f.Sync(); err != nil { + f.Close() + return errors.Trace(err) + } + return errors.Trace(f.Close()) +} + // cleanupOldReports removes the oldest report files from the in-memory cache // when the total number exceeds maxReportFiles * 2 (each round produces .report + .json). func (r *Recorder) cleanupOldReports() { @@ -215,8 +244,8 @@ func (r *Recorder) flushCheckpoint(round uint64, timeWindowData map[string]types return errors.Trace(err) } - // 1. Write the new content to a temp file first. - if err := os.WriteFile(tempFile, data, 0600); err != nil { + // 1. Write the new content to a temp file first and fsync it. + if err := syncWriteFile(tempFile, data); err != nil { return errors.Trace(err) } diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go index 1bbfaa16e9..b2ca0578a0 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go @@ -230,7 +230,7 @@ func TestRecorder_RecordTimeWindow(t *testing.T) { } report := NewReport(5) cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}) - cr.AddDataLossItem("d1", "test_table", "pk-1", 100, 200) + cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, 100, 200) report.AddClusterReport("c1", cr) require.True(t, report.NeedFlush()) @@ -251,7 +251,7 @@ func TestRecorder_RecordTimeWindow(t *testing.T) { reportData, err := os.ReadFile(filepath.Join(dataDir, "report", "report-5.report")) require.NoError(t, err) require.Contains(t, string(reportData), "round: 5") - require.Contains(t, string(reportData), "pk-1") + require.Contains(t, string(reportData), `[id: 1]`) // Verify json report is valid JSON jsonData, err := os.ReadFile(filepath.Join(dataDir, "report", "report-5.json")) @@ -353,7 +353,7 @@ func TestRecorder_CheckpointPersistence(t *testing.T) { } report := NewReport(i) cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}) - cr.AddDataLossItem("d1", "test_table", "pk-1", i, i+1) + cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, i, i+1) report.AddClusterReport("c1", cr) require.True(t, report.NeedFlush()) @@ -395,7 +395,7 @@ func TestRecorder_CheckpointPersistence(t *testing.T) { } report := NewReport(i) cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}) - cr.AddDataLossItem("d1", "test_table", "pk-1", i, i+1) + cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, i, i+1) report.AddClusterReport("c1", cr) err = r.RecordTimeWindow(twData, report) diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index ff5bc15bbe..0fceddc3a5 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -15,6 +15,7 @@ package recorder import ( "fmt" + "sort" "strings" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" @@ -22,14 +23,16 @@ import ( ) type DataLossItem struct { - PeerClusterID string `json:"peer_cluster_id"` - PK string `json:"pk"` - OriginTS uint64 `json:"origin_ts"` - CommitTS uint64 `json:"commit_ts"` + PeerClusterID string `json:"peer_cluster_id"` + PK map[string]any `json:"pk"` + OriginTS uint64 `json:"origin_ts"` + CommitTS uint64 `json:"commit_ts"` + + PKStr string `json:"-"` } func (item *DataLossItem) String() string { - return fmt.Sprintf("peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", item.PeerClusterID, item.PK, item.OriginTS, item.CommitTS) + return fmt.Sprintf("peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", item.PeerClusterID, item.PKStr, item.OriginTS, item.CommitTS) } type InconsistentColumn struct { @@ -44,16 +47,18 @@ func (c *InconsistentColumn) String() string { type DataInconsistentItem struct { PeerClusterID string `json:"peer_cluster_id"` - PK string `json:"pk"` + PK map[string]any `json:"pk"` OriginTS uint64 `json:"origin_ts"` CommitTS uint64 `json:"commit_ts"` InconsistentColumns []InconsistentColumn `json:"inconsistent_columns,omitempty"` + + PKStr string `json:"-"` } func (item *DataInconsistentItem) String() string { var sb strings.Builder fmt.Fprintf(&sb, "peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", - item.PeerClusterID, item.PK, item.OriginTS, item.CommitTS) + item.PeerClusterID, item.PKStr, item.OriginTS, item.CommitTS) if len(item.InconsistentColumns) > 0 { sb.WriteString(", inconsistent columns: [") for i, col := range item.InconsistentColumns { @@ -68,27 +73,31 @@ func (item *DataInconsistentItem) String() string { } type DataRedundantItem struct { - PK string `json:"pk"` - OriginTS uint64 `json:"origin_ts"` - CommitTS uint64 `json:"commit_ts"` + PK map[string]any `json:"pk"` + OriginTS uint64 `json:"origin_ts"` + CommitTS uint64 `json:"commit_ts"` + + PKStr string `json:"-"` } func (item *DataRedundantItem) String() string { - return fmt.Sprintf("pk: %s, origin ts: %d, commit ts: %d", item.PK, item.OriginTS, item.CommitTS) + return fmt.Sprintf("pk: %s, origin ts: %d, commit ts: %d", item.PKStr, item.OriginTS, item.CommitTS) } type LWWViolationItem struct { - PK string `json:"pk"` - ExistingOriginTS uint64 `json:"existing_origin_ts"` - ExistingCommitTS uint64 `json:"existing_commit_ts"` - OriginTS uint64 `json:"origin_ts"` - CommitTS uint64 `json:"commit_ts"` + PK map[string]any `json:"pk"` + ExistingOriginTS uint64 `json:"existing_origin_ts"` + ExistingCommitTS uint64 `json:"existing_commit_ts"` + OriginTS uint64 `json:"origin_ts"` + CommitTS uint64 `json:"commit_ts"` + + PKStr string `json:"-"` } func (item *LWWViolationItem) String() string { return fmt.Sprintf( "pk: %s, existing origin ts: %d, existing commit ts: %d, origin ts: %d, commit ts: %d", - item.PK, item.ExistingOriginTS, item.ExistingCommitTS, item.OriginTS, item.CommitTS) + item.PKStr, item.ExistingOriginTS, item.ExistingCommitTS, item.OriginTS, item.CommitTS) } type TableFailureItems struct { @@ -126,7 +135,12 @@ func NewClusterReport(clusterID string, timeWindow types.TimeWindow) *ClusterRep } } -func (r *ClusterReport) AddDataLossItem(peerClusterID, schemaKey, pk string, originTS, commitTS uint64) { +func (r *ClusterReport) AddDataLossItem( + peerClusterID, schemaKey string, + pk map[string]any, + pkStr string, + originTS, commitTS uint64, +) { tableFailureItems, exists := r.TableFailureItems[schemaKey] if !exists { tableFailureItems = NewTableFailureItems() @@ -137,11 +151,19 @@ func (r *ClusterReport) AddDataLossItem(peerClusterID, schemaKey, pk string, ori PK: pk, OriginTS: originTS, CommitTS: commitTS, + + PKStr: pkStr, }) r.needFlush = true } -func (r *ClusterReport) AddDataInconsistentItem(peerClusterID, schemaKey, pk string, originTS, commitTS uint64, inconsistentColumns []InconsistentColumn) { +func (r *ClusterReport) AddDataInconsistentItem( + peerClusterID, schemaKey string, + pk map[string]any, + pkStr string, + originTS, commitTS uint64, + inconsistentColumns []InconsistentColumn, +) { tableFailureItems, exists := r.TableFailureItems[schemaKey] if !exists { tableFailureItems = NewTableFailureItems() @@ -153,11 +175,18 @@ func (r *ClusterReport) AddDataInconsistentItem(peerClusterID, schemaKey, pk str OriginTS: originTS, CommitTS: commitTS, InconsistentColumns: inconsistentColumns, + + PKStr: pkStr, }) r.needFlush = true } -func (r *ClusterReport) AddDataRedundantItem(schemaKey, pk string, originTS, commitTS uint64) { +func (r *ClusterReport) AddDataRedundantItem( + schemaKey string, + pk map[string]any, + pkStr string, + originTS, commitTS uint64, +) { tableFailureItems, exists := r.TableFailureItems[schemaKey] if !exists { tableFailureItems = NewTableFailureItems() @@ -167,13 +196,16 @@ func (r *ClusterReport) AddDataRedundantItem(schemaKey, pk string, originTS, com PK: pk, OriginTS: originTS, CommitTS: commitTS, + + PKStr: pkStr, }) r.needFlush = true } func (r *ClusterReport) AddLWWViolationItem( schemaKey string, - pk string, + pk map[string]any, + pkStr string, existingOriginTS, existingCommitTS uint64, originTS, commitTS uint64, ) { @@ -188,6 +220,8 @@ func (r *ClusterReport) AddLWWViolationItem( ExistingCommitTS: existingCommitTS, OriginTS: originTS, CommitTS: commitTS, + + PKStr: pkStr, }) r.needFlush = true } @@ -214,13 +248,31 @@ func (r *Report) AddClusterReport(clusterID string, clusterReport *ClusterReport func (r *Report) MarshalReport() string { var reportMsg strings.Builder fmt.Fprintf(&reportMsg, "round: %d\n", r.Round) - for clusterID, clusterReport := range r.ClusterReports { + + // Sort cluster IDs for deterministic output + clusterIDs := make([]string, 0, len(r.ClusterReports)) + for clusterID := range r.ClusterReports { + clusterIDs = append(clusterIDs, clusterID) + } + sort.Strings(clusterIDs) + + for _, clusterID := range clusterIDs { + clusterReport := r.ClusterReports[clusterID] if !clusterReport.needFlush { continue } fmt.Fprintf(&reportMsg, "\n[cluster: %s]\n", clusterID) fmt.Fprintf(&reportMsg, "time window: %s\n", clusterReport.TimeWindow.String()) - for schemaKey, tableFailureItems := range clusterReport.TableFailureItems { + + // Sort schema keys for deterministic output + schemaKeys := make([]string, 0, len(clusterReport.TableFailureItems)) + for schemaKey := range clusterReport.TableFailureItems { + schemaKeys = append(schemaKeys, schemaKey) + } + sort.Strings(schemaKeys) + + for _, schemaKey := range schemaKeys { + tableFailureItems := clusterReport.TableFailureItems[schemaKey] fmt.Fprintf(&reportMsg, " - [table name: %s]\n", schemaKey) if len(tableFailureItems.DataLossItems) > 0 { fmt.Fprintf(&reportMsg, " - [data loss items: %d]\n", len(tableFailureItems.DataLossItems)) diff --git a/cmd/multi-cluster-consistency-checker/recorder/types_test.go b/cmd/multi-cluster-consistency-checker/recorder/types_test.go index c7e8d69033..e054b48d19 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types_test.go @@ -25,12 +25,13 @@ func TestDataLossItem_String(t *testing.T) { t.Parallel() item := &DataLossItem{ PeerClusterID: "cluster-2", - PK: "pk-1", + PK: map[string]any{"id": "1"}, OriginTS: 100, CommitTS: 200, + PKStr: `[id: 1]`, } s := item.String() - require.Equal(t, "peer cluster: cluster-2, pk: pk-1, origin ts: 100, commit ts: 200", s) + require.Equal(t, `peer cluster: cluster-2, pk: [id: 1], origin ts: 100, commit ts: 200`, s) } func TestDataInconsistentItem_String(t *testing.T) { @@ -40,21 +41,23 @@ func TestDataInconsistentItem_String(t *testing.T) { t.Parallel() item := &DataInconsistentItem{ PeerClusterID: "cluster-3", - PK: "pk-2", + PK: map[string]any{"id": "2"}, OriginTS: 300, CommitTS: 400, + PKStr: `[id: 2]`, } s := item.String() - require.Equal(t, "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400", s) + require.Equal(t, `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, commit ts: 400`, s) }) t.Run("with inconsistent columns", func(t *testing.T) { t.Parallel() item := &DataInconsistentItem{ PeerClusterID: "cluster-3", - PK: "pk-2", + PK: map[string]any{"id": "2"}, OriginTS: 300, CommitTS: 400, + PKStr: `[id: 2]`, InconsistentColumns: []InconsistentColumn{ {Column: "col1", Local: "val_a", Replicated: "val_b"}, {Column: "col2", Local: 100, Replicated: 200}, @@ -62,7 +65,7 @@ func TestDataInconsistentItem_String(t *testing.T) { } s := item.String() require.Equal(t, - "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, "+ + `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, commit ts: 400, `+ "inconsistent columns: [column: col1, local: val_a, replicated: val_b; column: col2, local: 100, replicated: 200]", s) }) @@ -71,16 +74,17 @@ func TestDataInconsistentItem_String(t *testing.T) { t.Parallel() item := &DataInconsistentItem{ PeerClusterID: "cluster-3", - PK: "pk-2", + PK: map[string]any{"id": "2"}, OriginTS: 300, CommitTS: 400, + PKStr: `[id: 2]`, InconsistentColumns: []InconsistentColumn{ {Column: "col1", Local: "val_a", Replicated: nil}, }, } s := item.String() require.Equal(t, - "peer cluster: cluster-3, pk: pk-2, origin ts: 300, commit ts: 400, "+ + `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, commit ts: 400, `+ "inconsistent columns: [column: col1, local: val_a, replicated: ]", s) }) @@ -88,22 +92,23 @@ func TestDataInconsistentItem_String(t *testing.T) { func TestDataRedundantItem_String(t *testing.T) { t.Parallel() - item := &DataRedundantItem{PK: "pk-x", OriginTS: 10, CommitTS: 20} + item := &DataRedundantItem{PK: map[string]any{"id": "x"}, PKStr: `[id: x]`, OriginTS: 10, CommitTS: 20} s := item.String() - require.Equal(t, "pk: pk-x, origin ts: 10, commit ts: 20", s) + require.Equal(t, `pk: [id: x], origin ts: 10, commit ts: 20`, s) } func TestLWWViolationItem_String(t *testing.T) { t.Parallel() item := &LWWViolationItem{ - PK: "pk-y", + PK: map[string]any{"id": "y"}, + PKStr: `[id: y]`, ExistingOriginTS: 1, ExistingCommitTS: 2, OriginTS: 3, CommitTS: 4, } s := item.String() - require.Equal(t, "pk: pk-y, existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4", s) + require.Equal(t, `pk: [id: y], existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4`, s) } const testSchemaKey = "test_table" @@ -122,14 +127,14 @@ func TestClusterReport(t *testing.T) { t.Run("add data loss item sets needFlush", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("peer-cluster-1", testSchemaKey, "pk-1", 100, 200) + cr.AddDataLossItem("peer-cluster-1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 100, 200) require.Len(t, cr.TableFailureItems, 1) require.Contains(t, cr.TableFailureItems, testSchemaKey) tableItems := cr.TableFailureItems[testSchemaKey] require.Len(t, tableItems.DataLossItems, 1) require.True(t, cr.needFlush) require.Equal(t, "peer-cluster-1", tableItems.DataLossItems[0].PeerClusterID) - require.Equal(t, "pk-1", tableItems.DataLossItems[0].PK) + require.Equal(t, map[string]any{"id": "1"}, tableItems.DataLossItems[0].PK) require.Equal(t, uint64(100), tableItems.DataLossItems[0].OriginTS) require.Equal(t, uint64(200), tableItems.DataLossItems[0].CommitTS) }) @@ -140,14 +145,14 @@ func TestClusterReport(t *testing.T) { cols := []InconsistentColumn{ {Column: "val", Local: "a", Replicated: "b"}, } - cr.AddDataInconsistentItem("peer-cluster-2", testSchemaKey, "pk-2", 300, 400, cols) + cr.AddDataInconsistentItem("peer-cluster-2", testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 300, 400, cols) require.Len(t, cr.TableFailureItems, 1) require.Contains(t, cr.TableFailureItems, testSchemaKey) tableItems := cr.TableFailureItems[testSchemaKey] require.Len(t, tableItems.DataInconsistentItems, 1) require.True(t, cr.needFlush) require.Equal(t, "peer-cluster-2", tableItems.DataInconsistentItems[0].PeerClusterID) - require.Equal(t, "pk-2", tableItems.DataInconsistentItems[0].PK) + require.Equal(t, map[string]any{"id": "2"}, tableItems.DataInconsistentItems[0].PK) require.Equal(t, uint64(300), tableItems.DataInconsistentItems[0].OriginTS) require.Equal(t, uint64(400), tableItems.DataInconsistentItems[0].CommitTS) require.Len(t, tableItems.DataInconsistentItems[0].InconsistentColumns, 1) @@ -159,7 +164,7 @@ func TestClusterReport(t *testing.T) { t.Run("add data redundant item sets needFlush", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataRedundantItem(testSchemaKey, "pk-2", 300, 400) + cr.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "2"}, `id: 2`, 300, 400) require.Len(t, cr.TableFailureItems, 1) tableItems := cr.TableFailureItems[testSchemaKey] require.Len(t, tableItems.DataRedundantItems, 1) @@ -169,7 +174,7 @@ func TestClusterReport(t *testing.T) { t.Run("add lww violation item sets needFlush", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddLWWViolationItem(testSchemaKey, "pk-3", 1, 2, 3, 4) + cr.AddLWWViolationItem(testSchemaKey, map[string]any{"id": "3"}, `id: 3`, 1, 2, 3, 4) require.Len(t, cr.TableFailureItems, 1) tableItems := cr.TableFailureItems[testSchemaKey] require.Len(t, tableItems.LWWViolationItems, 1) @@ -183,10 +188,10 @@ func TestClusterReport(t *testing.T) { t.Run("add multiple items", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2) - cr.AddDataInconsistentItem("d2", testSchemaKey, "pk-2", 3, 4, nil) - cr.AddDataRedundantItem(testSchemaKey, "pk-3", 5, 6) - cr.AddLWWViolationItem(testSchemaKey, "pk-4", 7, 8, 9, 10) + cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `id: 1`, 1, 2) + cr.AddDataInconsistentItem("d2", testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 3, 4, nil) + cr.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "3"}, `[id: 3]`, 5, 6) + cr.AddLWWViolationItem(testSchemaKey, map[string]any{"id": "4"}, `[id: 4]`, 7, 8, 9, 10) require.Len(t, cr.TableFailureItems, 1) tableItems := cr.TableFailureItems[testSchemaKey] require.Len(t, tableItems.DataLossItems, 1) @@ -220,7 +225,7 @@ func TestReport(t *testing.T) { t.Parallel() r := NewReport(1) cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2) + cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2) r.AddClusterReport("c1", cr) require.True(t, r.NeedFlush()) }) @@ -230,7 +235,7 @@ func TestReport(t *testing.T) { r := NewReport(1) cr1 := NewClusterReport("c1", types.TimeWindow{}) cr2 := NewClusterReport("c2", types.TimeWindow{}) - cr2.AddDataRedundantItem(testSchemaKey, "pk-1", 1, 2) + cr2.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2) r.AddClusterReport("c1", cr1) r.AddClusterReport("c2", cr2) require.True(t, r.NeedFlush()) @@ -254,7 +259,7 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(1) cr := NewClusterReport("c1", tw) - cr.AddDataLossItem("d1", testSchemaKey, "pk-1", 100, 200) + cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 100, 200) r.AddClusterReport("c1", cr) s := r.MarshalReport() require.Equal(t, "round: 1\n\n"+ @@ -262,7 +267,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [peer cluster: d1, pk: pk-1, origin ts: 100, commit ts: 200]\n\n", + ` - [peer cluster: d1, pk: [id: 1], origin ts: 100, commit ts: 200]`+"\n\n", s) }) @@ -270,7 +275,7 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(2) cr := NewClusterReport("c2", tw) - cr.AddDataRedundantItem(testSchemaKey, "pk-r", 10, 20) + cr.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "r"}, `[id: r]`, 10, 20) r.AddClusterReport("c2", cr) s := r.MarshalReport() require.Equal(t, "round: 2\n\n"+ @@ -278,7 +283,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data redundant items: 1]\n"+ - " - [pk: pk-r, origin ts: 10, commit ts: 20]\n\n", + ` - [pk: [id: r], origin ts: 10, commit ts: 20]`+"\n\n", s) }) @@ -286,7 +291,7 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(3) cr := NewClusterReport("c3", tw) - cr.AddLWWViolationItem(testSchemaKey, "pk-v", 1, 2, 3, 4) + cr.AddLWWViolationItem(testSchemaKey, map[string]any{"id": "v"}, `[id: v]`, 1, 2, 3, 4) r.AddClusterReport("c3", cr) s := r.MarshalReport() require.Equal(t, "round: 3\n\n"+ @@ -294,7 +299,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [lww violation items: 1]\n"+ - " - [pk: pk-v, existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4]\n\n", + ` - [pk: [id: v], existing origin ts: 1, existing commit ts: 2, origin ts: 3, commit ts: 4]`+"\n\n", s) }) @@ -303,7 +308,7 @@ func TestReport_MarshalReport(t *testing.T) { r := NewReport(1) crEmpty := NewClusterReport("empty-cluster", tw) crFull := NewClusterReport("full-cluster", tw) - crFull.AddDataLossItem("d1", testSchemaKey, "pk-1", 1, 2) + crFull.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2) r.AddClusterReport("empty-cluster", crEmpty) r.AddClusterReport("full-cluster", crFull) s := r.MarshalReport() @@ -312,7 +317,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2]\n\n", + ` - [peer cluster: d1, pk: [id: 1], origin ts: 1, commit ts: 2]`+"\n\n", s) }) @@ -320,12 +325,12 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(10) cr := NewClusterReport("c1", tw) - cr.AddDataLossItem("d0", testSchemaKey, "pk-0", 0, 1) - cr.AddDataInconsistentItem("d1", testSchemaKey, "pk-1", 1, 2, []InconsistentColumn{ + cr.AddDataLossItem("d0", testSchemaKey, map[string]any{"id": "0"}, `[id: 0]`, 0, 1) + cr.AddDataInconsistentItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2, []InconsistentColumn{ {Column: "val", Local: "x", Replicated: "y"}, }) - cr.AddDataRedundantItem(testSchemaKey, "pk-2", 3, 4) - cr.AddLWWViolationItem(testSchemaKey, "pk-3", 5, 6, 7, 8) + cr.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 3, 4) + cr.AddLWWViolationItem(testSchemaKey, map[string]any{"id": "3"}, `[id: 3]`, 5, 6, 7, 8) r.AddClusterReport("c1", cr) s := r.MarshalReport() require.Equal(t, "round: 10\n\n"+ @@ -333,13 +338,13 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - " - [peer cluster: d0, pk: pk-0, origin ts: 0, commit ts: 1]\n"+ + ` - [peer cluster: d0, pk: [id: 0], origin ts: 0, commit ts: 1]`+"\n"+ " - [data inconsistent items: 1]\n"+ - " - [peer cluster: d1, pk: pk-1, origin ts: 1, commit ts: 2, inconsistent columns: [column: val, local: x, replicated: y]]\n"+ + ` - [peer cluster: d1, pk: [id: 1], origin ts: 1, commit ts: 2, inconsistent columns: [column: val, local: x, replicated: y]]`+"\n"+ " - [data redundant items: 1]\n"+ - " - [pk: pk-2, origin ts: 3, commit ts: 4]\n"+ + ` - [pk: [id: 2], origin ts: 3, commit ts: 4]`+"\n"+ " - [lww violation items: 1]\n"+ - " - [pk: pk-3, existing origin ts: 5, existing commit ts: 6, origin ts: 7, commit ts: 8]\n\n", + ` - [pk: [id: 3], existing origin ts: 5, existing commit ts: 6, origin ts: 7, commit ts: 8]`+"\n\n", s) }) } diff --git a/cmd/multi-cluster-consistency-checker/types/types.go b/cmd/multi-cluster-consistency-checker/types/types.go index a8f06265fd..e6309bd1b8 100644 --- a/cmd/multi-cluster-consistency-checker/types/types.go +++ b/cmd/multi-cluster-consistency-checker/types/types.go @@ -15,11 +15,14 @@ package types import ( "fmt" + "sort" "strings" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" ) +// PkType is a distinct type for encoded primary key strings, making it clear +// at the type level that the value has been serialized / encoded. type PkType string type CdcVersion struct { @@ -68,8 +71,16 @@ type TimeWindow struct { func (t *TimeWindow) String() string { var builder strings.Builder fmt.Fprintf(&builder, "time window boundary: (%d, %d]\n", t.LeftBoundary, t.RightBoundary) - for replicatedClusterID, checkpointTs := range t.CheckpointTs { - fmt.Fprintf(&builder, "checkpoint ts [replicated cluster: %s]: %d\n", replicatedClusterID, checkpointTs) + + // Sort cluster IDs for deterministic output + clusterIDs := make([]string, 0, len(t.CheckpointTs)) + for id := range t.CheckpointTs { + clusterIDs = append(clusterIDs, id) + } + sort.Strings(clusterIDs) + + for _, replicatedClusterID := range clusterIDs { + fmt.Fprintf(&builder, "checkpoint ts [replicated cluster: %s]: %d\n", replicatedClusterID, t.CheckpointTs[replicatedClusterID]) } return builder.String() } diff --git a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go index 09c5827008..6aa05c17f6 100644 --- a/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go +++ b/cmd/multi-cluster-consistency-checker/watcher/checkpoint_watcher.go @@ -15,7 +15,6 @@ package watcher import ( "context" - "strings" "sync" "time" @@ -28,6 +27,11 @@ import ( "go.uber.org/zap" ) +// errChangefeedKeyDeleted is a sentinel error indicating that the changefeed +// status key has been deleted from etcd. This is a non-recoverable error +// that should not be retried. +var errChangefeedKeyDeleted = errors.New("changefeed status key is deleted") + const ( // retryBackoffBase is the initial backoff duration for retries retryBackoffBase = 500 * time.Millisecond @@ -169,7 +173,7 @@ func (cw *CheckpointWatcher) run() { } // Check if this is a non-recoverable error - if isNonRecoverableError(err) { + if errors.Is(err, errChangefeedKeyDeleted) { cw.mu.Lock() cw.watchErr = err cw.mu.Unlock() @@ -248,8 +252,7 @@ func (cw *CheckpointWatcher) watchOnce() error { for _, event := range watchResp.Events { if event.Type == clientv3.EventTypeDelete { - // Key deletion is a non-recoverable error - return errors.Errorf("[changefeedID: %s] changefeed status key is deleted", cw.changefeedID.String()) + return errors.Annotatef(errChangefeedKeyDeleted, "[changefeedID: %s]", cw.changefeedID.String()) } // Parse the updated status @@ -278,16 +281,6 @@ func (cw *CheckpointWatcher) watchOnce() error { } } -// isNonRecoverableError checks if the error is non-recoverable and should not be retried -func isNonRecoverableError(err error) bool { - errMsg := err.Error() - // Key deletion is non-recoverable - if strings.Contains(errMsg, "deleted") { - return true - } - return false -} - // notifyPendingTasksLocked notifies pending tasks whose minCheckpointTs has been exceeded // Must be called with mu locked func (cw *CheckpointWatcher) notifyPendingTasksLocked() { From 0b9c0bba59c54c42dcefc26b788f563cfb3ae5d0 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Mon, 23 Feb 2026 23:04:58 +0800 Subject: [PATCH 23/23] meet the spec Signed-off-by: Jianjun Liao --- .../checker/checker.go | 8 +- .../checker/checker_test.go | 5 +- .../consumer/consumer.go | 140 ++++++++++-------- .../consumer/consumer_test.go | 135 +++++++++++++---- .../decoder/decoder.go | 36 ++++- .../decoder/decoder_test.go | 79 +++++++++- .../recorder/recorder_test.go | 6 +- .../recorder/types.go | 18 +-- .../recorder/types_test.go | 72 ++++----- .../types/types.go | 2 + 10 files changed, 354 insertions(+), 147 deletions(-) diff --git a/cmd/multi-cluster-consistency-checker/checker/checker.go b/cmd/multi-cluster-consistency-checker/checker/checker.go index 998fbc472c..1365b9142b 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker.go @@ -244,7 +244,7 @@ func (cd *clusterDataChecker) InitializeFromCheckpoint( schemaKey := schemaPathKey.GetKey() for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { - records, err := decoder.Decode(content) + records, err := decoder.Decode(content, incrementalData.ColumnFieldTypes) if err != nil { return errors.Trace(err) } @@ -424,14 +424,14 @@ func (cd *clusterDataChecker) checkLocalRecordsForDataLoss( zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkMap, record.PkStr, record.OriginTs, record.CommitTs) + cd.report.AddDataLossItem(replicatedClusterID, schemaKey, record.PkMap, record.PkStr, record.CommitTs) } else if !record.EqualReplicatedRecord(replicatedRecord) { // data inconsistent detected log.Error("data inconsistent detected", zap.String("local cluster ID", cd.clusterID), zap.String("replicated cluster ID", replicatedClusterID), zap.Any("record", record)) - cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkMap, record.PkStr, record.OriginTs, record.CommitTs, diffColumns(record, replicatedRecord)) + cd.report.AddDataInconsistentItem(replicatedClusterID, schemaKey, record.PkMap, record.PkStr, replicatedRecord.OriginTs, record.CommitTs, replicatedRecord.CommitTs, diffColumns(record, replicatedRecord)) } } } @@ -641,7 +641,7 @@ func (c *DataChecker) decodeNewTimeWindowData(newTimeWindowData map[string]types schemaKey := schemaPathKey.GetKey() for _, contents := range incrementalData.DataContentSlices { for _, content := range contents { - records, err := decoder.Decode(content) + records, err := decoder.Decode(content, incrementalData.ColumnFieldTypes) if err != nil { return errors.Trace(err) } diff --git a/cmd/multi-cluster-consistency-checker/checker/checker_test.go b/cmd/multi-cluster-consistency-checker/checker/checker_test.go index e1228d58b9..18453f35c2 100644 --- a/cmd/multi-cluster-consistency-checker/checker/checker_test.go +++ b/cmd/multi-cluster-consistency-checker/checker/checker_test.go @@ -467,7 +467,6 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { tableItems := c1Report.TableFailureItems[defaultSchemaKey] require.Len(t, tableItems.DataLossItems, 1) require.Equal(t, "c2", tableItems.DataLossItems[0].PeerClusterID) - require.Equal(t, uint64(0), tableItems.DataLossItems[0].OriginTS) require.Equal(t, uint64(250), tableItems.DataLossItems[0].CommitTS) // c2 should have no issues c2Report := lastReport.ClusterReports["c2"] @@ -508,7 +507,9 @@ func TestDataChecker_FourRoundsCheck(t *testing.T) { require.Empty(t, tableItems.DataLossItems) require.Len(t, tableItems.DataInconsistentItems, 1) require.Equal(t, "c2", tableItems.DataInconsistentItems[0].PeerClusterID) - require.Equal(t, uint64(250), tableItems.DataInconsistentItems[0].CommitTS) + require.Equal(t, uint64(250), tableItems.DataInconsistentItems[0].OriginTS) + require.Equal(t, uint64(250), tableItems.DataInconsistentItems[0].LocalCommitTS) + require.Equal(t, uint64(260), tableItems.DataInconsistentItems[0].ReplicatedCommitTS) require.Len(t, tableItems.DataInconsistentItems[0].InconsistentColumns, 1) require.Equal(t, "val", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Column) require.Equal(t, "c", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Local) diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer.go b/cmd/multi-cluster-consistency-checker/consumer/consumer.go index 73544ddde5..07736f7e67 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer.go @@ -15,6 +15,7 @@ package consumer import ( "context" + "encoding/json" "fmt" "path" "strings" @@ -28,6 +29,7 @@ import ( "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "github.com/pingcap/tidb/br/pkg/storage" + ptypes "github.com/pingcap/tidb/pkg/parser/types" "go.uber.org/zap" "golang.org/x/sync/errgroup" ) @@ -57,9 +59,9 @@ func updateTableDMLIdxMap( } } -type schemaParser struct { - path string - parser *TableParser +type schemaDefinition struct { + path string + columnFieldTypes map[string]*ptypes.FieldType } type schemaKey struct { @@ -67,8 +69,6 @@ type schemaKey struct { table string } -type TableParser struct{} - var ErrWalkDirEnd = perrors.Normalize("walk dir end", perrors.RFCCodeText("CDC:ErrWalkDirEnd")) type CurrentTableVersion struct { @@ -96,49 +96,61 @@ func (cvt *CurrentTableVersion) UpdateCurrentTableVersion(schema, table string, cvt.currentTableVersionMap[schemaKey{schema: schema, table: table}] = version } -type SchemaParsers struct { - mu sync.RWMutex - schemaParserMap map[cloudstorage.SchemaPathKey]schemaParser +type SchemaDefinitions struct { + mu sync.RWMutex + schemaDefinitionMap map[cloudstorage.SchemaPathKey]schemaDefinition } -func NewSchemaParser() *SchemaParsers { - return &SchemaParsers{ - schemaParserMap: make(map[cloudstorage.SchemaPathKey]schemaParser), +func NewSchemaDefinitions() *SchemaDefinitions { + return &SchemaDefinitions{ + schemaDefinitionMap: make(map[cloudstorage.SchemaPathKey]schemaDefinition), } } -// GetSchemaParser returns the schema parser for a given schema and table version -func (sp *SchemaParsers) GetSchemaParser(schema, table string, version uint64) (*TableParser, error) { +// GetColumnFieldTypes returns the pre-parsed column field types for a given schema and table version +func (sp *SchemaDefinitions) GetColumnFieldTypes(schema, table string, version uint64) (map[string]*ptypes.FieldType, error) { schemaPathKey := cloudstorage.SchemaPathKey{ Schema: schema, Table: table, TableVersion: version, } sp.mu.RLock() - schemaParser, ok := sp.schemaParserMap[schemaPathKey] + schemaDefinition, ok := sp.schemaDefinitionMap[schemaPathKey] sp.mu.RUnlock() if !ok { - return nil, errors.Errorf("schema parser not found for schema: %s, table: %s, version: %d", schema, table, version) + return nil, errors.Errorf("schema definition not found for schema: %s, table: %s, version: %d", schema, table, version) } - return schemaParser.parser, nil + return schemaDefinition.columnFieldTypes, nil } -// SetSchemaParser sets the schema parser for a given schema and table version -func (sp *SchemaParsers) SetSchemaParser(schemaPathKey cloudstorage.SchemaPathKey, filePath string, parser *TableParser) { +// SetSchemaDefinition sets the schema definition for a given schema and table version. +// It pre-parses the column field types from the table definition for later use by the decoder. +func (sp *SchemaDefinitions) SetSchemaDefinition(schemaPathKey cloudstorage.SchemaPathKey, filePath string, tableDefinition *cloudstorage.TableDefinition) error { + columnFieldTypes := make(map[string]*ptypes.FieldType) + if tableDefinition != nil { + for i, col := range tableDefinition.Columns { + colInfo, err := col.ToTiColumnInfo(int64(i)) + if err != nil { + return errors.Annotatef(err, "failed to convert column %s to FieldType", col.Name) + } + columnFieldTypes[col.Name] = &colInfo.FieldType + } + } sp.mu.Lock() - sp.schemaParserMap[schemaPathKey] = schemaParser{ - path: filePath, - parser: parser, + sp.schemaDefinitionMap[schemaPathKey] = schemaDefinition{ + path: filePath, + columnFieldTypes: columnFieldTypes, } sp.mu.Unlock() + return nil } -// RemoveSchemaParserWithCondition removes the schema parser for a given condition -func (sp *SchemaParsers) RemoveSchemaParserWithCondition(condition func(schemaPathKey cloudstorage.SchemaPathKey) bool) { +// RemoveSchemaDefinitionWithCondition removes the schema definition for a given condition +func (sp *SchemaDefinitions) RemoveSchemaDefinitionWithCondition(condition func(schemaPathKey cloudstorage.SchemaPathKey) bool) { sp.mu.Lock() - for schemaPathkey := range sp.schemaParserMap { + for schemaPathkey := range sp.schemaDefinitionMap { if condition(schemaPathkey) { - delete(sp.schemaParserMap, schemaPathkey) + delete(sp.schemaDefinitionMap, schemaPathkey) } } sp.mu.Unlock() @@ -217,7 +229,7 @@ type S3Consumer struct { currentTableVersion *CurrentTableVersion tableDMLIdx *TableDMLIdx - schemaParser *SchemaParsers + schemaDefinitions *SchemaDefinitions } func NewS3Consumer( @@ -235,7 +247,7 @@ func NewS3Consumer( currentTableVersion: NewCurrentTableVersion(), tableDMLIdx: NewTableDMLIdx(), - schemaParser: NewSchemaParser(), + schemaDefinitions: NewSchemaDefinitions(), } } @@ -269,12 +281,12 @@ func (c *S3Consumer) InitializeFromCheckpoint( func( dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, - parser *TableParser, + columnFieldTypes map[string]*ptypes.FieldType, ) { mu.Lock() result[dmlPathKey] = types.IncrementalData{ DataContentSlices: dmlSlices, - // Parser: parser, + ColumnFieldTypes: columnFieldTypes, } mu.Unlock() }, @@ -364,12 +376,18 @@ func (c *S3Consumer) downloadSchemaFilesWithScanRange( return scanVersions, nil } +// downloadDataFilesWithScanRange downloads data files for a given scan range. +// consumeFunc is called from multiple goroutines concurrently and must be goroutine-safe. func (c *S3Consumer) downloadDataFilesWithScanRange( ctx context.Context, schema, table string, scanVersions []types.VersionKey, scanRange *recorder.ScanRange, - consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *TableParser), + consumeFunc func( + dmlPathKey cloudstorage.DmlPathKey, + dmlSlices map[cloudstorage.FileIndexKey][][]byte, + columnFieldTypes map[string]*ptypes.FieldType, + ), ) error { eg, egCtx := errgroup.WithContext(ctx) for _, version := range scanVersions { @@ -382,12 +400,12 @@ func (c *S3Consumer) downloadDataFilesWithScanRange( if err != nil { return errors.Trace(err) } - parser, err := c.schemaParser.GetSchemaParser(schema, table, version.Version) + columnFieldTypes, err := c.schemaDefinitions.GetColumnFieldTypes(schema, table, version.Version) if err != nil { return errors.Trace(err) } for dmlPathKey, dmlSlices := range dmlData { - consumeFunc(dmlPathKey, dmlSlices, parser) + consumeFunc(dmlPathKey, dmlSlices, columnFieldTypes) } return nil }) @@ -440,32 +458,32 @@ func (c *S3Consumer) getNewFilesForSchemaPathKeyWithEndPath( // downloadSchemaFiles downloads schema files concurrently for given schema path keys func (c *S3Consumer) downloadSchemaFiles( - _ context.Context, + ctx context.Context, newVersionPaths map[cloudstorage.SchemaPathKey]string, ) error { - // eg, ectx := errgroup.WithContext(ctx) + eg, egCtx := errgroup.WithContext(ctx) log.Debug("starting concurrent schema file download", zap.Int("totalSchemas", len(newVersionPaths))) for schemaPathKey, filePath := range newVersionPaths { - // eg.Go(func() error { - // content, err := c.s3Storage.ReadFile(egCtx, filePath) - // if err != nil { - // return errors.Annotatef(err, "failed to read schema file: %s", filePath) - // } - // - // Use canal-json decoder for S3 sink with .json file extension - // parser, err := types.NewTableParserWithFormat(schemaPathKey.GetKey(), content, config.ProtocolCanalJSON) - // if err != nil { - // return errors.Annotatef(err, "failed to create table parser: %s", schemaPathKey.GetKey()) - // } - // - c.schemaParser.SetSchemaParser(schemaPathKey, filePath, nil) - // return nil - // }) - } - //if err := eg.Wait(); err != nil { - // return errors.Trace(err) - //} + eg.Go(func() error { + content, err := c.s3Storage.ReadFile(egCtx, filePath) + if err != nil { + return errors.Annotatef(err, "failed to read schema file: %s", filePath) + } + + tableDefinition := &cloudstorage.TableDefinition{} + if err := json.Unmarshal(content, tableDefinition); err != nil { + return errors.Annotatef(err, "failed to unmarshal schema file: %s", filePath) + } + if err := c.schemaDefinitions.SetSchemaDefinition(schemaPathKey, filePath, tableDefinition); err != nil { + return errors.Trace(err) + } + return nil + }) + } + if err := eg.Wait(); err != nil { + return errors.Trace(err) + } return nil } @@ -652,11 +670,17 @@ func (c *S3Consumer) downloadDMLFiles( return result, nil } +// downloadNewFilesWithVersions downloads new files for given schema versions. +// consumeFunc is called from multiple goroutines concurrently and must be goroutine-safe. func (c *S3Consumer) downloadNewFilesWithVersions( ctx context.Context, schema, table string, scanVersions []types.VersionKey, - consumeFunc func(dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, parser *TableParser), + consumeFunc func( + dmlPathKey cloudstorage.DmlPathKey, + dmlSlices map[cloudstorage.FileIndexKey][][]byte, + columnFieldTypes map[string]*ptypes.FieldType, + ), ) (*types.VersionKey, error) { var maxVersion *types.VersionKey eg, egCtx := errgroup.WithContext(ctx) @@ -674,12 +698,12 @@ func (c *S3Consumer) downloadNewFilesWithVersions( if err != nil { return errors.Trace(err) } - parser, err := c.schemaParser.GetSchemaParser(schema, table, versionp.Version) + columnFieldTypes, err := c.schemaDefinitions.GetColumnFieldTypes(schema, table, versionp.Version) if err != nil { return errors.Trace(err) } for dmlPathKey, dmlSlices := range dmlData { - consumeFunc(dmlPathKey, dmlSlices, parser) + consumeFunc(dmlPathKey, dmlSlices, columnFieldTypes) } return nil }) @@ -714,12 +738,12 @@ func (c *S3Consumer) ConsumeNewFiles( func( dmlPathKey cloudstorage.DmlPathKey, dmlSlices map[cloudstorage.FileIndexKey][][]byte, - parser *TableParser, + columnFieldTypes map[string]*ptypes.FieldType, ) { mu.Lock() result[dmlPathKey] = types.IncrementalData{ DataContentSlices: dmlSlices, - // Parser: parser, + ColumnFieldTypes: columnFieldTypes, } mu.Unlock() }, diff --git a/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go b/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go index 05900fda81..b51bdb80d1 100644 --- a/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go +++ b/cmd/multi-cluster-consistency-checker/consumer/consumer_test.go @@ -25,6 +25,8 @@ import ( "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/pkg/parser/mysql" + ptypes "github.com/pingcap/tidb/pkg/parser/types" "github.com/stretchr/testify/require" ) @@ -123,64 +125,139 @@ func TestCurrentTableVersion(t *testing.T) { }) } -func TestSchemaParser(t *testing.T) { +func TestSchemaDefinitions(t *testing.T) { t.Parallel() t.Run("get returns error for missing key", func(t *testing.T) { t.Parallel() - sp := NewSchemaParser() - _, err := sp.GetSchemaParser("db", "tbl", 1) + sp := NewSchemaDefinitions() + _, err := sp.GetColumnFieldTypes("db", "tbl", 1) require.Error(t, err) - require.Contains(t, err.Error(), "schema parser not found") + require.Contains(t, err.Error(), "schema definition not found") }) - t.Run("set and get", func(t *testing.T) { + t.Run("set and get empty table definition", func(t *testing.T) { t.Parallel() - sp := NewSchemaParser() + sp := NewSchemaDefinitions() key := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1} - parser := &TableParser{} - sp.SetSchemaParser(key, "/path/to/schema.json", parser) + td := &cloudstorage.TableDefinition{} + err := sp.SetSchemaDefinition(key, "/path/to/schema.json", td) + require.NoError(t, err) + + got, err := sp.GetColumnFieldTypes("db", "tbl", 1) + require.NoError(t, err) + require.Equal(t, map[string]*ptypes.FieldType{}, got) + }) + + t.Run("set and get with columns parses field types correctly", func(t *testing.T) { + t.Parallel() + sp := NewSchemaDefinitions() + key := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1} + td := &cloudstorage.TableDefinition{ + Table: "tbl", + Schema: "db", + Columns: []cloudstorage.TableCol{ + {Name: "id", Tp: "INT", IsPK: "true", Precision: "11"}, + {Name: "name", Tp: "VARCHAR", Precision: "255"}, + {Name: "score", Tp: "DECIMAL", Precision: "10", Scale: "2"}, + {Name: "duration", Tp: "TIME", Scale: "3"}, + {Name: "created_at", Tp: "TIMESTAMP", Scale: "6"}, + {Name: "big_id", Tp: "BIGINT UNSIGNED", Precision: "20"}, + }, + TotalColumns: 6, + } + err := sp.SetSchemaDefinition(key, "/path/to/schema.json", td) + require.NoError(t, err) - got, err := sp.GetSchemaParser("db", "tbl", 1) + got, err := sp.GetColumnFieldTypes("db", "tbl", 1) require.NoError(t, err) - require.Equal(t, parser, got) + require.Len(t, got, 6) + + // INT PK + require.Equal(t, mysql.TypeLong, got["id"].GetType()) + require.True(t, mysql.HasPriKeyFlag(got["id"].GetFlag())) + require.Equal(t, 11, got["id"].GetFlen()) + + // VARCHAR(255) + require.Equal(t, mysql.TypeVarchar, got["name"].GetType()) + require.Equal(t, 255, got["name"].GetFlen()) + + // DECIMAL(10,2) + require.Equal(t, mysql.TypeNewDecimal, got["score"].GetType()) + require.Equal(t, 10, got["score"].GetFlen()) + require.Equal(t, 2, got["score"].GetDecimal()) + + // TIME(3) — decimal stores FSP + require.Equal(t, mysql.TypeDuration, got["duration"].GetType()) + require.Equal(t, 3, got["duration"].GetDecimal()) + + // TIMESTAMP(6) — decimal stores FSP + require.Equal(t, mysql.TypeTimestamp, got["created_at"].GetType()) + require.Equal(t, 6, got["created_at"].GetDecimal()) + + // BIGINT UNSIGNED + require.Equal(t, mysql.TypeLonglong, got["big_id"].GetType()) + require.True(t, mysql.HasUnsignedFlag(got["big_id"].GetFlag())) + require.Equal(t, 20, got["big_id"].GetFlen()) + }) + + t.Run("set returns error for invalid column definition", func(t *testing.T) { + t.Parallel() + sp := NewSchemaDefinitions() + key := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl", TableVersion: 1} + td := &cloudstorage.TableDefinition{ + Table: "tbl", + Schema: "db", + Columns: []cloudstorage.TableCol{ + {Name: "id", Tp: "INT", Precision: "not_a_number"}, + }, + TotalColumns: 1, + } + err := sp.SetSchemaDefinition(key, "/path/to/schema.json", td) + require.Error(t, err) + require.Contains(t, err.Error(), "failed to convert column id to FieldType") + + // Verify the definition was NOT stored + _, err = sp.GetColumnFieldTypes("db", "tbl", 1) + require.Error(t, err) + require.Contains(t, err.Error(), "schema definition not found") }) t.Run("remove with condition", func(t *testing.T) { t.Parallel() - sp := NewSchemaParser() + sp := NewSchemaDefinitions() key1 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl1", TableVersion: 1} key2 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl2", TableVersion: 2} - sp.SetSchemaParser(key1, "/path1", nil) - sp.SetSchemaParser(key2, "/path2", nil) + require.NoError(t, sp.SetSchemaDefinition(key1, "/path1", nil)) + require.NoError(t, sp.SetSchemaDefinition(key2, "/path2", nil)) // Remove only entries for tbl1 - sp.RemoveSchemaParserWithCondition(func(k cloudstorage.SchemaPathKey) bool { + sp.RemoveSchemaDefinitionWithCondition(func(k cloudstorage.SchemaPathKey) bool { return k.Table == "tbl1" }) - _, err := sp.GetSchemaParser("db", "tbl1", 1) + _, err := sp.GetColumnFieldTypes("db", "tbl1", 1) require.Error(t, err) - _, err = sp.GetSchemaParser("db", "tbl2", 2) + _, err = sp.GetColumnFieldTypes("db", "tbl2", 2) require.NoError(t, err) }) t.Run("remove with condition matching all", func(t *testing.T) { t.Parallel() - sp := NewSchemaParser() + sp := NewSchemaDefinitions() key1 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl1", TableVersion: 1} key2 := cloudstorage.SchemaPathKey{Schema: "db", Table: "tbl2", TableVersion: 2} - sp.SetSchemaParser(key1, "/path1", nil) - sp.SetSchemaParser(key2, "/path2", nil) + require.NoError(t, sp.SetSchemaDefinition(key1, "/path1", nil)) + require.NoError(t, sp.SetSchemaDefinition(key2, "/path2", nil)) - sp.RemoveSchemaParserWithCondition(func(k cloudstorage.SchemaPathKey) bool { + sp.RemoveSchemaDefinitionWithCondition(func(k cloudstorage.SchemaPathKey) bool { return true }) - _, err := sp.GetSchemaParser("db", "tbl1", 1) + _, err := sp.GetColumnFieldTypes("db", "tbl1", 1) require.Error(t, err) - _, err = sp.GetSchemaParser("db", "tbl2", 2) + _, err = sp.GetColumnFieldTypes("db", "tbl2", 2) require.Error(t, err) }) } @@ -344,7 +421,7 @@ func TestS3Consumer(t *testing.T) { t.Parallel() ctx := context.Background() round1Files := []mockFile{ - {name: "test/t1/meta/schema_1_0000000001.json", content: []byte{}}, + {name: "test/t1/meta/schema_1_0000000001.json", content: []byte("{}")}, {name: "test/t1/1/2026-01-01/CDC00000000000000000001.json", content: []byte("1_2026-01-01_1.json")}, } round1TimeWindowData := types.TimeWindowData{ @@ -365,7 +442,7 @@ func TestS3Consumer(t *testing.T) { }, maxVersionMap[types.SchemaTableKey{Schema: "test", Table: "t1"}]) } round2Files := []mockFile{ - {name: "test/t1/meta/schema_1_0000000001.json", content: []byte{}}, + {name: "test/t1/meta/schema_1_0000000001.json", content: []byte("{}")}, {name: "test/t1/1/2026-01-01/CDC00000000000000000001.json", content: []byte("1_2026-01-01_1.json")}, {name: "test/t1/1/2026-01-01/CDC00000000000000000002.json", content: []byte("1_2026-01-01_2.json")}, {name: "test/t1/1/2026-01-02/CDC00000000000000000001.json", content: []byte("1_2026-01-02_1.json")}, @@ -387,6 +464,7 @@ func TestS3Consumer(t *testing.T) { DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-01_2.json")}, }, + ColumnFieldTypes: map[string]*ptypes.FieldType{}, }, newData[cloudstorage.DmlPathKey{ SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, PartitionNum: 0, @@ -396,6 +474,7 @@ func TestS3Consumer(t *testing.T) { DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-02_1.json")}, }, + ColumnFieldTypes: map[string]*ptypes.FieldType{}, }, newData[cloudstorage.DmlPathKey{ SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, PartitionNum: 0, @@ -409,8 +488,8 @@ func TestS3Consumer(t *testing.T) { }, maxVersionMap[types.SchemaTableKey{Schema: "test", Table: "t1"}]) } round3Files := []mockFile{ - {name: "test/t1/meta/schema_1_0000000001.json", content: []byte{}}, - {name: "test/t1/meta/schema_2_0000000001.json", content: []byte{}}, + {name: "test/t1/meta/schema_1_0000000001.json", content: []byte("{}")}, + {name: "test/t1/meta/schema_2_0000000001.json", content: []byte("{}")}, {name: "test/t1/1/2026-01-01/CDC00000000000000000001.json", content: []byte("1_2026-01-01_1.json")}, {name: "test/t1/1/2026-01-01/CDC00000000000000000002.json", content: []byte("1_2026-01-01_2.json")}, {name: "test/t1/1/2026-01-02/CDC00000000000000000001.json", content: []byte("1_2026-01-02_1.json")}, @@ -436,6 +515,7 @@ func TestS3Consumer(t *testing.T) { DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-02_2.json")}, }, + ColumnFieldTypes: map[string]*ptypes.FieldType{}, }, newData[cloudstorage.DmlPathKey{ SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, PartitionNum: 0, @@ -445,6 +525,7 @@ func TestS3Consumer(t *testing.T) { DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("2_2026-01-02_1.json")}, }, + ColumnFieldTypes: map[string]*ptypes.FieldType{}, }, newData[cloudstorage.DmlPathKey{ SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 2}, PartitionNum: 0, @@ -475,6 +556,7 @@ func TestS3Consumer(t *testing.T) { DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("1_2026-01-01_2.json")}, }, + ColumnFieldTypes: map[string]*ptypes.FieldType{}, }, data[cloudstorage.DmlPathKey{ SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 1}, PartitionNum: 0, @@ -496,6 +578,7 @@ func TestS3Consumer(t *testing.T) { DataContentSlices: map[cloudstorage.FileIndexKey][][]byte{ {DispatcherID: "", EnableTableAcrossNodes: false}: {[]byte("2_2026-01-02_1.json")}, }, + ColumnFieldTypes: map[string]*ptypes.FieldType{}, }, data[cloudstorage.DmlPathKey{ SchemaPathKey: cloudstorage.SchemaPathKey{Schema: "test", Table: "t1", TableVersion: 2}, PartitionNum: 0, diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder.go b/cmd/multi-cluster-consistency-checker/decoder/decoder.go index dff25ad771..af0f8d01d0 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder.go @@ -119,7 +119,8 @@ type columnValueDecoder struct { data []byte config *common.Config - msg *canalValueDecoderJSONMessageWithTiDBExtension + msg *canalValueDecoderJSONMessageWithTiDBExtension + columnFieldTypes map[string]*ptypes.FieldType } func newColumnValueDecoder(data []byte) (*columnValueDecoder, error) { @@ -138,12 +139,14 @@ func newColumnValueDecoder(data []byte) (*columnValueDecoder, error) { }, nil } -func Decode(data []byte) ([]*Record, error) { +func Decode(data []byte, columnFieldTypes map[string]*ptypes.FieldType) ([]*Record, error) { decoder, err := newColumnValueDecoder(data) if err != nil { return nil, errors.Trace(err) } + decoder.columnFieldTypes = columnFieldTypes + records := make([]*Record, 0) for { msgType, hasNext := decoder.tryNext() @@ -187,6 +190,7 @@ func (d *columnValueDecoder) tryNext() (common.MessageType, bool) { if err := json.Unmarshal(encodedData, msg); err != nil { log.Error("canal-json decoder unmarshal data failed", zap.Error(err), zap.ByteString("data", encodedData)) + d.msg = nil return common.MessageTypeUnknown, true } d.msg = msg @@ -205,11 +209,6 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { pkMap := make(map[string]any, len(d.msg.PkNames)) slices.Sort(d.msg.PkNames) for i, pkName := range d.msg.PkNames { - mysqlType, ok := d.msg.MySQLType[pkName] - if !ok { - log.Error("mysql type not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) - return nil, errors.Errorf("mysql type of column %s not found", pkName) - } columnValue, ok := d.msg.Data[0][pkName] if !ok { log.Error("column value not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) @@ -220,7 +219,11 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { } fmt.Fprintf(&pkStrBuilder, "%s: %v", pkName, columnValue) pkMap[pkName] = columnValue - ft := newPKColumnFieldTypeFromMysqlType(mysqlType) + ft := d.getColumnFieldType(pkName) + if ft == nil { + log.Error("field type not found", zap.String("pkName", pkName), zap.Any("msg", d.msg)) + return nil, errors.Errorf("field type of column %s not found", pkName) + } datum := valueToDatum(columnValue, ft) if datum.IsNull() { log.Error("column value is null", zap.String("pkName", pkName), zap.Any("msg", d.msg)) @@ -263,6 +266,23 @@ func (d *columnValueDecoder) decodeNext() (*Record, error) { }, nil } +// getColumnFieldType returns the FieldType for a column. +// It first looks up from the tableDefinition-based columnFieldTypes map, +// then falls back to parsing the MySQLType string from the canal-json message. +func (d *columnValueDecoder) getColumnFieldType(columnName string) *ptypes.FieldType { + if d.columnFieldTypes != nil { + if ft, ok := d.columnFieldTypes[columnName]; ok { + return ft + } + } + // Fallback: parse from MySQLType in the canal-json message + mysqlType, ok := d.msg.MySQLType[columnName] + if !ok { + return nil + } + return newPKColumnFieldTypeFromMysqlType(mysqlType) +} + func newPKColumnFieldTypeFromMysqlType(mysqlType string) *ptypes.FieldType { tp := ptypes.NewFieldType(common.ExtractBasicMySQLType(mysqlType)) if common.IsBinaryMySQLType(mysqlType) { diff --git a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go index 0dba247b82..972ab377cd 100644 --- a/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go +++ b/cmd/multi-cluster-consistency-checker/decoder/decoder_test.go @@ -18,9 +18,24 @@ import ( "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/decoder" "github.com/pingcap/ticdc/cmd/multi-cluster-consistency-checker/types" + "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + ptypes "github.com/pingcap/tidb/pkg/parser/types" "github.com/stretchr/testify/require" ) +// buildColumnFieldTypes converts a TableDefinition into a map of column name → FieldType, +// mimicking what SchemaDefinitions.SetSchemaDefinition does. +func buildColumnFieldTypes(t *testing.T, td *cloudstorage.TableDefinition) map[string]*ptypes.FieldType { + t.Helper() + result := make(map[string]*ptypes.FieldType, len(td.Columns)) + for i, col := range td.Columns { + colInfo, err := col.ToTiColumnInfo(int64(i)) + require.NoError(t, err) + result[col.Name] = &colInfo.FieldType + } + return result +} + // DataContent uses CRLF (\r\n) as line terminator to match the codec config const DataContent1 string = "" + `{"id":0,"database":"test_active","table":"message","pkNames":["id"],"isDdl":false,"type":"INSERT","es":1770184540709,"ts":1770184542274,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp","id":"int","first_name":"varchar"},"old":null,"data":[{"id":"20","first_name":"t","last_name":"TT","_tidb_origin_ts":null,"_tidb_softdelete_time":null}],"_tidb":{"commitTs":464043256649875456}}` + "\r\n" + @@ -43,8 +58,24 @@ var ExpectedRecords1 = []decoder.Record{ {CdcVersion: types.CdcVersion{CommitTs: 464074446600667164, OriginTs: 464074446196178963}, Pk: "038000000000000008", PkStr: "[id: 8]", ColumnValues: map[string]any{"first_name": "h", "last_name": "H", "_tidb_softdelete_time": "2026-02-05 22:58:40.992217"}}, } +// tableDefinition1 describes the "message" table: id(INT PK), first_name(VARCHAR), last_name(VARCHAR), +// _tidb_origin_ts(BIGINT), _tidb_softdelete_time(TIMESTAMP) +var tableDefinition1 = &cloudstorage.TableDefinition{ + Table: "message", + Schema: "test_active", + Version: 1, + Columns: []cloudstorage.TableCol{ + {Name: "id", Tp: "INT", IsPK: "true", Precision: "11"}, + {Name: "first_name", Tp: "VARCHAR", Precision: "255"}, + {Name: "last_name", Tp: "VARCHAR", Precision: "255"}, + {Name: "_tidb_origin_ts", Tp: "BIGINT", Precision: "20"}, + {Name: "_tidb_softdelete_time", Tp: "TIMESTAMP"}, + }, + TotalColumns: 5, +} + func TestCanalJSONDecoder1(t *testing.T) { - records, err := decoder.Decode([]byte(DataContent1)) + records, err := decoder.Decode([]byte(DataContent1), buildColumnFieldTypes(t, tableDefinition1)) require.NoError(t, err) require.Len(t, records, 8) for i, actualRecord := range records { @@ -66,8 +97,24 @@ var ExpectedRecords2 = []decoder.Record{ {CdcVersion: types.CdcVersion{CommitTs: 464085169694572575, OriginTs: 0}, Pk: "016200000000000000f8038000000000000065", PkStr: "[first_name: b, id: 101]", ColumnValues: map[string]any{"last_name": "B", "_tidb_softdelete_time": nil}}, } +// tableDefinition2 describes the "message2" table: id(INT PK), first_name(VARCHAR PK), last_name(VARCHAR), +// _tidb_origin_ts(BIGINT), _tidb_softdelete_time(TIMESTAMP) +var tableDefinition2 = &cloudstorage.TableDefinition{ + Table: "message2", + Schema: "test_active", + Version: 1, + Columns: []cloudstorage.TableCol{ + {Name: "id", Tp: "INT", IsPK: "true", Precision: "11"}, + {Name: "first_name", Tp: "VARCHAR", IsPK: "true", Precision: "255"}, + {Name: "last_name", Tp: "VARCHAR", Precision: "255"}, + {Name: "_tidb_origin_ts", Tp: "BIGINT", Precision: "20"}, + {Name: "_tidb_softdelete_time", Tp: "TIMESTAMP"}, + }, + TotalColumns: 5, +} + func TestCanalJSONDecoder2(t *testing.T) { - records, err := decoder.Decode([]byte(DataContent2)) + records, err := decoder.Decode([]byte(DataContent2), buildColumnFieldTypes(t, tableDefinition2)) require.NoError(t, err) require.Len(t, records, 2) for i, actualRecord := range records { @@ -80,6 +127,34 @@ func TestCanalJSONDecoder2(t *testing.T) { } } +// TestCanalJSONDecoderWithInvalidMessage verifies that when a malformed message appears in +// the data stream, it is skipped gracefully and subsequent valid messages are still decoded. +// This covers the fix where d.msg is cleared to nil on unmarshal failure to prevent stale +// message data from leaking into decodeNext. +func TestCanalJSONDecoderWithInvalidMessage(t *testing.T) { + // First line is invalid JSON, second line is a valid message. + dataWithInvalidLine := `{invalid json}` + "\r\n" + + `{"id":0,"database":"test_active","table":"message2","pkNames":["id","first_name"],"isDdl":false,"type":"INSERT","es":1770344412751,"ts":1770344413749,"sql":"","sqlType":{"id":4,"first_name":12,"last_name":12,"_tidb_origin_ts":-5,"_tidb_softdelete_time":93},"mysqlType":{"id":"int","first_name":"varchar","last_name":"varchar","_tidb_origin_ts":"bigint","_tidb_softdelete_time":"timestamp"},"old":null,"data":[{"id":"100","first_name":"a","last_name":"A","_tidb_origin_ts":"464085165262503958","_tidb_softdelete_time":null}],"_tidb":{"commitTs":464085165736198159}}` + "\r\n" + + records, err := decoder.Decode([]byte(dataWithInvalidLine), buildColumnFieldTypes(t, tableDefinition2)) + require.NoError(t, err) + // The invalid line should be skipped, only the valid record should be returned. + require.Len(t, records, 1) + require.Equal(t, ExpectedRecords2[0].Pk, records[0].Pk) + require.Equal(t, ExpectedRecords2[0].PkStr, records[0].PkStr) + require.Equal(t, ExpectedRecords2[0].CdcVersion.CommitTs, records[0].CdcVersion.CommitTs) + require.Equal(t, ExpectedRecords2[0].CdcVersion.OriginTs, records[0].CdcVersion.OriginTs) +} + +// TestCanalJSONDecoderAllInvalidMessages verifies that when all messages are malformed, +// the decoder returns an empty result without errors. +func TestCanalJSONDecoderAllInvalidMessages(t *testing.T) { + allInvalid := `{broken}` + "\r\n" + `{also broken}` + "\r\n" + records, err := decoder.Decode([]byte(allInvalid), nil) + require.NoError(t, err) + require.Empty(t, records) +} + func TestRecord_EqualReplicatedRecord(t *testing.T) { tests := []struct { name string diff --git a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go index b2ca0578a0..af3a8297b3 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/recorder_test.go @@ -230,7 +230,7 @@ func TestRecorder_RecordTimeWindow(t *testing.T) { } report := NewReport(5) cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: 1, RightBoundary: 10}) - cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, 100, 200) + cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, 200) report.AddClusterReport("c1", cr) require.True(t, report.NeedFlush()) @@ -353,7 +353,7 @@ func TestRecorder_CheckpointPersistence(t *testing.T) { } report := NewReport(i) cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}) - cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, i, i+1) + cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, i+1) report.AddClusterReport("c1", cr) require.True(t, report.NeedFlush()) @@ -395,7 +395,7 @@ func TestRecorder_CheckpointPersistence(t *testing.T) { } report := NewReport(i) cr := NewClusterReport("c1", types.TimeWindow{LeftBoundary: i * 10, RightBoundary: (i + 1) * 10}) - cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, i, i+1) + cr.AddDataLossItem("d1", "test_table", map[string]any{"id": "1"}, `[id: 1]`, i+1) report.AddClusterReport("c1", cr) err = r.RecordTimeWindow(twData, report) diff --git a/cmd/multi-cluster-consistency-checker/recorder/types.go b/cmd/multi-cluster-consistency-checker/recorder/types.go index 0fceddc3a5..2c316fc965 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types.go @@ -25,14 +25,13 @@ import ( type DataLossItem struct { PeerClusterID string `json:"peer_cluster_id"` PK map[string]any `json:"pk"` - OriginTS uint64 `json:"origin_ts"` CommitTS uint64 `json:"commit_ts"` PKStr string `json:"-"` } func (item *DataLossItem) String() string { - return fmt.Sprintf("peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", item.PeerClusterID, item.PKStr, item.OriginTS, item.CommitTS) + return fmt.Sprintf("peer cluster: %s, pk: %s, commit ts: %d", item.PeerClusterID, item.PKStr, item.CommitTS) } type InconsistentColumn struct { @@ -49,7 +48,8 @@ type DataInconsistentItem struct { PeerClusterID string `json:"peer_cluster_id"` PK map[string]any `json:"pk"` OriginTS uint64 `json:"origin_ts"` - CommitTS uint64 `json:"commit_ts"` + LocalCommitTS uint64 `json:"local_commit_ts"` + ReplicatedCommitTS uint64 `json:"replicated_commit_ts"` InconsistentColumns []InconsistentColumn `json:"inconsistent_columns,omitempty"` PKStr string `json:"-"` @@ -57,8 +57,8 @@ type DataInconsistentItem struct { func (item *DataInconsistentItem) String() string { var sb strings.Builder - fmt.Fprintf(&sb, "peer cluster: %s, pk: %s, origin ts: %d, commit ts: %d", - item.PeerClusterID, item.PKStr, item.OriginTS, item.CommitTS) + fmt.Fprintf(&sb, "peer cluster: %s, pk: %s, origin ts: %d, local commit ts: %d, replicated commit ts: %d", + item.PeerClusterID, item.PKStr, item.OriginTS, item.LocalCommitTS, item.ReplicatedCommitTS) if len(item.InconsistentColumns) > 0 { sb.WriteString(", inconsistent columns: [") for i, col := range item.InconsistentColumns { @@ -139,7 +139,7 @@ func (r *ClusterReport) AddDataLossItem( peerClusterID, schemaKey string, pk map[string]any, pkStr string, - originTS, commitTS uint64, + commitTS uint64, ) { tableFailureItems, exists := r.TableFailureItems[schemaKey] if !exists { @@ -149,7 +149,6 @@ func (r *ClusterReport) AddDataLossItem( tableFailureItems.DataLossItems = append(tableFailureItems.DataLossItems, DataLossItem{ PeerClusterID: peerClusterID, PK: pk, - OriginTS: originTS, CommitTS: commitTS, PKStr: pkStr, @@ -161,7 +160,7 @@ func (r *ClusterReport) AddDataInconsistentItem( peerClusterID, schemaKey string, pk map[string]any, pkStr string, - originTS, commitTS uint64, + originTS, localCommitTS, replicatedCommitTS uint64, inconsistentColumns []InconsistentColumn, ) { tableFailureItems, exists := r.TableFailureItems[schemaKey] @@ -173,7 +172,8 @@ func (r *ClusterReport) AddDataInconsistentItem( PeerClusterID: peerClusterID, PK: pk, OriginTS: originTS, - CommitTS: commitTS, + LocalCommitTS: localCommitTS, + ReplicatedCommitTS: replicatedCommitTS, InconsistentColumns: inconsistentColumns, PKStr: pkStr, diff --git a/cmd/multi-cluster-consistency-checker/recorder/types_test.go b/cmd/multi-cluster-consistency-checker/recorder/types_test.go index e054b48d19..ec2df79d3c 100644 --- a/cmd/multi-cluster-consistency-checker/recorder/types_test.go +++ b/cmd/multi-cluster-consistency-checker/recorder/types_test.go @@ -26,12 +26,11 @@ func TestDataLossItem_String(t *testing.T) { item := &DataLossItem{ PeerClusterID: "cluster-2", PK: map[string]any{"id": "1"}, - OriginTS: 100, CommitTS: 200, PKStr: `[id: 1]`, } s := item.String() - require.Equal(t, `peer cluster: cluster-2, pk: [id: 1], origin ts: 100, commit ts: 200`, s) + require.Equal(t, `peer cluster: cluster-2, pk: [id: 1], commit ts: 200`, s) } func TestDataInconsistentItem_String(t *testing.T) { @@ -40,24 +39,26 @@ func TestDataInconsistentItem_String(t *testing.T) { t.Run("without inconsistent columns", func(t *testing.T) { t.Parallel() item := &DataInconsistentItem{ - PeerClusterID: "cluster-3", - PK: map[string]any{"id": "2"}, - OriginTS: 300, - CommitTS: 400, - PKStr: `[id: 2]`, + PeerClusterID: "cluster-3", + PK: map[string]any{"id": "2"}, + OriginTS: 300, + LocalCommitTS: 400, + ReplicatedCommitTS: 410, + PKStr: `[id: 2]`, } s := item.String() - require.Equal(t, `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, commit ts: 400`, s) + require.Equal(t, `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, local commit ts: 400, replicated commit ts: 410`, s) }) t.Run("with inconsistent columns", func(t *testing.T) { t.Parallel() item := &DataInconsistentItem{ - PeerClusterID: "cluster-3", - PK: map[string]any{"id": "2"}, - OriginTS: 300, - CommitTS: 400, - PKStr: `[id: 2]`, + PeerClusterID: "cluster-3", + PK: map[string]any{"id": "2"}, + OriginTS: 300, + LocalCommitTS: 400, + ReplicatedCommitTS: 410, + PKStr: `[id: 2]`, InconsistentColumns: []InconsistentColumn{ {Column: "col1", Local: "val_a", Replicated: "val_b"}, {Column: "col2", Local: 100, Replicated: 200}, @@ -65,7 +66,7 @@ func TestDataInconsistentItem_String(t *testing.T) { } s := item.String() require.Equal(t, - `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, commit ts: 400, `+ + `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, local commit ts: 400, replicated commit ts: 410, `+ "inconsistent columns: [column: col1, local: val_a, replicated: val_b; column: col2, local: 100, replicated: 200]", s) }) @@ -73,18 +74,19 @@ func TestDataInconsistentItem_String(t *testing.T) { t.Run("with missing column in replicated", func(t *testing.T) { t.Parallel() item := &DataInconsistentItem{ - PeerClusterID: "cluster-3", - PK: map[string]any{"id": "2"}, - OriginTS: 300, - CommitTS: 400, - PKStr: `[id: 2]`, + PeerClusterID: "cluster-3", + PK: map[string]any{"id": "2"}, + OriginTS: 300, + LocalCommitTS: 400, + ReplicatedCommitTS: 410, + PKStr: `[id: 2]`, InconsistentColumns: []InconsistentColumn{ {Column: "col1", Local: "val_a", Replicated: nil}, }, } s := item.String() require.Equal(t, - `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, commit ts: 400, `+ + `peer cluster: cluster-3, pk: [id: 2], origin ts: 300, local commit ts: 400, replicated commit ts: 410, `+ "inconsistent columns: [column: col1, local: val_a, replicated: ]", s) }) @@ -127,7 +129,7 @@ func TestClusterReport(t *testing.T) { t.Run("add data loss item sets needFlush", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("peer-cluster-1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 100, 200) + cr.AddDataLossItem("peer-cluster-1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 200) require.Len(t, cr.TableFailureItems, 1) require.Contains(t, cr.TableFailureItems, testSchemaKey) tableItems := cr.TableFailureItems[testSchemaKey] @@ -135,7 +137,6 @@ func TestClusterReport(t *testing.T) { require.True(t, cr.needFlush) require.Equal(t, "peer-cluster-1", tableItems.DataLossItems[0].PeerClusterID) require.Equal(t, map[string]any{"id": "1"}, tableItems.DataLossItems[0].PK) - require.Equal(t, uint64(100), tableItems.DataLossItems[0].OriginTS) require.Equal(t, uint64(200), tableItems.DataLossItems[0].CommitTS) }) @@ -145,7 +146,7 @@ func TestClusterReport(t *testing.T) { cols := []InconsistentColumn{ {Column: "val", Local: "a", Replicated: "b"}, } - cr.AddDataInconsistentItem("peer-cluster-2", testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 300, 400, cols) + cr.AddDataInconsistentItem("peer-cluster-2", testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 300, 400, 410, cols) require.Len(t, cr.TableFailureItems, 1) require.Contains(t, cr.TableFailureItems, testSchemaKey) tableItems := cr.TableFailureItems[testSchemaKey] @@ -154,7 +155,8 @@ func TestClusterReport(t *testing.T) { require.Equal(t, "peer-cluster-2", tableItems.DataInconsistentItems[0].PeerClusterID) require.Equal(t, map[string]any{"id": "2"}, tableItems.DataInconsistentItems[0].PK) require.Equal(t, uint64(300), tableItems.DataInconsistentItems[0].OriginTS) - require.Equal(t, uint64(400), tableItems.DataInconsistentItems[0].CommitTS) + require.Equal(t, uint64(400), tableItems.DataInconsistentItems[0].LocalCommitTS) + require.Equal(t, uint64(410), tableItems.DataInconsistentItems[0].ReplicatedCommitTS) require.Len(t, tableItems.DataInconsistentItems[0].InconsistentColumns, 1) require.Equal(t, "val", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Column) require.Equal(t, "a", tableItems.DataInconsistentItems[0].InconsistentColumns[0].Local) @@ -188,8 +190,8 @@ func TestClusterReport(t *testing.T) { t.Run("add multiple items", func(t *testing.T) { t.Parallel() cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `id: 1`, 1, 2) - cr.AddDataInconsistentItem("d2", testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 3, 4, nil) + cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `id: 1`, 2) + cr.AddDataInconsistentItem("d2", testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 3, 4, 5, nil) cr.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "3"}, `[id: 3]`, 5, 6) cr.AddLWWViolationItem(testSchemaKey, map[string]any{"id": "4"}, `[id: 4]`, 7, 8, 9, 10) require.Len(t, cr.TableFailureItems, 1) @@ -225,7 +227,7 @@ func TestReport(t *testing.T) { t.Parallel() r := NewReport(1) cr := NewClusterReport("c1", types.TimeWindow{}) - cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2) + cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 2) r.AddClusterReport("c1", cr) require.True(t, r.NeedFlush()) }) @@ -259,7 +261,7 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(1) cr := NewClusterReport("c1", tw) - cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 100, 200) + cr.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 200) r.AddClusterReport("c1", cr) s := r.MarshalReport() require.Equal(t, "round: 1\n\n"+ @@ -267,7 +269,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - ` - [peer cluster: d1, pk: [id: 1], origin ts: 100, commit ts: 200]`+"\n\n", + ` - [peer cluster: d1, pk: [id: 1], commit ts: 200]`+"\n\n", s) }) @@ -308,7 +310,7 @@ func TestReport_MarshalReport(t *testing.T) { r := NewReport(1) crEmpty := NewClusterReport("empty-cluster", tw) crFull := NewClusterReport("full-cluster", tw) - crFull.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2) + crFull.AddDataLossItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 2) r.AddClusterReport("empty-cluster", crEmpty) r.AddClusterReport("full-cluster", crFull) s := r.MarshalReport() @@ -317,7 +319,7 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - ` - [peer cluster: d1, pk: [id: 1], origin ts: 1, commit ts: 2]`+"\n\n", + ` - [peer cluster: d1, pk: [id: 1], commit ts: 2]`+"\n\n", s) }) @@ -325,8 +327,8 @@ func TestReport_MarshalReport(t *testing.T) { t.Parallel() r := NewReport(10) cr := NewClusterReport("c1", tw) - cr.AddDataLossItem("d0", testSchemaKey, map[string]any{"id": "0"}, `[id: 0]`, 0, 1) - cr.AddDataInconsistentItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2, []InconsistentColumn{ + cr.AddDataLossItem("d0", testSchemaKey, map[string]any{"id": "0"}, `[id: 0]`, 1) + cr.AddDataInconsistentItem("d1", testSchemaKey, map[string]any{"id": "1"}, `[id: 1]`, 1, 2, 3, []InconsistentColumn{ {Column: "val", Local: "x", Replicated: "y"}, }) cr.AddDataRedundantItem(testSchemaKey, map[string]any{"id": "2"}, `[id: 2]`, 3, 4) @@ -338,9 +340,9 @@ func TestReport_MarshalReport(t *testing.T) { "time window: "+twStr+"\n"+ " - [table name: "+testSchemaKey+"]\n"+ " - [data loss items: 1]\n"+ - ` - [peer cluster: d0, pk: [id: 0], origin ts: 0, commit ts: 1]`+"\n"+ + ` - [peer cluster: d0, pk: [id: 0], commit ts: 1]`+"\n"+ " - [data inconsistent items: 1]\n"+ - ` - [peer cluster: d1, pk: [id: 1], origin ts: 1, commit ts: 2, inconsistent columns: [column: val, local: x, replicated: y]]`+"\n"+ + ` - [peer cluster: d1, pk: [id: 1], origin ts: 1, local commit ts: 2, replicated commit ts: 3, inconsistent columns: [column: val, local: x, replicated: y]]`+"\n"+ " - [data redundant items: 1]\n"+ ` - [pk: [id: 2], origin ts: 3, commit ts: 4]`+"\n"+ " - [lww violation items: 1]\n"+ diff --git a/cmd/multi-cluster-consistency-checker/types/types.go b/cmd/multi-cluster-consistency-checker/types/types.go index e6309bd1b8..cdb7e1760a 100644 --- a/cmd/multi-cluster-consistency-checker/types/types.go +++ b/cmd/multi-cluster-consistency-checker/types/types.go @@ -19,6 +19,7 @@ import ( "strings" "github.com/pingcap/ticdc/pkg/sink/cloudstorage" + ptypes "github.com/pingcap/tidb/pkg/parser/types" ) // PkType is a distinct type for encoded primary key strings, making it clear @@ -93,4 +94,5 @@ type TimeWindowData struct { type IncrementalData struct { DataContentSlices map[cloudstorage.FileIndexKey][][]byte + ColumnFieldTypes map[string]*ptypes.FieldType }