From 687a57e2af6840d6448bc4c13cc24839a333b3c7 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 14:44:52 -0500 Subject: [PATCH 001/130] move to common --- internal/verifier/change_stream.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 06aacca9..b53cbdbe 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -64,7 +64,7 @@ type changeEventBatch struct { clusterTime bson.Timestamp } -type ChangeStreamReader struct { +type ChangeReaderCommon struct { readerType whichCluster lastChangeEventTime *bson.Timestamp @@ -90,21 +90,29 @@ type ChangeStreamReader struct { onDDLEvent ddlEventHandling } +type ChangeStreamReader struct { + ChangeReaderCommon +} + func (verifier *Verifier) initializeChangeStreamReaders() { srcReader := &ChangeStreamReader{ - readerType: src, - namespaces: verifier.srcNamespaces, - watcherClient: verifier.srcClient, - clusterInfo: *verifier.srcClusterInfo, + ChangeReaderCommon: ChangeReaderCommon{ + readerType: src, + namespaces: verifier.srcNamespaces, + watcherClient: verifier.srcClient, + clusterInfo: *verifier.srcClusterInfo, + }, } verifier.srcChangeStreamReader = srcReader dstReader := &ChangeStreamReader{ - readerType: dst, - namespaces: verifier.dstNamespaces, - watcherClient: verifier.dstClient, - clusterInfo: *verifier.dstClusterInfo, - onDDLEvent: onDDLEventAllow, + ChangeReaderCommon: ChangeReaderCommon{ + readerType: dst, + namespaces: verifier.dstNamespaces, + watcherClient: verifier.dstClient, + clusterInfo: *verifier.dstClusterInfo, + onDDLEvent: onDDLEventAllow, + }, } verifier.dstChangeStreamReader = dstReader From 61b89e35c07f3a151ba4603211aa67897091d291 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 15:52:30 -0500 Subject: [PATCH 002/130] move --- internal/util/clusterinfo.go | 10 ++ internal/verifier/change_reader.go | 207 ++++++++++++++++++++++++ internal/verifier/change_stream.go | 37 +---- internal/verifier/change_stream_test.go | 84 +++++++--- internal/verifier/check.go | 20 +-- internal/verifier/compare.go | 4 +- internal/verifier/migration_verifier.go | 16 +- internal/verifier/summary.go | 8 +- 8 files changed, 305 insertions(+), 81 deletions(-) create mode 100644 internal/verifier/change_reader.go diff --git a/internal/util/clusterinfo.go b/internal/util/clusterinfo.go index 2364db93..7f66f8e7 100644 --- a/internal/util/clusterinfo.go +++ b/internal/util/clusterinfo.go @@ -19,6 +19,16 @@ type ClusterInfo struct { Topology ClusterTopology } +func ClusterHasBSONSize(va [2]int) bool { + major := va[0] + + if major == 4 { + return va[1] >= 4 + } + + return major > 4 +} + const ( TopologySharded ClusterTopology = "sharded" TopologyReplset ClusterTopology = "replset" diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go new file mode 100644 index 00000000..cd6e30dd --- /dev/null +++ b/internal/verifier/change_reader.go @@ -0,0 +1,207 @@ +package verifier + +import ( + "context" + "time" + + "github.com/10gen/migration-verifier/history" + "github.com/10gen/migration-verifier/internal/logger" + "github.com/10gen/migration-verifier/internal/util" + "github.com/10gen/migration-verifier/msync" + "github.com/10gen/migration-verifier/option" + "github.com/pkg/errors" + "github.com/samber/lo" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" +) + +type changeReader interface { + getWhichCluster() whichCluster + getReadChannel() <-chan changeEventBatch + getError() *util.Eventual[error] + getStartTimestamp() option.Option[bson.Timestamp] + getEventsPerSecond() option.Option[float64] + getLag() option.Option[time.Duration] + getBufferSaturation() float64 + setWritesOff(bson.Timestamp) + setPersistorError(error) + StartChangeStream(context.Context) error + done() <-chan struct{} + persistChangeStreamResumeToken(context.Context, bson.Raw) error + isRunning() bool + String() string +} + +type ChangeReaderCommon struct { + readerType whichCluster + + lastChangeEventTime *bson.Timestamp + logger *logger.Logger + namespaces []string + + metaDB *mongo.Database + watcherClient *mongo.Client + clusterInfo util.ClusterInfo + + resumeTokenTSExtractor func(bson.Raw) (bson.Timestamp, error) + + changeStreamRunning bool + changeEventBatchChan chan changeEventBatch + writesOffTs *util.Eventual[bson.Timestamp] + readerError *util.Eventual[error] + handlerError *util.Eventual[error] + doneChan chan struct{} + + startAtTs *bson.Timestamp + + lag *msync.TypedAtomic[option.Option[time.Duration]] + batchSizeHistory *history.History[int] + + onDDLEvent ddlEventHandling +} + +func (rc ChangeReaderCommon) getWhichCluster() whichCluster { + return rc.readerType +} + +func (rc ChangeReaderCommon) setPersistorError(err error) { + rc.handlerError.Set(err) +} + +func (rc ChangeReaderCommon) getError() *util.Eventual[error] { + return rc.readerError +} + +func (rc ChangeReaderCommon) getStartTimestamp() option.Option[bson.Timestamp] { + return option.FromPointer(rc.startAtTs) +} + +func (rc ChangeReaderCommon) setWritesOff(ts bson.Timestamp) { + rc.writesOffTs.Set(ts) +} + +func (rc ChangeReaderCommon) isRunning() bool { + return rc.isRunning() +} + +func (rc ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { + return rc.changeEventBatchChan +} + +func (rc ChangeReaderCommon) done() <-chan struct{} { + return rc.doneChan +} + +func (rc ChangeReaderCommon) getBufferSaturation() float64 { + return util.DivideToF64(len(rc.changeEventBatchChan), cap(rc.changeEventBatchChan)) +} + +func (rc ChangeReaderCommon) getLag() option.Option[time.Duration] { + return rc.lag.Load() +} + +// getEventsPerSecond returns the number of change events per second we’ve been +// seeing “recently”. (See implementation for the actual period over which we +// compile this metric.) +func (rc ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { + logs := rc.batchSizeHistory.Get() + lastLog, hasLogs := lo.Last(logs) + + if hasLogs && lastLog.At != logs[0].At { + span := lastLog.At.Sub(logs[0].At) + + // Each log contains a time and a # of events that happened since + // the prior log. Thus, each log’s Datum is a count of events that + // happened before the timestamp. Since we want the # of events that + // happened between the first & last times, we only want events *after* + // the first time. Thus, we skip the first log entry here. + totalEvents := 0 + for _, log := range logs[1:] { + totalEvents += log.Datum + } + + return option.Some(util.DivideToF64(totalEvents, span.Seconds())) + } + + return option.None[float64]() +} + +func (rc ChangeReaderCommon) persistChangeStreamResumeToken(ctx context.Context, token bson.Raw) error { + coll := rc.metaDB.Collection(metadataChangeStreamCollectionName) + _, err := coll.ReplaceOne( + ctx, + bson.D{{"_id", rc.resumeTokenDocID()}}, + token, + options.Replace().SetUpsert(true), + ) + + if err == nil { + ts, err := rc.resumeTokenTSExtractor(token) + + logEvent := rc.logger.Debug() + + if err == nil { + logEvent = addTimestampToLogEvent(ts, logEvent) + } else { + rc.logger.Warn().Err(err). + Msg("failed to extract resume token timestamp") + } + + logEvent.Msgf("Persisted %s's resume token.", rc.readerType) + + return nil + } + + return errors.Wrapf(err, "failed to persist change stream resume token (%v)", token) +} + +func (rc ChangeReaderCommon) resumeTokenDocID() string { + switch rc.readerType { + case src: + return "srcResumeToken" + case dst: + return "dstResumeToken" + default: + panic("unknown readerType: " + rc.readerType) + } +} + +func (rc ChangeReaderCommon) getMetadataCollection() *mongo.Collection { + return rc.metaDB.Collection(metadataChangeStreamCollectionName) +} + +func (rc ChangeReaderCommon) loadResumeToken(ctx context.Context) (option.Option[bson.Raw], error) { + coll := rc.getMetadataCollection() + + token, err := coll.FindOne( + ctx, + bson.D{{"_id", rc.resumeTokenDocID()}}, + ).Raw() + + if errors.Is(err, mongo.ErrNoDocuments) { + return option.None[bson.Raw](), nil + } + + return option.Some(token), err +} + +func (rc *ChangeReaderCommon) updateLag(sess *mongo.Session, token bson.Raw) { + var tokenTs bson.Timestamp + tokenTs, err := rc.resumeTokenTSExtractor(token) + if err == nil { + lagSecs := int64(sess.OperationTime().T) - int64(tokenTs.T) + rc.lag.Store(option.Some(time.Second * time.Duration(lagSecs))) + } else { + rc.logger.Warn(). + Err(err). + Msgf("Failed to extract timestamp from %s's resume token to compute lag.", rc.readerType) + } +} + +func (rc *ChangeReaderCommon) logIgnoredDDL(rawEvent bson.Raw) { + rc.logger.Info(). + Str("reader", string(rc.readerType)). + Stringer("event", rawEvent). + Msg("Ignoring event with unrecognized type on destination. (It’s assumedly internal to the migration.)") +} diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index b53cbdbe..8254be70 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -7,7 +7,6 @@ import ( "github.com/10gen/migration-verifier/history" "github.com/10gen/migration-verifier/internal/keystring" - "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/retry" "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mbson" @@ -64,36 +63,12 @@ type changeEventBatch struct { clusterTime bson.Timestamp } -type ChangeReaderCommon struct { - readerType whichCluster - - lastChangeEventTime *bson.Timestamp - logger *logger.Logger - namespaces []string - - metaDB *mongo.Database - watcherClient *mongo.Client - clusterInfo util.ClusterInfo - - changeStreamRunning bool - changeEventBatchChan chan changeEventBatch - writesOffTs *util.Eventual[bson.Timestamp] - readerError *util.Eventual[error] - handlerError *util.Eventual[error] - doneChan chan struct{} - - startAtTs *bson.Timestamp - - lag *msync.TypedAtomic[option.Option[time.Duration]] - batchSizeHistory *history.History[int] - - onDDLEvent ddlEventHandling -} - type ChangeStreamReader struct { ChangeReaderCommon } +var _ changeReader = &ChangeStreamReader{} + func (verifier *Verifier) initializeChangeStreamReaders() { srcReader := &ChangeStreamReader{ ChangeReaderCommon: ChangeReaderCommon{ @@ -133,7 +108,7 @@ func (verifier *Verifier) initializeChangeStreamReaders() { // RunChangeEventHandler handles change event batches from the reader. // It needs to be started after the reader starts and should run in its own // goroutine. -func (verifier *Verifier) RunChangeEventHandler(ctx context.Context, reader *ChangeStreamReader) error { +func (verifier *Verifier) RunChangeEventHandler(ctx context.Context, reader changeReader) error { var err error var lastPersistedTime time.Time @@ -161,7 +136,7 @@ HandlerLoop: Err(err). Stringer("changeStreamReader", reader). Msg("Change event handler failed.") - case batch, more := <-reader.changeEventBatchChan: + case batch, more := <-reader.getReadChannel(): if !more { verifier.logger.Debug(). Stringer("changeStreamReader", reader). @@ -177,7 +152,7 @@ HandlerLoop: Msg("Handling change event batch.") err = errors.Wrap( - verifier.HandleChangeStreamEvents(ctx, batch, reader.readerType), + verifier.HandleChangeStreamEvents(ctx, batch, reader.getWhichCluster()), "failed to handle change stream events", ) @@ -190,7 +165,7 @@ HandlerLoop: // This will prevent the reader from hanging because the reader checks // this along with checks for context expiry. if err != nil { - reader.handlerError.Set(err) + reader.setPersistorError(err) } return err diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index cbe3928e..35fa08a1 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -32,7 +32,12 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_NoNamespaces() { verifier := suite.BuildVerifier() - filter := verifier.srcChangeStreamReader.GetChangeStreamFilter() + changeStreamReader, ok := verifier.srcChangeStreamReader.(*ChangeStreamReader) + if !ok { + suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeStreamReader, changeStreamReader) + } + + filter := changeStreamReader.GetChangeStreamFilter() _, err := suite.srcMongoClient. Database("realUserDatabase"). @@ -96,18 +101,23 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_BsonSize() { ctx := suite.Context() verifier := suite.BuildVerifier() - if !verifier.srcChangeStreamReader.hasBsonSize() { + if !util.ClusterHasBSONSize([2]int(verifier.srcClusterInfo.VersionArray)) { suite.T().Skip("Need a source version that has $bsonSize") } + changeStreamReader, ok := verifier.srcChangeStreamReader.(*ChangeStreamReader) + if !ok { + suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeStreamReader, changeStreamReader) + } + srcColl := verifier.srcClient.Database(suite.DBNameForTest()).Collection("coll") _, err := srcColl.InsertOne(ctx, bson.D{{"_id", 123}}) suite.Require().NoError(err) - verifier.srcChangeStreamReader.namespaces = mslices.Of(FullName(srcColl)) + changeStreamReader.namespaces = mslices.Of(FullName(srcColl)) - filter := verifier.srcChangeStreamReader.GetChangeStreamFilter() + filter := changeStreamReader.GetChangeStreamFilter() cs, err := suite.srcMongoClient.Watch( ctx, @@ -157,14 +167,20 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { ctx := suite.Context() verifier := suite.BuildVerifier() - verifier.srcChangeStreamReader.namespaces = []string{ + + changeStreamReader, ok := verifier.srcChangeStreamReader.(*ChangeStreamReader) + if !ok { + suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeStreamReader, changeStreamReader) + } + + changeStreamReader.namespaces = []string{ "foo.bar", "foo.baz", "test.car", "test.chaz", } - filter := verifier.srcChangeStreamReader.GetChangeStreamFilter() + filter := changeStreamReader.GetChangeStreamFilter() cs, err := suite.srcMongoClient.Watch(ctx, filter) suite.Require().NoError(err) @@ -188,7 +204,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { suite.Require().NoError(err) sctx := mongo.NewSessionContext(ctx, sess) - for _, ns := range verifier.srcChangeStreamReader.namespaces { + for _, ns := range changeStreamReader.namespaces { dbAndColl := strings.Split(ns, ".") _, err := suite.srcMongoClient. @@ -223,7 +239,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { suite.Assert().Len( events, - len(verifier.srcChangeStreamReader.namespaces), + len(changeStreamReader.namespaces), "should have 1 event per in-filter namespace", ) suite.Assert().True( @@ -266,7 +282,9 @@ func (suite *IntegrationTestSuite) TestChangeStream_Resume_NoSkip() { defer v1Cancel(ctx.Err()) suite.startSrcChangeStreamReaderAndHandler(v1Ctx, verifier1) - changeStreamMetaColl := verifier1.srcChangeStreamReader.getChangeStreamMetadataCollection() + changeStreamMetaColl := verifier1.metaClient. + Database(verifier1.metaDBName). + Collection(metadataChangeStreamCollectionName) var originalResumeToken bson.Raw @@ -417,10 +435,12 @@ func (suite *IntegrationTestSuite) TestChangeStreamResumability() { suite.startSrcChangeStreamReaderAndHandler(ctx, verifier2) - suite.Require().NotNil(verifier2.srcChangeStreamReader.startAtTs) + startAtTs, hasStartAtTs := verifier2.srcChangeStreamReader.getStartTimestamp().Get() + + suite.Require().True(hasStartAtTs) suite.Assert().False( - verifier2.srcChangeStreamReader.startAtTs.After(newTime), + startAtTs.After(newTime), "verifier2's change stream should be no later than this new session", ) @@ -569,7 +589,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamLag() { verifierRunner.AwaitGenerationEnd(), ) - return verifier.srcChangeStreamReader.GetLag().IsSome() + return verifier.srcChangeStreamReader.getLag().IsSome() }, time.Minute, 100*time.Millisecond, @@ -578,7 +598,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamLag() { // NB: The lag will include whatever time elapsed above before // verifier read the event, so it can be several seconds. suite.Assert().Less( - verifier.srcChangeStreamReader.GetLag().MustGet(), + verifier.srcChangeStreamReader.getLag().MustGet(), 10*time.Minute, "verifier lag is as expected", ) @@ -605,18 +625,20 @@ func (suite *IntegrationTestSuite) TestStartAtTimeNoChanges() { suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs := verifier.srcChangeStreamReader.startAtTs - suite.Require().NotNil(startAtTs, "startAtTs should be set") + startAtTs, hasStartAtTs := verifier.srcChangeStreamReader.getStartTimestamp().Get() + suite.Require().True(hasStartAtTs, "startAtTs should be set") + + verifier.srcChangeStreamReader.setWritesOff(insertTs) - verifier.srcChangeStreamReader.writesOffTs.Set(insertTs) + <-verifier.srcChangeStreamReader.done() - <-verifier.srcChangeStreamReader.doneChan + startAtTs2 := verifier.srcChangeStreamReader.getStartTimestamp().MustGet() suite.Require().False( - verifier.srcChangeStreamReader.startAtTs.Before(*startAtTs), + startAtTs2.Before(startAtTs), "new startAtTs (%+v) should be no earlier than last one (%+v)", - verifier.srcChangeStreamReader.startAtTs, - *startAtTs, + startAtTs2, + startAtTs, ) } } @@ -635,10 +657,13 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { suite.Require().NotNil(origSessionTime) suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) + startAtTs, hasStartAtTs := verifier.srcChangeStreamReader.getStartTimestamp().Get() + suite.Require().True(hasStartAtTs, "startAtTs should be set") + // srcStartAtTs derives from the change stream’s resume token, which can // postdate our session time but should not precede it. suite.Require().False( - verifier.srcChangeStreamReader.startAtTs.Before(*origSessionTime), + startAtTs.Before(*origSessionTime), "srcStartAtTs should be >= the insert’s optime", ) @@ -662,12 +687,15 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { "session time after events should exceed the original", ) - verifier.srcChangeStreamReader.writesOffTs.Set(*postEventsSessionTime) - <-verifier.srcChangeStreamReader.doneChan + verifier.srcChangeStreamReader.setWritesOff(*postEventsSessionTime) + <-verifier.srcChangeStreamReader.done() + + startAtTs, hasStartAtTs = verifier.srcChangeStreamReader.getStartTimestamp().Get() + suite.Require().True(hasStartAtTs, "startAtTs should be set") suite.Assert().Equal( *postEventsSessionTime, - *verifier.srcChangeStreamReader.startAtTs, + startAtTs, "verifier.srcStartAtTs should now be our session timestamp", ) } @@ -684,8 +712,12 @@ func (suite *IntegrationTestSuite) TestNoStartAtTime() { origStartTs := sess.OperationTime() suite.Require().NotNil(origStartTs) suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - suite.Require().NotNil(verifier.srcChangeStreamReader.startAtTs) - suite.Require().LessOrEqual(origStartTs.Compare(*verifier.srcChangeStreamReader.startAtTs), 0) + + startAtTs, hasStartAtTs := verifier.srcChangeStreamReader.getStartTimestamp().Get() + suite.Require().True(hasStartAtTs, "startAtTs should be set") + + suite.Require().NotNil(startAtTs) + suite.Require().LessOrEqual(origStartTs.Compare(startAtTs), 0) } func (suite *IntegrationTestSuite) TestWithChangeEventsBatching() { diff --git a/internal/verifier/check.go b/internal/verifier/check.go index e8269ecd..29b97670 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -50,16 +50,16 @@ func (verifier *Verifier) Check(ctx context.Context, filter bson.D) { verifier.MaybeStartPeriodicHeapProfileCollection(ctx) } -func (verifier *Verifier) waitForChangeStream(ctx context.Context, csr *ChangeStreamReader) error { +func (verifier *Verifier) waitForChangeStream(ctx context.Context, csr changeReader) error { select { case <-ctx.Done(): return util.WrapCtxErrWithCause(ctx) - case <-csr.readerError.Ready(): - err := csr.readerError.Get() + case <-csr.getError().Ready(): + err := csr.getError().Get() verifier.logger.Warn().Err(err). Msgf("Received error from %s.", csr) return err - case <-csr.doneChan: + case <-csr.done(): verifier.logger.Debug(). Msgf("Received completion signal from %s.", csr) break @@ -93,11 +93,11 @@ func (verifier *Verifier) CheckWorker(ctxIn context.Context) error { // If the change stream fails, everything should stop. eg.Go(func() error { select { - case <-verifier.srcChangeStreamReader.readerError.Ready(): - err := verifier.srcChangeStreamReader.readerError.Get() + case <-verifier.srcChangeStreamReader.getError().Ready(): + err := verifier.srcChangeStreamReader.getError().Get() return errors.Wrapf(err, "%s failed", verifier.srcChangeStreamReader) - case <-verifier.dstChangeStreamReader.readerError.Ready(): - err := verifier.dstChangeStreamReader.readerError.Get() + case <-verifier.dstChangeStreamReader.getError().Ready(): + err := verifier.dstChangeStreamReader.getError().Get() return errors.Wrapf(err, "%s failed", verifier.dstChangeStreamReader) case <-ctx.Done(): return nil @@ -270,8 +270,8 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh }() ceHandlerGroup, groupCtx := contextplus.ErrGroup(ctx) - for _, csReader := range []*ChangeStreamReader{verifier.srcChangeStreamReader, verifier.dstChangeStreamReader} { - if csReader.changeStreamRunning { + for _, csReader := range mslices.Of(verifier.srcChangeStreamReader, verifier.dstChangeStreamReader) { + if csReader.isRunning() { verifier.logger.Debug().Msgf("Check: %s already running.", csReader) } else { verifier.logger.Debug().Msgf("%s not running; starting change stream", csReader) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index fc49aaa4..c21fe2d0 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -467,7 +467,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.srcClientCollection(task), verifier.srcClusterInfo, - verifier.srcChangeStreamReader.startAtTs, + verifier.srcChangeStreamReader.getStartTimestamp().ToPointer(), task, ) @@ -500,7 +500,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.dstClientCollection(task), verifier.dstClusterInfo, - verifier.dstChangeStreamReader.startAtTs, + verifier.dstChangeStreamReader.getStartTimestamp().ToPointer(), task, ) diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index c7c1102b..ef239dfb 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -132,8 +132,8 @@ type Verifier struct { mux sync.RWMutex - srcChangeStreamReader *ChangeStreamReader - dstChangeStreamReader *ChangeStreamReader + srcChangeStreamReader changeReader + dstChangeStreamReader changeReader readConcernSetting ReadConcernSetting @@ -273,19 +273,19 @@ func (verifier *Verifier) WritesOff(ctx context.Context) error { // might be inserting docs into the recheck queue, which happens // under the lock. select { - case <-verifier.srcChangeStreamReader.readerError.Ready(): - err := verifier.srcChangeStreamReader.readerError.Get() + case <-verifier.srcChangeStreamReader.getError().Ready(): + err := verifier.srcChangeStreamReader.getError().Get() return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change stream already failed", verifier.srcChangeStreamReader) default: - verifier.srcChangeStreamReader.writesOffTs.Set(srcFinalTs) + verifier.srcChangeStreamReader.setWritesOff(srcFinalTs) } select { - case <-verifier.dstChangeStreamReader.readerError.Ready(): - err := verifier.dstChangeStreamReader.readerError.Get() + case <-verifier.dstChangeStreamReader.getError().Ready(): + err := verifier.dstChangeStreamReader.getError().Get() return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change stream already failed", verifier.dstChangeStreamReader) default: - verifier.dstChangeStreamReader.writesOffTs.Set(dstFinalTs) + verifier.dstChangeStreamReader.setWritesOff(dstFinalTs) } return nil diff --git a/internal/verifier/summary.go b/internal/verifier/summary.go index f04607da..7079b294 100644 --- a/internal/verifier/summary.go +++ b/internal/verifier/summary.go @@ -557,7 +557,7 @@ func (verifier *Verifier) printChangeEventStatistics(builder io.Writer) { for _, cluster := range []struct { title string eventRecorder *EventRecorder - csReader *ChangeStreamReader + csReader changeReader }{ {"Source", verifier.srcEventRecorder, verifier.srcChangeStreamReader}, {"Destination", verifier.dstEventRecorder, verifier.dstChangeStreamReader}, @@ -584,16 +584,16 @@ func (verifier *Verifier) printChangeEventStatistics(builder io.Writer) { fmt.Fprintf(builder, "%s change events this generation: %s\n", cluster.title, eventsDescr) - if eventsPerSec, has := cluster.csReader.GetEventsPerSecond().Get(); has { + if eventsPerSec, has := cluster.csReader.getEventsPerSecond().Get(); has { var lagNote string - lag, hasLag := cluster.csReader.GetLag().Get() + lag, hasLag := cluster.csReader.getLag().Get() if hasLag { lagNote = fmt.Sprintf("lag: %s; ", reportutils.DurationToHMS(lag)) } - saturation := cluster.csReader.GetSaturation() + saturation := cluster.csReader.getBufferSaturation() fmt.Fprintf( builder, From 3d35d297118b13bb906d5409fe8cb206ba81d64a Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:05:26 -0500 Subject: [PATCH 003/130] fix lint --- internal/verifier/change_reader.go | 2 +- internal/verifier/change_stream.go | 71 +++++------------------------- 2 files changed, 11 insertions(+), 62 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index cd6e30dd..7e260ef7 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -82,7 +82,7 @@ func (rc ChangeReaderCommon) setWritesOff(ts bson.Timestamp) { } func (rc ChangeReaderCommon) isRunning() bool { - return rc.isRunning() + return rc.changeStreamRunning } func (rc ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 8254be70..204ee2d9 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -102,6 +102,7 @@ func (verifier *Verifier) initializeChangeStreamReaders() { csr.doneChan = make(chan struct{}) csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) csr.batchSizeHistory = history.New[int](time.Minute) + csr.resumeTokenTSExtractor = extractTimestampFromResumeToken } } @@ -407,10 +408,7 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( // indexes are created after initial sync. if csr.onDDLEvent == onDDLEventAllow { - csr.logger.Info(). - Stringer("changeStream", csr). - Stringer("event", cs.Current). - Msg("Ignoring event with unrecognized type on destination. (It’s assumedly internal to the migration.)") + csr.logIgnoredDDL(cs.Current) // Discard this event, then keep reading. changeEvents = changeEvents[:len(changeEvents)-1] @@ -437,16 +435,7 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( eventsRead++ } - var tokenTs bson.Timestamp - tokenTs, err := extractTimestampFromResumeToken(cs.ResumeToken()) - if err == nil { - lagSecs := int64(sess.OperationTime().T) - int64(tokenTs.T) - csr.lag.Store(option.Some(time.Second * time.Duration(lagSecs))) - } else { - csr.logger.Warn(). - Err(err). - Msgf("Failed to extract timestamp from %s's resume token to compute change stream lag.", csr) - } + csr.updateLag(sess, cs.ResumeToken()) if eventsRead == 0 { ri.NoteSuccess("received an empty change stream response") @@ -535,7 +524,7 @@ func (csr *ChangeStreamReader) iterateChangeStream( // (i.e., the `getMore` call returns empty) for { var curTs bson.Timestamp - curTs, err = extractTimestampFromResumeToken(cs.ResumeToken()) + curTs, err = csr.resumeTokenTSExtractor(cs.ResumeToken()) if err != nil { return errors.Wrap(err, "failed to extract timestamp from change stream's resume token") } @@ -608,18 +597,18 @@ func (csr *ChangeStreamReader) createChangeStream( ) } - savedResumeToken, err := csr.loadChangeStreamResumeToken(ctx) + savedResumeToken, err := csr.loadResumeToken(ctx) if err != nil { return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to load persisted change stream resume token") } csStartLogEvent := csr.logger.Info() - if savedResumeToken != nil { + if token, hasToken := savedResumeToken.Get(); hasToken { logEvent := csStartLogEvent. - Stringer(csr.resumeTokenDocID(), savedResumeToken) + Stringer(csr.resumeTokenDocID(), token) - ts, err := extractTimestampFromResumeToken(savedResumeToken) + ts, err := csr.resumeTokenTSExtractor(token) if err == nil { logEvent = addTimestampToLogEvent(ts, logEvent) } else { @@ -630,7 +619,7 @@ func (csr *ChangeStreamReader) createChangeStream( logEvent.Msg("Starting change stream from persisted resume token.") - opts = opts.SetStartAfter(savedResumeToken) + opts = opts.SetStartAfter(token) } else { csStartLogEvent.Msgf("Starting change stream from current %s cluster time.", csr.readerType) } @@ -650,7 +639,7 @@ func (csr *ChangeStreamReader) createChangeStream( return nil, nil, bson.Timestamp{}, err } - startTs, err := extractTimestampFromResumeToken(changeStream.ResumeToken()) + startTs, err := csr.resumeTokenTSExtractor(changeStream.ResumeToken()) if err != nil { return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to extract timestamp from change stream's resume token") } @@ -827,46 +816,6 @@ func (csr *ChangeStreamReader) String() string { return fmt.Sprintf("%s change stream reader", csr.readerType) } -func (csr *ChangeStreamReader) resumeTokenDocID() string { - switch csr.readerType { - case src: - return "srcResumeToken" - case dst: - return "dstResumeToken" - default: - panic("unknown readerType: " + csr.readerType) - } -} - -func (csr *ChangeStreamReader) persistChangeStreamResumeToken(ctx context.Context, token bson.Raw) error { - coll := csr.getChangeStreamMetadataCollection() - _, err := coll.ReplaceOne( - ctx, - bson.D{{"_id", csr.resumeTokenDocID()}}, - token, - options.Replace().SetUpsert(true), - ) - - if err == nil { - ts, err := extractTimestampFromResumeToken(token) - - logEvent := csr.logger.Debug() - - if err == nil { - logEvent = addTimestampToLogEvent(ts, logEvent) - } else { - csr.logger.Warn().Err(err). - Msg("failed to extract resume token timestamp") - } - - logEvent.Msgf("Persisted %s's resume token.", csr) - - return nil - } - - return errors.Wrapf(err, "failed to persist change stream resume token (%v)", token) -} - func extractTimestampFromResumeToken(resumeToken bson.Raw) (bson.Timestamp, error) { // Change stream token is always a V1 keystring in the _data field tokenDataRV, err := resumeToken.LookupErr("_data") From 1c672f9b34834232ba5cb7bc701430d82e995370 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:06:32 -0500 Subject: [PATCH 004/130] unused --- internal/verifier/change_stream.go | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 204ee2d9..b11548d6 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -793,25 +793,6 @@ func addTimestampToLogEvent(ts bson.Timestamp, event *zerolog.Event) *zerolog.Ev Time("time", time.Unix(int64(ts.T), int64(0))) } -func (csr *ChangeStreamReader) getChangeStreamMetadataCollection() *mongo.Collection { - return csr.metaDB.Collection(metadataChangeStreamCollectionName) -} - -func (csr *ChangeStreamReader) loadChangeStreamResumeToken(ctx context.Context) (bson.Raw, error) { - coll := csr.getChangeStreamMetadataCollection() - - token, err := coll.FindOne( - ctx, - bson.D{{"_id", csr.resumeTokenDocID()}}, - ).Raw() - - if errors.Is(err, mongo.ErrNoDocuments) { - return nil, nil - } - - return token, err -} - func (csr *ChangeStreamReader) String() string { return fmt.Sprintf("%s change stream reader", csr.readerType) } From aefe4d6fe80056fd7c9bc1fcf3a6268ac0febcb8 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:15:21 -0500 Subject: [PATCH 005/130] pointer receivers --- internal/verifier/change_reader.go | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 7e260ef7..1168c422 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -61,50 +61,50 @@ type ChangeReaderCommon struct { onDDLEvent ddlEventHandling } -func (rc ChangeReaderCommon) getWhichCluster() whichCluster { +func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { return rc.readerType } -func (rc ChangeReaderCommon) setPersistorError(err error) { +func (rc *ChangeReaderCommon) setPersistorError(err error) { rc.handlerError.Set(err) } -func (rc ChangeReaderCommon) getError() *util.Eventual[error] { +func (rc *ChangeReaderCommon) getError() *util.Eventual[error] { return rc.readerError } -func (rc ChangeReaderCommon) getStartTimestamp() option.Option[bson.Timestamp] { +func (rc *ChangeReaderCommon) getStartTimestamp() option.Option[bson.Timestamp] { return option.FromPointer(rc.startAtTs) } -func (rc ChangeReaderCommon) setWritesOff(ts bson.Timestamp) { +func (rc *ChangeReaderCommon) setWritesOff(ts bson.Timestamp) { rc.writesOffTs.Set(ts) } -func (rc ChangeReaderCommon) isRunning() bool { +func (rc *ChangeReaderCommon) isRunning() bool { return rc.changeStreamRunning } -func (rc ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { +func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { return rc.changeEventBatchChan } -func (rc ChangeReaderCommon) done() <-chan struct{} { +func (rc *ChangeReaderCommon) done() <-chan struct{} { return rc.doneChan } -func (rc ChangeReaderCommon) getBufferSaturation() float64 { +func (rc *ChangeReaderCommon) getBufferSaturation() float64 { return util.DivideToF64(len(rc.changeEventBatchChan), cap(rc.changeEventBatchChan)) } -func (rc ChangeReaderCommon) getLag() option.Option[time.Duration] { +func (rc *ChangeReaderCommon) getLag() option.Option[time.Duration] { return rc.lag.Load() } // getEventsPerSecond returns the number of change events per second we’ve been // seeing “recently”. (See implementation for the actual period over which we // compile this metric.) -func (rc ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { +func (rc *ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { logs := rc.batchSizeHistory.Get() lastLog, hasLogs := lo.Last(logs) @@ -127,7 +127,7 @@ func (rc ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { return option.None[float64]() } -func (rc ChangeReaderCommon) persistChangeStreamResumeToken(ctx context.Context, token bson.Raw) error { +func (rc *ChangeReaderCommon) persistChangeStreamResumeToken(ctx context.Context, token bson.Raw) error { coll := rc.metaDB.Collection(metadataChangeStreamCollectionName) _, err := coll.ReplaceOne( ctx, @@ -156,7 +156,7 @@ func (rc ChangeReaderCommon) persistChangeStreamResumeToken(ctx context.Context, return errors.Wrapf(err, "failed to persist change stream resume token (%v)", token) } -func (rc ChangeReaderCommon) resumeTokenDocID() string { +func (rc *ChangeReaderCommon) resumeTokenDocID() string { switch rc.readerType { case src: return "srcResumeToken" @@ -167,11 +167,11 @@ func (rc ChangeReaderCommon) resumeTokenDocID() string { } } -func (rc ChangeReaderCommon) getMetadataCollection() *mongo.Collection { +func (rc *ChangeReaderCommon) getMetadataCollection() *mongo.Collection { return rc.metaDB.Collection(metadataChangeStreamCollectionName) } -func (rc ChangeReaderCommon) loadResumeToken(ctx context.Context) (option.Option[bson.Raw], error) { +func (rc *ChangeReaderCommon) loadResumeToken(ctx context.Context) (option.Option[bson.Raw], error) { coll := rc.getMetadataCollection() token, err := coll.FindOne( From 1cb493b476540ac69d3b6cfe2ccbbe4fb9890f1f Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:24:43 -0500 Subject: [PATCH 006/130] renames/refacotrs --- internal/verifier/change_reader.go | 2 +- internal/verifier/change_stream.go | 170 +----------------- internal/verifier/change_stream_test.go | 44 ++--- internal/verifier/check.go | 42 ++--- internal/verifier/compare.go | 4 +- internal/verifier/integration_test_suite.go | 2 +- internal/verifier/migration_verifier.go | 24 +-- internal/verifier/migration_verifier_test.go | 14 +- internal/verifier/recheck_persist.go | 180 +++++++++++++++++++ internal/verifier/summary.go | 8 +- 10 files changed, 254 insertions(+), 236 deletions(-) create mode 100644 internal/verifier/recheck_persist.go diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 1168c422..663eb89f 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -26,7 +26,7 @@ type changeReader interface { getBufferSaturation() float64 setWritesOff(bson.Timestamp) setPersistorError(error) - StartChangeStream(context.Context) error + start(context.Context) error done() <-chan struct{} persistChangeStreamResumeToken(context.Context, bson.Raw) error isRunning() bool diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index b11548d6..b69254b7 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -69,7 +69,7 @@ type ChangeStreamReader struct { var _ changeReader = &ChangeStreamReader{} -func (verifier *Verifier) initializeChangeStreamReaders() { +func (verifier *Verifier) initializeChangeReaders() { srcReader := &ChangeStreamReader{ ChangeReaderCommon: ChangeReaderCommon{ readerType: src, @@ -78,7 +78,7 @@ func (verifier *Verifier) initializeChangeStreamReaders() { clusterInfo: *verifier.srcClusterInfo, }, } - verifier.srcChangeStreamReader = srcReader + verifier.srcChangeReader = srcReader dstReader := &ChangeStreamReader{ ChangeReaderCommon: ChangeReaderCommon{ @@ -89,7 +89,7 @@ func (verifier *Verifier) initializeChangeStreamReaders() { onDDLEvent: onDDLEventAllow, }, } - verifier.dstChangeStreamReader = dstReader + verifier.dstChangeReader = dstReader // Common elements in both readers: for _, csr := range mslices.Of(srcReader, dstReader) { @@ -106,168 +106,6 @@ func (verifier *Verifier) initializeChangeStreamReaders() { } } -// RunChangeEventHandler handles change event batches from the reader. -// It needs to be started after the reader starts and should run in its own -// goroutine. -func (verifier *Verifier) RunChangeEventHandler(ctx context.Context, reader changeReader) error { - var err error - - var lastPersistedTime time.Time - persistResumeTokenIfNeeded := func(ctx context.Context, token bson.Raw) { - if time.Since(lastPersistedTime) >= minChangeStreamPersistInterval { - persistErr := reader.persistChangeStreamResumeToken(ctx, token) - if persistErr != nil { - verifier.logger.Warn(). - Stringer("changeReader", reader). - Err(persistErr). - Msg("Failed to persist resume token. Because of this, if the verifier restarts, it will have to re-process already-handled change events. This error may be transient, but if it recurs, investigate.") - } else { - lastPersistedTime = time.Now() - } - } - } - -HandlerLoop: - for err == nil { - select { - case <-ctx.Done(): - err = util.WrapCtxErrWithCause(ctx) - - verifier.logger.Debug(). - Err(err). - Stringer("changeStreamReader", reader). - Msg("Change event handler failed.") - case batch, more := <-reader.getReadChannel(): - if !more { - verifier.logger.Debug(). - Stringer("changeStreamReader", reader). - Msg("Change event batch channel has been closed.") - - break HandlerLoop - } - - verifier.logger.Trace(). - Stringer("changeStreamReader", reader). - Int("batchSize", len(batch.events)). - Any("batch", batch). - Msg("Handling change event batch.") - - err = errors.Wrap( - verifier.HandleChangeStreamEvents(ctx, batch, reader.getWhichCluster()), - "failed to handle change stream events", - ) - - if err == nil && batch.resumeToken != nil { - persistResumeTokenIfNeeded(ctx, batch.resumeToken) - } - } - } - - // This will prevent the reader from hanging because the reader checks - // this along with checks for context expiry. - if err != nil { - reader.setPersistorError(err) - } - - return err -} - -// HandleChangeStreamEvents performs the necessary work for change stream events after receiving a batch. -func (verifier *Verifier) HandleChangeStreamEvents(ctx context.Context, batch changeEventBatch, eventOrigin whichCluster) error { - if len(batch.events) == 0 { - return nil - } - - dbNames := make([]string, len(batch.events)) - collNames := make([]string, len(batch.events)) - docIDs := make([]bson.RawValue, len(batch.events)) - dataSizes := make([]int32, len(batch.events)) - - latestTimestamp := bson.Timestamp{} - - for i, changeEvent := range batch.events { - if !supportedEventOpTypes.Contains(changeEvent.OpType) { - panic(fmt.Sprintf("Unsupported optype in event; should have failed already! event=%+v", changeEvent)) - } - - if changeEvent.ClusterTime == nil { - verifier.logger.Warn(). - Any("event", changeEvent). - Msg("Change event unexpectedly lacks a clusterTime?!?") - } else if changeEvent.ClusterTime.After(latestTimestamp) { - latestTimestamp = *changeEvent.ClusterTime - } - - var srcDBName, srcCollName string - - var eventRecorder EventRecorder - - // Recheck Docs are keyed by source namespaces. - // We need to retrieve the source namespaces if change events are from the destination. - switch eventOrigin { - case dst: - eventRecorder = *verifier.dstEventRecorder - - if verifier.nsMap.Len() == 0 { - // Namespace is not remapped. Source namespace is the same as the destination. - srcDBName = changeEvent.Ns.DB - srcCollName = changeEvent.Ns.Coll - } else { - dstNs := fmt.Sprintf("%s.%s", changeEvent.Ns.DB, changeEvent.Ns.Coll) - srcNs, exist := verifier.nsMap.GetSrcNamespace(dstNs) - if !exist { - return errors.Errorf("no source namespace corresponding to the destination namepsace %s", dstNs) - } - srcDBName, srcCollName = SplitNamespace(srcNs) - } - case src: - eventRecorder = *verifier.srcEventRecorder - - srcDBName = changeEvent.Ns.DB - srcCollName = changeEvent.Ns.Coll - default: - panic(fmt.Sprintf("unknown event origin: %s", eventOrigin)) - } - - dbNames[i] = srcDBName - collNames[i] = srcCollName - docIDs[i] = changeEvent.DocID - - if changeEvent.FullDocLen.OrZero() > 0 { - dataSizes[i] = int32(changeEvent.FullDocLen.OrZero()) - } else if changeEvent.FullDocument == nil { - // This happens for deletes and for some updates. - // The document is probably, but not necessarily, deleted. - dataSizes[i] = fauxDocSizeForDeleteEvents - } else { - // This happens for inserts, replaces, and most updates. - dataSizes[i] = int32(len(changeEvent.FullDocument)) - } - - if err := eventRecorder.AddEvent(&changeEvent); err != nil { - return errors.Wrapf( - err, - "failed to augment stats with %s change event (%+v)", - eventOrigin, - changeEvent, - ) - } - } - - latestTimestampTime := time.Unix(int64(latestTimestamp.T), 0) - lag := time.Unix(int64(batch.clusterTime.T), 0).Sub(latestTimestampTime) - - verifier.logger.Trace(). - Str("origin", string(eventOrigin)). - Int("count", len(docIDs)). - Any("latestTimestamp", latestTimestamp). - Time("latestTimestampTime", latestTimestampTime). - Stringer("lag", lag). - Msg("Persisting rechecks for change events.") - - return verifier.insertRecheckDocs(ctx, dbNames, collNames, docIDs, dataSizes) -} - // GetChangeStreamFilter returns an aggregation pipeline that filters // namespaces as per configuration. // @@ -666,7 +504,7 @@ func (csr *ChangeStreamReader) createChangeStream( } // StartChangeStream starts the change stream. -func (csr *ChangeStreamReader) StartChangeStream(ctx context.Context) error { +func (csr *ChangeStreamReader) start(ctx context.Context) error { // This channel holds the first change stream creation's result, whether // success or failure. Rather than using a Result we could make separate // Timestamp and error channels, but the single channel is cleaner since diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 35fa08a1..c69e38c5 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -32,9 +32,9 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_NoNamespaces() { verifier := suite.BuildVerifier() - changeStreamReader, ok := verifier.srcChangeStreamReader.(*ChangeStreamReader) + changeStreamReader, ok := verifier.srcChangeReader.(*ChangeStreamReader) if !ok { - suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeStreamReader, changeStreamReader) + suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeReader, changeStreamReader) } filter := changeStreamReader.GetChangeStreamFilter() @@ -105,9 +105,9 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_BsonSize() { suite.T().Skip("Need a source version that has $bsonSize") } - changeStreamReader, ok := verifier.srcChangeStreamReader.(*ChangeStreamReader) + changeStreamReader, ok := verifier.srcChangeReader.(*ChangeStreamReader) if !ok { - suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeStreamReader, changeStreamReader) + suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeReader, changeStreamReader) } srcColl := verifier.srcClient.Database(suite.DBNameForTest()).Collection("coll") @@ -168,9 +168,9 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { verifier := suite.BuildVerifier() - changeStreamReader, ok := verifier.srcChangeStreamReader.(*ChangeStreamReader) + changeStreamReader, ok := verifier.srcChangeReader.(*ChangeStreamReader) if !ok { - suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeStreamReader, changeStreamReader) + suite.T().Skipf("source change reader is a %T; this test needs a %T", verifier.srcChangeReader, changeStreamReader) } changeStreamReader.namespaces = []string{ @@ -254,10 +254,10 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { } func (suite *IntegrationTestSuite) startSrcChangeStreamReaderAndHandler(ctx context.Context, verifier *Verifier) { - err := verifier.srcChangeStreamReader.StartChangeStream(ctx) + err := verifier.srcChangeReader.start(ctx) suite.Require().NoError(err) go func() { - err := verifier.RunChangeEventHandler(ctx, verifier.srcChangeStreamReader) + err := verifier.RunChangeEventPersistor(ctx, verifier.srcChangeReader) if errors.Is(err, context.Canceled) { return } @@ -435,7 +435,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamResumability() { suite.startSrcChangeStreamReaderAndHandler(ctx, verifier2) - startAtTs, hasStartAtTs := verifier2.srcChangeStreamReader.getStartTimestamp().Get() + startAtTs, hasStartAtTs := verifier2.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs) @@ -589,7 +589,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamLag() { verifierRunner.AwaitGenerationEnd(), ) - return verifier.srcChangeStreamReader.getLag().IsSome() + return verifier.srcChangeReader.getLag().IsSome() }, time.Minute, 100*time.Millisecond, @@ -598,7 +598,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamLag() { // NB: The lag will include whatever time elapsed above before // verifier read the event, so it can be several seconds. suite.Assert().Less( - verifier.srcChangeStreamReader.getLag().MustGet(), + verifier.srcChangeReader.getLag().MustGet(), 10*time.Minute, "verifier lag is as expected", ) @@ -625,14 +625,14 @@ func (suite *IntegrationTestSuite) TestStartAtTimeNoChanges() { suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs, hasStartAtTs := verifier.srcChangeStreamReader.getStartTimestamp().Get() + startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") - verifier.srcChangeStreamReader.setWritesOff(insertTs) + verifier.srcChangeReader.setWritesOff(insertTs) - <-verifier.srcChangeStreamReader.done() + <-verifier.srcChangeReader.done() - startAtTs2 := verifier.srcChangeStreamReader.getStartTimestamp().MustGet() + startAtTs2 := verifier.srcChangeReader.getStartTimestamp().MustGet() suite.Require().False( startAtTs2.Before(startAtTs), @@ -657,7 +657,7 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { suite.Require().NotNil(origSessionTime) suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs, hasStartAtTs := verifier.srcChangeStreamReader.getStartTimestamp().Get() + startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") // srcStartAtTs derives from the change stream’s resume token, which can @@ -687,10 +687,10 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { "session time after events should exceed the original", ) - verifier.srcChangeStreamReader.setWritesOff(*postEventsSessionTime) - <-verifier.srcChangeStreamReader.done() + verifier.srcChangeReader.setWritesOff(*postEventsSessionTime) + <-verifier.srcChangeReader.done() - startAtTs, hasStartAtTs = verifier.srcChangeStreamReader.getStartTimestamp().Get() + startAtTs, hasStartAtTs = verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") suite.Assert().Equal( @@ -713,7 +713,7 @@ func (suite *IntegrationTestSuite) TestNoStartAtTime() { suite.Require().NotNil(origStartTs) suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs, hasStartAtTs := verifier.srcChangeStreamReader.getStartTimestamp().Get() + startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") suite.Require().NotNil(startAtTs) @@ -1063,9 +1063,9 @@ func (suite *IntegrationTestSuite) TestRecheckDocsWithDstChangeEvents() { verifier.SetDstNamespaces([]string{dstDBName + ".dstColl1", dstDBName + ".dstColl2"}) verifier.SetNamespaceMap() - suite.Require().NoError(verifier.dstChangeStreamReader.StartChangeStream(ctx)) + suite.Require().NoError(verifier.dstChangeReader.start(ctx)) go func() { - err := verifier.RunChangeEventHandler(ctx, verifier.dstChangeStreamReader) + err := verifier.RunChangeEventPersistor(ctx, verifier.dstChangeReader) if errors.Is(err, context.Canceled) { return } diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 29b97670..44f80519 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -50,7 +50,7 @@ func (verifier *Verifier) Check(ctx context.Context, filter bson.D) { verifier.MaybeStartPeriodicHeapProfileCollection(ctx) } -func (verifier *Verifier) waitForChangeStream(ctx context.Context, csr changeReader) error { +func (verifier *Verifier) waitForChangeReader(ctx context.Context, csr changeReader) error { select { case <-ctx.Done(): return util.WrapCtxErrWithCause(ctx) @@ -90,15 +90,15 @@ func (verifier *Verifier) CheckWorker(ctxIn context.Context) error { cancelableCtx, canceler := contextplus.WithCancelCause(ctxIn) eg, ctx := contextplus.ErrGroup(cancelableCtx) - // If the change stream fails, everything should stop. + // If the change reader fails, everything should stop. eg.Go(func() error { select { - case <-verifier.srcChangeStreamReader.getError().Ready(): - err := verifier.srcChangeStreamReader.getError().Get() - return errors.Wrapf(err, "%s failed", verifier.srcChangeStreamReader) - case <-verifier.dstChangeStreamReader.getError().Ready(): - err := verifier.dstChangeStreamReader.getError().Get() - return errors.Wrapf(err, "%s failed", verifier.dstChangeStreamReader) + case <-verifier.srcChangeReader.getError().Ready(): + err := verifier.srcChangeReader.getError().Get() + return errors.Wrapf(err, "%s failed", verifier.srcChangeReader) + case <-verifier.dstChangeReader.getError().Ready(): + err := verifier.dstChangeReader.getError().Get() + return errors.Wrapf(err, "%s failed", verifier.dstChangeReader) case <-ctx.Done(): return nil } @@ -229,11 +229,11 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } } - verifier.logger.Info().Msg("Starting change streams.") + verifier.logger.Info().Msg("Starting change readers.") // Now that we’ve initialized verifier.generation we can // start the change stream readers. - verifier.initializeChangeStreamReaders() + verifier.initializeChangeReaders() verifier.mux.Unlock() err = retry.New().WithCallback( @@ -270,18 +270,18 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh }() ceHandlerGroup, groupCtx := contextplus.ErrGroup(ctx) - for _, csReader := range mslices.Of(verifier.srcChangeStreamReader, verifier.dstChangeStreamReader) { - if csReader.isRunning() { - verifier.logger.Debug().Msgf("Check: %s already running.", csReader) + for _, changeReader := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { + if changeReader.isRunning() { + verifier.logger.Debug().Msgf("Check: %s already running.", changeReader) } else { - verifier.logger.Debug().Msgf("%s not running; starting change stream", csReader) + verifier.logger.Debug().Msgf("%s not running; starting change reader", changeReader) - err = csReader.StartChangeStream(ctx) + err = changeReader.start(ctx) if err != nil { - return errors.Wrapf(err, "failed to start %s", csReader) + return errors.Wrapf(err, "failed to start %s", changeReader) } ceHandlerGroup.Go(func() error { - return verifier.RunChangeEventHandler(groupCtx, csReader) + return verifier.RunChangeEventPersistor(groupCtx, changeReader) }) } } @@ -364,14 +364,14 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh // caught again on the next iteration. if verifier.writesOff { verifier.logger.Debug(). - Msg("Waiting for change streams to end.") + Msg("Waiting for change readers to end.") - // It's necessary to wait for the change stream to finish before incrementing the + // It's necessary to wait for the change reader to finish before incrementing the // generation number, or the last changes will not be checked. verifier.mux.Unlock() - for _, csr := range mslices.Of(verifier.srcChangeStreamReader, verifier.dstChangeStreamReader) { - if err = verifier.waitForChangeStream(ctx, csr); err != nil { + for _, csr := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { + if err = verifier.waitForChangeReader(ctx, csr); err != nil { return errors.Wrapf( err, "an error interrupted the wait for closure of %s", diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index c21fe2d0..3172c0d4 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -467,7 +467,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.srcClientCollection(task), verifier.srcClusterInfo, - verifier.srcChangeStreamReader.getStartTimestamp().ToPointer(), + verifier.srcChangeReader.getStartTimestamp().ToPointer(), task, ) @@ -500,7 +500,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.dstClientCollection(task), verifier.dstClusterInfo, - verifier.dstChangeStreamReader.getStartTimestamp().ToPointer(), + verifier.dstChangeReader.getStartTimestamp().ToPointer(), task, ) diff --git a/internal/verifier/integration_test_suite.go b/internal/verifier/integration_test_suite.go index 897678da..19aff0ab 100644 --- a/internal/verifier/integration_test_suite.go +++ b/internal/verifier/integration_test_suite.go @@ -191,7 +191,7 @@ func (suite *IntegrationTestSuite) BuildVerifier() *Verifier { "should set metadata connection string", ) verifier.SetMetaDBName(metaDBName) - verifier.initializeChangeStreamReaders() + verifier.initializeChangeReaders() suite.Require().NoError(verifier.srcClientCollection(&task).Drop(ctx)) suite.Require().NoError(verifier.dstClientCollection(&task).Drop(ctx)) diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index ef239dfb..9e75bf18 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -132,8 +132,8 @@ type Verifier struct { mux sync.RWMutex - srcChangeStreamReader changeReader - dstChangeStreamReader changeReader + srcChangeReader changeReader + dstChangeReader changeReader readConcernSetting ReadConcernSetting @@ -188,7 +188,7 @@ func NewVerifier(settings VerifierSettings, logPath string) *Verifier { readConcernSetting: readConcern, // This will get recreated once gen0 starts, but we want it - // here in case the change streams gets an event before then. + // here in case the change readers get an event before then. srcEventRecorder: NewEventRecorder(), dstEventRecorder: NewEventRecorder(), @@ -269,23 +269,23 @@ func (verifier *Verifier) WritesOff(ctx context.Context) error { return err } - // This has to happen outside the lock because the change streams + // This has to happen outside the lock because the change readers // might be inserting docs into the recheck queue, which happens // under the lock. select { - case <-verifier.srcChangeStreamReader.getError().Ready(): - err := verifier.srcChangeStreamReader.getError().Get() - return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change stream already failed", verifier.srcChangeStreamReader) + case <-verifier.srcChangeReader.getError().Ready(): + err := verifier.srcChangeReader.getError().Get() + return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change reader already failed", verifier.srcChangeReader) default: - verifier.srcChangeStreamReader.setWritesOff(srcFinalTs) + verifier.srcChangeReader.setWritesOff(srcFinalTs) } select { - case <-verifier.dstChangeStreamReader.getError().Ready(): - err := verifier.dstChangeStreamReader.getError().Get() - return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change stream already failed", verifier.dstChangeStreamReader) + case <-verifier.dstChangeReader.getError().Ready(): + err := verifier.dstChangeReader.getError().Get() + return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change reader already failed", verifier.dstChangeReader) default: - verifier.dstChangeStreamReader.setWritesOff(dstFinalTs) + verifier.dstChangeReader.setWritesOff(dstFinalTs) } return nil diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index caa2b714..4519f341 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -681,7 +681,7 @@ func (suite *IntegrationTestSuite) TestGetPersistedNamespaceStatistics_Recheck() ctx := suite.Context() verifier := suite.BuildVerifier() - err := verifier.HandleChangeStreamEvents( + err := verifier.PersistChangeEvents( ctx, changeEventBatch{ events: []ParsedEvent{{ @@ -697,7 +697,7 @@ func (suite *IntegrationTestSuite) TestGetPersistedNamespaceStatistics_Recheck() ) suite.Require().NoError(err) - err = verifier.HandleChangeStreamEvents( + err = verifier.PersistChangeEvents( ctx, changeEventBatch{ events: []ParsedEvent{{ @@ -972,23 +972,23 @@ func (suite *IntegrationTestSuite) TestFailedVerificationTaskInsertions() { events: mslices.Of(event), } - err = verifier.HandleChangeStreamEvents(ctx, batch, src) + err = verifier.PersistChangeEvents(ctx, batch, src) suite.Require().NoError(err) event.OpType = "insert" - err = verifier.HandleChangeStreamEvents(ctx, batch, src) + err = verifier.PersistChangeEvents(ctx, batch, src) suite.Require().NoError(err) event.OpType = "replace" - err = verifier.HandleChangeStreamEvents(ctx, batch, src) + err = verifier.PersistChangeEvents(ctx, batch, src) suite.Require().NoError(err) event.OpType = "update" - err = verifier.HandleChangeStreamEvents(ctx, batch, src) + err = verifier.PersistChangeEvents(ctx, batch, src) suite.Require().NoError(err) batch.events[0].OpType = "flibbity" suite.Assert().Panics( func() { - _ = verifier.HandleChangeStreamEvents(ctx, batch, src) + _ = verifier.PersistChangeEvents(ctx, batch, src) }, "HandleChangeStreamEvents should panic if it gets an unknown optype", ) diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go new file mode 100644 index 00000000..ae6b9728 --- /dev/null +++ b/internal/verifier/recheck_persist.go @@ -0,0 +1,180 @@ +package verifier + +import ( + "context" + "fmt" + "time" + + "github.com/10gen/migration-verifier/internal/util" + "github.com/pkg/errors" + "go.mongodb.org/mongo-driver/v2/bson" +) + +// RunChangeEventPersistor persists rechecks from change event batches. +// It needs to be started after the reader starts and should run in its own +// goroutine. +func (verifier *Verifier) RunChangeEventPersistor( + ctx context.Context, + reader changeReader, +) error { + clusterName := reader.getWhichCluster() + persistCallback := reader.persistChangeStreamResumeToken + in := reader.getReadChannel() + + var err error + + var lastPersistedTime time.Time + persistResumeTokenIfNeeded := func(ctx context.Context, token bson.Raw) { + if time.Since(lastPersistedTime) >= minChangeStreamPersistInterval { + persistErr := persistCallback(ctx, token) + if persistErr != nil { + verifier.logger.Warn(). + Str("changeReader", string(clusterName)). + Err(persistErr). + Msg("Failed to persist resume token. Because of this, if the verifier restarts, it will have to re-process already-handled change events. This error may be transient, but if it recurs, investigate.") + } else { + lastPersistedTime = time.Now() + } + } + } + +HandlerLoop: + for err == nil { + select { + case <-ctx.Done(): + err = util.WrapCtxErrWithCause(ctx) + + verifier.logger.Debug(). + Err(err). + Str("changeReader", string(clusterName)). + Msg("Change event handler failed.") + case batch, more := <-in: + if !more { + verifier.logger.Debug(). + Str("changeReader", string(clusterName)). + Msg("Change event batch channel has been closed.") + + break HandlerLoop + } + + verifier.logger.Trace(). + Str("changeReader", string(clusterName)). + Int("batchSize", len(batch.events)). + Any("batch", batch). + Msg("Handling change event batch.") + + err = errors.Wrap( + verifier.PersistChangeEvents(ctx, batch, clusterName), + "failed to handle change stream events", + ) + + if err == nil && batch.resumeToken != nil { + persistResumeTokenIfNeeded(ctx, batch.resumeToken) + } + } + } + + // This will prevent the reader from hanging because the reader checks + // this along with checks for context expiry. + if err != nil { + reader.setPersistorError(err) + } + + return err +} + +// PersistChangeEvents performs the necessary work for change events after receiving a batch. +func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeEventBatch, eventOrigin whichCluster) error { + if len(batch.events) == 0 { + return nil + } + + dbNames := make([]string, len(batch.events)) + collNames := make([]string, len(batch.events)) + docIDs := make([]bson.RawValue, len(batch.events)) + dataSizes := make([]int32, len(batch.events)) + + latestTimestamp := bson.Timestamp{} + + for i, changeEvent := range batch.events { + if !supportedEventOpTypes.Contains(changeEvent.OpType) { + panic(fmt.Sprintf("Unsupported optype in event; should have failed already! event=%+v", changeEvent)) + } + + if changeEvent.ClusterTime == nil { + verifier.logger.Warn(). + Any("event", changeEvent). + Msg("Change event unexpectedly lacks a clusterTime?!?") + } else if changeEvent.ClusterTime.After(latestTimestamp) { + latestTimestamp = *changeEvent.ClusterTime + } + + var srcDBName, srcCollName string + + var eventRecorder EventRecorder + + // Recheck Docs are keyed by source namespaces. + // We need to retrieve the source namespaces if change events are from the destination. + switch eventOrigin { + case dst: + eventRecorder = *verifier.dstEventRecorder + + if verifier.nsMap.Len() == 0 { + // Namespace is not remapped. Source namespace is the same as the destination. + srcDBName = changeEvent.Ns.DB + srcCollName = changeEvent.Ns.Coll + } else { + dstNs := fmt.Sprintf("%s.%s", changeEvent.Ns.DB, changeEvent.Ns.Coll) + srcNs, exist := verifier.nsMap.GetSrcNamespace(dstNs) + if !exist { + return errors.Errorf("no source namespace corresponding to the destination namepsace %s", dstNs) + } + srcDBName, srcCollName = SplitNamespace(srcNs) + } + case src: + eventRecorder = *verifier.srcEventRecorder + + srcDBName = changeEvent.Ns.DB + srcCollName = changeEvent.Ns.Coll + default: + panic(fmt.Sprintf("unknown event origin: %s", eventOrigin)) + } + + dbNames[i] = srcDBName + collNames[i] = srcCollName + docIDs[i] = changeEvent.DocID + + if changeEvent.FullDocLen.OrZero() > 0 { + dataSizes[i] = int32(changeEvent.FullDocLen.OrZero()) + } else if changeEvent.FullDocument == nil { + // This happens for deletes and for some updates. + // The document is probably, but not necessarily, deleted. + dataSizes[i] = fauxDocSizeForDeleteEvents + } else { + // This happens for inserts, replaces, and most updates. + dataSizes[i] = int32(len(changeEvent.FullDocument)) + } + + if err := eventRecorder.AddEvent(&changeEvent); err != nil { + return errors.Wrapf( + err, + "failed to augment stats with %s change event (%+v)", + eventOrigin, + changeEvent, + ) + } + } + + latestTimestampTime := time.Unix(int64(latestTimestamp.T), 0) + lag := time.Unix(int64(batch.clusterTime.T), 0).Sub(latestTimestampTime) + + verifier.logger.Trace(). + Str("origin", string(eventOrigin)). + Int("count", len(docIDs)). + Any("latestTimestamp", latestTimestamp). + Time("latestTimestampTime", latestTimestampTime). + Stringer("lag", lag). + Msg("Persisting rechecks for change events.") + + return verifier.insertRecheckDocs(ctx, dbNames, collNames, docIDs, dataSizes) +} diff --git a/internal/verifier/summary.go b/internal/verifier/summary.go index 7079b294..d204ed88 100644 --- a/internal/verifier/summary.go +++ b/internal/verifier/summary.go @@ -559,8 +559,8 @@ func (verifier *Verifier) printChangeEventStatistics(builder io.Writer) { eventRecorder *EventRecorder csReader changeReader }{ - {"Source", verifier.srcEventRecorder, verifier.srcChangeStreamReader}, - {"Destination", verifier.dstEventRecorder, verifier.dstChangeStreamReader}, + {"Source", verifier.srcEventRecorder, verifier.srcChangeReader}, + {"Destination", verifier.dstEventRecorder, verifier.dstChangeReader}, } { nsStats := cluster.eventRecorder.Read() @@ -619,13 +619,13 @@ func (verifier *Verifier) printChangeEventStatistics(builder io.Writer) { } } - if cluster.csReader == verifier.srcChangeStreamReader { + if cluster.csReader == verifier.srcChangeReader { fmt.Fprint(builder, "\n") } // We only print event breakdowns for the source because we assume that // events on the destination will largely mirror the source’s. - if totalEvents > 0 && cluster.csReader == verifier.srcChangeStreamReader { + if totalEvents > 0 && cluster.csReader == verifier.srcChangeReader { reverseSortedNamespaces := maps.Keys(nsTotals) sort.Slice( reverseSortedNamespaces, From 08d56336dc747a0e8cef516b0feac443e54ea51d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:28:50 -0500 Subject: [PATCH 007/130] cleanup --- internal/verifier/change_reader.go | 18 +++++++++++------- internal/verifier/change_stream.go | 11 +++++------ internal/verifier/change_stream_test.go | 2 +- internal/verifier/check.go | 10 +++++----- internal/verifier/recheck_persist.go | 2 +- internal/verifier/recheck_test.go | 2 +- 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 663eb89f..84a4f3e9 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -16,6 +16,10 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo/options" ) +const ( + changeReaderCollectionName = "changeReader" +) + type changeReader interface { getWhichCluster() whichCluster getReadChannel() <-chan changeEventBatch @@ -28,7 +32,7 @@ type changeReader interface { setPersistorError(error) start(context.Context) error done() <-chan struct{} - persistChangeStreamResumeToken(context.Context, bson.Raw) error + persistResumeToken(context.Context, bson.Raw) error isRunning() bool String() string } @@ -46,7 +50,7 @@ type ChangeReaderCommon struct { resumeTokenTSExtractor func(bson.Raw) (bson.Timestamp, error) - changeStreamRunning bool + running bool changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] readerError *util.Eventual[error] @@ -82,7 +86,7 @@ func (rc *ChangeReaderCommon) setWritesOff(ts bson.Timestamp) { } func (rc *ChangeReaderCommon) isRunning() bool { - return rc.changeStreamRunning + return rc.running } func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { @@ -127,8 +131,8 @@ func (rc *ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { return option.None[float64]() } -func (rc *ChangeReaderCommon) persistChangeStreamResumeToken(ctx context.Context, token bson.Raw) error { - coll := rc.metaDB.Collection(metadataChangeStreamCollectionName) +func (rc *ChangeReaderCommon) persistResumeToken(ctx context.Context, token bson.Raw) error { + coll := rc.metaDB.Collection(changeReaderCollectionName) _, err := coll.ReplaceOne( ctx, bson.D{{"_id", rc.resumeTokenDocID()}}, @@ -153,7 +157,7 @@ func (rc *ChangeReaderCommon) persistChangeStreamResumeToken(ctx context.Context return nil } - return errors.Wrapf(err, "failed to persist change stream resume token (%v)", token) + return errors.Wrapf(err, "failed to persist %s resume token (%v)", rc.readerType, token) } func (rc *ChangeReaderCommon) resumeTokenDocID() string { @@ -168,7 +172,7 @@ func (rc *ChangeReaderCommon) resumeTokenDocID() string { } func (rc *ChangeReaderCommon) getMetadataCollection() *mongo.Collection { - return rc.metaDB.Collection(metadataChangeStreamCollectionName) + return rc.metaDB.Collection(changeReaderCollectionName) } func (rc *ChangeReaderCommon) loadResumeToken(ctx context.Context) (option.Option[bson.Raw], error) { diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index b69254b7..f83814a5 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -44,9 +44,8 @@ var supportedEventOpTypes = mapset.NewSet( ) const ( - minChangeStreamPersistInterval = time.Second * 10 - maxChangeStreamAwaitTime = time.Second - metadataChangeStreamCollectionName = "changeStream" + minChangeStreamPersistInterval = time.Second * 10 + maxChangeStreamAwaitTime = time.Second ) type UnknownEventError struct { @@ -394,7 +393,7 @@ func (csr *ChangeStreamReader) iterateChangeStream( } if gotwritesOffTimestamp { - csr.changeStreamRunning = false + csr.running = false if csr.lastChangeEventTime != nil { csr.startAtTs = csr.lastChangeEventTime } @@ -472,7 +471,7 @@ func (csr *ChangeStreamReader) createChangeStream( return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to open change stream") } - err = csr.persistChangeStreamResumeToken(ctx, changeStream.ResumeToken()) + err = csr.persistResumeToken(ctx, changeStream.ResumeToken()) if err != nil { return nil, nil, bson.Timestamp{}, err } @@ -581,7 +580,7 @@ func (csr *ChangeStreamReader) start(ctx context.Context) error { csr.startAtTs = &startTs - csr.changeStreamRunning = true + csr.running = true return nil } diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index c69e38c5..793426cf 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -284,7 +284,7 @@ func (suite *IntegrationTestSuite) TestChangeStream_Resume_NoSkip() { changeStreamMetaColl := verifier1.metaClient. Database(verifier1.metaDBName). - Collection(metadataChangeStreamCollectionName) + Collection(changeReaderCollectionName) var originalResumeToken bson.Raw diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 44f80519..794a7dbf 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -232,7 +232,7 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh verifier.logger.Info().Msg("Starting change readers.") // Now that we’ve initialized verifier.generation we can - // start the change stream readers. + // start the change readers. verifier.initializeChangeReaders() verifier.mux.Unlock() @@ -380,8 +380,8 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } verifier.logger.Debug(). - Stringer("changeStreamReader", csr). - Msg("Change stream reader finished.") + Stringer("changeReader", csr). + Msg("Change reader finished.") } if err = ceHandlerGroup.Wait(); err != nil { @@ -391,9 +391,9 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh verifier.lastGeneration = true } - // Increment the in-memory generation so that the change streams will + // Increment the in-memory generation so that the change readers will // mark rechecks for the next generation. For example, if we just - // finished generation 2, the change streams need to mark generation 3 + // finished generation 2, the change readers need to mark generation 3 // on enqueued rechecks. Meanwhile, generaiton 3’s recheck tasks will // derive from rechecks enqueued during generation 2. verifier.generation++ diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index ae6b9728..6ed9b173 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -18,7 +18,7 @@ func (verifier *Verifier) RunChangeEventPersistor( reader changeReader, ) error { clusterName := reader.getWhichCluster() - persistCallback := reader.persistChangeStreamResumeToken + persistCallback := reader.persistResumeToken in := reader.getReadChannel() var err error diff --git a/internal/verifier/recheck_test.go b/internal/verifier/recheck_test.go index 1a213dba..0da76289 100644 --- a/internal/verifier/recheck_test.go +++ b/internal/verifier/recheck_test.go @@ -62,7 +62,7 @@ func (suite *IntegrationTestSuite) TestFailedCompareThenReplace() { }, } - err := verifier.HandleChangeStreamEvents( + err := verifier.PersistChangeEvents( ctx, changeEventBatch{events: mslices.Of(event)}, src, From c9e4610cd9d63ad5abd7f21c63d927873d22cff1 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:29:17 -0500 Subject: [PATCH 008/130] tidy --- internal/verifier/change_reader.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 84a4f3e9..8c0accd3 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -191,7 +191,6 @@ func (rc *ChangeReaderCommon) loadResumeToken(ctx context.Context) (option.Optio } func (rc *ChangeReaderCommon) updateLag(sess *mongo.Session, token bson.Raw) { - var tokenTs bson.Timestamp tokenTs, err := rc.resumeTokenTSExtractor(token) if err == nil { lagSecs := int64(sess.OperationTime().T) - int64(tokenTs.T) From c7d9edd8b74d2cc2d88b31dd9aa8bffd74cf19a3 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:29:55 -0500 Subject: [PATCH 009/130] string --- internal/verifier/migration_verifier_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 4519f341..8159c19c 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -990,7 +990,7 @@ func (suite *IntegrationTestSuite) TestFailedVerificationTaskInsertions() { func() { _ = verifier.PersistChangeEvents(ctx, batch, src) }, - "HandleChangeStreamEvents should panic if it gets an unknown optype", + "PersistChangeEvents should panic if it gets an unknown optype", ) verifier.generation++ From 196f8877096ff3ff4c7baf889a83195cc4c1bfda Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:39:56 -0500 Subject: [PATCH 010/130] more renames --- internal/verifier/change_reader.go | 18 +++++- internal/verifier/change_stream.go | 86 +++---------------------- internal/verifier/change_stream_test.go | 2 +- internal/verifier/recheck_persist.go | 6 ++ 4 files changed, 31 insertions(+), 81 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 8c0accd3..aa6e4926 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -10,6 +10,7 @@ import ( "github.com/10gen/migration-verifier/msync" "github.com/10gen/migration-verifier/option" "github.com/pkg/errors" + "github.com/rs/zerolog" "github.com/samber/lo" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" @@ -54,7 +55,7 @@ type ChangeReaderCommon struct { changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] readerError *util.Eventual[error] - handlerError *util.Eventual[error] + persistorError *util.Eventual[error] doneChan chan struct{} startAtTs *bson.Timestamp @@ -70,7 +71,7 @@ func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { } func (rc *ChangeReaderCommon) setPersistorError(err error) { - rc.handlerError.Set(err) + rc.persistorError.Set(err) } func (rc *ChangeReaderCommon) getError() *util.Eventual[error] { @@ -208,3 +209,16 @@ func (rc *ChangeReaderCommon) logIgnoredDDL(rawEvent bson.Raw) { Stringer("event", rawEvent). Msg("Ignoring event with unrecognized type on destination. (It’s assumedly internal to the migration.)") } + +func (rc *ChangeReaderCommon) wrapPersistorErrorForReader() error { + return errors.Wrap( + rc.persistorError.Get(), + "event persistor failed, so no more events can be processed", + ) +} + +func addTimestampToLogEvent(ts bson.Timestamp, event *zerolog.Event) *zerolog.Event { + return event. + Any("timestamp", ts). + Time("time", time.Unix(int64(ts.T), int64(0))) +} diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index f83814a5..b8d9f22b 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -16,8 +16,6 @@ import ( mapset "github.com/deckarep/golang-set/v2" clone "github.com/huandu/go-clone/generic" "github.com/pkg/errors" - "github.com/rs/zerolog" - "github.com/samber/lo" "github.com/samber/mo" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" @@ -56,12 +54,6 @@ func (uee UnknownEventError) Error() string { return fmt.Sprintf("received event with unknown optype: %+v", uee.Event) } -type changeEventBatch struct { - events []ParsedEvent - resumeToken bson.Raw - clusterTime bson.Timestamp -} - type ChangeStreamReader struct { ChangeReaderCommon } @@ -97,11 +89,11 @@ func (verifier *Verifier) initializeChangeReaders() { csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) csr.writesOffTs = util.NewEventual[bson.Timestamp]() csr.readerError = util.NewEventual[error]() - csr.handlerError = util.NewEventual[error]() + csr.persistorError = util.NewEventual[error]() csr.doneChan = make(chan struct{}) csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) csr.batchSizeHistory = history.New[int](time.Minute) - csr.resumeTokenTSExtractor = extractTimestampFromResumeToken + csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken } } @@ -157,7 +149,7 @@ func (csr *ChangeStreamReader) GetChangeStreamFilter() (pipeline mongo.Pipeline) }, ) - if csr.hasBsonSize() { + if util.ClusterHasBSONSize([2]int(csr.clusterInfo.VersionArray)) { pipeline = append( pipeline, bson.D{ @@ -172,16 +164,6 @@ func (csr *ChangeStreamReader) GetChangeStreamFilter() (pipeline mongo.Pipeline) return pipeline } -func (csr *ChangeStreamReader) hasBsonSize() bool { - major := csr.clusterInfo.VersionArray[0] - - if major == 4 { - return csr.clusterInfo.VersionArray[1] >= 4 - } - - return major > 4 -} - // This function reads a single `getMore` response into a slice. // // Note that this doesn’t care about the writesOff timestamp. Thus, @@ -294,8 +276,8 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( select { case <-ctx.Done(): return util.WrapCtxErrWithCause(ctx) - case <-csr.handlerError.Ready(): - return csr.wrapHandlerErrorForReader() + case <-csr.persistorError.Ready(): + return csr.wrapPersistorErrorForReader() case csr.changeEventBatchChan <- changeEventBatch{ events: changeEvents, @@ -311,13 +293,6 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( return nil } -func (csr *ChangeStreamReader) wrapHandlerErrorForReader() error { - return errors.Wrap( - csr.handlerError.Get(), - "event handler failed, so no more events can be processed", - ) -} - func (csr *ChangeStreamReader) iterateChangeStream( ctx context.Context, ri *retry.FuncInfo, @@ -341,8 +316,8 @@ func (csr *ChangeStreamReader) iterateChangeStream( return err - case <-csr.handlerError.Ready(): - return csr.wrapHandlerErrorForReader() + case <-csr.persistorError.Ready(): + return csr.wrapPersistorErrorForReader() // If the ChangeStreamEnderChan has a message, the user has indicated that // source writes are ended and the migration tool is finished / committed. @@ -585,56 +560,11 @@ func (csr *ChangeStreamReader) start(ctx context.Context) error { return nil } -// GetLag returns the observed change stream lag (i.e., the delta between -// cluster time and the most-recently-seen change event). -func (csr *ChangeStreamReader) GetLag() option.Option[time.Duration] { - return csr.lag.Load() -} - -// GetSaturation returns the reader’s internal buffer’s saturation level as -// a fraction. If saturation rises, that means we’re reading events faster than -// we can persist them. -func (csr *ChangeStreamReader) GetSaturation() float64 { - return util.DivideToF64(len(csr.changeEventBatchChan), cap(csr.changeEventBatchChan)) -} - -// GetEventsPerSecond returns the number of change events per second we’ve been -// seeing “recently”. (See implementation for the actual period over which we -// compile this metric.) -func (csr *ChangeStreamReader) GetEventsPerSecond() option.Option[float64] { - logs := csr.batchSizeHistory.Get() - lastLog, hasLogs := lo.Last(logs) - - if hasLogs && lastLog.At != logs[0].At { - span := lastLog.At.Sub(logs[0].At) - - // Each log contains a time and a # of events that happened since - // the prior log. Thus, each log’s Datum is a count of events that - // happened before the timestamp. Since we want the # of events that - // happened between the first & last times, we only want events *after* - // the first time. Thus, we skip the first log entry here. - totalEvents := 0 - for _, log := range logs[1:] { - totalEvents += log.Datum - } - - return option.Some(util.DivideToF64(totalEvents, span.Seconds())) - } - - return option.None[float64]() -} - -func addTimestampToLogEvent(ts bson.Timestamp, event *zerolog.Event) *zerolog.Event { - return event. - Any("timestamp", ts). - Time("time", time.Unix(int64(ts.T), int64(0))) -} - func (csr *ChangeStreamReader) String() string { return fmt.Sprintf("%s change stream reader", csr.readerType) } -func extractTimestampFromResumeToken(resumeToken bson.Raw) (bson.Timestamp, error) { +func extractTSFromChangeStreamResumeToken(resumeToken bson.Raw) (bson.Timestamp, error) { // Change stream token is always a V1 keystring in the _data field tokenDataRV, err := resumeToken.LookupErr("_data") diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 793426cf..0c12a64f 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -223,7 +223,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { for { gotEvent := cs.TryNext(ctx) suite.Require().NoError(cs.Err()) - csOpTime, err := extractTimestampFromResumeToken(cs.ResumeToken()) + csOpTime, err := extractTSFromChangeStreamResumeToken(cs.ResumeToken()) suite.Require().NoError(err, "should get timestamp from resume token") if gotEvent { diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 6ed9b173..799040be 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -10,6 +10,12 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" ) +type changeEventBatch struct { + events []ParsedEvent + resumeToken bson.Raw + clusterTime bson.Timestamp +} + // RunChangeEventPersistor persists rechecks from change event batches. // It needs to be started after the reader starts and should run in its own // goroutine. From 06f7794a7a87f8adec4ea34b051767cc6ca7d49e Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:52:39 -0500 Subject: [PATCH 011/130] move --- internal/verifier/change_reader.go | 9 +++++++++ internal/verifier/change_stream.go | 11 ----------- internal/verifier/metadata.go | 1 + 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index aa6e4926..40e8ccc7 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -17,7 +17,16 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo/options" ) +type ddlEventHandling string + const ( + fauxDocSizeForDeleteEvents = 1024 + + // The number of batches we’ll hold in memory at once. + batchChanBufferSize = 100 + + onDDLEventAllow ddlEventHandling = "allow" + changeReaderCollectionName = "changeReader" ) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index b8d9f22b..c3841a86 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -23,17 +23,6 @@ import ( "golang.org/x/exp/slices" ) -type ddlEventHandling string - -const ( - fauxDocSizeForDeleteEvents = 1024 - - // The number of batches we’ll hold in memory at once. - batchChanBufferSize = 100 - - onDDLEventAllow ddlEventHandling = "allow" -) - var supportedEventOpTypes = mapset.NewSet( "insert", "update", diff --git a/internal/verifier/metadata.go b/internal/verifier/metadata.go index 7c317a2c..dbd87300 100644 --- a/internal/verifier/metadata.go +++ b/internal/verifier/metadata.go @@ -5,5 +5,6 @@ package verifier // 2: Split failed-task discrepancies into separate collection. // 3: Enqueued rechecks now reference the generation in which they’ll be // rechecked rather than the generation during which they were enqueued. +// 4: Use “changeReader” instead of “changeStream” collection name. const verifierMetadataVersion = 3 From 0b13627539d07ff1dfc0a19ed6c60d9f31d98fbe Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:52:47 -0500 Subject: [PATCH 012/130] metadata --- internal/verifier/metadata.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/metadata.go b/internal/verifier/metadata.go index dbd87300..906117c9 100644 --- a/internal/verifier/metadata.go +++ b/internal/verifier/metadata.go @@ -7,4 +7,4 @@ package verifier // rechecked rather than the generation during which they were enqueued. // 4: Use “changeReader” instead of “changeStream” collection name. -const verifierMetadataVersion = 3 +const verifierMetadataVersion = 4 From 25b7e61c5f01c71b52a0f9cc23dc1ec9f3c716a4 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 16:59:23 -0500 Subject: [PATCH 013/130] comment --- internal/util/clusterinfo.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/util/clusterinfo.go b/internal/util/clusterinfo.go index 7f66f8e7..b2354e96 100644 --- a/internal/util/clusterinfo.go +++ b/internal/util/clusterinfo.go @@ -19,6 +19,8 @@ type ClusterInfo struct { Topology ClusterTopology } +// ClusterHasBSONSize indicates whether a cluster with the given +// major & minor version numbers supports the $bsonSize aggregation operator. func ClusterHasBSONSize(va [2]int) bool { major := va[0] From 3284468d4a3454afbfb90772a8cf9bb2956b5d9d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 11 Nov 2025 17:01:09 -0500 Subject: [PATCH 014/130] comments --- internal/verifier/change_reader.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 40e8ccc7..01579434 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -107,10 +107,15 @@ func (rc *ChangeReaderCommon) done() <-chan struct{} { return rc.doneChan } +// getBufferSaturation returns the reader’s internal buffer’s saturation level +// as a fraction. If saturation rises, that means we’re reading events faster +// than we can persist them. func (rc *ChangeReaderCommon) getBufferSaturation() float64 { return util.DivideToF64(len(rc.changeEventBatchChan), cap(rc.changeEventBatchChan)) } +// getLag returns the observed change stream lag (i.e., the delta between +// cluster time and the most-recently-seen change event). func (rc *ChangeReaderCommon) getLag() option.Option[time.Duration] { return rc.lag.Load() } From 15a7636394d4681e85e0a538aa394ca681df7453 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 09:13:40 -0500 Subject: [PATCH 015/130] remove persistor error --- internal/verifier/change_reader.go | 16 +--- internal/verifier/change_stream.go | 106 ++++++++++++------------ internal/verifier/change_stream_test.go | 8 +- internal/verifier/check.go | 2 +- internal/verifier/recheck_persist.go | 6 -- 5 files changed, 60 insertions(+), 78 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 01579434..13ea1e62 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -15,6 +15,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" + "golang.org/x/sync/errgroup" ) type ddlEventHandling string @@ -39,8 +40,7 @@ type changeReader interface { getLag() option.Option[time.Duration] getBufferSaturation() float64 setWritesOff(bson.Timestamp) - setPersistorError(error) - start(context.Context) error + start(context.Context, *errgroup.Group) error done() <-chan struct{} persistResumeToken(context.Context, bson.Raw) error isRunning() bool @@ -64,7 +64,6 @@ type ChangeReaderCommon struct { changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] readerError *util.Eventual[error] - persistorError *util.Eventual[error] doneChan chan struct{} startAtTs *bson.Timestamp @@ -79,10 +78,6 @@ func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { return rc.readerType } -func (rc *ChangeReaderCommon) setPersistorError(err error) { - rc.persistorError.Set(err) -} - func (rc *ChangeReaderCommon) getError() *util.Eventual[error] { return rc.readerError } @@ -224,13 +219,6 @@ func (rc *ChangeReaderCommon) logIgnoredDDL(rawEvent bson.Raw) { Msg("Ignoring event with unrecognized type on destination. (It’s assumedly internal to the migration.)") } -func (rc *ChangeReaderCommon) wrapPersistorErrorForReader() error { - return errors.Wrap( - rc.persistorError.Get(), - "event persistor failed, so no more events can be processed", - ) -} - func addTimestampToLogEvent(ts bson.Timestamp, event *zerolog.Event) *zerolog.Event { return event. Any("timestamp", ts). diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index c3841a86..ae477b14 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -21,6 +21,7 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" ) var supportedEventOpTypes = mapset.NewSet( @@ -78,7 +79,6 @@ func (verifier *Verifier) initializeChangeReaders() { csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) csr.writesOffTs = util.NewEventual[bson.Timestamp]() csr.readerError = util.NewEventual[error]() - csr.persistorError = util.NewEventual[error]() csr.doneChan = make(chan struct{}) csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) csr.batchSizeHistory = history.New[int](time.Minute) @@ -265,8 +265,6 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( select { case <-ctx.Done(): return util.WrapCtxErrWithCause(ctx) - case <-csr.persistorError.Ready(): - return csr.wrapPersistorErrorForReader() case csr.changeEventBatchChan <- changeEventBatch{ events: changeEvents, @@ -305,9 +303,6 @@ func (csr *ChangeStreamReader) iterateChangeStream( return err - case <-csr.persistorError.Ready(): - return csr.wrapPersistorErrorForReader() - // If the ChangeStreamEnderChan has a message, the user has indicated that // source writes are ended and the migration tool is finished / committed. // This means we should exit rather than continue reading the change stream @@ -467,79 +462,80 @@ func (csr *ChangeStreamReader) createChangeStream( } // StartChangeStream starts the change stream. -func (csr *ChangeStreamReader) start(ctx context.Context) error { +func (csr *ChangeStreamReader) start( + ctx context.Context, + eg *errgroup.Group, +) error { // This channel holds the first change stream creation's result, whether // success or failure. Rather than using a Result we could make separate // Timestamp and error channels, but the single channel is cleaner since // there's no chance of "nonsense" like both channels returning a payload. initialCreateResultChan := make(chan mo.Result[bson.Timestamp]) - go func() { - // Closing changeEventBatchChan at the end of change stream goroutine - // notifies the verifier's change event handler to exit. - defer func() { - csr.logger.Debug(). - Stringer("changeStreamReader", csr). - Msg("Closing change event batch channel.") + eg.Go( + func() error { + // Closing changeEventBatchChan at the end of change stream goroutine + // notifies the verifier's change event handler to exit. + defer func() { + csr.logger.Debug(). + Stringer("changeStreamReader", csr). + Msg("Closing change event batch channel.") - close(csr.changeEventBatchChan) - }() + close(csr.changeEventBatchChan) + }() - retryer := retry.New().WithErrorCodes(util.CursorKilledErrCode) + retryer := retry.New().WithErrorCodes(util.CursorKilledErrCode) - parentThreadWaiting := true + parentThreadWaiting := true - err := retryer.WithCallback( - func(ctx context.Context, ri *retry.FuncInfo) error { - changeStream, sess, startTs, err := csr.createChangeStream(ctx) - if err != nil { - logEvent := csr.logger.Debug(). - Err(err). - Stringer("changeStreamReader", csr) + return retryer.WithCallback( + func(ctx context.Context, ri *retry.FuncInfo) error { + changeStream, sess, startTs, err := csr.createChangeStream(ctx) + if err != nil { + logEvent := csr.logger.Debug(). + Err(err). + Stringer("changeStreamReader", csr) - if parentThreadWaiting { - logEvent.Msg("First change stream open failed.") + if parentThreadWaiting { + logEvent.Msg("First change stream open failed.") - initialCreateResultChan <- mo.Err[bson.Timestamp](err) - return nil - } + initialCreateResultChan <- mo.Err[bson.Timestamp](err) + return nil + } - logEvent.Msg("Retried change stream open failed.") + logEvent.Msg("Retried change stream open failed.") - return err - } - - defer changeStream.Close(ctx) + return err + } - logEvent := csr.logger.Debug(). - Stringer("changeStreamReader", csr). - Any("startTimestamp", startTs) + defer changeStream.Close(ctx) - if parentThreadWaiting { - logEvent.Msg("First change stream open succeeded.") + logEvent := csr.logger.Debug(). + Stringer("changeStreamReader", csr). + Any("startTimestamp", startTs) - initialCreateResultChan <- mo.Ok(startTs) - close(initialCreateResultChan) - parentThreadWaiting = false - } else { - logEvent.Msg("Retried change stream open succeeded.") - } + if parentThreadWaiting { + logEvent.Msg("First change stream open succeeded.") - return csr.iterateChangeStream(ctx, ri, changeStream, sess) - }, - "running %s", csr, - ).Run(ctx, csr.logger) + initialCreateResultChan <- mo.Ok(startTs) + close(initialCreateResultChan) + parentThreadWaiting = false + } else { + logEvent.Msg("Retried change stream open succeeded.") + } - if err != nil { - csr.readerError.Set(err) - } - }() + return csr.iterateChangeStream(ctx, ri, changeStream, sess) + }, + "running %s", csr, + ).Run(ctx, csr.logger) + }, + ) result := <-initialCreateResultChan startTs, err := result.Get() if err != nil { - return err + return errors.Wrapf(err, "creating change stream") } csr.startAtTs = &startTs diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 0c12a64f..d3fb166c 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -254,7 +254,9 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { } func (suite *IntegrationTestSuite) startSrcChangeStreamReaderAndHandler(ctx context.Context, verifier *Verifier) { - err := verifier.srcChangeReader.start(ctx) + eg, egCtx := contextplus.ErrGroup(ctx) + + err := verifier.srcChangeReader.start(egCtx, eg) suite.Require().NoError(err) go func() { err := verifier.RunChangeEventPersistor(ctx, verifier.srcChangeReader) @@ -1063,7 +1065,9 @@ func (suite *IntegrationTestSuite) TestRecheckDocsWithDstChangeEvents() { verifier.SetDstNamespaces([]string{dstDBName + ".dstColl1", dstDBName + ".dstColl2"}) verifier.SetNamespaceMap() - suite.Require().NoError(verifier.dstChangeReader.start(ctx)) + eg, egCtx := contextplus.ErrGroup(ctx) + + suite.Require().NoError(verifier.dstChangeReader.start(egCtx, eg)) go func() { err := verifier.RunChangeEventPersistor(ctx, verifier.dstChangeReader) if errors.Is(err, context.Canceled) { diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 794a7dbf..66c36993 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -276,7 +276,7 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } else { verifier.logger.Debug().Msgf("%s not running; starting change reader", changeReader) - err = changeReader.start(ctx) + err = changeReader.start(groupCtx, ceHandlerGroup) if err != nil { return errors.Wrapf(err, "failed to start %s", changeReader) } diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 799040be..564e33a2 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -80,12 +80,6 @@ HandlerLoop: } } - // This will prevent the reader from hanging because the reader checks - // this along with checks for context expiry. - if err != nil { - reader.setPersistorError(err) - } - return err } From df29cdef625a5a29b657225c528590311e91781f Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 09:18:29 -0500 Subject: [PATCH 016/130] remove doneChan --- internal/verifier/change_reader.go | 6 ------ internal/verifier/change_stream.go | 11 ++++++----- internal/verifier/change_stream_test.go | 6 ++++-- internal/verifier/check.go | 15 +++++++++------ 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 13ea1e62..56bf5e0e 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -41,7 +41,6 @@ type changeReader interface { getBufferSaturation() float64 setWritesOff(bson.Timestamp) start(context.Context, *errgroup.Group) error - done() <-chan struct{} persistResumeToken(context.Context, bson.Raw) error isRunning() bool String() string @@ -64,7 +63,6 @@ type ChangeReaderCommon struct { changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] readerError *util.Eventual[error] - doneChan chan struct{} startAtTs *bson.Timestamp @@ -98,10 +96,6 @@ func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { return rc.changeEventBatchChan } -func (rc *ChangeReaderCommon) done() <-chan struct{} { - return rc.doneChan -} - // getBufferSaturation returns the reader’s internal buffer’s saturation level // as a fraction. If saturation rises, that means we’re reading events faster // than we can persist them. diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index ae477b14..4c0d6c0a 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -79,7 +79,6 @@ func (verifier *Verifier) initializeChangeReaders() { csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) csr.writesOffTs = util.NewEventual[bson.Timestamp]() csr.readerError = util.NewEventual[error]() - csr.doneChan = make(chan struct{}) csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) csr.batchSizeHistory = history.New[int](time.Minute) csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken @@ -356,9 +355,7 @@ func (csr *ChangeStreamReader) iterateChangeStream( if csr.lastChangeEventTime != nil { csr.startAtTs = csr.lastChangeEventTime } - // since we have started Recheck, we must signal that we have - // finished the change stream changes so that Recheck can continue. - close(csr.doneChan) + break } } @@ -488,7 +485,7 @@ func (csr *ChangeStreamReader) start( parentThreadWaiting := true - return retryer.WithCallback( + err := retryer.WithCallback( func(ctx context.Context, ri *retry.FuncInfo) error { changeStream, sess, startTs, err := csr.createChangeStream(ctx) if err != nil { @@ -528,6 +525,10 @@ func (csr *ChangeStreamReader) start( }, "running %s", csr, ).Run(ctx, csr.logger) + + csr.readerError.Set(err) + + return err }, ) diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index d3fb166c..1eef5e87 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -632,7 +632,8 @@ func (suite *IntegrationTestSuite) TestStartAtTimeNoChanges() { verifier.srcChangeReader.setWritesOff(insertTs) - <-verifier.srcChangeReader.done() + <-verifier.srcChangeReader.getError().Ready() + suite.Require().NoError(verifier.srcChangeReader.getError().Get()) startAtTs2 := verifier.srcChangeReader.getStartTimestamp().MustGet() @@ -690,7 +691,8 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { ) verifier.srcChangeReader.setWritesOff(*postEventsSessionTime) - <-verifier.srcChangeReader.done() + <-verifier.srcChangeReader.getError().Ready() + suite.Require().NoError(verifier.srcChangeReader.getError().Get()) startAtTs, hasStartAtTs = verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 66c36993..6ad7261e 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -56,13 +56,16 @@ func (verifier *Verifier) waitForChangeReader(ctx context.Context, csr changeRea return util.WrapCtxErrWithCause(ctx) case <-csr.getError().Ready(): err := csr.getError().Get() - verifier.logger.Warn().Err(err). - Msgf("Received error from %s.", csr) + + if err != nil { + verifier.logger.Warn().Err(err). + Msgf("Received error from %s.", csr) + } else { + verifier.logger.Debug(). + Msgf("Received completion signal from %s.", csr) + } + return err - case <-csr.done(): - verifier.logger.Debug(). - Msgf("Received completion signal from %s.", csr) - break } return nil From fc6c8ced58e4bfdcd024cccae5d117f81cc0d49d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 09:53:25 -0500 Subject: [PATCH 017/130] remove reader error channel --- internal/verifier/change_reader.go | 6 -- internal/verifier/change_stream.go | 3 - internal/verifier/change_stream_test.go | 31 +++++----- internal/verifier/check.go | 75 ++++++++++--------------- internal/verifier/migration_verifier.go | 33 ++++++----- 5 files changed, 66 insertions(+), 82 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 56bf5e0e..143f5f04 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -34,7 +34,6 @@ const ( type changeReader interface { getWhichCluster() whichCluster getReadChannel() <-chan changeEventBatch - getError() *util.Eventual[error] getStartTimestamp() option.Option[bson.Timestamp] getEventsPerSecond() option.Option[float64] getLag() option.Option[time.Duration] @@ -62,7 +61,6 @@ type ChangeReaderCommon struct { running bool changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] - readerError *util.Eventual[error] startAtTs *bson.Timestamp @@ -76,10 +74,6 @@ func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { return rc.readerType } -func (rc *ChangeReaderCommon) getError() *util.Eventual[error] { - return rc.readerError -} - func (rc *ChangeReaderCommon) getStartTimestamp() option.Option[bson.Timestamp] { return option.FromPointer(rc.startAtTs) } diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 4c0d6c0a..02f63620 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -78,7 +78,6 @@ func (verifier *Verifier) initializeChangeReaders() { csr.metaDB = verifier.metaClient.Database(verifier.metaDBName) csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) csr.writesOffTs = util.NewEventual[bson.Timestamp]() - csr.readerError = util.NewEventual[error]() csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) csr.batchSizeHistory = history.New[int](time.Minute) csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken @@ -526,8 +525,6 @@ func (csr *ChangeStreamReader) start( "running %s", csr, ).Run(ctx, csr.logger) - csr.readerError.Set(err) - return err }, ) diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 1eef5e87..5f746f14 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -25,6 +25,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" + "golang.org/x/sync/errgroup" ) func (suite *IntegrationTestSuite) TestChangeStreamFilter_NoNamespaces() { @@ -253,18 +254,21 @@ func (suite *IntegrationTestSuite) TestChangeStreamFilter_WithNamespaces() { ) } -func (suite *IntegrationTestSuite) startSrcChangeStreamReaderAndHandler(ctx context.Context, verifier *Verifier) { +func (suite *IntegrationTestSuite) startSrcChangeStreamReaderAndHandler( + ctx context.Context, + verifier *Verifier, +) *errgroup.Group { eg, egCtx := contextplus.ErrGroup(ctx) err := verifier.srcChangeReader.start(egCtx, eg) suite.Require().NoError(err) - go func() { - err := verifier.RunChangeEventPersistor(ctx, verifier.srcChangeReader) - if errors.Is(err, context.Canceled) { - return - } - suite.Require().NoError(err) - }() + eg.Go( + func() error { + return verifier.RunChangeEventPersistor(egCtx, verifier.srcChangeReader) + }, + ) + + return eg } func (suite *IntegrationTestSuite) TestChangeStream_Resume_NoSkip() { @@ -625,15 +629,14 @@ func (suite *IntegrationTestSuite) TestStartAtTimeNoChanges() { insertTs, err := util.GetClusterTimeFromSession(sess) suite.Require().NoError(err, "should get cluster time") - suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) + eg := suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") verifier.srcChangeReader.setWritesOff(insertTs) - <-verifier.srcChangeReader.getError().Ready() - suite.Require().NoError(verifier.srcChangeReader.getError().Get()) + suite.Require().NoError(eg.Wait()) startAtTs2 := verifier.srcChangeReader.getStartTimestamp().MustGet() @@ -658,7 +661,7 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { origSessionTime := sess.OperationTime() suite.Require().NotNil(origSessionTime) - suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) + eg := suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") @@ -691,8 +694,8 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { ) verifier.srcChangeReader.setWritesOff(*postEventsSessionTime) - <-verifier.srcChangeReader.getError().Ready() - suite.Require().NoError(verifier.srcChangeReader.getError().Get()) + + suite.Require().NoError(eg.Wait()) startAtTs, hasStartAtTs = verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 6ad7261e..42ce141d 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -1,6 +1,7 @@ package verifier import ( + "cmp" "context" "fmt" "time" @@ -27,9 +28,11 @@ const ( findTaskTimeWarnThreshold = 5 * time.Second ) -var failedStatuses = mapset.NewSet( - verificationTaskFailed, - verificationTaskMetadataMismatch, +var ( + failedStatuses = mapset.NewSet( + verificationTaskFailed, + verificationTaskMetadataMismatch, + ) ) // Check is the asynchronous entry point to Check, should only be called by the web server. Use @@ -50,27 +53,6 @@ func (verifier *Verifier) Check(ctx context.Context, filter bson.D) { verifier.MaybeStartPeriodicHeapProfileCollection(ctx) } -func (verifier *Verifier) waitForChangeReader(ctx context.Context, csr changeReader) error { - select { - case <-ctx.Done(): - return util.WrapCtxErrWithCause(ctx) - case <-csr.getError().Ready(): - err := csr.getError().Get() - - if err != nil { - verifier.logger.Warn().Err(err). - Msgf("Received error from %s.", csr) - } else { - verifier.logger.Debug(). - Msgf("Received completion signal from %s.", csr) - } - - return err - } - - return nil -} - func (verifier *Verifier) CheckWorker(ctxIn context.Context) error { generation := verifier.generation @@ -96,12 +78,14 @@ func (verifier *Verifier) CheckWorker(ctxIn context.Context) error { // If the change reader fails, everything should stop. eg.Go(func() error { select { - case <-verifier.srcChangeReader.getError().Ready(): - err := verifier.srcChangeReader.getError().Get() - return errors.Wrapf(err, "%s failed", verifier.srcChangeReader) - case <-verifier.dstChangeReader.getError().Ready(): - err := verifier.dstChangeReader.getError().Get() - return errors.Wrapf(err, "%s failed", verifier.dstChangeReader) + case <-verifier.changeReaderErr.Ready(): + return errors.Wrap( + cmp.Or( + verifier.changeReaderErr.Get(), + fmt.Errorf("change handling stopped prematurely"), + ), + verifier.dstChangeReader.String(), + ) case <-ctx.Done(): return nil } @@ -272,23 +256,28 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh verifier.phase = Idle }() - ceHandlerGroup, groupCtx := contextplus.ErrGroup(ctx) + changeReaderGroup, groupCtx := contextplus.ErrGroup(ctx) for _, changeReader := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { if changeReader.isRunning() { verifier.logger.Debug().Msgf("Check: %s already running.", changeReader) } else { verifier.logger.Debug().Msgf("%s not running; starting change reader", changeReader) - err = changeReader.start(groupCtx, ceHandlerGroup) + err = changeReader.start(groupCtx, changeReaderGroup) if err != nil { return errors.Wrapf(err, "failed to start %s", changeReader) } - ceHandlerGroup.Go(func() error { + changeReaderGroup.Go(func() error { return verifier.RunChangeEventPersistor(groupCtx, changeReader) }) } } + verifier.changeReaderErr = util.NewEventual[error]() + go func() { + verifier.changeReaderErr.Set(changeReaderGroup.Wait()) + }() + // Log the verification status when initially booting up so it's easy to see the current state verificationStatus, err := verifier.GetVerificationStatus(ctx) if err != nil { @@ -373,23 +362,19 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh // generation number, or the last changes will not be checked. verifier.mux.Unlock() - for _, csr := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { - if err = verifier.waitForChangeReader(ctx, csr); err != nil { - return errors.Wrapf( - err, - "an error interrupted the wait for closure of %s", - csr, - ) + select { + case <-ctx.Done(): + return ctx.Err() + case <-verifier.changeReaderErr.Ready(): + err := verifier.changeReaderErr.Get() + if err != nil { + return errors.Wrap(err, "handling change events") } verifier.logger.Debug(). - Stringer("changeReader", csr). - Msg("Change reader finished.") + Msg("Change readers finished.") } - if err = ceHandlerGroup.Wait(); err != nil { - return err - } verifier.mux.Lock() verifier.lastGeneration = true } diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index 9e75bf18..76f98e61 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -103,6 +103,8 @@ type Verifier struct { srcEventRecorder *EventRecorder dstEventRecorder *EventRecorder + changeReaderErr *util.Eventual[error] + // Used only with generation 0 to defer the first // progress report until after we’ve finished partitioning // every collection. @@ -272,20 +274,23 @@ func (verifier *Verifier) WritesOff(ctx context.Context) error { // This has to happen outside the lock because the change readers // might be inserting docs into the recheck queue, which happens // under the lock. - select { - case <-verifier.srcChangeReader.getError().Ready(): - err := verifier.srcChangeReader.getError().Get() - return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change reader already failed", verifier.srcChangeReader) - default: - verifier.srcChangeReader.setWritesOff(srcFinalTs) - } - - select { - case <-verifier.dstChangeReader.getError().Ready(): - err := verifier.dstChangeReader.getError().Get() - return errors.Wrapf(err, "tried to send writes-off timestamp to %s, but change reader already failed", verifier.dstChangeReader) - default: - verifier.dstChangeReader.setWritesOff(dstFinalTs) + for _, readerAndTS := range []struct { + reader changeReader + ts bson.Timestamp + }{ + {verifier.srcChangeReader, srcFinalTs}, + {verifier.dstChangeReader, dstFinalTs}, + } { + select { + case <-ctx.Done(): + return ctx.Err() + case <-verifier.changeReaderErr.Ready(): + return errors.Wrapf( + verifier.changeReaderErr.Get(), + "tried to send writes-off timestamp to %s, but change handling already failed", readerAndTS.reader) + default: + readerAndTS.reader.setWritesOff(readerAndTS.ts) + } } return nil From d47c40a5bd9d17d9fe5c35a9c38fd5e80c8fdb25 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 10:56:39 -0500 Subject: [PATCH 018/130] allow eventual to accept nil -- MUST TEST! --- internal/util/eventual.go | 20 ++++++++++---------- internal/util/eventual_test.go | 24 ++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/internal/util/eventual.go b/internal/util/eventual.go index ad2c6dd7..4bd37271 100644 --- a/internal/util/eventual.go +++ b/internal/util/eventual.go @@ -2,8 +2,6 @@ package util import ( "sync" - - "github.com/10gen/migration-verifier/option" ) // Eventual solves the “one writer, many readers” problem: a value gets @@ -14,7 +12,7 @@ import ( // generalized to any data type. type Eventual[T any] struct { ready chan struct{} - val option.Option[T] + val T mux sync.RWMutex } @@ -37,12 +35,12 @@ func (e *Eventual[T]) Get() T { e.mux.RLock() defer e.mux.RUnlock() - val, has := e.val.Get() - if has { - return val + select { + case <-e.ready: + return e.val + default: + panic("Eventual's Get() called before value was ready.") } - - panic("Eventual's Get() called before value was ready.") } // Set sets the Eventual’s value. It may be called only once; @@ -51,13 +49,15 @@ func (e *Eventual[T]) Set(val T) { e.mux.Lock() defer e.mux.Unlock() - if e.val.IsSome() { + select { + case <-e.ready: panic("Tried to set an eventual twice!") + default: } // NB: This *must* happen before the close(), or else a fast reader may // not see this value. - e.val = option.Some(val) + e.val = val close(e.ready) } diff --git a/internal/util/eventual_test.go b/internal/util/eventual_test.go index 17f4c4b7..6196e4e2 100644 --- a/internal/util/eventual_test.go +++ b/internal/util/eventual_test.go @@ -15,14 +15,14 @@ func (s *UnitTestSuite) TestEventual() { select { case <-eventual.Ready(): s.Require().Fail("should not be ready") - case <-time.NewTimer(time.Second).C: + case <-time.NewTimer(time.Millisecond).C: } eventual.Set(123) select { case <-eventual.Ready(): - case <-time.NewTimer(time.Second).C: + case <-time.NewTimer(time.Millisecond).C: s.Require().Fail("should be ready") } @@ -32,3 +32,23 @@ func (s *UnitTestSuite) TestEventual() { "Get() should return the value", ) } + +func (s *UnitTestSuite) TestEventualNil() { + eventual := NewEventual[error]() + + select { + case <-eventual.Ready(): + s.Require().Fail("should not be ready") + case <-time.NewTimer(time.Millisecond).C: + } + + eventual.Set(nil) + + select { + case <-eventual.Ready(): + case <-time.NewTimer(time.Millisecond).C: + s.Require().Fail("should be ready") + } + + s.Assert().Nil(eventual.Get()) +} From 4e38f0429ee93ea9f6cd606c4ccfc0b804487464 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 11:39:23 -0500 Subject: [PATCH 019/130] allow premature exit --- internal/verifier/check.go | 6 +----- internal/verifier/timeseries_test.go | 3 ++- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 42ce141d..12296f0d 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -1,7 +1,6 @@ package verifier import ( - "cmp" "context" "fmt" "time" @@ -80,10 +79,7 @@ func (verifier *Verifier) CheckWorker(ctxIn context.Context) error { select { case <-verifier.changeReaderErr.Ready(): return errors.Wrap( - cmp.Or( - verifier.changeReaderErr.Get(), - fmt.Errorf("change handling stopped prematurely"), - ), + verifier.changeReaderErr.Get(), verifier.dstChangeReader.String(), ) case <-ctx.Done(): diff --git a/internal/verifier/timeseries_test.go b/internal/verifier/timeseries_test.go index ae8548d9..9497b0a9 100644 --- a/internal/verifier/timeseries_test.go +++ b/internal/verifier/timeseries_test.go @@ -298,7 +298,8 @@ func (suite *IntegrationTestSuite) TestTimeSeries_Simple() { suite.Assert().Equal( 0, verificationStatus.FailedTasks, - "should be no failed tasks", + "should be no failed tasks (status: %+v)", + verificationStatus, ) suite.Assert().Equal( 3, From 0d4e40607133b90aa19a32705c24623023416436 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 12:43:14 -0500 Subject: [PATCH 020/130] rename & move --- internal/verifier/check.go | 13 ++++++------- internal/verifier/migration_verifier.go | 8 +++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 12296f0d..3930fccc 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -8,7 +8,6 @@ import ( "github.com/10gen/migration-verifier/contextplus" "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/retry" - "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mslices" mapset "github.com/deckarep/golang-set/v2" "github.com/goaux/timer" @@ -77,9 +76,9 @@ func (verifier *Verifier) CheckWorker(ctxIn context.Context) error { // If the change reader fails, everything should stop. eg.Go(func() error { select { - case <-verifier.changeReaderErr.Ready(): + case <-verifier.changeHandlingErr.Ready(): return errors.Wrap( - verifier.changeReaderErr.Get(), + verifier.changeHandlingErr.Get(), verifier.dstChangeReader.String(), ) case <-ctx.Done(): @@ -269,9 +268,9 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } } - verifier.changeReaderErr = util.NewEventual[error]() + changeHandlingErr := verifier.changeHandlingErr go func() { - verifier.changeReaderErr.Set(changeReaderGroup.Wait()) + changeHandlingErr.Set(changeReaderGroup.Wait()) }() // Log the verification status when initially booting up so it's easy to see the current state @@ -361,8 +360,8 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh select { case <-ctx.Done(): return ctx.Err() - case <-verifier.changeReaderErr.Ready(): - err := verifier.changeReaderErr.Get() + case <-verifier.changeHandlingErr.Ready(): + err := verifier.changeHandlingErr.Get() if err != nil { return errors.Wrap(err, "handling change events") } diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index 76f98e61..e23445cb 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -103,7 +103,7 @@ type Verifier struct { srcEventRecorder *EventRecorder dstEventRecorder *EventRecorder - changeReaderErr *util.Eventual[error] + changeHandlingErr *util.Eventual[error] // Used only with generation 0 to defer the first // progress report until after we’ve finished partitioning @@ -198,6 +198,8 @@ func NewVerifier(settings VerifierSettings, logPath string) *Verifier { verificationStatusCheckInterval: 2 * time.Second, nsMap: NewNSMap(), + + changeHandlingErr: util.NewEventual[error](), } } @@ -284,9 +286,9 @@ func (verifier *Verifier) WritesOff(ctx context.Context) error { select { case <-ctx.Done(): return ctx.Err() - case <-verifier.changeReaderErr.Ready(): + case <-verifier.changeHandlingErr.Ready(): return errors.Wrapf( - verifier.changeReaderErr.Get(), + verifier.changeHandlingErr.Get(), "tried to send writes-off timestamp to %s, but change handling already failed", readerAndTS.reader) default: readerAndTS.reader.setWritesOff(readerAndTS.ts) From e23a50334c229aa996ddc1f15b3c402714b3f44a Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 13:02:19 -0500 Subject: [PATCH 021/130] move --- internal/verifier/change_stream.go | 37 ----------------------------- internal/verifier/check.go | 38 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 02f63620..4a91a3c3 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -5,13 +5,10 @@ import ( "fmt" "time" - "github.com/10gen/migration-verifier/history" "github.com/10gen/migration-verifier/internal/keystring" "github.com/10gen/migration-verifier/internal/retry" "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mbson" - "github.com/10gen/migration-verifier/mslices" - "github.com/10gen/migration-verifier/msync" "github.com/10gen/migration-verifier/option" mapset "github.com/deckarep/golang-set/v2" clone "github.com/huandu/go-clone/generic" @@ -50,40 +47,6 @@ type ChangeStreamReader struct { var _ changeReader = &ChangeStreamReader{} -func (verifier *Verifier) initializeChangeReaders() { - srcReader := &ChangeStreamReader{ - ChangeReaderCommon: ChangeReaderCommon{ - readerType: src, - namespaces: verifier.srcNamespaces, - watcherClient: verifier.srcClient, - clusterInfo: *verifier.srcClusterInfo, - }, - } - verifier.srcChangeReader = srcReader - - dstReader := &ChangeStreamReader{ - ChangeReaderCommon: ChangeReaderCommon{ - readerType: dst, - namespaces: verifier.dstNamespaces, - watcherClient: verifier.dstClient, - clusterInfo: *verifier.dstClusterInfo, - onDDLEvent: onDDLEventAllow, - }, - } - verifier.dstChangeReader = dstReader - - // Common elements in both readers: - for _, csr := range mslices.Of(srcReader, dstReader) { - csr.logger = verifier.logger - csr.metaDB = verifier.metaClient.Database(verifier.metaDBName) - csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) - csr.writesOffTs = util.NewEventual[bson.Timestamp]() - csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) - csr.batchSizeHistory = history.New[int](time.Minute) - csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken - } -} - // GetChangeStreamFilter returns an aggregation pipeline that filters // namespaces as per configuration. // diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 3930fccc..3fea7dcf 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -6,9 +6,13 @@ import ( "time" "github.com/10gen/migration-verifier/contextplus" + "github.com/10gen/migration-verifier/history" "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/retry" + "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mslices" + "github.com/10gen/migration-verifier/msync" + "github.com/10gen/migration-verifier/option" mapset "github.com/deckarep/golang-set/v2" "github.com/goaux/timer" "github.com/pkg/errors" @@ -595,3 +599,37 @@ func (verifier *Verifier) work(ctx context.Context, workerNum int) error { } } } + +func (verifier *Verifier) initializeChangeReaders() { + srcReader := &ChangeStreamReader{ + ChangeReaderCommon: ChangeReaderCommon{ + readerType: src, + namespaces: verifier.srcNamespaces, + watcherClient: verifier.srcClient, + clusterInfo: *verifier.srcClusterInfo, + }, + } + verifier.srcChangeReader = srcReader + + dstReader := &ChangeStreamReader{ + ChangeReaderCommon: ChangeReaderCommon{ + readerType: dst, + namespaces: verifier.dstNamespaces, + watcherClient: verifier.dstClient, + clusterInfo: *verifier.dstClusterInfo, + onDDLEvent: onDDLEventAllow, + }, + } + verifier.dstChangeReader = dstReader + + // Common elements in both readers: + for _, csr := range mslices.Of(srcReader, dstReader) { + csr.logger = verifier.logger + csr.metaDB = verifier.metaClient.Database(verifier.metaDBName) + csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) + csr.writesOffTs = util.NewEventual[bson.Timestamp]() + csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) + csr.batchSizeHistory = history.New[int](time.Minute) + csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken + } +} From d61a8f1966fe2274d681a267d8ffd78bf3f3eb2b Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 13:07:07 -0500 Subject: [PATCH 022/130] handling --- internal/verifier/check.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 3fea7dcf..529c3e3f 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -371,7 +371,7 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } verifier.logger.Debug(). - Msg("Change readers finished.") + Msg("Change handling finished.") } verifier.mux.Lock() From b05ffe30e840b8a0ecd5bad3fed9c1d909441bad Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 13:49:03 -0500 Subject: [PATCH 023/130] add oplog reader & alia --- agg/agg.go | 197 ++++++++++++ agg/helpers/string.go | 22 ++ internal/verifier/change_reader.go | 15 + internal/verifier/change_stream.go | 20 ++ internal/verifier/check.go | 97 ++++-- internal/verifier/namespaces/meta.go | 10 + internal/verifier/oplog/oplog.go | 125 ++++++++ internal/verifier/oplog/start_time.go | 159 ++++++++++ internal/verifier/oplog_reader.go | 416 ++++++++++++++++++++++++++ mbson/raw_value.go | 6 +- mmongo/cursor.go | 55 ++++ 11 files changed, 1089 insertions(+), 33 deletions(-) create mode 100644 agg/agg.go create mode 100644 agg/helpers/string.go create mode 100644 internal/verifier/namespaces/meta.go create mode 100644 internal/verifier/oplog/oplog.go create mode 100644 internal/verifier/oplog/start_time.go create mode 100644 internal/verifier/oplog_reader.go create mode 100644 mmongo/cursor.go diff --git a/agg/agg.go b/agg/agg.go new file mode 100644 index 00000000..8c753d91 --- /dev/null +++ b/agg/agg.go @@ -0,0 +1,197 @@ +package agg + +import ( + "go.mongodb.org/mongo-driver/v2/bson" +) + +func Eq(comparands ...any) bson.D { + return bson.D{{"$eq", comparands}} +} + +func In[T any](needle any, haystack ...T) bson.D { + return bson.D{{"$in", bson.A{needle, haystack}}} +} + +func BSONSize(ref any) bson.D { + return bson.D{{"$bsonSize", ref}} +} + +func Type(ref any) bson.D { + return bson.D{{"$type", ref}} +} + +func Concat(refs ...any) bson.D { + return bson.D{{"$concat", refs}} +} + +// --------------------------------------------- + +type Not struct { + Ref any +} + +var _ bson.Marshaler = Not{} + +func (n Not) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{ + {"$not", n.Ref}, + }) +} + +// --------------------------------------------- + +type And []any + +var _ bson.Marshaler = And{} + +func (a And) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{ + {"$and", []any(a)}, + }) +} + +// --------------------------------------------- + +type Or []any + +var _ bson.Marshaler = Or{} + +func (o Or) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{ + {"$or", []any(o)}, + }) +} + +// --------------------------------------------- + +type SubstrBytes [3]any + +var _ bson.Marshaler = SubstrBytes{} + +func (s SubstrBytes) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{ + {"$substr", []any(s[:])}, + }) +} + +// --------------------------------------------- + +type MergeObjects []any + +var _ bson.Marshaler = MergeObjects{} + +func (m MergeObjects) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{ + {"$mergeObjects", []any(m)}, + }) +} + +// --------------------------------------------- + +type Cond struct { + If, Then, Else any +} + +var _ bson.Marshaler = Cond{} + +func (c Cond) D() bson.D { + return bson.D{ + {"$cond", bson.D{ + {"if", c.If}, + {"then", c.Then}, + {"else", c.Else}, + }}, + } +} + +func (c Cond) MarshalBSON() ([]byte, error) { + return bson.Marshal(c.D()) +} + +// --------------------------------------------- + +type Switch struct { + Branches []SwitchCase + Default any +} + +type SwitchCase struct { + Case any + Then any +} + +func (s Switch) D() bson.D { + return bson.D{{"$switch", bson.D{ + {"branches", s.Branches}, + {"default", s.Default}, + }}} +} + +func (s Switch) MarshalBSON() ([]byte, error) { + return bson.Marshal(s.D()) +} + +// --------------------------------------------- + +type ArrayElemAt struct { + Array any + Index int +} + +func (a ArrayElemAt) D() bson.D { + return bson.D{{"$arrayElemAt", bson.A{ + a.Array, + a.Index, + }}} +} + +func (a ArrayElemAt) MarshalBSON() ([]byte, error) { + return bson.Marshal(a.D()) +} + +// --------------------------------------------- + +type Map struct { + Input, As, In any +} + +var _ bson.Marshaler = Map{} + +func (m Map) D() bson.D { + return bson.D{ + {"$map", bson.D{ + {"input", m.Input}, + {"as", m.As}, + {"in", m.In}, + }}, + } +} + +func (m Map) MarshalBSON() ([]byte, error) { + return bson.Marshal(m.D()) +} + +// ------------------------------------------ + +type Filter struct { + Input, As, Cond, Limit any +} + +var _ bson.Marshaler = Filter{} + +func (f Filter) D() bson.D { + d := bson.D{ + {"input", f.Input}, + {"as", f.As}, + {"cond", f.Cond}, + } + + if f.Limit != nil { + d = append(d, bson.E{"limit", f.Limit}) + } + return bson.D{{"$filter", d}} +} + +func (f Filter) MarshalBSON() ([]byte, error) { + return bson.Marshal(f.D()) +} diff --git a/agg/helpers/string.go b/agg/helpers/string.go new file mode 100644 index 00000000..852845df --- /dev/null +++ b/agg/helpers/string.go @@ -0,0 +1,22 @@ +package helpers + +import "go.mongodb.org/mongo-driver/v2/bson" + +type StringHasPrefix struct { + FieldRef any + Prefix string +} + +func (sp StringHasPrefix) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{ + {"$eq", bson.A{ + 0, + bson.D{{"$indexOfCP", bson.A{ + sp.FieldRef, + sp.Prefix, + 0, + 1, + }}}, + }}, + }) +} diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 143f5f04..31794fae 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -70,6 +70,21 @@ type ChangeReaderCommon struct { onDDLEvent ddlEventHandling } +func newChangeReaderCommon(clusterName whichCluster) ChangeReaderCommon { + return ChangeReaderCommon{ + readerType: clusterName, + changeEventBatchChan: make(chan changeEventBatch, batchChanBufferSize), + writesOffTs: util.NewEventual[bson.Timestamp](), + lag: msync.NewTypedAtomic(option.None[time.Duration]()), + batchSizeHistory: history.New[int](time.Minute), + onDDLEvent: lo.Ternary( + clusterName == dst, + onDDLEventAllow, + "", + ), + } +} + func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { return rc.readerType } diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 4a91a3c3..623a553e 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -47,6 +47,26 @@ type ChangeStreamReader struct { var _ changeReader = &ChangeStreamReader{} +func (v *Verifier) newChangeStreamReader( + namespaces []string, + cluster whichCluster, + client *mongo.Client, + clusterInfo util.ClusterInfo, +) *ChangeStreamReader { + common := newChangeReaderCommon(cluster) + common.namespaces = namespaces + common.readerType = cluster + common.watcherClient = client + common.clusterInfo = clusterInfo + + common.logger = v.logger + common.metaDB = v.metaClient.Database(v.metaDBName) + + common.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken + + return &ChangeStreamReader{ChangeReaderCommon: common} +} + // GetChangeStreamFilter returns an aggregation pipeline that filters // namespaces as per configuration. // diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 529c3e3f..89716015 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -6,13 +6,10 @@ import ( "time" "github.com/10gen/migration-verifier/contextplus" - "github.com/10gen/migration-verifier/history" "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/retry" "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mslices" - "github.com/10gen/migration-verifier/msync" - "github.com/10gen/migration-verifier/option" mapset "github.com/deckarep/golang-set/v2" "github.com/goaux/timer" "github.com/pkg/errors" @@ -600,36 +597,72 @@ func (verifier *Verifier) work(ctx context.Context, workerNum int) error { } } -func (verifier *Verifier) initializeChangeReaders() { - srcReader := &ChangeStreamReader{ - ChangeReaderCommon: ChangeReaderCommon{ - readerType: src, - namespaces: verifier.srcNamespaces, - watcherClient: verifier.srcClient, - clusterInfo: *verifier.srcClusterInfo, - }, +func (v *Verifier) initializeChangeReaders() { + var whyCS string + + switch { + case len(v.srcNamespaces) > 0: + whyCS = "ns filter" + case v.srcClusterInfo.Topology == util.TopologySharded: + whyCS = "sharded" + case !util.ClusterHasBSONSize([2]int(v.srcClusterInfo.VersionArray)): + whyCS = "no $bsonSize" } - verifier.srcChangeReader = srcReader - - dstReader := &ChangeStreamReader{ - ChangeReaderCommon: ChangeReaderCommon{ - readerType: dst, - namespaces: verifier.dstNamespaces, - watcherClient: verifier.dstClient, - clusterInfo: *verifier.dstClusterInfo, - onDDLEvent: onDDLEventAllow, - }, + + srcLogEvent := v.logger.Info() + + if whyCS == "" { + v.srcChangeReader = v.newOplogReader( + v.srcNamespaces, + src, + v.srcClient, + *v.srcClusterInfo, + ) + } else { + srcLogEvent.Str("whyChangeStream", whyCS) + + v.srcChangeReader = v.newChangeStreamReader( + v.srcNamespaces, + src, + v.srcClient, + *v.srcClusterInfo, + ) } - verifier.dstChangeReader = dstReader - - // Common elements in both readers: - for _, csr := range mslices.Of(srcReader, dstReader) { - csr.logger = verifier.logger - csr.metaDB = verifier.metaClient.Database(verifier.metaDBName) - csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) - csr.writesOffTs = util.NewEventual[bson.Timestamp]() - csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) - csr.batchSizeHistory = history.New[int](time.Minute) - csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken + + srcLogEvent. + Stringer("reader", v.srcChangeReader). + Msg("Listening for writes to source.") + + switch { + case len(v.dstNamespaces) > 0: + whyCS = "ns filter" + case v.dstClusterInfo.Topology == util.TopologySharded: + whyCS = "sharded" + case !util.ClusterHasBSONSize([2]int(v.dstClusterInfo.VersionArray)): + whyCS = "no $bsonSize" } + + dstLogEvent := v.logger.Info() + + if whyCS == "" { + v.dstChangeReader = v.newOplogReader( + v.dstNamespaces, + dst, + v.dstClient, + *v.dstClusterInfo, + ) + } else { + dstLogEvent.Str("whyChangeStream", whyCS) + + v.dstChangeReader = v.newChangeStreamReader( + v.dstNamespaces, + dst, + v.dstClient, + *v.dstClusterInfo, + ) + } + + dstLogEvent. + Stringer("reader", v.dstChangeReader). + Msg("Listening for writes to destination.") } diff --git a/internal/verifier/namespaces/meta.go b/internal/verifier/namespaces/meta.go new file mode 100644 index 00000000..97cf09af --- /dev/null +++ b/internal/verifier/namespaces/meta.go @@ -0,0 +1,10 @@ +package namespaces + +import "github.com/10gen/migration-verifier/mslices" + +var ( + MongosyncMetaDBPrefixes = mslices.Of( + "mongosync_internal_", + "mongosync_reserved_", + ) +) diff --git a/internal/verifier/oplog/oplog.go b/internal/verifier/oplog/oplog.go new file mode 100644 index 00000000..91d31a8a --- /dev/null +++ b/internal/verifier/oplog/oplog.go @@ -0,0 +1,125 @@ +package oplog + +import ( + "encoding/binary" + "fmt" + "slices" + + "github.com/10gen/migration-verifier/mbson" + "github.com/pkg/errors" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/x/bsonx/bsoncore" +) + +const ( + rtBSONLength = 4 + 1 + 2 + 1 + 8 + 1 +) + +type Op struct { + Op string + TS bson.Timestamp + Ns string + CmdName string + DocLen int32 + DocID bson.RawValue + Ops []Op +} + +type ResumeToken struct { + TS bson.Timestamp +} + +func GetRawResumeTokenTimestamp(token bson.Raw) (bson.Timestamp, error) { + rv, err := token.LookupErr("ts") + if err != nil { + return bson.Timestamp{}, errors.Wrap(err, "getting ts") + } + + return mbson.CastRawValue[bson.Timestamp](rv) +} + +func (rt ResumeToken) MarshalToBSON() []byte { + buf := make([]byte, 4, rtBSONLength) + + binary.LittleEndian.PutUint32(buf, uint32(cap(buf))) + + buf = bsoncore.AppendTimestampElement(buf, "ts", rt.TS.T, rt.TS.I) + + buf = append(buf, 0) + + if len(buf) != rtBSONLength { + panic(fmt.Sprintf("bad resume token BSON length: %d", len(buf))) + } + + return buf +} + +func (o *Op) UnmarshalFromBSON(in []byte) error { + //fmt.Printf("---- unmarshaling: %+v\n\n", bson.Raw(in)) + + for el, err := range mbson.RawElements(bson.Raw(in)) { + if err != nil { + return errors.Wrap(err, "iterating BSON document") + } + + key, err := el.KeyErr() + if err != nil { + return errors.Wrap(err, "reading BSON field name") + } + + switch key { + case "op": + err = mbson.UnmarshalElementValue(el, &o.Op) + case "ts": + err = mbson.UnmarshalElementValue(el, &o.TS) + case "ns": + err = mbson.UnmarshalElementValue(el, &o.Ns) + case "cmdName": + err = mbson.UnmarshalElementValue(el, &o.CmdName) + case "docLen": + err = mbson.UnmarshalElementValue(el, &o.DocLen) + case "docID": + o.DocID, err = el.ValueErr() + if err != nil { + err = errors.Wrapf(err, "parsing %#q value", key) + } + o.DocID.Value = slices.Clone(o.DocID.Value) + case "ops": + var arr bson.RawArray + err = errors.Wrapf( + mbson.UnmarshalElementValue(el, &arr), + "parsing ops", + ) + + if err == nil { + vals, err := arr.Values() + if err != nil { + return errors.Wrap(err, "parsing applyOps") + } + + o.Ops = make([]Op, len(vals)) + + for i, val := range vals { + + var opRaw bson.Raw + err := mbson.UnmarshalRawValue(val, &opRaw) + if err != nil { + return errors.Wrapf(err, "parsing applyOps field") + } + + if err := (&o.Ops[i]).UnmarshalFromBSON(opRaw); err != nil { + return errors.Wrapf(err, "parsing applyOps value") + } + } + } + default: + err = errors.Wrapf(err, "unexpected field %#q", key) + } + + if err != nil { + return err + } + } + + return nil +} diff --git a/internal/verifier/oplog/start_time.go b/internal/verifier/oplog/start_time.go new file mode 100644 index 00000000..eb042b58 --- /dev/null +++ b/internal/verifier/oplog/start_time.go @@ -0,0 +1,159 @@ +package oplog + +import ( + "context" + "fmt" + + "github.com/10gen/migration-verifier/option" + "github.com/pkg/errors" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/readconcern" +) + +func GetTailingStartTimes( + ctx context.Context, + client *mongo.Client, +) (OpTime, OpTime, error) { + oldestTxn, err := getOldestTransactionTime(ctx, client) + if err != nil { + return OpTime{}, OpTime{}, errors.Wrapf(err, "finding oldest txn") + } + + latestTime, err := getLatestVisibleOplogOpTime(ctx, client) + if err != nil { + return OpTime{}, OpTime{}, errors.Wrapf(err, "finding latest optime") + } + + if oldestTime, has := oldestTxn.Get(); has { + return oldestTime, latestTime, nil + } + + return latestTime, latestTime, nil +} + +type OpTime struct { + TS bson.Timestamp + T int64 + H option.Option[int64] +} + +func (ot OpTime) Equals(ot2 OpTime) bool { + if !ot.TS.Equal(ot2.TS) { + return false + } + + if ot.T != ot2.T { + return false + } + + return ot.H.OrZero() == ot2.H.OrZero() +} + +// GetLatestOplogOpTime returns the optime of the most recent oplog +// record satisfying the given `query` or a zero-value db.OpTime{} if +// no oplog record matches. This method does not ensure that all prior oplog +// entries are visible (i.e. have been storage-committed). +func getLatestOplogOpTime( + ctx context.Context, + client *mongo.Client, +) (OpTime, error) { + var optime OpTime + + opts := options.FindOne(). + SetProjection(bson.M{"ts": 1, "t": 1, "h": 1}). + SetSort(bson.D{{"$natural", -1}}) + + coll := client.Database("local").Collection("oplog.rs") + + res := coll.FindOne(ctx, bson.D{}, opts) + if err := res.Err(); err != nil { + return OpTime{}, err + } + + if err := res.Decode(&optime); err != nil { + return OpTime{}, err + } + return optime, nil +} + +func getLatestVisibleOplogOpTime( + ctx context.Context, + client *mongo.Client, +) (OpTime, error) { + + latestOpTime, err := getLatestOplogOpTime(ctx, client) + if err != nil { + return OpTime{}, err + } + + coll := client.Database("local").Collection("oplog.rs") + + // Do a forward scan starting at the last op fetched to ensure that + // all operations with earlier oplog times have been storage-committed. + result, err := coll.FindOne(ctx, + bson.M{"ts": bson.M{"$gte": latestOpTime.TS}}, + options.FindOne().SetOplogReplay(true), + ).Raw() + if err != nil { + if errors.Is(err, mongo.ErrNoDocuments) { + return OpTime{}, fmt.Errorf( + "last op was not confirmed. last optime: %+v. confirmation time was not found", + latestOpTime, + ) + } + return OpTime{}, err + } + + var optime OpTime + + if err := bson.Unmarshal(result, &optime); err != nil { + return OpTime{}, errors.Wrap(err, "local.oplog.rs error") + } + + if !optime.Equals(latestOpTime) { + return OpTime{}, fmt.Errorf( + "last op was not confirmed. last optime: %+v. confirmation time: %+v", + latestOpTime, + optime, + ) + } + + return latestOpTime, nil +} + +func getOldestTransactionTime( + ctx context.Context, + client *mongo.Client, +) (option.Option[OpTime], error) { + coll := client.Database("config"). + Collection( + "transactions", + options.Collection().SetReadConcern(readconcern.Local()), + ) + + decoded := struct { + StartOpTime OpTime + }{} + + err := coll.FindOne( + ctx, + bson.D{ + {"state", bson.D{ + {"$in", bson.A{"prepared", "inProgress"}}, + }}, + }, + options.FindOne().SetSort(bson.D{{"startOpTime", 1}}), + ).Decode(&decoded) + + if errors.Is(err, mongo.ErrNoDocuments) { + return option.None[OpTime](), nil + } + + if err != nil { + return option.None[OpTime](), errors.Wrap(err, "config.transactions.findOne") + } + + return option.Some(decoded.StartOpTime), nil +} diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go new file mode 100644 index 00000000..84b5a415 --- /dev/null +++ b/internal/verifier/oplog_reader.go @@ -0,0 +1,416 @@ +package verifier + +import ( + "context" + "fmt" + + "github.com/10gen/migration-verifier/agg" + "github.com/10gen/migration-verifier/agg/helpers" + "github.com/10gen/migration-verifier/internal/types" + "github.com/10gen/migration-verifier/internal/util" + "github.com/10gen/migration-verifier/internal/verifier/namespaces" + "github.com/10gen/migration-verifier/internal/verifier/oplog" + "github.com/10gen/migration-verifier/mmongo" + "github.com/10gen/migration-verifier/option" + "github.com/pkg/errors" + "github.com/samber/lo" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/readconcern" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" +) + +// OplogReader reads change events via oplog tailing instead of a change stream. +// This significantly lightens server load and allows verification of heavier +// workloads than change streams allow. It only works with replica sets. +type OplogReader struct { + curDocs []bson.Raw + scratch []byte + ChangeReaderCommon +} + +var _ changeReader = &OplogReader{} + +func (v *Verifier) newOplogReader( + namespaces []string, + cluster whichCluster, + client *mongo.Client, + clusterInfo util.ClusterInfo, +) *OplogReader { + common := newChangeReaderCommon(cluster) + common.namespaces = namespaces + common.watcherClient = client + common.clusterInfo = clusterInfo + + common.logger = v.logger + common.metaDB = v.metaClient.Database(v.metaDBName) + + common.resumeTokenTSExtractor = oplog.GetRawResumeTokenTimestamp + + return &OplogReader{ChangeReaderCommon: common} +} + +func (o *OplogReader) start(ctx context.Context, eg *errgroup.Group) error { + // TODO: retryer + + savedResumeToken, err := o.loadResumeToken(ctx) + if err != nil { + return errors.Wrap(err, "loading persisted resume token") + } + + var allowDDLBeforeTS bson.Timestamp + + if token, has := savedResumeToken.Get(); has { + var rt oplog.ResumeToken + if err := bson.Unmarshal(token, &rt); err != nil { + return errors.Wrap(err, "parsing persisted resume token") + } + + // TODO: Smarten this rather than assuming we’ve passed the original + // latest optime. + allowDDLBeforeTS = rt.TS + allowDDLBeforeTS.T-- + + o.startAtTs = &rt.TS + } else { + startOpTime, latestOpTime, err := oplog.GetTailingStartTimes(ctx, o.watcherClient) + if err != nil { + return errors.Wrapf(err, "getting start optime from %s", o.readerType) + } + + allowDDLBeforeTS = latestOpTime.TS + + o.startAtTs = &startOpTime.TS + } + + o.logger.Info(). + Any("startReadTs", *o.startAtTs). + Any("currentOplogTs", allowDDLBeforeTS). + Msg("Tailing oplog.") + + sess, err := o.watcherClient.StartSession() + if err != nil { + return errors.Wrap(err, "creating session") + } + + sctx := mongo.NewSessionContext(ctx, sess) + + cursor, err := o.watcherClient. + Database("local"). + Collection( + "oplog.rs", + options.Collection().SetReadConcern(readconcern.Majority()), + ). + Find( + sctx, + bson.D{{"$and", []any{ + bson.D{{"ts", bson.D{{"$gte", o.startAtTs}}}}, + + bson.D{{"$expr", agg.Or{ + // plain ops: one write per op + append( + agg.And{agg.In("$op", "d", "i", "u")}, + o.getDefaultNSExclusions("$$ROOT")..., + ), + + // op=c is for applyOps, and also to detect forbidden DDL. + // op=n is for no-ops, so we stay up-to-date. + agg.In("$op", "c", "n"), + }}}, + }}}, + + options.Find(). + SetCursorType(options.TailableAwait). + SetProjection(bson.D{ + {"ts", 1}, + {"op", 1}, + {"ns", 1}, + + // TODO: Adjust for 4.2. + {"docLen", getOplogDocLenExpr("$$ROOT")}, + + {"docID", getOplogDocIDExpr("$$ROOT")}, + + {"cmdName", agg.Cond{ + If: agg.Eq("$op", "c"), + Then: agg.ArrayElemAt{ + Array: agg.Map{ + Input: bson.D{ + {"$objectToArray", "$o"}, + }, + As: "field", + In: "$$field.k", + }, + Index: 0, + }, + Else: "$$REMOVE", + }}, + + {"o", agg.Cond{ + If: agg.And{ + agg.Eq("$op", "c"), + agg.Eq("missing", agg.Type("$o.applyOps")), + }, + Then: "$o", + Else: "$$REMOVE", + }}, + + {"ops", agg.Cond{ + If: agg.And{ + agg.Eq("$op", "c"), + agg.Eq(agg.Type("$o.applyOps"), "array"), + }, + Then: agg.Map{ + Input: agg.Filter{ + Input: "$o.applyOps", + As: "opEntry", + Cond: o.getDefaultNSExclusions("$$opEntry"), + }, + As: "opEntry", + In: bson.D{ + {"op", "$$opEntry.op"}, + {"ns", "$$opEntry.ns"}, + {"docID", getOplogDocIDExpr("$$opEntry")}, + {"docLen", getOplogDocLenExpr("$$opEntry")}, + }, + }, + Else: "$$REMOVE", + }}, + }), + ) + + if err != nil { + return errors.Wrapf(err, "opening cursor to tail oplog") + } + + eg.Go( + func() error { + return o.iterate(sctx, cursor, allowDDLBeforeTS) + }, + ) + + return nil +} + +func (o *OplogReader) iterate( + sctx context.Context, + cursor *mongo.Cursor, + allowDDLBeforeTS bson.Timestamp, +) error { +CursorLoop: + for { + var err error + + select { + case <-sctx.Done(): + return sctx.Err() + case <-o.writesOffTs.Ready(): + break CursorLoop + default: + err = o.readAndHandleOneBatch(sctx, cursor, allowDDLBeforeTS) + if err != nil { + return err + } + } + } + + writesOffTS := o.writesOffTs.Get() + + for { + if o.lastChangeEventTime != nil { + if !o.lastChangeEventTime.Before(writesOffTS) { + fmt.Printf("----------- %s reached writes off ts %v", o, writesOffTS) + break + } + } + + err := o.readAndHandleOneBatch(sctx, cursor, allowDDLBeforeTS) + if err != nil { + return err + } + } + + // TODO: deduplicate + o.running = false + + infoLog := o.logger.Info() + if o.lastChangeEventTime != nil { + infoLog = infoLog.Any("lastEventTime", o.lastChangeEventTime) + o.startAtTs = lo.ToPtr(*o.lastChangeEventTime) + } else { + infoLog = infoLog.Str("lastEventTime", "none") + } + + infoLog. + Stringer("reader", o). + Msg("Change stream reader is done.") + + return nil +} + +var oplogOpToOperationType = map[string]string{ + "i": "insert", + "u": "update", // don’t need to distinguish from replace + "d": "delete", +} + +func (o *OplogReader) readAndHandleOneBatch( + sctx context.Context, + cursor *mongo.Cursor, + allowDDLBeforeTS bson.Timestamp, +) error { + var err error + + o.curDocs = o.curDocs[:0] + o.scratch = o.scratch[:0] + + o.curDocs, o.scratch, err = mmongo.GetBatch(sctx, cursor, o.curDocs, o.scratch) + if err != nil { + return errors.Wrap(err, "reading cursor") + } + + events := make([]ParsedEvent, 0, len(o.curDocs)) + + var latestTS bson.Timestamp + + for _, rawDoc := range o.curDocs { + var op oplog.Op + + if err := (&op).UnmarshalFromBSON(rawDoc); err != nil { + return errors.Wrapf(err, "reading oplog entry") + } + + latestTS = op.TS + + switch op.Op { + case "n": + case "c": + if op.CmdName != "applyOps" { + if o.onDDLEvent == onDDLEventAllow { + o.logIgnoredDDL(rawDoc) + continue + } + + if !op.TS.After(allowDDLBeforeTS) { + o.logger.Info(). + Stringer("event", rawDoc). + Msg("Ignoring unrecognized write from the past.") + + continue + } + + return UnknownEventError{rawDoc} + } + + events = append( + events, + lo.Map( + op.Ops, + func(subOp oplog.Op, _ int) ParsedEvent { + return ParsedEvent{ + OpType: oplogOpToOperationType[subOp.Op], + Ns: NewNamespace(SplitNamespace(subOp.Ns)), + DocID: subOp.DocID, + FullDocLen: option.Some(types.ByteCount(subOp.DocLen)), + ClusterTime: &op.TS, + } + }, + )..., + ) + default: + events = append( + events, + ParsedEvent{ + OpType: oplogOpToOperationType[op.Op], + Ns: NewNamespace(SplitNamespace(op.Ns)), + DocID: op.DocID, + FullDocLen: option.Some(types.ByteCount(op.DocLen)), + ClusterTime: &op.TS, + }, + ) + } + } + + sess := mongo.SessionFromContext(sctx) + resumeToken := oplog.ResumeToken{latestTS}.MarshalToBSON() + + o.updateLag(sess, resumeToken) + + o.batchSizeHistory.Add(len(events)) + + select { + case <-sctx.Done(): + return err + case o.changeEventBatchChan <- changeEventBatch{ + events: events, + resumeToken: resumeToken, + clusterTime: *sess.OperationTime(), + }: + } + + o.lastChangeEventTime = &latestTS + + return nil +} + +func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { + prefixes := append( + slices.Clone(namespaces.MongosyncMetaDBPrefixes), + o.metaDB.Name()+".", + "config.", + "admin.", + ) + + return agg.And(lo.Map( + prefixes, + func(prefix string, _ int) any { + return agg.Not{helpers.StringHasPrefix{ + FieldRef: docroot + ".ns", + Prefix: prefix, + }} + }, + )) +} + +func getOplogDocLenExpr(docroot string) any { + return agg.Switch{ + Branches: []agg.SwitchCase{ + { + Case: agg.Or{ + agg.Eq(docroot+".op", "i"), + agg.And{ + agg.Eq(docroot+".op", "u"), + agg.Not{agg.Eq("missing", docroot+".o._id")}, + }, + }, + Then: agg.BSONSize(docroot + ".o"), + }, + }, + Default: "$$REMOVE", + } +} + +func getOplogDocIDExpr(docroot string) any { + return agg.Switch{ + Branches: []agg.SwitchCase{ + { + Case: agg.Eq(docroot+".op", "c"), + Then: "$$REMOVE", + }, + { + Case: agg.In(docroot+".op", "i", "d"), + Then: docroot + ".o._id", + }, + { + Case: agg.In(docroot+".op", "u"), + Then: docroot + ".o2._id", + }, + }, + } +} + +func (o *OplogReader) String() string { + return fmt.Sprintf("%s oplog reader", o.readerType) +} diff --git a/mbson/raw_value.go b/mbson/raw_value.go index b0b1c96c..7674c56e 100644 --- a/mbson/raw_value.go +++ b/mbson/raw_value.go @@ -9,7 +9,7 @@ import ( ) type bsonCastRecipient interface { - bson.Raw | bson.Timestamp | bson.ObjectID | string | int32 + bson.Raw | bson.RawArray | bson.Timestamp | bson.ObjectID | string | int32 } type bsonSourceTypes interface { @@ -36,6 +36,10 @@ func CastRawValue[T bsonCastRecipient](in bson.RawValue) (T, error) { if doc, isDoc := in.DocumentOK(); isDoc { return any(doc).(T), nil } + case bson.RawArray: + if arr, ok := in.ArrayOK(); ok { + return any(arr).(T), nil + } case bson.Timestamp: if t, i, ok := in.TimestampOK(); ok { return any(bson.Timestamp{t, i}).(T), nil diff --git a/mmongo/cursor.go b/mmongo/cursor.go new file mode 100644 index 00000000..c76b8a18 --- /dev/null +++ b/mmongo/cursor.go @@ -0,0 +1,55 @@ +package mmongo + +import ( + "context" + + "github.com/pkg/errors" + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" +) + +// GetBatch returns a batch of documents from a cursor. It does so by appending +// to passed-in slices, which lets you optimize memory handling. +func GetBatch( + ctx context.Context, + cursor *mongo.Cursor, + docs []bson.Raw, + buffer []byte, +) ([]bson.Raw, []byte, error) { + for hasDocs := true; hasDocs; hasDocs = cursor.RemainingBatchLength() > 0 { + got := cursor.TryNext(ctx) + + if cursor.Err() != nil { + return nil, nil, errors.Wrap(cursor.Err(), "cursor iteration failed") + } + + if !got { + break + } + + docPos := len(buffer) + buffer = append(buffer, cursor.Current...) + docs = append(docs, buffer[docPos:]) + } + + /* + batchLen := cursor.RemainingBatchLength() + + docs = slices.Grow(docs, batchLen) + + for range batchLen { + if !cursor.Next(ctx) { + return nil, nil, mcmp.Or( + errors.Wrap(cursor.Err(), "iterating cursor mid-batch"), + fmt.Errorf("expected %d docs from cursor but only saw %d", batchLen, len(docs)), + ) + } + + docPos := len(buffer) + buffer = append(buffer, cursor.Current...) + docs = append(docs, buffer[docPos:]) + } + */ + + return docs, buffer, nil +} From c6025f55545f09d2133e9f1d8bdfeb4c07e51692 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 14:22:27 -0500 Subject: [PATCH 024/130] dedupe retry of change reader --- internal/verifier/change_reader.go | 94 +++++++++++++++++++ internal/verifier/change_stream.go | 143 +++++++---------------------- internal/verifier/oplog_reader.go | 48 +++++----- 3 files changed, 153 insertions(+), 132 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 31794fae..fb7b12bd 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -6,12 +6,14 @@ import ( "github.com/10gen/migration-verifier/history" "github.com/10gen/migration-verifier/internal/logger" + "github.com/10gen/migration-verifier/internal/retry" "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/msync" "github.com/10gen/migration-verifier/option" "github.com/pkg/errors" "github.com/rs/zerolog" "github.com/samber/lo" + "github.com/samber/mo" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" @@ -67,6 +69,9 @@ type ChangeReaderCommon struct { lag *msync.TypedAtomic[option.Option[time.Duration]] batchSizeHistory *history.History[int] + createIteratorCb func(context.Context, *mongo.Session) (bson.Timestamp, error) + iterateCb func(context.Context, *retry.FuncInfo, *mongo.Session) error + onDDLEvent ddlEventHandling } @@ -144,6 +149,95 @@ func (rc *ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { return option.None[float64]() } +// start starts the change reader +func (rc *ChangeReaderCommon) start( + ctx context.Context, + eg *errgroup.Group, +) error { + // This channel holds the first change stream creation's result, whether + // success or failure. Rather than using a Result we could make separate + // Timestamp and error channels, but the single channel is cleaner since + // there's no chance of "nonsense" like both channels returning a payload. + initialCreateResultChan := make(chan mo.Result[bson.Timestamp]) + + eg.Go( + func() error { + // Closing changeEventBatchChan at the end of change stream goroutine + // notifies the verifier's change event handler to exit. + defer func() { + rc.logger.Debug(). + Str("reader", string(rc.readerType)). + Msg("Closing change event batch channel.") + + close(rc.changeEventBatchChan) + }() + + retryer := retry.New().WithErrorCodes(util.CursorKilledErrCode) + + parentThreadWaiting := true + + err := retryer.WithCallback( + func(ctx context.Context, ri *retry.FuncInfo) error { + sess, err := rc.watcherClient.StartSession() + if err != nil { + return errors.Wrap(err, "failed to start session") + } + + startTs, err := rc.createIteratorCb(ctx, sess) + if err != nil { + logEvent := rc.logger.Debug(). + Err(err). + Str("reader", string(rc.readerType)) + + if parentThreadWaiting { + logEvent.Msg("First change stream open failed.") + + initialCreateResultChan <- mo.Err[bson.Timestamp](err) + return nil + } + + logEvent.Msg("Retried change stream open failed.") + + return err + } + + logEvent := rc.logger.Debug(). + Str("reader", string(rc.readerType)). + Any("startTimestamp", startTs) + + if parentThreadWaiting { + logEvent.Msg("First change stream open succeeded.") + + initialCreateResultChan <- mo.Ok(startTs) + close(initialCreateResultChan) + parentThreadWaiting = false + } else { + logEvent.Msg("Retried change stream open succeeded.") + } + + return rc.iterateCb(ctx, ri, sess) + }, + "running %s", rc, + ).Run(ctx, rc.logger) + + return err + }, + ) + + result := <-initialCreateResultChan + + startTs, err := result.Get() + if err != nil { + return errors.Wrapf(err, "creating change stream") + } + + rc.startAtTs = &startTs + + rc.running = true + + return nil +} + func (rc *ChangeReaderCommon) persistResumeToken(ctx context.Context, token bson.Raw) error { coll := rc.metaDB.Collection(changeReaderCollectionName) _, err := coll.ReplaceOne( diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 623a553e..da5d5664 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -13,12 +13,10 @@ import ( mapset "github.com/deckarep/golang-set/v2" clone "github.com/huandu/go-clone/generic" "github.com/pkg/errors" - "github.com/samber/mo" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" ) var supportedEventOpTypes = mapset.NewSet( @@ -42,7 +40,8 @@ func (uee UnknownEventError) Error() string { } type ChangeStreamReader struct { - ChangeReaderCommon + changeStream *mongo.ChangeStream + *ChangeReaderCommon } var _ changeReader = &ChangeStreamReader{} @@ -64,7 +63,12 @@ func (v *Verifier) newChangeStreamReader( common.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken - return &ChangeStreamReader{ChangeReaderCommon: common} + csr := &ChangeStreamReader{ChangeReaderCommon: &common} + + common.createIteratorCb = csr.createChangeStream + common.iterateCb = csr.iterateChangeStream + + return csr } // GetChangeStreamFilter returns an aggregation pipeline that filters @@ -143,10 +147,9 @@ func (csr *ChangeStreamReader) GetChangeStreamFilter() (pipeline mongo.Pipeline) // is unideal but shouldn’t impede correctness since post-writesOff events // shouldn’t really happen anyway by definition. func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( - ctx context.Context, + sctx context.Context, ri *retry.FuncInfo, cs *mongo.ChangeStream, - sess *mongo.Session, ) error { eventsRead := 0 var changeEvents []ParsedEvent @@ -155,7 +158,7 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( var batchTotalBytes int for hasEventInBatch := true; hasEventInBatch; hasEventInBatch = cs.RemainingBatchLength() > 0 { - gotEvent := cs.TryNext(ctx) + gotEvent := cs.TryNext(sctx) if cs.Err() != nil { return errors.Wrap(cs.Err(), "change stream iteration failed") @@ -224,6 +227,8 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( eventsRead++ } + sess := mongo.SessionFromContext(sctx) + csr.updateLag(sess, cs.ResumeToken()) if eventsRead == 0 { @@ -244,8 +249,8 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( ri.NoteSuccess("parsed %d-event batch", len(changeEvents)) select { - case <-ctx.Done(): - return util.WrapCtxErrWithCause(ctx) + case <-sctx.Done(): + return util.WrapCtxErrWithCause(sctx) case csr.changeEventBatchChan <- changeEventBatch{ events: changeEvents, @@ -264,9 +269,14 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( func (csr *ChangeStreamReader) iterateChangeStream( ctx context.Context, ri *retry.FuncInfo, - cs *mongo.ChangeStream, sess *mongo.Session, ) error { + sctx := mongo.NewSessionContext(ctx, sess) + + cs := csr.changeStream + + defer cs.Close(sctx) + for { var err error var gotwritesOffTimestamp bool @@ -317,7 +327,7 @@ func (csr *ChangeStreamReader) iterateChangeStream( break } - err = csr.readAndHandleOneChangeEventBatch(ctx, ri, cs, sess) + err = csr.readAndHandleOneChangeEventBatch(sctx, ri, cs) if err != nil { return err @@ -325,7 +335,7 @@ func (csr *ChangeStreamReader) iterateChangeStream( } default: - err = csr.readAndHandleOneChangeEventBatch(ctx, ri, cs, sess) + err = csr.readAndHandleOneChangeEventBatch(sctx, ri, cs) if err != nil { return err @@ -358,7 +368,8 @@ func (csr *ChangeStreamReader) iterateChangeStream( func (csr *ChangeStreamReader) createChangeStream( ctx context.Context, -) (*mongo.ChangeStream, *mongo.Session, bson.Timestamp, error) { + sess *mongo.Session, +) (bson.Timestamp, error) { pipeline := csr.GetChangeStreamFilter() opts := options.ChangeStream(). SetMaxAwaitTime(maxChangeStreamAwaitTime) @@ -374,7 +385,7 @@ func (csr *ChangeStreamReader) createChangeStream( savedResumeToken, err := csr.loadResumeToken(ctx) if err != nil { - return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to load persisted change stream resume token") + return bson.Timestamp{}, errors.Wrap(err, "failed to load persisted change stream resume token") } csStartLogEvent := csr.logger.Info() @@ -399,24 +410,23 @@ func (csr *ChangeStreamReader) createChangeStream( csStartLogEvent.Msgf("Starting change stream from current %s cluster time.", csr.readerType) } - sess, err := csr.watcherClient.StartSession() - if err != nil { - return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to start session") - } sctx := mongo.NewSessionContext(ctx, sess) + changeStream, err := csr.watcherClient.Watch(sctx, pipeline, opts) if err != nil { - return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to open change stream") + return bson.Timestamp{}, errors.Wrap(err, "failed to open change stream") } err = csr.persistResumeToken(ctx, changeStream.ResumeToken()) if err != nil { - return nil, nil, bson.Timestamp{}, err + changeStream.Close(sctx) + return bson.Timestamp{}, err } startTs, err := csr.resumeTokenTSExtractor(changeStream.ResumeToken()) if err != nil { - return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to extract timestamp from change stream's resume token") + changeStream.Close(sctx) + return bson.Timestamp{}, errors.Wrap(err, "failed to extract timestamp from change stream's resume token") } // With sharded clusters the resume token might lead the cluster time @@ -424,7 +434,8 @@ func (csr *ChangeStreamReader) createChangeStream( // otherwise we will get errors. clusterTime, err := util.GetClusterTimeFromSession(sess) if err != nil { - return nil, nil, bson.Timestamp{}, errors.Wrap(err, "failed to read cluster time from session") + changeStream.Close(sctx) + return bson.Timestamp{}, errors.Wrap(err, "failed to read cluster time from session") } csr.logger.Debug(). @@ -437,93 +448,9 @@ func (csr *ChangeStreamReader) createChangeStream( startTs = clusterTime } - return changeStream, sess, startTs, nil -} - -// StartChangeStream starts the change stream. -func (csr *ChangeStreamReader) start( - ctx context.Context, - eg *errgroup.Group, -) error { - // This channel holds the first change stream creation's result, whether - // success or failure. Rather than using a Result we could make separate - // Timestamp and error channels, but the single channel is cleaner since - // there's no chance of "nonsense" like both channels returning a payload. - initialCreateResultChan := make(chan mo.Result[bson.Timestamp]) - - eg.Go( - func() error { - // Closing changeEventBatchChan at the end of change stream goroutine - // notifies the verifier's change event handler to exit. - defer func() { - csr.logger.Debug(). - Stringer("changeStreamReader", csr). - Msg("Closing change event batch channel.") - - close(csr.changeEventBatchChan) - }() - - retryer := retry.New().WithErrorCodes(util.CursorKilledErrCode) - - parentThreadWaiting := true - - err := retryer.WithCallback( - func(ctx context.Context, ri *retry.FuncInfo) error { - changeStream, sess, startTs, err := csr.createChangeStream(ctx) - if err != nil { - logEvent := csr.logger.Debug(). - Err(err). - Stringer("changeStreamReader", csr) - - if parentThreadWaiting { - logEvent.Msg("First change stream open failed.") - - initialCreateResultChan <- mo.Err[bson.Timestamp](err) - return nil - } - - logEvent.Msg("Retried change stream open failed.") - - return err - } - - defer changeStream.Close(ctx) - - logEvent := csr.logger.Debug(). - Stringer("changeStreamReader", csr). - Any("startTimestamp", startTs) - - if parentThreadWaiting { - logEvent.Msg("First change stream open succeeded.") - - initialCreateResultChan <- mo.Ok(startTs) - close(initialCreateResultChan) - parentThreadWaiting = false - } else { - logEvent.Msg("Retried change stream open succeeded.") - } - - return csr.iterateChangeStream(ctx, ri, changeStream, sess) - }, - "running %s", csr, - ).Run(ctx, csr.logger) - - return err - }, - ) - - result := <-initialCreateResultChan - - startTs, err := result.Get() - if err != nil { - return errors.Wrapf(err, "creating change stream") - } - - csr.startAtTs = &startTs + csr.changeStream = changeStream - csr.running = true - - return nil + return startTs, nil } func (csr *ChangeStreamReader) String() string { diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 84b5a415..cbf9b8f2 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -19,16 +19,17 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo/options" "go.mongodb.org/mongo-driver/v2/mongo/readconcern" "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" ) // OplogReader reads change events via oplog tailing instead of a change stream. // This significantly lightens server load and allows verification of heavier // workloads than change streams allow. It only works with replica sets. type OplogReader struct { + *ChangeReaderCommon + curDocs []bson.Raw scratch []byte - ChangeReaderCommon + cursor *mongo.Cursor } var _ changeReader = &OplogReader{} @@ -49,23 +50,31 @@ func (v *Verifier) newOplogReader( common.resumeTokenTSExtractor = oplog.GetRawResumeTokenTimestamp - return &OplogReader{ChangeReaderCommon: common} -} + o := &OplogReader{ChangeReaderCommon: &common} + + common.createIteratorCb = o.createCursor + common.iterateCb = o.iterateCb -func (o *OplogReader) start(ctx context.Context, eg *errgroup.Group) error { - // TODO: retryer + return o +} +func (o *OplogReader) createCursor( + ctx context.Context, + sess *mongo.Session, +) (bson.Timestamp, error) { savedResumeToken, err := o.loadResumeToken(ctx) if err != nil { - return errors.Wrap(err, "loading persisted resume token") + return bson.Timestamp{}, errors.Wrap(err, "loading persisted resume token") } var allowDDLBeforeTS bson.Timestamp + var startTS bson.Timestamp + if token, has := savedResumeToken.Get(); has { var rt oplog.ResumeToken if err := bson.Unmarshal(token, &rt); err != nil { - return errors.Wrap(err, "parsing persisted resume token") + return bson.Timestamp{}, errors.Wrap(err, "parsing persisted resume token") } // TODO: Smarten this rather than assuming we’ve passed the original @@ -73,16 +82,16 @@ func (o *OplogReader) start(ctx context.Context, eg *errgroup.Group) error { allowDDLBeforeTS = rt.TS allowDDLBeforeTS.T-- - o.startAtTs = &rt.TS + startTS = rt.TS } else { startOpTime, latestOpTime, err := oplog.GetTailingStartTimes(ctx, o.watcherClient) if err != nil { - return errors.Wrapf(err, "getting start optime from %s", o.readerType) + return bson.Timestamp{}, errors.Wrapf(err, "getting start optime from %s", o.readerType) } allowDDLBeforeTS = latestOpTime.TS - o.startAtTs = &startOpTime.TS + startTS = startOpTime.TS } o.logger.Info(). @@ -90,11 +99,6 @@ func (o *OplogReader) start(ctx context.Context, eg *errgroup.Group) error { Any("currentOplogTs", allowDDLBeforeTS). Msg("Tailing oplog.") - sess, err := o.watcherClient.StartSession() - if err != nil { - return errors.Wrap(err, "creating session") - } - sctx := mongo.NewSessionContext(ctx, sess) cursor, err := o.watcherClient. @@ -182,19 +186,15 @@ func (o *OplogReader) start(ctx context.Context, eg *errgroup.Group) error { ) if err != nil { - return errors.Wrapf(err, "opening cursor to tail oplog") + return bson.Timestamp{}, errors.Wrapf(err, "opening cursor to tail oplog") } - eg.Go( - func() error { - return o.iterate(sctx, cursor, allowDDLBeforeTS) - }, - ) + o.cursor = cursor - return nil + return startTS, nil } -func (o *OplogReader) iterate( +func (o *OplogReader) iterateCursor( sctx context.Context, cursor *mongo.Cursor, allowDDLBeforeTS bson.Timestamp, From 8751b117958b0132182a3e4e49290dbb79337735 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 14:31:37 -0500 Subject: [PATCH 025/130] format --- internal/verifier/change_reader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index fb7b12bd..ee61ea72 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -217,7 +217,7 @@ func (rc *ChangeReaderCommon) start( return rc.iterateCb(ctx, ri, sess) }, - "running %s", rc, + "reading %s’s changes", rc.readerType, ).Run(ctx, rc.logger) return err From d3070e2be2e46232b2fde92457eb97f6a0ee6435 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 14:34:12 -0500 Subject: [PATCH 026/130] oops --- internal/verifier/oplog_reader.go | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index cbf9b8f2..61ec4c8a 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -6,6 +6,7 @@ import ( "github.com/10gen/migration-verifier/agg" "github.com/10gen/migration-verifier/agg/helpers" + "github.com/10gen/migration-verifier/internal/retry" "github.com/10gen/migration-verifier/internal/types" "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/internal/verifier/namespaces" @@ -27,9 +28,10 @@ import ( type OplogReader struct { *ChangeReaderCommon - curDocs []bson.Raw - scratch []byte - cursor *mongo.Cursor + curDocs []bson.Raw + scratch []byte + cursor *mongo.Cursor + allowDDLBeforeTS bson.Timestamp } var _ changeReader = &OplogReader{} @@ -53,7 +55,7 @@ func (v *Verifier) newOplogReader( o := &OplogReader{ChangeReaderCommon: &common} common.createIteratorCb = o.createCursor - common.iterateCb = o.iterateCb + common.iterateCb = o.iterateCursor return o } @@ -190,15 +192,24 @@ func (o *OplogReader) createCursor( } o.cursor = cursor + o.allowDDLBeforeTS = allowDDLBeforeTS return startTS, nil } func (o *OplogReader) iterateCursor( - sctx context.Context, - cursor *mongo.Cursor, - allowDDLBeforeTS bson.Timestamp, + ctx context.Context, + _ *retry.FuncInfo, + sess *mongo.Session, + /* + cursor *mongo.Cursor, + allowDDLBeforeTS bson.Timestamp, + */ ) error { + sctx := mongo.NewSessionContext(ctx, sess) + cursor := o.cursor + allowDDLBeforeTS := o.allowDDLBeforeTS + CursorLoop: for { var err error From 06bc31b0b8992e46cf7b0c97ba87c3c1835a5b7a Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 14:36:03 -0500 Subject: [PATCH 027/130] nolint --- internal/verifier/oplog/start_time.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/verifier/oplog/start_time.go b/internal/verifier/oplog/start_time.go index eb042b58..494ac8a8 100644 --- a/internal/verifier/oplog/start_time.go +++ b/internal/verifier/oplog/start_time.go @@ -94,6 +94,8 @@ func getLatestVisibleOplogOpTime( // all operations with earlier oplog times have been storage-committed. result, err := coll.FindOne(ctx, bson.M{"ts": bson.M{"$gte": latestOpTime.TS}}, + + //nolint SA1019 options.FindOne().SetOplogReplay(true), ).Raw() if err != nil { From 1cbb232b3c10f3bef6b25e5ca5852778db2f115e Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 14:47:00 -0500 Subject: [PATCH 028/130] fix test --- internal/verifier/change_stream_test.go | 4 ++-- internal/verifier/oplog_reader.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 5f746f14..b606cf90 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -353,7 +353,7 @@ func (suite *IntegrationTestSuite) TestChangeStream_Resume_NoSkip() { return !bytes.Equal(rt, originalResumeToken) }, time.Minute, - 50*time.Millisecond, + 500*time.Millisecond, "should see a new change stream resume token persisted", ) @@ -955,7 +955,7 @@ func (suite *IntegrationTestSuite) TestCreateForbidden() { eventErr := UnknownEventError{} suite.Require().ErrorAs(err, &eventErr) - suite.Assert().Equal("create", eventErr.Event.Lookup("operationType").StringValue()) + suite.Assert().Contains(string(eventErr.Event), "create") } func (suite *IntegrationTestSuite) TestTolerateDestinationCollMod() { diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 61ec4c8a..e2915788 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -97,7 +97,7 @@ func (o *OplogReader) createCursor( } o.logger.Info(). - Any("startReadTs", *o.startAtTs). + Any("startReadTs", startTS). Any("currentOplogTs", allowDDLBeforeTS). Msg("Tailing oplog.") @@ -112,7 +112,7 @@ func (o *OplogReader) createCursor( Find( sctx, bson.D{{"$and", []any{ - bson.D{{"ts", bson.D{{"$gte", o.startAtTs}}}}, + bson.D{{"ts", bson.D{{"$gte", startTS}}}}, bson.D{{"$expr", agg.Or{ // plain ops: one write per op From 51f4163d6dcde02095dc8e38657a4d66378fc9ec Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 15:25:28 -0500 Subject: [PATCH 029/130] ddl allowance --- internal/retry/retry.go | 12 +++++++- internal/verifier/change_stream.go | 6 ++-- internal/verifier/oplog_reader.go | 46 ++++++++++++++++++++++++++---- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/internal/retry/retry.go b/internal/retry/retry.go index 62f74e8d..8f7b260d 100644 --- a/internal/retry/retry.go +++ b/internal/retry/retry.go @@ -180,6 +180,10 @@ func (r *Retryer) runRetryLoop( // Not a transient error? Fail immediately. if !r.shouldRetryWithSleep(logger, sleepTime, descriptions, cbErr) { + if descr, has := r.description.Get(); has { + cbErr = errors.Wrap(cbErr, descr) + } + return cbErr } @@ -187,11 +191,17 @@ func (r *Retryer) runRetryLoop( // then fail. if failedFuncInfo.GetDurationSoFar() > li.durationLimit { - return RetryDurationLimitExceededErr{ + var err error = RetryDurationLimitExceededErr{ attempts: li.attemptsSoFar, duration: failedFuncInfo.GetDurationSoFar(), lastErr: groupErr.errFromCallback, } + + if descr, has := r.description.Get(); has { + err = errors.Wrap(err, descr) + } + + return err } // Sleep and increase the sleep time for the next retry, diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index da5d5664..1938b118 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -417,13 +417,15 @@ func (csr *ChangeStreamReader) createChangeStream( return bson.Timestamp{}, errors.Wrap(err, "failed to open change stream") } - err = csr.persistResumeToken(ctx, changeStream.ResumeToken()) + resumeToken := changeStream.ResumeToken() + + err = csr.persistResumeToken(ctx, resumeToken) if err != nil { changeStream.Close(sctx) return bson.Timestamp{}, err } - startTs, err := csr.resumeTokenTSExtractor(changeStream.ResumeToken()) + startTs, err := csr.resumeTokenTSExtractor(resumeToken) if err != nil { changeStream.Close(sctx) return bson.Timestamp{}, errors.Wrap(err, "failed to extract timestamp from change stream's resume token") diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index e2915788..b80e212e 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -11,6 +11,7 @@ import ( "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/internal/verifier/namespaces" "github.com/10gen/migration-verifier/internal/verifier/oplog" + "github.com/10gen/migration-verifier/mbson" "github.com/10gen/migration-verifier/mmongo" "github.com/10gen/migration-verifier/option" "github.com/pkg/errors" @@ -79,10 +80,22 @@ func (o *OplogReader) createCursor( return bson.Timestamp{}, errors.Wrap(err, "parsing persisted resume token") } - // TODO: Smarten this rather than assuming we’ve passed the original - // latest optime. - allowDDLBeforeTS = rt.TS - allowDDLBeforeTS.T-- + ddlAllowanceResult := o.getMetadataCollection().FindOne( + ctx, + bson.D{ + {"_id", o.ddlAllowanceDocID()}, + }, + ) + + allowanceRaw, err := ddlAllowanceResult.Raw() + if err != nil { + return bson.Timestamp{}, errors.Wrap(err, "fetching DDL allowance timestamp") + } + + allowDDLBeforeTS, err = mbson.Lookup[bson.Timestamp](allowanceRaw, "ts") + if err != nil { + return bson.Timestamp{}, errors.Wrap(err, "parsing DDL allowance timestamp doc") + } startTS = rt.TS } else { @@ -93,7 +106,26 @@ func (o *OplogReader) createCursor( allowDDLBeforeTS = latestOpTime.TS + _, err = o.getMetadataCollection().ReplaceOne( + ctx, + bson.D{ + {"_id", o.ddlAllowanceDocID()}, + }, + bson.D{ + {"ts", allowDDLBeforeTS}, + }, + options.Replace().SetUpsert(true), + ) + if err != nil { + return bson.Timestamp{}, errors.Wrapf(err, "persisting DDL-allowance timestamp") + } + startTS = startOpTime.TS + + err = o.persistResumeToken(ctx, oplog.ResumeToken{startTS}.MarshalToBSON()) + if err != nil { + return bson.Timestamp{}, errors.Wrap(err, "persisting resume token") + } } o.logger.Info(). @@ -188,7 +220,7 @@ func (o *OplogReader) createCursor( ) if err != nil { - return bson.Timestamp{}, errors.Wrapf(err, "opening cursor to tail oplog") + return bson.Timestamp{}, errors.Wrapf(err, "opening cursor to tail %s’s oplog", o.readerType) } o.cursor = cursor @@ -197,6 +229,10 @@ func (o *OplogReader) createCursor( return startTS, nil } +func (o *OplogReader) ddlAllowanceDocID() string { + return string(o.readerType) + "-ddlAllowanceTS" +} + func (o *OplogReader) iterateCursor( ctx context.Context, _ *retry.FuncInfo, From 6918fda08d77f465f0d89c268c359c33f6b3c3da Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 15:33:48 -0500 Subject: [PATCH 030/130] token --- internal/verifier/change_reader.go | 10 +++++----- internal/verifier/change_stream.go | 2 +- internal/verifier/change_stream_test.go | 7 ++++++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index ee61ea72..877120a9 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -242,7 +242,7 @@ func (rc *ChangeReaderCommon) persistResumeToken(ctx context.Context, token bson coll := rc.metaDB.Collection(changeReaderCollectionName) _, err := coll.ReplaceOne( ctx, - bson.D{{"_id", rc.resumeTokenDocID()}}, + bson.D{{"_id", resumeTokenDocID(rc.getWhichCluster())}}, token, options.Replace().SetUpsert(true), ) @@ -267,14 +267,14 @@ func (rc *ChangeReaderCommon) persistResumeToken(ctx context.Context, token bson return errors.Wrapf(err, "failed to persist %s resume token (%v)", rc.readerType, token) } -func (rc *ChangeReaderCommon) resumeTokenDocID() string { - switch rc.readerType { +func resumeTokenDocID(clusterType whichCluster) string { + switch clusterType { case src: return "srcResumeToken" case dst: return "dstResumeToken" default: - panic("unknown readerType: " + rc.readerType) + panic("unknown readerType: " + clusterType) } } @@ -287,7 +287,7 @@ func (rc *ChangeReaderCommon) loadResumeToken(ctx context.Context) (option.Optio token, err := coll.FindOne( ctx, - bson.D{{"_id", rc.resumeTokenDocID()}}, + bson.D{{"_id", resumeTokenDocID(rc.getWhichCluster())}}, ).Raw() if errors.Is(err, mongo.ErrNoDocuments) { diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 1938b118..bcc62cef 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -392,7 +392,7 @@ func (csr *ChangeStreamReader) createChangeStream( if token, hasToken := savedResumeToken.Get(); hasToken { logEvent := csStartLogEvent. - Stringer(csr.resumeTokenDocID(), token) + Stringer(resumeTokenDocID(csr.readerType), token) ts, err := csr.resumeTokenTSExtractor(token) if err == nil { diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index b606cf90..b97d06d6 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -345,7 +345,12 @@ func (suite *IntegrationTestSuite) TestChangeStream_Resume_NoSkip() { assert.Eventually( suite.T(), func() bool { - rt, err := changeStreamMetaColl.FindOne(ctx, bson.D{}).Raw() + rt, err := changeStreamMetaColl.FindOne( + ctx, + bson.D{ + {"_id", resumeTokenDocID(src)}, + }, + ).Raw() require.NoError(suite.T(), err) suite.T().Logf("found rt: %v\n", rt) From eb0464da02ca361c6dd07066367beb4d578e3395 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 15:34:41 -0500 Subject: [PATCH 031/130] test 2nd time --- internal/util/eventual_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/util/eventual_test.go b/internal/util/eventual_test.go index 6196e4e2..4bcc209e 100644 --- a/internal/util/eventual_test.go +++ b/internal/util/eventual_test.go @@ -31,6 +31,12 @@ func (s *UnitTestSuite) TestEventual() { eventual.Get(), "Get() should return the value", ) + + s.Assert().Equal( + 123, + eventual.Get(), + "Get() should return the value a 2nd time", + ) } func (s *UnitTestSuite) TestEventualNil() { From 7b30bff2a3ad850cb112168481de54d3a7c901b7 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 16:11:31 -0500 Subject: [PATCH 032/130] save --- internal/verifier/change_reader.go | 32 +++++++++++++------------ internal/verifier/change_stream.go | 6 +---- internal/verifier/change_stream_test.go | 2 +- internal/verifier/oplog_reader.go | 5 ++-- internal/verifier/recheck_persist.go | 3 --- 5 files changed, 21 insertions(+), 27 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 877120a9..7eff72f5 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -239,32 +239,34 @@ func (rc *ChangeReaderCommon) start( } func (rc *ChangeReaderCommon) persistResumeToken(ctx context.Context, token bson.Raw) error { + ts, err := rc.resumeTokenTSExtractor(token) + if err != nil { + return errors.Wrapf(err, "parsing resume token %#q", token) + } + + if ts.IsZero() { + panic("empty ts in resume token is invalid!") + } + coll := rc.metaDB.Collection(changeReaderCollectionName) - _, err := coll.ReplaceOne( + _, err = coll.ReplaceOne( ctx, bson.D{{"_id", resumeTokenDocID(rc.getWhichCluster())}}, token, options.Replace().SetUpsert(true), ) - if err == nil { - ts, err := rc.resumeTokenTSExtractor(token) - - logEvent := rc.logger.Debug() + if err != nil { + return errors.Wrapf(err, "persisting %s resume token (%v)", rc.readerType, token) + } - if err == nil { - logEvent = addTimestampToLogEvent(ts, logEvent) - } else { - rc.logger.Warn().Err(err). - Msg("failed to extract resume token timestamp") - } + logEvent := rc.logger.Debug() - logEvent.Msgf("Persisted %s's resume token.", rc.readerType) + logEvent = addTimestampToLogEvent(ts, logEvent) - return nil - } + logEvent.Msgf("Persisted %s’s resume token.", rc.readerType) - return errors.Wrapf(err, "failed to persist %s resume token (%v)", rc.readerType, token) + return nil } func resumeTokenDocID(clusterType whichCluster) string { diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index bcc62cef..29f3164d 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -252,12 +252,8 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( case <-sctx.Done(): return util.WrapCtxErrWithCause(sctx) case csr.changeEventBatchChan <- changeEventBatch{ - events: changeEvents, - + events: changeEvents, resumeToken: cs.ResumeToken(), - - // NB: We know by now that OperationTime is non-nil. - clusterTime: *sess.OperationTime(), }: } diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index b97d06d6..1ef9c7a5 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -353,7 +353,7 @@ func (suite *IntegrationTestSuite) TestChangeStream_Resume_NoSkip() { ).Raw() require.NoError(suite.T(), err) - suite.T().Logf("found rt: %v\n", rt) + suite.T().Logf("found rt: %v", rt) return !bytes.Equal(rt, originalResumeToken) }, diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index b80e212e..d4df2e2f 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -268,7 +268,7 @@ CursorLoop: for { if o.lastChangeEventTime != nil { if !o.lastChangeEventTime.Before(writesOffTS) { - fmt.Printf("----------- %s reached writes off ts %v", o, writesOffTS) + fmt.Printf("----------- %s reached writes off ts %v\n", o, writesOffTS) break } } @@ -292,7 +292,7 @@ CursorLoop: infoLog. Stringer("reader", o). - Msg("Change stream reader is done.") + Msg("Oplog reader is done.") return nil } @@ -393,7 +393,6 @@ func (o *OplogReader) readAndHandleOneBatch( case o.changeEventBatchChan <- changeEventBatch{ events: events, resumeToken: resumeToken, - clusterTime: *sess.OperationTime(), }: } diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 564e33a2..82db17fc 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -13,7 +13,6 @@ import ( type changeEventBatch struct { events []ParsedEvent resumeToken bson.Raw - clusterTime bson.Timestamp } // RunChangeEventPersistor persists rechecks from change event batches. @@ -166,14 +165,12 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE } latestTimestampTime := time.Unix(int64(latestTimestamp.T), 0) - lag := time.Unix(int64(batch.clusterTime.T), 0).Sub(latestTimestampTime) verifier.logger.Trace(). Str("origin", string(eventOrigin)). Int("count", len(docIDs)). Any("latestTimestamp", latestTimestamp). Time("latestTimestampTime", latestTimestampTime). - Stringer("lag", lag). Msg("Persisting rechecks for change events.") return verifier.insertRecheckDocs(ctx, dbNames, collNames, docIDs, dataSizes) From 4df242e13f56ff35bf6d5848a68822ac7c79143a Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 16:11:41 -0500 Subject: [PATCH 033/130] add comment --- internal/util/eventual.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/util/eventual.go b/internal/util/eventual.go index 4bd37271..901d2a1f 100644 --- a/internal/util/eventual.go +++ b/internal/util/eventual.go @@ -35,6 +35,8 @@ func (e *Eventual[T]) Get() T { e.mux.RLock() defer e.mux.RUnlock() + // If the ready channel is still open then there’s no value yet, + // which means this method should not have been called. select { case <-e.ready: return e.val @@ -59,5 +61,6 @@ func (e *Eventual[T]) Set(val T) { // not see this value. e.val = val + // This allows Get() to work: close(e.ready) } From e5a15969b4d894e22f6bcad506753871542da976 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 16:17:15 -0500 Subject: [PATCH 034/130] tolerate later startAtTs --- internal/verifier/change_stream_test.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 1ef9c7a5..05b1d90e 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -705,10 +705,9 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { startAtTs, hasStartAtTs = verifier.srcChangeReader.getStartTimestamp().Get() suite.Require().True(hasStartAtTs, "startAtTs should be set") - suite.Assert().Equal( - *postEventsSessionTime, - startAtTs, - "verifier.srcStartAtTs should now be our session timestamp", + suite.Assert().False( + startAtTs.Before(*postEventsSessionTime), + "verifier.srcStartAtTs should now be at least at the session timestamp", ) } From 90f912374b89acccd327ccd5e342176371365aa7 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 16:22:57 -0500 Subject: [PATCH 035/130] collmod test --- internal/verifier/oplog_reader.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index d4df2e2f..9f4968f6 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -153,9 +153,17 @@ func (o *OplogReader) createCursor( o.getDefaultNSExclusions("$$ROOT")..., ), - // op=c is for applyOps, and also to detect forbidden DDL. // op=n is for no-ops, so we stay up-to-date. - agg.In("$op", "c", "n"), + agg.Eq("$op", "n"), + + // op=c is for applyOps, and also to detect forbidden DDL. + agg.And{ + agg.Eq("$op", "c"), + agg.Not{helpers.StringHasPrefix{ + FieldRef: "$ns", + Prefix: "config.", + }}, + }, }}}, }}}, From 11140aee81620a54bdfa95977a658d1e012561c5 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 16:37:04 -0500 Subject: [PATCH 036/130] empty ts --- internal/verifier/oplog_reader.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 9f4968f6..5575ce9a 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -326,6 +326,11 @@ func (o *OplogReader) readAndHandleOneBatch( return errors.Wrap(err, "reading cursor") } + if len(o.curDocs) == 0 { + // If there were no oplog events, then there’s nothing for us to do. + return nil + } + events := make([]ParsedEvent, 0, len(o.curDocs)) var latestTS bson.Timestamp @@ -393,7 +398,12 @@ func (o *OplogReader) readAndHandleOneBatch( o.updateLag(sess, resumeToken) - o.batchSizeHistory.Add(len(events)) + // NB: events can legitimately be empty here because we might only have + // gotten op=n oplog entries, which we just use to advance the reader. + // (Similar to a change stream’s post-batch resume token.) + if len(events) > 0 { + o.batchSizeHistory.Add(len(events)) + } select { case <-sctx.Done(): From 8f6871b8364dcd4fbfde5a4f5f4a30f5ff4299cd Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 16:41:05 -0500 Subject: [PATCH 037/130] read oplog for 4.2 --- internal/verifier/change_reader.go | 2 +- internal/verifier/check.go | 4 ---- internal/verifier/oplog_reader.go | 15 ++++++++++----- internal/verifier/recheck_persist.go | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 7eff72f5..ec490cf3 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -23,7 +23,7 @@ import ( type ddlEventHandling string const ( - fauxDocSizeForDeleteEvents = 1024 + defaultUserDocumentSize = 1024 // The number of batches we’ll hold in memory at once. batchChanBufferSize = 100 diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 89716015..2c88919a 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -605,8 +605,6 @@ func (v *Verifier) initializeChangeReaders() { whyCS = "ns filter" case v.srcClusterInfo.Topology == util.TopologySharded: whyCS = "sharded" - case !util.ClusterHasBSONSize([2]int(v.srcClusterInfo.VersionArray)): - whyCS = "no $bsonSize" } srcLogEvent := v.logger.Info() @@ -638,8 +636,6 @@ func (v *Verifier) initializeChangeReaders() { whyCS = "ns filter" case v.dstClusterInfo.Topology == util.TopologySharded: whyCS = "sharded" - case !util.ClusterHasBSONSize([2]int(v.dstClusterInfo.VersionArray)): - whyCS = "no $bsonSize" } dstLogEvent := v.logger.Info() diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 5575ce9a..78be0760 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -135,6 +135,8 @@ func (o *OplogReader) createCursor( sctx := mongo.NewSessionContext(ctx, sess) + clientHasBSONSize := util.ClusterHasBSONSize([2]int(o.clusterInfo.VersionArray)) + cursor, err := o.watcherClient. Database("local"). Collection( @@ -174,8 +176,7 @@ func (o *OplogReader) createCursor( {"op", 1}, {"ns", 1}, - // TODO: Adjust for 4.2. - {"docLen", getOplogDocLenExpr("$$ROOT")}, + {"docLen", getOplogDocLenExpr("$$ROOT", clientHasBSONSize)}, {"docID", getOplogDocIDExpr("$$ROOT")}, @@ -219,7 +220,7 @@ func (o *OplogReader) createCursor( {"op", "$$opEntry.op"}, {"ns", "$$opEntry.ns"}, {"docID", getOplogDocIDExpr("$$opEntry")}, - {"docLen", getOplogDocLenExpr("$$opEntry")}, + {"docLen", getOplogDocLenExpr("$$opEntry", clientHasBSONSize)}, }, }, Else: "$$REMOVE", @@ -438,7 +439,7 @@ func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { )) } -func getOplogDocLenExpr(docroot string) any { +func getOplogDocLenExpr(docroot string, useBSONSize bool) any { return agg.Switch{ Branches: []agg.SwitchCase{ { @@ -449,7 +450,11 @@ func getOplogDocLenExpr(docroot string) any { agg.Not{agg.Eq("missing", docroot+".o._id")}, }, }, - Then: agg.BSONSize(docroot + ".o"), + Then: lo.Ternary[any]( + useBSONSize, + agg.BSONSize(docroot+".o"), + defaultUserDocumentSize, + ), }, }, Default: "$$REMOVE", diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 82db17fc..c941791f 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -148,7 +148,7 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE } else if changeEvent.FullDocument == nil { // This happens for deletes and for some updates. // The document is probably, but not necessarily, deleted. - dataSizes[i] = fauxDocSizeForDeleteEvents + dataSizes[i] = defaultUserDocumentSize } else { // This happens for inserts, replaces, and most updates. dataSizes[i] = int32(len(changeEvent.FullDocument)) From 0fbc28e69f4829add4bda4c7a38738c63c00a9b8 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 17:45:01 -0500 Subject: [PATCH 038/130] maybe fix test? --- internal/verifier/migration_verifier_test.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 8159c19c..455a5ec3 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -1685,19 +1685,21 @@ func (suite *IntegrationTestSuite) TestVerifierCompareIndexes() { func (suite *IntegrationTestSuite) TestVerifierDocMismatches() { ctx := suite.Context() + dbName := suite.DBNameForTest() + suite.Require().NoError( suite.srcMongoClient. - Database("test"). + Database(dbName). Collection("coll").Drop(ctx), ) suite.Require().NoError( suite.dstMongoClient. - Database("test"). + Database(dbName). Collection("coll").Drop(ctx), ) _, err := suite.srcMongoClient. - Database("test"). + Database(dbName). Collection("coll"). InsertMany( ctx, @@ -1716,7 +1718,7 @@ func (suite *IntegrationTestSuite) TestVerifierDocMismatches() { // The first has a mismatched `foo` value, // and the 2nd lacks `foo` entirely. _, err = suite.dstMongoClient. - Database("test"). + Database(dbName). Collection("coll"). InsertMany(ctx, lo.ToAnySlice([]bson.D{ {{"_id", 100000}, {"foo", 1}}, @@ -1727,7 +1729,7 @@ func (suite *IntegrationTestSuite) TestVerifierDocMismatches() { verifier := suite.BuildVerifier() verifier.failureDisplaySize = 10 - ns := "test.coll" + ns := dbName + ".coll" verifier.SetSrcNamespaces([]string{ns}) verifier.SetDstNamespaces([]string{ns}) verifier.SetNamespaceMap() From 6052ecb274a985439f722412419fb98c3dd8c032 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 21:38:36 -0500 Subject: [PATCH 039/130] avoid $switch for 4.2. --- internal/verifier/oplog_reader.go | 33 ++++++++++++++----------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 78be0760..38424c12 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -440,29 +440,26 @@ func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { } func getOplogDocLenExpr(docroot string, useBSONSize bool) any { - return agg.Switch{ - Branches: []agg.SwitchCase{ - { - Case: agg.Or{ - agg.Eq(docroot+".op", "i"), - agg.And{ - agg.Eq(docroot+".op", "u"), - agg.Not{agg.Eq("missing", docroot+".o._id")}, - }, - }, - Then: lo.Ternary[any]( - useBSONSize, - agg.BSONSize(docroot+".o"), - defaultUserDocumentSize, - ), + return agg.Cond{ + If: agg.Or{ + agg.Eq(docroot+".op", "i"), + agg.And{ + agg.Eq(docroot+".op", "u"), + agg.Not{agg.Eq("missing", docroot+".o._id")}, }, }, - Default: "$$REMOVE", + Then: lo.Ternary[any]( + useBSONSize, + agg.BSONSize(docroot+".o"), + defaultUserDocumentSize, + ), + Else: "$$REMOVE", } } func getOplogDocIDExpr(docroot string) any { - return agg.Switch{ + // $switch was new in MongoDB 4.2, so use $cond instead. + return helpers.SwitchToCond(agg.Switch{ Branches: []agg.SwitchCase{ { Case: agg.Eq(docroot+".op", "c"), @@ -477,7 +474,7 @@ func getOplogDocIDExpr(docroot string) any { Then: docroot + ".o2._id", }, }, - } + }) } func (o *OplogReader) String() string { From 491e8f819ae97f939b3b8f2dbad09771eddcbd81 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 21:38:48 -0500 Subject: [PATCH 040/130] 4.4 --- internal/verifier/oplog_reader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 38424c12..f70fe704 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -458,7 +458,7 @@ func getOplogDocLenExpr(docroot string, useBSONSize bool) any { } func getOplogDocIDExpr(docroot string) any { - // $switch was new in MongoDB 4.2, so use $cond instead. + // $switch was new in MongoDB 4.4, so use $cond instead. return helpers.SwitchToCond(agg.Switch{ Branches: []agg.SwitchCase{ { From 6c0aa334f7549ee19c279b87e3e6c9dda79d1d02 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 21:40:14 -0500 Subject: [PATCH 041/130] compat --- .github/workflows/all.yml | 1 + agg/helpers/compat.go | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 agg/helpers/compat.go diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index ced7c10d..c4702dba 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -38,6 +38,7 @@ jobs: - [ '4.2', '4.4' ] - [ '4.2', '5.0' ] - [ '4.2', '6.0' ] + - [ '4.2', '8.0' ] - [ '4.4', '4.4' ] - [ '4.4', '5.0' ] diff --git a/agg/helpers/compat.go b/agg/helpers/compat.go new file mode 100644 index 00000000..2f4f7803 --- /dev/null +++ b/agg/helpers/compat.go @@ -0,0 +1,25 @@ +package helpers + +import "github.com/10gen/migration-verifier/agg" + +func SwitchToCond(in agg.Switch) agg.Cond { + rootCond := agg.Cond{ + If: in.Branches[0].Case, + Then: in.Branches[0].Then, + } + + curCond := &rootCond + + for _, branch := range in.Branches[1:] { + newCond := agg.Cond{ + If: branch.Case, + Then: branch.Then, + } + + curCond.Else = &newCond + + curCond = &newCond + } + + return rootCond +} From 8c0a1d386a41b2d58127faff703673c33b507a12 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 21:47:53 -0500 Subject: [PATCH 042/130] switch is OK --- .github/workflows/all.yml | 2 ++ agg/helpers/compat.go | 25 ------------------------- internal/verifier/oplog_reader.go | 4 ++-- 3 files changed, 4 insertions(+), 27 deletions(-) delete mode 100644 agg/helpers/compat.go diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index c4702dba..9e9963ae 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -31,6 +31,8 @@ jobs: toHashedIndexKey: true - mongodb_versions: [ '4.2', '6.0' ] toHashedIndexKey: true + - mongodb_versions: [ '4.2', '8.0' ] + toHashedIndexKey: true # versions are: source, destination mongodb_versions: diff --git a/agg/helpers/compat.go b/agg/helpers/compat.go deleted file mode 100644 index 2f4f7803..00000000 --- a/agg/helpers/compat.go +++ /dev/null @@ -1,25 +0,0 @@ -package helpers - -import "github.com/10gen/migration-verifier/agg" - -func SwitchToCond(in agg.Switch) agg.Cond { - rootCond := agg.Cond{ - If: in.Branches[0].Case, - Then: in.Branches[0].Then, - } - - curCond := &rootCond - - for _, branch := range in.Branches[1:] { - newCond := agg.Cond{ - If: branch.Case, - Then: branch.Then, - } - - curCond.Else = &newCond - - curCond = &newCond - } - - return rootCond -} diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index f70fe704..b7490690 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -459,7 +459,7 @@ func getOplogDocLenExpr(docroot string, useBSONSize bool) any { func getOplogDocIDExpr(docroot string) any { // $switch was new in MongoDB 4.4, so use $cond instead. - return helpers.SwitchToCond(agg.Switch{ + return agg.Switch{ Branches: []agg.SwitchCase{ { Case: agg.Eq(docroot+".op", "c"), @@ -474,7 +474,7 @@ func getOplogDocIDExpr(docroot string) any { Then: docroot + ".o2._id", }, }, - }) + } } func (o *OplogReader) String() string { From 1bfbe362756a935276a7dc12e570da3c40f818ac Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 21:49:04 -0500 Subject: [PATCH 043/130] no oplog for 4.2 for now --- internal/verifier/check.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 2c88919a..0e9f42a1 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -601,6 +601,8 @@ func (v *Verifier) initializeChangeReaders() { var whyCS string switch { + case !util.ClusterHasBSONSize([2]int(v.srcClusterInfo.VersionArray)): + whyCS = "no expr projection" case len(v.srcNamespaces) > 0: whyCS = "ns filter" case v.srcClusterInfo.Topology == util.TopologySharded: @@ -632,6 +634,8 @@ func (v *Verifier) initializeChangeReaders() { Msg("Listening for writes to source.") switch { + case !util.ClusterHasBSONSize([2]int(v.dstClusterInfo.VersionArray)): + whyCS = "no expr projection" case len(v.dstNamespaces) > 0: whyCS = "ns filter" case v.dstClusterInfo.Topology == util.TopologySharded: From e80cba357219a9c6e72545e80865de5bde93ee05 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 21:49:59 -0500 Subject: [PATCH 044/130] projection is only 4.4 anyway --- internal/verifier/oplog_reader.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index b7490690..613766d5 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -135,8 +135,6 @@ func (o *OplogReader) createCursor( sctx := mongo.NewSessionContext(ctx, sess) - clientHasBSONSize := util.ClusterHasBSONSize([2]int(o.clusterInfo.VersionArray)) - cursor, err := o.watcherClient. Database("local"). Collection( @@ -176,7 +174,7 @@ func (o *OplogReader) createCursor( {"op", 1}, {"ns", 1}, - {"docLen", getOplogDocLenExpr("$$ROOT", clientHasBSONSize)}, + {"docLen", getOplogDocLenExpr("$$ROOT")}, {"docID", getOplogDocIDExpr("$$ROOT")}, @@ -220,7 +218,7 @@ func (o *OplogReader) createCursor( {"op", "$$opEntry.op"}, {"ns", "$$opEntry.ns"}, {"docID", getOplogDocIDExpr("$$opEntry")}, - {"docLen", getOplogDocLenExpr("$$opEntry", clientHasBSONSize)}, + {"docLen", getOplogDocLenExpr("$$opEntry")}, }, }, Else: "$$REMOVE", @@ -439,7 +437,7 @@ func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { )) } -func getOplogDocLenExpr(docroot string, useBSONSize bool) any { +func getOplogDocLenExpr(docroot string) any { return agg.Cond{ If: agg.Or{ agg.Eq(docroot+".op", "i"), @@ -448,11 +446,7 @@ func getOplogDocLenExpr(docroot string, useBSONSize bool) any { agg.Not{agg.Eq("missing", docroot+".o._id")}, }, }, - Then: lo.Ternary[any]( - useBSONSize, - agg.BSONSize(docroot+".o"), - defaultUserDocumentSize, - ), + Then: agg.BSONSize(docroot + ".o"), Else: "$$REMOVE", } } From f3bf38740d06936e4b2e96dfdaa6f2db8ee0c529 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 12 Nov 2025 22:18:09 -0500 Subject: [PATCH 045/130] allow non-expr oplog --- internal/verifier/check.go | 4 - internal/verifier/oplog_reader.go | 281 +++++++++++++++++++++--------- 2 files changed, 197 insertions(+), 88 deletions(-) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 0e9f42a1..2c88919a 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -601,8 +601,6 @@ func (v *Verifier) initializeChangeReaders() { var whyCS string switch { - case !util.ClusterHasBSONSize([2]int(v.srcClusterInfo.VersionArray)): - whyCS = "no expr projection" case len(v.srcNamespaces) > 0: whyCS = "ns filter" case v.srcClusterInfo.Topology == util.TopologySharded: @@ -634,8 +632,6 @@ func (v *Verifier) initializeChangeReaders() { Msg("Listening for writes to source.") switch { - case !util.ClusterHasBSONSize([2]int(v.dstClusterInfo.VersionArray)): - whyCS = "no expr projection" case len(v.dstNamespaces) > 0: whyCS = "ns filter" case v.dstClusterInfo.Topology == util.TopologySharded: diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 613766d5..cf36e03d 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -20,6 +20,7 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" "go.mongodb.org/mongo-driver/v2/mongo/readconcern" + "go.mongodb.org/mongo-driver/v2/x/bsonx/bsoncore" "golang.org/x/exp/slices" ) @@ -135,6 +136,13 @@ func (o *OplogReader) createCursor( sctx := mongo.NewSessionContext(ctx, sess) + findOpts := options.Find(). + SetCursorType(options.TailableAwait) + + if util.ClusterHasBSONSize([2]int(o.clusterInfo.VersionArray)) { + findOpts.SetProjection(o.getExprProjection()) + } + cursor, err := o.watcherClient. Database("local"). Collection( @@ -166,64 +174,7 @@ func (o *OplogReader) createCursor( }, }}}, }}}, - - options.Find(). - SetCursorType(options.TailableAwait). - SetProjection(bson.D{ - {"ts", 1}, - {"op", 1}, - {"ns", 1}, - - {"docLen", getOplogDocLenExpr("$$ROOT")}, - - {"docID", getOplogDocIDExpr("$$ROOT")}, - - {"cmdName", agg.Cond{ - If: agg.Eq("$op", "c"), - Then: agg.ArrayElemAt{ - Array: agg.Map{ - Input: bson.D{ - {"$objectToArray", "$o"}, - }, - As: "field", - In: "$$field.k", - }, - Index: 0, - }, - Else: "$$REMOVE", - }}, - - {"o", agg.Cond{ - If: agg.And{ - agg.Eq("$op", "c"), - agg.Eq("missing", agg.Type("$o.applyOps")), - }, - Then: "$o", - Else: "$$REMOVE", - }}, - - {"ops", agg.Cond{ - If: agg.And{ - agg.Eq("$op", "c"), - agg.Eq(agg.Type("$o.applyOps"), "array"), - }, - Then: agg.Map{ - Input: agg.Filter{ - Input: "$o.applyOps", - As: "opEntry", - Cond: o.getDefaultNSExclusions("$$opEntry"), - }, - As: "opEntry", - In: bson.D{ - {"op", "$$opEntry.op"}, - {"ns", "$$opEntry.ns"}, - {"docID", getOplogDocIDExpr("$$opEntry")}, - {"docLen", getOplogDocLenExpr("$$opEntry")}, - }, - }, - Else: "$$REMOVE", - }}, - }), + findOpts, ) if err != nil { @@ -236,6 +187,64 @@ func (o *OplogReader) createCursor( return startTS, nil } +func (o *OplogReader) getExprProjection() bson.D { + return bson.D{ + {"ts", 1}, + {"op", 1}, + {"ns", 1}, + + {"docLen", getOplogDocLenExpr("$$ROOT")}, + + {"docID", getOplogDocIDExpr("$$ROOT")}, + + {"cmdName", agg.Cond{ + If: agg.Eq("$op", "c"), + Then: agg.ArrayElemAt{ + Array: agg.Map{ + Input: bson.D{ + {"$objectToArray", "$o"}, + }, + As: "field", + In: "$$field.k", + }, + Index: 0, + }, + Else: "$$REMOVE", + }}, + + {"o", agg.Cond{ + If: agg.And{ + agg.Eq("$op", "c"), + agg.Eq("missing", agg.Type("$o.applyOps")), + }, + Then: "$o", + Else: "$$REMOVE", + }}, + + {"ops", agg.Cond{ + If: agg.And{ + agg.Eq("$op", "c"), + agg.Eq(agg.Type("$o.applyOps"), "array"), + }, + Then: agg.Map{ + Input: agg.Filter{ + Input: "$o.applyOps", + As: "opEntry", + Cond: o.getDefaultNSExclusions("$$opEntry"), + }, + As: "opEntry", + In: bson.D{ + {"op", "$$opEntry.op"}, + {"ns", "$$opEntry.ns"}, + {"docID", getOplogDocIDExpr("$$opEntry")}, + {"docLen", getOplogDocLenExpr("$$opEntry")}, + }, + }, + Else: "$$REMOVE", + }}, + } +} + func (o *OplogReader) ddlAllowanceDocID() string { return string(o.readerType) + "-ddlAllowanceTS" } @@ -330,15 +339,142 @@ func (o *OplogReader) readAndHandleOneBatch( return nil } + var latestTS bson.Timestamp + events := make([]ParsedEvent, 0, len(o.curDocs)) + if util.ClusterHasBSONSize([2]int(o.clusterInfo.VersionArray)) { + events, latestTS, err = o.parseExprProjectedOps(events, allowDDLBeforeTS) + } else { + events, latestTS, err = o.parseRawOps(events, allowDDLBeforeTS) + } + + if err != nil { + return err + } + + sess := mongo.SessionFromContext(sctx) + resumeToken := oplog.ResumeToken{latestTS}.MarshalToBSON() + + o.updateLag(sess, resumeToken) + + // NB: events can legitimately be empty here because we might only have + // gotten op=n oplog entries, which we just use to advance the reader. + // (Similar to a change stream’s post-batch resume token.) + if len(events) > 0 { + o.batchSizeHistory.Add(len(events)) + } + + select { + case <-sctx.Done(): + return err + case o.changeEventBatchChan <- changeEventBatch{ + events: events, + resumeToken: resumeToken, + }: + } + + o.lastChangeEventTime = &latestTS + + return nil +} + +func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Timestamp) ([]ParsedEvent, bson.Timestamp, error) { + var latestTS bson.Timestamp + + for _, rawDoc := range o.curDocs { + opName, err := mbson.Lookup[string](rawDoc, "op") + if err != nil { + return nil, bson.Timestamp{}, err + } + + err = mbson.LookupTo(rawDoc, &latestTS, "ts") + if err != nil { + return nil, bson.Timestamp{}, err + } + + switch opName { + case "n": + case "c": + default: + nsStr, err := mbson.Lookup[string](rawDoc, "ns") + if err != nil { + return nil, bson.Timestamp{}, err + } + + var docID bson.RawValue + var docLength types.ByteCount + var docField string + + switch opName { + case "i": + docField = "o" + case "d": + docID, err = rawDoc.LookupErr("o", "_id") + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "extracting o._id from delete") + } + case "u": + _, err := rawDoc.LookupErr("o", "_id") + if err == nil { + // replace, so we have the full doc + docField = "o" + } else if errors.Is(err, bsoncore.ErrElementNotFound) { + docID, err = rawDoc.LookupErr("o2", "_id") + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "extracting o2._id from update") + } + } else { + return nil, bson.Timestamp{}, errors.Wrap(err, "extracting o._id from update") + } + default: + panic(fmt.Sprintf("op=%#q unexpected (%v)", opName, rawDoc)) + } + + if docField != "" { + doc, err := mbson.Lookup[bson.Raw](rawDoc, docField) + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "extracting doc from op") + } + + docLength = types.ByteCount(len(doc)) + docID, err = doc.LookupErr("_id") + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "extracting doc ID from op") + } + } else { + if docID.IsZero() { + panic("zero doc ID!") + } + + docLength = defaultUserDocumentSize + } + + events = append( + events, + ParsedEvent{ + OpType: oplogOpToOperationType[opName], + Ns: NewNamespace(SplitNamespace(nsStr)), + DocID: docID, + FullDocLen: option.Some(docLength), + ClusterTime: lo.ToPtr(latestTS), + }, + ) + } + } + + return events, latestTS, nil +} + +func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBeforeTS bson.Timestamp) ([]ParsedEvent, bson.Timestamp, error) { + var latestTS bson.Timestamp for _, rawDoc := range o.curDocs { var op oplog.Op if err := (&op).UnmarshalFromBSON(rawDoc); err != nil { - return errors.Wrapf(err, "reading oplog entry") + return nil, bson.Timestamp{}, errors.Wrapf(err, "reading oplog entry") } latestTS = op.TS @@ -360,7 +496,7 @@ func (o *OplogReader) readAndHandleOneBatch( continue } - return UnknownEventError{rawDoc} + return nil, bson.Timestamp{}, UnknownEventError{rawDoc} } events = append( @@ -392,30 +528,7 @@ func (o *OplogReader) readAndHandleOneBatch( } } - sess := mongo.SessionFromContext(sctx) - resumeToken := oplog.ResumeToken{latestTS}.MarshalToBSON() - - o.updateLag(sess, resumeToken) - - // NB: events can legitimately be empty here because we might only have - // gotten op=n oplog entries, which we just use to advance the reader. - // (Similar to a change stream’s post-batch resume token.) - if len(events) > 0 { - o.batchSizeHistory.Add(len(events)) - } - - select { - case <-sctx.Done(): - return err - case o.changeEventBatchChan <- changeEventBatch{ - events: events, - resumeToken: resumeToken, - }: - } - - o.lastChangeEventTime = &latestTS - - return nil + return events, latestTS, nil } func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { From f07b8ded60802a8f2c4d056c50e2c77a6be4eda8 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 09:15:52 -0500 Subject: [PATCH 046/130] handle applyOps --- internal/verifier/oplog_reader.go | 167 +++++++++++++++++++++--------- 1 file changed, 119 insertions(+), 48 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index cf36e03d..9e049666 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -382,6 +382,74 @@ func (o *OplogReader) readAndHandleOneBatch( func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Timestamp) ([]ParsedEvent, bson.Timestamp, error) { var latestTS bson.Timestamp + parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { + nsStr, err := mbson.Lookup[string](rawDoc, "ns") + if err != nil { + return err + } + + var docID bson.RawValue + var docLength types.ByteCount + var docField string + + switch opName { + case "i": + docField = "o" + case "d": + docID, err = rawDoc.LookupErr("o", "_id") + if err != nil { + return errors.Wrap(err, "extracting o._id from delete") + } + case "u": + _, err := rawDoc.LookupErr("o", "_id") + if err == nil { + // replace, so we have the full doc + docField = "o" + } else if errors.Is(err, bsoncore.ErrElementNotFound) { + docID, err = rawDoc.LookupErr("o2", "_id") + if err != nil { + return errors.Wrap(err, "extracting o2._id from update") + } + } else { + return errors.Wrap(err, "extracting o._id from update") + } + default: + panic(fmt.Sprintf("op=%#q unexpected (%v)", opName, rawDoc)) + } + + if docField != "" { + doc, err := mbson.Lookup[bson.Raw](rawDoc, docField) + if err != nil { + return errors.Wrap(err, "extracting doc from op") + } + + docLength = types.ByteCount(len(doc)) + docID, err = doc.LookupErr("_id") + if err != nil { + return errors.Wrap(err, "extracting doc ID from op") + } + } else { + if docID.IsZero() { + panic("zero doc ID!") + } + + docLength = defaultUserDocumentSize + } + + events = append( + events, + ParsedEvent{ + OpType: oplogOpToOperationType[opName], + Ns: NewNamespace(SplitNamespace(nsStr)), + DocID: docID, + FullDocLen: option.Some(docLength), + ClusterTime: lo.ToPtr(ts), + }, + ) + + return nil + } + for _, rawDoc := range o.curDocs { opName, err := mbson.Lookup[string](rawDoc, "op") if err != nil { @@ -396,70 +464,73 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti switch opName { case "n": case "c": - default: - nsStr, err := mbson.Lookup[string](rawDoc, "ns") + oDoc, err := mbson.Lookup[bson.Raw](rawDoc, "o") if err != nil { return nil, bson.Timestamp{}, err } - var docID bson.RawValue - var docLength types.ByteCount - var docField string + el, err := oDoc.IndexErr(0) + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "getting first el of o doc") + } - switch opName { - case "i": - docField = "o" - case "d": - docID, err = rawDoc.LookupErr("o", "_id") - if err != nil { - return nil, bson.Timestamp{}, errors.Wrap(err, "extracting o._id from delete") + cmdName, err := el.KeyErr() + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "getting first field name of o doc") + } + + if cmdName != "applyOps" { + if o.onDDLEvent == onDDLEventAllow { + o.logIgnoredDDL(rawDoc) + continue } - case "u": - _, err := rawDoc.LookupErr("o", "_id") - if err == nil { - // replace, so we have the full doc - docField = "o" - } else if errors.Is(err, bsoncore.ErrElementNotFound) { - docID, err = rawDoc.LookupErr("o2", "_id") - if err != nil { - return nil, bson.Timestamp{}, errors.Wrap(err, "extracting o2._id from update") - } - } else { - return nil, bson.Timestamp{}, errors.Wrap(err, "extracting o._id from update") + + if !latestTS.After(allowDDLBeforeTS) { + o.logger.Info(). + Stringer("event", rawDoc). + Msg("Ignoring unrecognized write from the past.") + + continue } - default: - panic(fmt.Sprintf("op=%#q unexpected (%v)", opName, rawDoc)) + + return nil, bson.Timestamp{}, UnknownEventError{rawDoc} } - if docField != "" { - doc, err := mbson.Lookup[bson.Raw](rawDoc, docField) + var opsArray bson.Raw + err = mbson.UnmarshalElementValue(el, &opsArray) + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "parsing applyOps") + } + + arrayVals, err := opsArray.Values() + if err != nil { + return nil, bson.Timestamp{}, errors.Wrap(err, "getting applyOps values") + } + + // Might as well ... + events = slices.Grow(events, len(arrayVals)) + + for i, opRV := range arrayVals { + opRaw, err := mbson.CastRawValue[bson.Raw](opRV) if err != nil { - return nil, bson.Timestamp{}, errors.Wrap(err, "extracting doc from op") + return nil, bson.Timestamp{}, errors.Wrapf(err, "extracting applyOps[%d]", i) } - docLength = types.ByteCount(len(doc)) - docID, err = doc.LookupErr("_id") + opName, err := mbson.Lookup[string](opRaw, "op") if err != nil { - return nil, bson.Timestamp{}, errors.Wrap(err, "extracting doc ID from op") - } - } else { - if docID.IsZero() { - panic("zero doc ID!") + return nil, bson.Timestamp{}, errors.Wrapf(err, "extracting applyOps[%d].op", i) } - docLength = defaultUserDocumentSize + err = parseOneDocumentOp(opName, latestTS, opRaw) + if err != nil { + return nil, bson.Timestamp{}, errors.Wrapf(err, "processing applyOps[%d]", i) + } + } + default: + err := parseOneDocumentOp(opName, latestTS, rawDoc) + if err != nil { + return nil, bson.Timestamp{}, err } - - events = append( - events, - ParsedEvent{ - OpType: oplogOpToOperationType[opName], - Ns: NewNamespace(SplitNamespace(nsStr)), - DocID: docID, - FullDocLen: option.Some(docLength), - ClusterTime: lo.ToPtr(latestTS), - }, - ) } } From 7840134853a089a447b36294fc8c34a8af0fb143 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 10:10:08 -0500 Subject: [PATCH 047/130] clone --- internal/verifier/oplog_reader.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 9e049666..b6c6d25e 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -436,6 +436,8 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti docLength = defaultUserDocumentSize } + docID.Value = slices.Clone(docID.Value) + events = append( events, ParsedEvent{ From acbbbbeaa692bbb6c67b0ffc8b49baaa921372b1 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 13:57:00 -0500 Subject: [PATCH 048/130] options --- internal/verifier/change_stream.go | 5 +- internal/verifier/check.go | 62 +++++++++++-------------- internal/verifier/migration_verifier.go | 50 ++++++++++++++++++++ internal/verifier/oplog_reader.go | 4 ++ internal/verifier/recheck_persist.go | 6 ++- main/migration_verifier.go | 38 ++++++++++++++- 6 files changed, 127 insertions(+), 38 deletions(-) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 29f3164d..070806b0 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -27,8 +27,9 @@ var supportedEventOpTypes = mapset.NewSet( ) const ( - minChangeStreamPersistInterval = time.Second * 10 - maxChangeStreamAwaitTime = time.Second + maxChangeStreamAwaitTime = time.Second + + ChangeReaderOptChangeStream = "changeStream" ) type UnknownEventError struct { diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 2c88919a..b0ae1859 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -8,7 +8,6 @@ import ( "github.com/10gen/migration-verifier/contextplus" "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/retry" - "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mslices" mapset "github.com/deckarep/golang-set/v2" "github.com/goaux/timer" @@ -32,6 +31,11 @@ var ( verificationTaskFailed, verificationTaskMetadataMismatch, ) + + ChangeReaderOpts = mslices.Of( + ChangeReaderOptChangeStream, + ChangeReaderOptOplog, + ) ) // Check is the asynchronous entry point to Check, should only be called by the web server. Use @@ -216,9 +220,13 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh // Now that we’ve initialized verifier.generation we can // start the change readers. - verifier.initializeChangeReaders() + err = verifier.initializeChangeReaders() verifier.mux.Unlock() + if err != nil { + return err + } + err = retry.New().WithCallback( func(ctx context.Context, _ *retry.FuncInfo) error { err = verifier.AddMetaIndexes(ctx) @@ -597,68 +605,54 @@ func (verifier *Verifier) work(ctx context.Context, workerNum int) error { } } -func (v *Verifier) initializeChangeReaders() { - var whyCS string - - switch { - case len(v.srcNamespaces) > 0: - whyCS = "ns filter" - case v.srcClusterInfo.Topology == util.TopologySharded: - whyCS = "sharded" +func (v *Verifier) initializeChangeReaders() error { + warnAboutOplog := func(cluster whichCluster) { + v.logger.Warn(). + Str("cluster", string(cluster)). + Msg("Reading writes via oplog tailing. This feature is experimental.") } - srcLogEvent := v.logger.Info() + switch v.srcChangeReaderMethod { + case ChangeReaderOptOplog: + warnAboutOplog(src) - if whyCS == "" { v.srcChangeReader = v.newOplogReader( v.srcNamespaces, src, v.srcClient, *v.srcClusterInfo, ) - } else { - srcLogEvent.Str("whyChangeStream", whyCS) - + case ChangeReaderOptChangeStream: v.srcChangeReader = v.newChangeStreamReader( v.srcNamespaces, src, v.srcClient, *v.srcClusterInfo, ) + default: + return fmt.Errorf("bad source change reader: %#q", v.srcChangeReaderMethod) } - srcLogEvent. - Stringer("reader", v.srcChangeReader). - Msg("Listening for writes to source.") - - switch { - case len(v.dstNamespaces) > 0: - whyCS = "ns filter" - case v.dstClusterInfo.Topology == util.TopologySharded: - whyCS = "sharded" - } - - dstLogEvent := v.logger.Info() + switch v.dstChangeReaderMethod { + case ChangeReaderOptOplog: + warnAboutOplog(dst) - if whyCS == "" { v.dstChangeReader = v.newOplogReader( v.dstNamespaces, dst, v.dstClient, *v.dstClusterInfo, ) - } else { - dstLogEvent.Str("whyChangeStream", whyCS) - + case ChangeReaderOptChangeStream: v.dstChangeReader = v.newChangeStreamReader( v.dstNamespaces, dst, v.dstClient, *v.dstClusterInfo, ) + default: + return fmt.Errorf("bad destination change reader: %#q", v.srcChangeReaderMethod) } - dstLogEvent. - Stringer("reader", v.dstChangeReader). - Msg("Listening for writes to destination.") + return nil } diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index e23445cb..5eb8b4ac 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -103,6 +103,9 @@ type Verifier struct { srcEventRecorder *EventRecorder dstEventRecorder *EventRecorder + srcChangeReaderMethod string + dstChangeReaderMethod string + changeHandlingErr *util.Eventual[error] // Used only with generation 0 to defer the first @@ -378,6 +381,53 @@ func (verifier *Verifier) SetDocCompareMethod(method DocCompareMethod) { verifier.docCompareMethod = method } +func (verifier *Verifier) SetSrcChangeReader(method string) error { + err := validateChangeReaderOpt(method, verifier.srcNamespaces, *verifier.srcClusterInfo) + if err != nil { + return errors.Wrap(err, "setting source change reader method") + } + + verifier.srcChangeReaderMethod = method + + return nil +} + +func (verifier *Verifier) SetDstChangeReader(method string) error { + err := validateChangeReaderOpt(method, verifier.dstNamespaces, *verifier.dstClusterInfo) + if err != nil { + return errors.Wrap(err, "setting source change reader method") + } + + verifier.dstChangeReaderMethod = method + + return nil +} + +func validateChangeReaderOpt( + method string, + namespaces []string, + clusterInfo util.ClusterInfo, +) error { + if method != ChangeReaderOptOplog { + return nil + } + + var whyNoOplog string + + switch { + case len(namespaces) > 0: + whyNoOplog = "ns filter" + case clusterInfo.Topology == util.TopologySharded: + whyNoOplog = "sharded" + } + + if whyNoOplog != "" { + return fmt.Errorf("cannot read oplog (%s)", whyNoOplog) + } + + return nil +} + func (verifier *Verifier) SetVerifyAll(arg bool) { verifier.verifyAll = arg } diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index b6c6d25e..79592345 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -24,6 +24,10 @@ import ( "golang.org/x/exp/slices" ) +const ( + ChangeReaderOptOplog = "tailOplog" +) + // OplogReader reads change events via oplog tailing instead of a change stream. // This significantly lightens server load and allows verification of heavier // workloads than change streams allow. It only works with replica sets. diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index c941791f..54c86d47 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -15,6 +15,10 @@ type changeEventBatch struct { resumeToken bson.Raw } +const ( + minResumeTokenPersistInterval = 10 * time.Second +) + // RunChangeEventPersistor persists rechecks from change event batches. // It needs to be started after the reader starts and should run in its own // goroutine. @@ -30,7 +34,7 @@ func (verifier *Verifier) RunChangeEventPersistor( var lastPersistedTime time.Time persistResumeTokenIfNeeded := func(ctx context.Context, token bson.Raw) { - if time.Since(lastPersistedTime) >= minChangeStreamPersistInterval { + if time.Since(lastPersistedTime) >= minResumeTokenPersistInterval { persistErr := persistCallback(ctx, token) if persistErr != nil { verifier.logger.Warn(). diff --git a/main/migration_verifier.go b/main/migration_verifier.go index aa1c8a65..f203c96e 100644 --- a/main/migration_verifier.go +++ b/main/migration_verifier.go @@ -33,6 +33,8 @@ const ( logPath = "logPath" srcNamespace = "srcNamespace" dstNamespace = "dstNamespace" + srcChangeReader = "srcChangeReader" + dstChangeReader = "dstChangeReader" metaDBName = "metaDBName" docCompareMethod = "docCompareMethod" verifyAll = "verifyAll" @@ -126,6 +128,22 @@ func main() { Name: dstNamespace, Usage: "destination `namespaces` to check", }), + altsrc.NewStringFlag(cli.StringFlag{ + Name: srcChangeReader, + Value: verifier.ChangeReaderOptChangeStream, + Usage: "How to read changes from the source. One of: " + strings.Join( + verifier.ChangeReaderOpts, + ", ", + ), + }), + altsrc.NewStringFlag(cli.StringFlag{ + Name: dstChangeReader, + Value: verifier.ChangeReaderOptChangeStream, + Usage: "How to read changes from the destination. One of: " + strings.Join( + verifier.ChangeReaderOpts, + ", ", + ), + }), altsrc.NewStringFlag(cli.StringFlag{ Name: metaDBName, Value: "migration_verification_metadata", @@ -344,9 +362,27 @@ func handleArgs(ctx context.Context, cCtx *cli.Context) (*verifier.Verifier, err } v.SetMetaDBName(cCtx.String(metaDBName)) + srcChangeReaderVal := cCtx.String(srcChangeReader) + if !slices.Contains(verifier.ChangeReaderOpts, srcChangeReaderVal) { + return nil, errors.Errorf("invalid %#q (%s); valid values are: %#q", srcChangeReader, srcChangeReaderVal, verifier.ChangeReaderOpts) + } + err = v.SetSrcChangeReader(srcChangeReaderVal) + if err != nil { + return nil, err + } + + dstChangeReaderVal := cCtx.String(dstChangeReader) + if !slices.Contains(verifier.ChangeReaderOpts, dstChangeReaderVal) { + return nil, errors.Errorf("invalid %#q (%s); valid values are: %#q", dstChangeReader, dstChangeReaderVal, verifier.ChangeReaderOpts) + } + err = v.SetDstChangeReader(srcChangeReaderVal) + if err != nil { + return nil, err + } + docCompareMethod := verifier.DocCompareMethod(cCtx.String(docCompareMethod)) if !slices.Contains(verifier.DocCompareMethods, docCompareMethod) { - return nil, errors.Errorf("invalid doc compare method (%s); valid value are: %v", docCompareMethod, verifier.DocCompareMethods) + return nil, errors.Errorf("invalid doc compare method (%s); valid values are: %#q", docCompareMethod, verifier.DocCompareMethods) } v.SetDocCompareMethod(docCompareMethod) From ddeb8f321ec50f05d1e068253c58e8eddd37b1d5 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 14:34:50 -0500 Subject: [PATCH 049/130] github ci --- .github/workflows/all.yml | 76 ++++++++++++++------- internal/verifier/integration_test_suite.go | 12 ++++ 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index 9e9963ae..7c23d251 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -7,6 +7,12 @@ on: pull_request: workflow_dispatch: +env: + replsetSrcConnStr: mongodb://localhost:27020,localhost:27021,localhost:27022 + replsetDstConnStr: mongodb://localhost:27030,localhost:27031,localhost:27032 + shardedSrcConnStr: mongodb://localhost:27020 + shardedDstConnStr: mongodb://localhost:27030 + jobs: basics: strategy: @@ -17,10 +23,37 @@ jobs: # Testing fallback when `hello` isn’t implemented # (but appendOplogNote is). - mongodb_versions: [ '4.2.5', '6.0' ] - topology: - name: replset - srcConnStr: mongodb://localhost:27020,localhost:27021,localhost:27022 - dstConnStr: mongodb://localhost:27030,localhost:27031,localhost:27032 + topology: replset + + - mongodb_versions: [ '4.2', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '4.4', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '5.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '6.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '7.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '8.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog exclude: - mongodb_versions: [ '4.2', '4.2' ] @@ -63,27 +96,19 @@ jobs: toHashedIndexKey: [true, false] - topology: - - name: replset - srcConnStr: mongodb://localhost:27020,localhost:27021,localhost:27022 - dstConnStr: mongodb://localhost:27030,localhost:27031,localhost:27032 - - - name: replset-to-sharded - dstArgs: --sharded 2 - srcConnStr: mongodb://localhost:27020,localhost:27021,localhost:27022 - dstConnStr: mongodb://localhost:27030 + srcChangeReader: changeStream + dstChangeReader: changeStream - - name: sharded - srcArgs: --sharded 2 - dstArgs: --sharded 2 - srcConnStr: mongodb://localhost:27020 - dstConnStr: mongodb://localhost:27030 + topology: + - replset + - replset-to-sharded + - sharded # Ubuntu 24 lacks OpenSSL 1.1.1’s libcrypto, which pre-v6 MongoDB # versions need. runs-on: ubuntu-22.04 - name: ${{ matrix.mongodb_versions[0] }} to ${{ matrix.mongodb_versions[1] }}, ${{ matrix.topology.name }}${{ matrix.toHashedIndexKey && ', hashed doc compare' || '' }} + name: ${{ matrix.mongodb_versions[0] }} to ${{ matrix.mongodb_versions[1] }}, ${{ matrix.topology }}${{ matrix.toHashedIndexKey && ', hashed doc compare' || '' }}, srcChangeReader=${{ matrix.srcChangeReader }}, dstChangeReader=${{ matrix.dstChangeReader }} steps: - run: uname -a @@ -118,8 +143,8 @@ jobs: run: |- { echo ./build.sh - echo mlaunch init --binarypath $(cat .srcpath) --port 27020 --dir src --replicaset ${{ matrix.topology.srcArgs }} - echo mlaunch init --binarypath $(cat .dstpath) --port 27030 --dir dst --replicaset ${{ matrix.topology.dstArgs }} + echo mlaunch init --binarypath $(cat .srcpath) --port 27020 --dir src --replicaset ${{ (matrix.topology == "sharded") && "--sharded 2" || "" }} + echo mlaunch init --binarypath $(cat .dstpath) --port 27030 --dir dst --replicaset ${{ (matrix.topology == "sharded" || matrix.topology == "replset-to-sharded") && "--sharded 2" || "" }} echo mlaunch init --binarypath $(cat .metapath) --port 27040 --dir meta --replicaset --nodes 1 } | parallel @@ -127,6 +152,11 @@ jobs: run: go test -v ./... -race env: MVTEST_DOC_COMPARE_METHOD: ${{matrix.toHashedIndexKey && 'toHashedIndexKey' || ''}} - MVTEST_SRC: ${{matrix.topology.srcConnStr}} - MVTEST_DST: ${{matrix.topology.dstConnStr}} + + MVTEST_SRC_CHANGE_READER: ${{matrix.srcChangeReader}} + MVTEST_DST_CHANGE_READER: ${{matrix.dstChangeReader}} + + MVTEST_SRC: ${{ (matrix.topology == "sharded") && env.shardedSrcConnStr || env.replsetSrcConnStr }} + MVTEST_DST: ${{ (matrix.topology == "sharded" || matrix.topology == "replset-to-sharded") && env.shardedDstConnStr || env.replsetDstConnStr }} + MVTEST_META: mongodb://localhost:27040 diff --git a/internal/verifier/integration_test_suite.go b/internal/verifier/integration_test_suite.go index 19aff0ab..9a1aeb57 100644 --- a/internal/verifier/integration_test_suite.go +++ b/internal/verifier/integration_test_suite.go @@ -191,11 +191,23 @@ func (suite *IntegrationTestSuite) BuildVerifier() *Verifier { "should set metadata connection string", ) verifier.SetMetaDBName(metaDBName) + + envSrcChangeReader := os.Getenv("MVTEST_SRC_CHANGE_READER") + if envSrcChangeReader != "" { + verifier.SetSrcChangeReader(envSrcChangeReader) + } + + envDstChangeReader := os.Getenv("MVTEST_DST_CHANGE_READER") + if envDstChangeReader != "" { + verifier.SetDstChangeReader(envDstChangeReader) + } + verifier.initializeChangeReaders() suite.Require().NoError(verifier.srcClientCollection(&task).Drop(ctx)) suite.Require().NoError(verifier.dstClientCollection(&task).Drop(ctx)) suite.Require().NoError(verifier.AddMetaIndexes(ctx)) + return verifier } From e32835b679ebdda505d1e780ccf67a6f733b34d5 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 14:37:30 -0500 Subject: [PATCH 050/130] err check --- internal/verifier/integration_test_suite.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/verifier/integration_test_suite.go b/internal/verifier/integration_test_suite.go index 9a1aeb57..7800debe 100644 --- a/internal/verifier/integration_test_suite.go +++ b/internal/verifier/integration_test_suite.go @@ -194,15 +194,15 @@ func (suite *IntegrationTestSuite) BuildVerifier() *Verifier { envSrcChangeReader := os.Getenv("MVTEST_SRC_CHANGE_READER") if envSrcChangeReader != "" { - verifier.SetSrcChangeReader(envSrcChangeReader) + suite.Require().NoError(verifier.SetSrcChangeReader(envSrcChangeReader)) } envDstChangeReader := os.Getenv("MVTEST_DST_CHANGE_READER") if envDstChangeReader != "" { - verifier.SetDstChangeReader(envDstChangeReader) + suite.Require().NoError(verifier.SetDstChangeReader(envDstChangeReader)) } - verifier.initializeChangeReaders() + suite.Require().NoError(verifier.initializeChangeReaders()) suite.Require().NoError(verifier.srcClientCollection(&task).Drop(ctx)) suite.Require().NoError(verifier.dstClientCollection(&task).Drop(ctx)) From bb780433591ee1479226ad50da97d1bb0fbecd4a Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 14:38:45 -0500 Subject: [PATCH 051/130] array --- .github/workflows/all.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index 7c23d251..a8b3387d 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -96,8 +96,8 @@ jobs: toHashedIndexKey: [true, false] - srcChangeReader: changeStream - dstChangeReader: changeStream + srcChangeReader: [changeStream] + dstChangeReader: [changeStream] topology: - replset From bf719a65d6c797945bbae2d906d145c31f9d3911 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 14:43:44 -0500 Subject: [PATCH 052/130] quotes --- .github/workflows/all.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index a8b3387d..b1fb98a7 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -143,8 +143,8 @@ jobs: run: |- { echo ./build.sh - echo mlaunch init --binarypath $(cat .srcpath) --port 27020 --dir src --replicaset ${{ (matrix.topology == "sharded") && "--sharded 2" || "" }} - echo mlaunch init --binarypath $(cat .dstpath) --port 27030 --dir dst --replicaset ${{ (matrix.topology == "sharded" || matrix.topology == "replset-to-sharded") && "--sharded 2" || "" }} + echo mlaunch init --binarypath $(cat .srcpath) --port 27020 --dir src --replicaset ${{ (matrix.topology == 'sharded') && '--sharded 2' || '' }} + echo mlaunch init --binarypath $(cat .dstpath) --port 27030 --dir dst --replicaset ${{ (matrix.topology == 'sharded' || matrix.topology == 'replset-to-sharded') && '--sharded 2' || "" }} echo mlaunch init --binarypath $(cat .metapath) --port 27040 --dir meta --replicaset --nodes 1 } | parallel @@ -156,7 +156,7 @@ jobs: MVTEST_SRC_CHANGE_READER: ${{matrix.srcChangeReader}} MVTEST_DST_CHANGE_READER: ${{matrix.dstChangeReader}} - MVTEST_SRC: ${{ (matrix.topology == "sharded") && env.shardedSrcConnStr || env.replsetSrcConnStr }} - MVTEST_DST: ${{ (matrix.topology == "sharded" || matrix.topology == "replset-to-sharded") && env.shardedDstConnStr || env.replsetDstConnStr }} + MVTEST_SRC: ${{ (matrix.topology == 'sharded') && env.shardedSrcConnStr || env.replsetSrcConnStr }} + MVTEST_DST: ${{ (matrix.topology == 'sharded' || matrix.topology == 'replset-to-sharded') && env.shardedDstConnStr || env.replsetDstConnStr }} MVTEST_META: mongodb://localhost:27040 From a74853e9d4ec70d9e759b48a16ef1126af0a8ddd Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 14:44:25 -0500 Subject: [PATCH 053/130] quotes again --- .github/workflows/all.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index b1fb98a7..1ede5caf 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -144,7 +144,7 @@ jobs: { echo ./build.sh echo mlaunch init --binarypath $(cat .srcpath) --port 27020 --dir src --replicaset ${{ (matrix.topology == 'sharded') && '--sharded 2' || '' }} - echo mlaunch init --binarypath $(cat .dstpath) --port 27030 --dir dst --replicaset ${{ (matrix.topology == 'sharded' || matrix.topology == 'replset-to-sharded') && '--sharded 2' || "" }} + echo mlaunch init --binarypath $(cat .dstpath) --port 27030 --dir dst --replicaset ${{ (matrix.topology == 'sharded' || matrix.topology == 'replset-to-sharded') && '--sharded 2' || '' }} echo mlaunch init --binarypath $(cat .metapath) --port 27040 --dir meta --replicaset --nodes 1 } | parallel From 1f1fdb450e02e31aef0e9b71fc43dcc50b44385d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 14:52:54 -0500 Subject: [PATCH 054/130] support ns filter in oplog --- internal/verifier/migration_verifier.go | 7 ++----- internal/verifier/oplog_reader.go | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index 5eb8b4ac..c0c14c38 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -382,7 +382,7 @@ func (verifier *Verifier) SetDocCompareMethod(method DocCompareMethod) { } func (verifier *Verifier) SetSrcChangeReader(method string) error { - err := validateChangeReaderOpt(method, verifier.srcNamespaces, *verifier.srcClusterInfo) + err := validateChangeReaderOpt(method, *verifier.srcClusterInfo) if err != nil { return errors.Wrap(err, "setting source change reader method") } @@ -393,7 +393,7 @@ func (verifier *Verifier) SetSrcChangeReader(method string) error { } func (verifier *Verifier) SetDstChangeReader(method string) error { - err := validateChangeReaderOpt(method, verifier.dstNamespaces, *verifier.dstClusterInfo) + err := validateChangeReaderOpt(method, *verifier.dstClusterInfo) if err != nil { return errors.Wrap(err, "setting source change reader method") } @@ -405,7 +405,6 @@ func (verifier *Verifier) SetDstChangeReader(method string) error { func validateChangeReaderOpt( method string, - namespaces []string, clusterInfo util.ClusterInfo, ) error { if method != ChangeReaderOptOplog { @@ -415,8 +414,6 @@ func validateChangeReaderOpt( var whyNoOplog string switch { - case len(namespaces) > 0: - whyNoOplog = "ns filter" case clusterInfo.Topology == util.TopologySharded: whyNoOplog = "sharded" } diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 79592345..1fba90e6 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -162,7 +162,7 @@ func (o *OplogReader) createCursor( // plain ops: one write per op append( agg.And{agg.In("$op", "d", "i", "u")}, - o.getDefaultNSExclusions("$$ROOT")..., + o.getNSExclusions("$$ROOT")..., ), // op=n is for no-ops, so we stay up-to-date. @@ -234,7 +234,7 @@ func (o *OplogReader) getExprProjection() bson.D { Input: agg.Filter{ Input: "$o.applyOps", As: "opEntry", - Cond: o.getDefaultNSExclusions("$$opEntry"), + Cond: o.getNSExclusions("$$opEntry"), }, As: "opEntry", In: bson.D{ @@ -608,7 +608,7 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore return events, latestTS, nil } -func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { +func (o *OplogReader) getNSExclusions(docroot string) agg.And { prefixes := append( slices.Clone(namespaces.MongosyncMetaDBPrefixes), o.metaDB.Name()+".", @@ -616,7 +616,7 @@ func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { "admin.", ) - return agg.And(lo.Map( + filter := agg.And(lo.Map( prefixes, func(prefix string, _ int) any { return agg.Not{helpers.StringHasPrefix{ @@ -625,6 +625,15 @@ func (o *OplogReader) getDefaultNSExclusions(docroot string) agg.And { }} }, )) + + if len(o.namespaces) > 0 { + filter = append( + filter, + agg.In(docroot+".ns", o.namespaces...), + ) + } + + return filter } func getOplogDocLenExpr(docroot string) any { From 0622651cc883d652c9169328e38d4d4e315397b0 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 16:43:06 -0500 Subject: [PATCH 055/130] show Max the wonky --- agg/agg.go | 12 ---- agg/helpers/string.go | 15 +++- internal/verifier/compare.go | 36 ++++++---- internal/verifier/integration_test_suite.go | 20 +++--- internal/verifier/oplog_reader.go | 76 +++++++++++---------- internal/verifier/recheck.go | 1 + internal/verifier/recheck_persist.go | 33 +++++---- 7 files changed, 110 insertions(+), 83 deletions(-) diff --git a/agg/agg.go b/agg/agg.go index 8c753d91..f1202cf7 100644 --- a/agg/agg.go +++ b/agg/agg.go @@ -64,18 +64,6 @@ func (o Or) MarshalBSON() ([]byte, error) { // --------------------------------------------- -type SubstrBytes [3]any - -var _ bson.Marshaler = SubstrBytes{} - -func (s SubstrBytes) MarshalBSON() ([]byte, error) { - return bson.Marshal(bson.D{ - {"$substr", []any(s[:])}, - }) -} - -// --------------------------------------------- - type MergeObjects []any var _ bson.Marshaler = MergeObjects{} diff --git a/agg/helpers/string.go b/agg/helpers/string.go index 852845df..99502190 100644 --- a/agg/helpers/string.go +++ b/agg/helpers/string.go @@ -1,6 +1,8 @@ package helpers -import "go.mongodb.org/mongo-driver/v2/bson" +import ( + "go.mongodb.org/mongo-driver/v2/bson" +) type StringHasPrefix struct { FieldRef any @@ -19,4 +21,15 @@ func (sp StringHasPrefix) MarshalBSON() ([]byte, error) { }}}, }}, }) + + /* + return bson.Marshal(agg.Eq( + sp.Prefix, + agg.SubstrBytes{ + sp.FieldRef, + 0, + len(sp.Prefix), + }, + )) + */ } diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 85fbfca0..18f73c44 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -18,6 +18,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/readconcern" "go.mongodb.org/mongo-driver/v2/mongo/readpref" "golang.org/x/exp/slices" ) @@ -537,6 +538,8 @@ func iterateCursorToChannel( for cursor.Next(sctx) { state.NoteSuccess("received a document") + fmt.Printf("----- received a document: %+v\n\n", cursor.Current) + clusterTime, err := util.GetClusterTimeFromSession(sess) if err != nil { return errors.Wrap(err, "reading cluster time from session") @@ -590,6 +593,7 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo case DocQueryFunctionFind: findOptions = bson.D{ bson.E{"filter", filter}, + bson.E{"readConcern", readconcern.Majority()}, } case DocQueryFunctionAggregate: aggOptions = bson.D{ @@ -674,22 +678,26 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo // Suppress this log for recheck tasks because the list of IDs can be // quite long. - if !task.IsRecheck() { - if verifier.logger.Trace().Enabled() { - evt := verifier.logger.Trace(). - Any("task", task.PrimaryKey) + /* + if !task.IsRecheck() { + if verifier.logger.Trace().Enabled() { + */ + evt := verifier.logger.Debug(). + Any("task", task.PrimaryKey) + + cmdStr, err := bson.MarshalExtJSON(cmd, true, false) + if err != nil { + cmdStr = fmt.Appendf(nil, "%s", cmd) + } - cmdStr, err := bson.MarshalExtJSON(cmd, true, false) - if err != nil { - cmdStr = fmt.Appendf(nil, "%s", cmd) + evt. + Str("cmd", string(cmdStr)). + Str("options", fmt.Sprintf("%v", *runCommandOptions)). + Msg("getDocuments command.") + /* + } } - - evt. - Str("cmd", string(cmdStr)). - Str("options", fmt.Sprintf("%v", *runCommandOptions)). - Msg("getDocuments command.") - } - } + */ return collection.Database().RunCommandCursor(ctx, cmd, runCommandOptions) } diff --git a/internal/verifier/integration_test_suite.go b/internal/verifier/integration_test_suite.go index 7800debe..19291208 100644 --- a/internal/verifier/integration_test_suite.go +++ b/internal/verifier/integration_test_suite.go @@ -1,6 +1,7 @@ package verifier import ( + "cmp" "context" "os" "strings" @@ -192,15 +193,18 @@ func (suite *IntegrationTestSuite) BuildVerifier() *Verifier { ) verifier.SetMetaDBName(metaDBName) - envSrcChangeReader := os.Getenv("MVTEST_SRC_CHANGE_READER") - if envSrcChangeReader != "" { - suite.Require().NoError(verifier.SetSrcChangeReader(envSrcChangeReader)) - } + envSrcChangeReader := cmp.Or( + os.Getenv("MVTEST_SRC_CHANGE_READER"), + ChangeReaderOptChangeStream, + ) + suite.Require().NoError(verifier.SetSrcChangeReader(envSrcChangeReader)) - envDstChangeReader := os.Getenv("MVTEST_DST_CHANGE_READER") - if envDstChangeReader != "" { - suite.Require().NoError(verifier.SetDstChangeReader(envDstChangeReader)) - } + envDstChangeReader := cmp.Or( + os.Getenv("MVTEST_DST_CHANGE_READER"), + ChangeReaderOptChangeStream, + ) + + suite.Require().NoError(verifier.SetDstChangeReader(envDstChangeReader)) suite.Require().NoError(verifier.initializeChangeReaders()) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 1fba90e6..633dcdad 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -147,6 +147,32 @@ func (o *OplogReader) createCursor( findOpts.SetProjection(o.getExprProjection()) } + oplogFilter := bson.D{{"$and", []any{ + bson.D{{"ts", bson.D{{"$gte", startTS}}}}, + + bson.D{{"$expr", agg.Or{ + // plain ops: one write per op + append( + agg.And{agg.In("$op", "d", "i", "u")}, + o.getNSFilter("$$ROOT")..., + ), + + // op=n is for no-ops, so we stay up-to-date. + agg.Eq("$op", "n"), + + // op=c is for applyOps, and also to detect forbidden DDL. + agg.And{ + agg.Eq("$op", "c"), + agg.Not{helpers.StringHasPrefix{ + FieldRef: "$ns", + Prefix: "config.", + }}, + }, + }}}, + }}} + + fmt.Printf("------ oplogFilter: %v\n\n", oplogFilter) + cursor, err := o.watcherClient. Database("local"). Collection( @@ -155,29 +181,7 @@ func (o *OplogReader) createCursor( ). Find( sctx, - bson.D{{"$and", []any{ - bson.D{{"ts", bson.D{{"$gte", startTS}}}}, - - bson.D{{"$expr", agg.Or{ - // plain ops: one write per op - append( - agg.And{agg.In("$op", "d", "i", "u")}, - o.getNSExclusions("$$ROOT")..., - ), - - // op=n is for no-ops, so we stay up-to-date. - agg.Eq("$op", "n"), - - // op=c is for applyOps, and also to detect forbidden DDL. - agg.And{ - agg.Eq("$op", "c"), - agg.Not{helpers.StringHasPrefix{ - FieldRef: "$ns", - Prefix: "config.", - }}, - }, - }}}, - }}}, + oplogFilter, findOpts, ) @@ -234,7 +238,7 @@ func (o *OplogReader) getExprProjection() bson.D { Input: agg.Filter{ Input: "$o.applyOps", As: "opEntry", - Cond: o.getNSExclusions("$$opEntry"), + Cond: o.getNSFilter("$$opEntry"), }, As: "opEntry", In: bson.D{ @@ -257,10 +261,6 @@ func (o *OplogReader) iterateCursor( ctx context.Context, _ *retry.FuncInfo, sess *mongo.Session, - /* - cursor *mongo.Cursor, - allowDDLBeforeTS bson.Timestamp, - */ ) error { sctx := mongo.NewSessionContext(ctx, sess) cursor := o.cursor @@ -387,6 +387,8 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti var latestTS bson.Timestamp parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { + fmt.Printf("---- got op: %+v\n\n", rawDoc) + nsStr, err := mbson.Lookup[string](rawDoc, "ns") if err != nil { return err @@ -608,7 +610,9 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore return events, latestTS, nil } -func (o *OplogReader) getNSExclusions(docroot string) agg.And { +func (o *OplogReader) getNSFilter(docroot string) agg.And { + return agg.And{} + prefixes := append( slices.Clone(namespaces.MongosyncMetaDBPrefixes), o.metaDB.Name()+".", @@ -626,12 +630,14 @@ func (o *OplogReader) getNSExclusions(docroot string) agg.And { }, )) - if len(o.namespaces) > 0 { - filter = append( - filter, - agg.In(docroot+".ns", o.namespaces...), - ) - } + /* + if len(o.namespaces) > 0 { + filter = append( + filter, + agg.In(docroot+".ns", o.namespaces...), + ) + } + */ return filter } diff --git a/internal/verifier/recheck.go b/internal/verifier/recheck.go index 70cbe09b..ebf7620f 100644 --- a/internal/verifier/recheck.go +++ b/internal/verifier/recheck.go @@ -86,6 +86,7 @@ func (verifier *Verifier) insertRecheckDocs( insertThreads := 0 sendRechecks := func(rechecks []bson.Raw) { + fmt.Printf("----- inserting rechecks: %+v\n\n", rechecks) insertThreads++ eg.Go(func() error { diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 54c86d47..27b0d395 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -74,7 +74,7 @@ HandlerLoop: err = errors.Wrap( verifier.PersistChangeEvents(ctx, batch, clusterName), - "failed to handle change stream events", + "persisting rechecks for change events", ) if err == nil && batch.resumeToken != nil { @@ -92,14 +92,14 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE return nil } - dbNames := make([]string, len(batch.events)) - collNames := make([]string, len(batch.events)) - docIDs := make([]bson.RawValue, len(batch.events)) - dataSizes := make([]int32, len(batch.events)) + dbNames := make([]string, 0, len(batch.events)) + collNames := make([]string, 0, len(batch.events)) + docIDs := make([]bson.RawValue, 0, len(batch.events)) + dataSizes := make([]int32, 0, len(batch.events)) latestTimestamp := bson.Timestamp{} - for i, changeEvent := range batch.events { + for _, changeEvent := range batch.events { if !supportedEventOpTypes.Contains(changeEvent.OpType) { panic(fmt.Sprintf("Unsupported optype in event; should have failed already! event=%+v", changeEvent)) } @@ -127,10 +127,14 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE srcDBName = changeEvent.Ns.DB srcCollName = changeEvent.Ns.Coll } else { + if changeEvent.Ns.DB == "VERIFIER_TEST_META" { + continue + } + dstNs := fmt.Sprintf("%s.%s", changeEvent.Ns.DB, changeEvent.Ns.Coll) srcNs, exist := verifier.nsMap.GetSrcNamespace(dstNs) if !exist { - return errors.Errorf("no source namespace corresponding to the destination namepsace %s", dstNs) + return errors.Errorf("no source namespace matches the destination namepsace %#q", dstNs) } srcDBName, srcCollName = SplitNamespace(srcNs) } @@ -143,21 +147,24 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE panic(fmt.Sprintf("unknown event origin: %s", eventOrigin)) } - dbNames[i] = srcDBName - collNames[i] = srcCollName - docIDs[i] = changeEvent.DocID + dbNames = append(dbNames, srcDBName) + collNames = append(collNames, srcCollName) + docIDs = append(docIDs, changeEvent.DocID) + var dataSize int32 if changeEvent.FullDocLen.OrZero() > 0 { - dataSizes[i] = int32(changeEvent.FullDocLen.OrZero()) + dataSize = int32(changeEvent.FullDocLen.OrZero()) } else if changeEvent.FullDocument == nil { // This happens for deletes and for some updates. // The document is probably, but not necessarily, deleted. - dataSizes[i] = defaultUserDocumentSize + dataSize = defaultUserDocumentSize } else { // This happens for inserts, replaces, and most updates. - dataSizes[i] = int32(len(changeEvent.FullDocument)) + dataSize = int32(len(changeEvent.FullDocument)) } + dataSizes = append(dataSizes, dataSize) + if err := eventRecorder.AddEvent(&changeEvent); err != nil { return errors.Wrapf( err, From 0b686379b6e797c980416d8546a2af653e6a9ce5 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 13 Nov 2025 22:23:28 -0500 Subject: [PATCH 056/130] wth --- internal/verifier/change_reader.go | 13 ++++++++++--- internal/verifier/change_stream.go | 12 +++++------- internal/verifier/compare.go | 31 +++++++++++++----------------- internal/verifier/oplog_reader.go | 20 +++++++++---------- 4 files changed, 37 insertions(+), 39 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index ec490cf3..2ef4525a 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -37,6 +37,7 @@ type changeReader interface { getWhichCluster() whichCluster getReadChannel() <-chan changeEventBatch getStartTimestamp() option.Option[bson.Timestamp] + getLatestTimestamp() option.Option[bson.Timestamp] getEventsPerSecond() option.Option[float64] getLag() option.Option[time.Duration] getBufferSaturation() float64 @@ -50,9 +51,8 @@ type changeReader interface { type ChangeReaderCommon struct { readerType whichCluster - lastChangeEventTime *bson.Timestamp - logger *logger.Logger - namespaces []string + logger *logger.Logger + namespaces []string metaDB *mongo.Database watcherClient *mongo.Client @@ -64,6 +64,8 @@ type ChangeReaderCommon struct { changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] + lastChangeEventTime *msync.TypedAtomic[option.Option[bson.Timestamp]] + startAtTs *bson.Timestamp lag *msync.TypedAtomic[option.Option[time.Duration]] @@ -81,6 +83,7 @@ func newChangeReaderCommon(clusterName whichCluster) ChangeReaderCommon { changeEventBatchChan: make(chan changeEventBatch, batchChanBufferSize), writesOffTs: util.NewEventual[bson.Timestamp](), lag: msync.NewTypedAtomic(option.None[time.Duration]()), + lastChangeEventTime: msync.NewTypedAtomic(option.None[bson.Timestamp]()), batchSizeHistory: history.New[int](time.Minute), onDDLEvent: lo.Ternary( clusterName == dst, @@ -110,6 +113,10 @@ func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { return rc.changeEventBatchChan } +func (rc *ChangeReaderCommon) getLatestTimestamp() option.Option[bson.Timestamp] { + return rc.lastChangeEventTime.Load() +} + // getBufferSaturation returns the reader’s internal buffer’s saturation level // as a fraction. If saturation rises, that means we’re reading events faster // than we can persist them. diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 070806b0..6bfa3c64 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -217,11 +217,9 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( return errors.Errorf("Change event lacks a namespace: %+v", changeEvents[eventsRead]) } - if changeEvents[eventsRead].ClusterTime != nil && - (csr.lastChangeEventTime == nil || - csr.lastChangeEventTime.Before(*changeEvents[eventsRead].ClusterTime)) { - - csr.lastChangeEventTime = changeEvents[eventsRead].ClusterTime + eventTime := changeEvents[eventsRead].ClusterTime + if eventTime != nil && csr.lastChangeEventTime.Load().OrZero().Before(*eventTime) { + csr.lastChangeEventTime.Store(option.Some(*eventTime)) latestEvent = option.Some(changeEvents[eventsRead]) } @@ -341,8 +339,8 @@ func (csr *ChangeStreamReader) iterateChangeStream( if gotwritesOffTimestamp { csr.running = false - if csr.lastChangeEventTime != nil { - csr.startAtTs = csr.lastChangeEventTime + if ts, has := csr.lastChangeEventTime.Load().Get(); has { + csr.startAtTs = &ts } break diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 18f73c44..d1d02693 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -18,8 +18,6 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" - "go.mongodb.org/mongo-driver/v2/mongo/readconcern" - "go.mongodb.org/mongo-driver/v2/mongo/readpref" "golang.org/x/exp/slices" ) @@ -468,7 +466,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.srcClientCollection(task), verifier.srcClusterInfo, - verifier.srcChangeReader.getStartTimestamp().ToPointer(), + verifier.srcChangeReader.getLatestTimestamp().ToPointer(), task, ) @@ -501,7 +499,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.dstClientCollection(task), verifier.dstClusterInfo, - verifier.dstChangeReader.getStartTimestamp().ToPointer(), + verifier.dstChangeReader.getLatestTimestamp().ToPointer(), task, ) @@ -593,7 +591,6 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo case DocQueryFunctionFind: findOptions = bson.D{ bson.E{"filter", filter}, - bson.E{"readConcern", readconcern.Majority()}, } case DocQueryFunctionAggregate: aggOptions = bson.D{ @@ -660,20 +657,18 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo ) } - if verifier.readPreference.Mode() != readpref.PrimaryMode { - runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) - if startAtTs != nil { - readConcern := bson.D{ - {"afterClusterTime", *startAtTs}, - } - - // We never want to read before the change stream start time, - // or for the last generation, the change stream end time. - cmd = append( - cmd, - bson.E{"readConcern", readConcern}, - ) + runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) + if startAtTs != nil { + readConcern := bson.D{ + {"afterClusterTime", *startAtTs}, } + + // We never want to read before the change stream start time, + // or for the last generation, the change stream end time. + cmd = append( + cmd, + bson.E{"readConcern", readConcern}, + ) } // Suppress this log for recheck tasks because the list of IDs can be diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 633dcdad..393d954b 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -286,11 +286,9 @@ CursorLoop: writesOffTS := o.writesOffTs.Get() for { - if o.lastChangeEventTime != nil { - if !o.lastChangeEventTime.Before(writesOffTS) { - fmt.Printf("----------- %s reached writes off ts %v\n", o, writesOffTS) - break - } + if o.lastChangeEventTime.Load().OrZero().Before(writesOffTS) { + fmt.Printf("----------- %s reached writes off ts %v\n", o, writesOffTS) + break } err := o.readAndHandleOneBatch(sctx, cursor, allowDDLBeforeTS) @@ -303,9 +301,9 @@ CursorLoop: o.running = false infoLog := o.logger.Info() - if o.lastChangeEventTime != nil { - infoLog = infoLog.Any("lastEventTime", o.lastChangeEventTime) - o.startAtTs = lo.ToPtr(*o.lastChangeEventTime) + if ts, has := o.lastChangeEventTime.Load().Get(); has { + infoLog = infoLog.Any("lastEventTime", ts) + o.startAtTs = lo.ToPtr(ts) } else { infoLog = infoLog.Str("lastEventTime", "none") } @@ -378,12 +376,13 @@ func (o *OplogReader) readAndHandleOneBatch( }: } - o.lastChangeEventTime = &latestTS + o.lastChangeEventTime.Store(option.Some(latestTS)) return nil } func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Timestamp) ([]ParsedEvent, bson.Timestamp, error) { + fmt.Printf("--------------- parseRawOps\n\n\n") var latestTS bson.Timestamp parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { @@ -550,6 +549,7 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore var latestTS bson.Timestamp for _, rawDoc := range o.curDocs { + fmt.Printf("----- %s got op: %+v\n\n", o, rawDoc) var op oplog.Op if err := (&op).UnmarshalFromBSON(rawDoc); err != nil { @@ -611,8 +611,6 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore } func (o *OplogReader) getNSFilter(docroot string) agg.And { - return agg.And{} - prefixes := append( slices.Clone(namespaces.MongosyncMetaDBPrefixes), o.metaDB.Name()+".", From 00d5a24b1c3eed57040e683b44122813de57d7d3 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 09:01:14 -0500 Subject: [PATCH 057/130] still failing :( --- internal/verifier/compare.go | 51 +++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index d1d02693..60666d6e 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -15,9 +15,11 @@ import ( "github.com/10gen/migration-verifier/option" pool "github.com/libp2p/go-buffer-pool" "github.com/pkg/errors" + "github.com/samber/lo" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/readconcern" "golang.org/x/exp/slices" ) @@ -574,7 +576,7 @@ func getMapKey(docKeyValues []bson.RawValue) string { return keyBuffer.String() } -func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mongo.Collection, clusterInfo *util.ClusterInfo, +func (verifier *Verifier) getDocumentsCursor(sctx context.Context, collection *mongo.Collection, clusterInfo *util.ClusterInfo, startAtTs *bson.Timestamp, task *VerificationTask) (*mongo.Cursor, error) { var findOptions bson.D runCommandOptions := options.RunCmd() @@ -591,6 +593,7 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo case DocQueryFunctionFind: findOptions = bson.D{ bson.E{"filter", filter}, + bson.E{"readConcern", readconcern.Majority()}, } case DocQueryFunctionAggregate: aggOptions = bson.D{ @@ -658,18 +661,20 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo } runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) - if startAtTs != nil { - readConcern := bson.D{ - {"afterClusterTime", *startAtTs}, - } + /* + if startAtTs != nil { + readConcern := bson.D{ + {"afterClusterTime", *startAtTs}, + } - // We never want to read before the change stream start time, - // or for the last generation, the change stream end time. - cmd = append( - cmd, - bson.E{"readConcern", readConcern}, - ) - } + // We never want to read before the change stream start time, + // or for the last generation, the change stream end time. + cmd = append( + cmd, + bson.E{"readConcern", readConcern}, + ) + } + */ // Suppress this log for recheck tasks because the list of IDs can be // quite long. @@ -694,7 +699,27 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo } */ - return collection.Database().RunCommandCursor(ctx, cmd, runCommandOptions) + /* + sess := lo.Must(collection.Database().Client().StartSession()) + defer sess.EndSession(ctx) + + sess.AdvanceOperationTime(startAtTs) + */ + + lo.Must0(mongo.SessionFromContext(sctx).AdvanceOperationTime(startAtTs)) + lo.Must0(mongo.SessionFromContext(sctx).AdvanceClusterTime(lo.Must(bson.Marshal( + bson.D{ + {"$clusterTime", bson.D{ + {"clusterTime", *startAtTs}, + }}, + }, + )))) + + return collection.Database().RunCommandCursor( + sctx, + cmd, + runCommandOptions, + ) } func transformPipelineForToHashedIndexKey( From 3d20e9cc65d209bc728a17c7a462296a0fbe0198 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 09:17:02 -0500 Subject: [PATCH 058/130] neutralize session --- internal/verifier/compare.go | 64 +++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 60666d6e..f31b34f7 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -15,11 +15,9 @@ import ( "github.com/10gen/migration-verifier/option" pool "github.com/libp2p/go-buffer-pool" "github.com/pkg/errors" - "github.com/samber/lo" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" - "go.mongodb.org/mongo-driver/v2/mongo/readconcern" "golang.org/x/exp/slices" ) @@ -533,27 +531,29 @@ func iterateCursorToChannel( ) error { defer close(writer) - sess := mongo.SessionFromContext(sctx) + //sess := mongo.SessionFromContext(sctx) for cursor.Next(sctx) { state.NoteSuccess("received a document") fmt.Printf("----- received a document: %+v\n\n", cursor.Current) - clusterTime, err := util.GetClusterTimeFromSession(sess) - if err != nil { - return errors.Wrap(err, "reading cluster time from session") - } + /* + clusterTime, err := util.GetClusterTimeFromSession(sess) + if err != nil { + return errors.Wrap(err, "reading cluster time from session") + } + */ buf := pool.Get(len(cursor.Current)) copy(buf, cursor.Current) - err = chanutil.WriteWithDoneCheck( + err := chanutil.WriteWithDoneCheck( sctx, writer, docWithTs{ doc: buf, - ts: clusterTime, + //ts: clusterTime, }, ) @@ -593,7 +593,7 @@ func (verifier *Verifier) getDocumentsCursor(sctx context.Context, collection *m case DocQueryFunctionFind: findOptions = bson.D{ bson.E{"filter", filter}, - bson.E{"readConcern", readconcern.Majority()}, + //bson.E{"readConcern", readconcern.Majority()}, } case DocQueryFunctionAggregate: aggOptions = bson.D{ @@ -661,20 +661,20 @@ func (verifier *Verifier) getDocumentsCursor(sctx context.Context, collection *m } runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) - /* - if startAtTs != nil { - readConcern := bson.D{ - {"afterClusterTime", *startAtTs}, - } - // We never want to read before the change stream start time, - // or for the last generation, the change stream end time. - cmd = append( - cmd, - bson.E{"readConcern", readConcern}, - ) + if startAtTs != nil { + readConcern := bson.D{ + {"level", "snapshot"}, + {"afterClusterTime", *startAtTs}, } - */ + + // We never want to read before the change stream start time, + // or for the last generation, the change stream end time. + cmd = append( + cmd, + bson.E{"readConcern", readConcern}, + ) + } // Suppress this log for recheck tasks because the list of IDs can be // quite long. @@ -706,17 +706,19 @@ func (verifier *Verifier) getDocumentsCursor(sctx context.Context, collection *m sess.AdvanceOperationTime(startAtTs) */ - lo.Must0(mongo.SessionFromContext(sctx).AdvanceOperationTime(startAtTs)) - lo.Must0(mongo.SessionFromContext(sctx).AdvanceClusterTime(lo.Must(bson.Marshal( - bson.D{ - {"$clusterTime", bson.D{ - {"clusterTime", *startAtTs}, - }}, - }, - )))) + /* + lo.Must0(mongo.SessionFromContext(sctx).AdvanceOperationTime(startAtTs)) + lo.Must0(mongo.SessionFromContext(sctx).AdvanceClusterTime(lo.Must(bson.Marshal( + bson.D{ + {"$clusterTime", bson.D{ + {"clusterTime", *startAtTs}, + }}, + }, + )))) + */ return collection.Database().RunCommandCursor( - sctx, + mongo.NewSessionContext(sctx, nil), cmd, runCommandOptions, ) From 2a5e55a38a7982b121015e9f87985d18379b5e8e Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 12:34:44 -0500 Subject: [PATCH 059/130] fixed test --- internal/verifier/change_reader.go | 2 +- internal/verifier/check.go | 3 ++- internal/verifier/migration_verifier.go | 2 +- internal/verifier/migration_verifier_test.go | 24 +++++++++++++------ internal/verifier/oplog_reader.go | 25 +++++++++++--------- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 2ef4525a..f611e716 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -174,7 +174,7 @@ func (rc *ChangeReaderCommon) start( defer func() { rc.logger.Debug(). Str("reader", string(rc.readerType)). - Msg("Closing change event batch channel.") + Msg("Finished.") close(rc.changeEventBatchChan) }() diff --git a/internal/verifier/check.go b/internal/verifier/check.go index b0ae1859..ba73ff71 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -272,6 +272,7 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh return errors.Wrapf(err, "failed to start %s", changeReader) } changeReaderGroup.Go(func() error { + defer fmt.Printf("----- %s persistor finished\n", changeReader.String()) return verifier.RunChangeEventPersistor(groupCtx, changeReader) }) } @@ -360,7 +361,7 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh // caught again on the next iteration. if verifier.writesOff { verifier.logger.Debug(). - Msg("Waiting for change readers to end.") + Msg("Waiting for change handling to finish.") // It's necessary to wait for the change reader to finish before incrementing the // generation number, or the last changes will not be checked. diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index c0c14c38..b26baa5b 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -248,7 +248,7 @@ func (verifier *Verifier) WritesOff(ctx context.Context) error { } verifier.writesOff = true - verifier.logger.Debug().Msg("Signalling that writes are done.") + verifier.logger.Debug().Msg("Signaling that writes are done.") srcFinalTs, err = GetNewClusterTime( ctx, diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index fb17f3ed..a933a722 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2320,15 +2320,23 @@ func (suite *IntegrationTestSuite) TestVerifierWithFilter() { _, err = srcColl.InsertOne(ctx, bson.M{"_id": 201, "x": 201, "inFilter": true}) suite.Require().NoError(err) - // Tell check to start the next generation. - checkContinueChan <- struct{}{} + suite.Require().Eventually( + func() bool { + suite.T().Log("checking to see if a failure was found yet") - // Wait for one generation to finish. - <-checkDoneChan - status = waitForTasks() + // Tell check to start the next generation. + checkContinueChan <- struct{}{} - // There should be a failure from the src insert of a document in the filter. - suite.Require().Equal(VerificationStatus{TotalTasks: 1, FailedTasks: 1}, *status) + // Wait for one generation to finish. + <-checkDoneChan + status = waitForTasks() + + return *status == VerificationStatus{TotalTasks: 1, FailedTasks: 1} + }, + time.Minute, + time.Second, + "we should see a failure from the src insert of a document in the filter.", + ) // Now patch up the destination. _, err = dstColl.InsertOne(ctx, bson.M{"_id": 201, "x": 201, "inFilter": true}) @@ -2344,6 +2352,8 @@ func (suite *IntegrationTestSuite) TestVerifierWithFilter() { // There should be no failures now, since they are equivalent at this point in time. suite.Require().Equal(VerificationStatus{TotalTasks: 1, CompletedTasks: 1}, *status) + suite.T().Log("Finalizing test") + // Turn writes off. suite.Require().NoError(verifier.WritesOff(ctx)) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 393d954b..cff4aaaa 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -274,6 +274,11 @@ CursorLoop: case <-sctx.Done(): return sctx.Err() case <-o.writesOffTs.Ready(): + o.logger.Debug(). + Stringer("reader", o). + Any("timestamp", o.writesOffTs.Get()). + Msg("Received writes-off timestamp.") + break CursorLoop default: err = o.readAndHandleOneBatch(sctx, cursor, allowDDLBeforeTS) @@ -286,7 +291,7 @@ CursorLoop: writesOffTS := o.writesOffTs.Get() for { - if o.lastChangeEventTime.Load().OrZero().Before(writesOffTS) { + if !o.lastChangeEventTime.Load().OrZero().Before(writesOffTS) { fmt.Printf("----------- %s reached writes off ts %v\n", o, writesOffTS) break } @@ -386,7 +391,7 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti var latestTS bson.Timestamp parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { - fmt.Printf("---- got op: %+v\n\n", rawDoc) + //fmt.Printf("---- got op: %+v\n\n", rawDoc) nsStr, err := mbson.Lookup[string](rawDoc, "ns") if err != nil { @@ -549,7 +554,7 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore var latestTS bson.Timestamp for _, rawDoc := range o.curDocs { - fmt.Printf("----- %s got op: %+v\n\n", o, rawDoc) + //fmt.Printf("----- %s got op: %+v\n\n", o, rawDoc) var op oplog.Op if err := (&op).UnmarshalFromBSON(rawDoc); err != nil { @@ -628,14 +633,12 @@ func (o *OplogReader) getNSFilter(docroot string) agg.And { }, )) - /* - if len(o.namespaces) > 0 { - filter = append( - filter, - agg.In(docroot+".ns", o.namespaces...), - ) - } - */ + if len(o.namespaces) > 0 { + filter = append( + filter, + agg.In(docroot+".ns", o.namespaces...), + ) + } return filter } From eeb1f4fa1a1ae6cb96cba23235c91d8e59759bdf Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 13:34:47 -0500 Subject: [PATCH 060/130] more log --- internal/verifier/migration_verifier_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index a933a722..67524876 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2296,6 +2296,8 @@ func (suite *IntegrationTestSuite) TestVerifierWithFilter() { // Wait for one generation to finish. <-checkDoneChan status := waitForTasks() + + fmt.Printf("----- finished generation %d\n", verifier.generation) suite.Require().Greater(status.CompletedTasks, 1) suite.Require().Greater(status.TotalTasks, 1) suite.Require().Zero(status.FailedTasks, "there should be no failed tasks") From b5ccf3025a7766fbf56d6ada2d43d837800e8113 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 13:41:14 -0500 Subject: [PATCH 061/130] lint --- internal/verifier/migration_verifier.go | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index b26baa5b..65ed4e39 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -407,19 +407,11 @@ func validateChangeReaderOpt( method string, clusterInfo util.ClusterInfo, ) error { - if method != ChangeReaderOptOplog { - return nil - } - - var whyNoOplog string - - switch { - case clusterInfo.Topology == util.TopologySharded: - whyNoOplog = "sharded" - } - - if whyNoOplog != "" { - return fmt.Errorf("cannot read oplog (%s)", whyNoOplog) + switch method { + case ChangeReaderOptOplog: + if clusterInfo.Topology == util.TopologySharded { + return fmt.Errorf("cannot read oplog from sharded cluster") + } } return nil From 77ea0d79bdcc083983a741cb0b839ca87c2920b8 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 13:44:27 -0500 Subject: [PATCH 062/130] remove prints --- internal/verifier/compare.go | 2 -- internal/verifier/oplog_reader.go | 1 - internal/verifier/recheck.go | 1 - 3 files changed, 4 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index f31b34f7..ae851523 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -536,8 +536,6 @@ func iterateCursorToChannel( for cursor.Next(sctx) { state.NoteSuccess("received a document") - fmt.Printf("----- received a document: %+v\n\n", cursor.Current) - /* clusterTime, err := util.GetClusterTimeFromSession(sess) if err != nil { diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index cff4aaaa..bd150348 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -387,7 +387,6 @@ func (o *OplogReader) readAndHandleOneBatch( } func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Timestamp) ([]ParsedEvent, bson.Timestamp, error) { - fmt.Printf("--------------- parseRawOps\n\n\n") var latestTS bson.Timestamp parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { diff --git a/internal/verifier/recheck.go b/internal/verifier/recheck.go index ebf7620f..70cbe09b 100644 --- a/internal/verifier/recheck.go +++ b/internal/verifier/recheck.go @@ -86,7 +86,6 @@ func (verifier *Verifier) insertRecheckDocs( insertThreads := 0 sendRechecks := func(rechecks []bson.Raw) { - fmt.Printf("----- inserting rechecks: %+v\n\n", rechecks) insertThreads++ eg.Go(func() error { From 43a588f5b7596c23a5ccbd5906b1403df21fd2da Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 14:08:13 -0500 Subject: [PATCH 063/130] no snapshot --- internal/verifier/compare.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index ae851523..8feb0bd0 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -662,7 +662,6 @@ func (verifier *Verifier) getDocumentsCursor(sctx context.Context, collection *m if startAtTs != nil { readConcern := bson.D{ - {"level", "snapshot"}, {"afterClusterTime", *startAtTs}, } From 36ddf24b64680373b7f1071228e9176f1345befc Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 14:08:46 -0500 Subject: [PATCH 064/130] majority --- internal/verifier/compare.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 8feb0bd0..62037cb5 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -662,6 +662,7 @@ func (verifier *Verifier) getDocumentsCursor(sctx context.Context, collection *m if startAtTs != nil { readConcern := bson.D{ + {"level", "majority"}, {"afterClusterTime", *startAtTs}, } From aa86c4fb37869fb3269e2c19f3ceb079b4869a9b Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 14:34:39 -0500 Subject: [PATCH 065/130] kill txns --- internal/testutil/testutil.go | 54 +++++++++++++++++++++ internal/verifier/integration_test_suite.go | 4 ++ 2 files changed, 58 insertions(+) diff --git a/internal/testutil/testutil.go b/internal/testutil/testutil.go index e0a680df..3bb68854 100644 --- a/internal/testutil/testutil.go +++ b/internal/testutil/testutil.go @@ -10,6 +10,7 @@ import ( "github.com/10gen/migration-verifier/internal/util" "github.com/pkg/errors" "github.com/samber/lo" + "github.com/stretchr/testify/require" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" @@ -90,6 +91,59 @@ func convertDocsToAnys(docs []bson.D) []any { return anys } +func KillTransactions( + ctx context.Context, + t *testing.T, + client *mongo.Client, +) { + cursor, err := client.Database("admin").Aggregate( + ctx, + mongo.Pipeline{ + {{"$currentOp", bson.D{}}}, + {{"$match", bson.D{ + {"transaction.parameters.txnNumber", bson.D{ + {"$exists", true}, + }}, + }}}, + }, + ) + require.NoError(t, err) + + type txn struct { + LSID struct { + ID bson.Binary + } + } + + var txns []txn + require.NoError(t, cursor.All(ctx, &txns)) + + if len(txns) == 0 { + return + } + + t.Logf("Killing %d transaction(s) via killSessions …", len(txns)) + + sessionsToKill := lo.Map( + txns, + func(t txn, _ int) bson.D { + return bson.D{{"id", t.LSID.ID}} + }, + ) + + require.NoError( + t, + client.Database("admin").RunCommand( + ctx, + bson.D{ + {"killSessions", sessionsToKill}, + }, + ).Err(), + ) + + return +} + func KillApplicationChangeStreams( ctx context.Context, t *testing.T, diff --git a/internal/verifier/integration_test_suite.go b/internal/verifier/integration_test_suite.go index 19291208..3bd22fed 100644 --- a/internal/verifier/integration_test_suite.go +++ b/internal/verifier/integration_test_suite.go @@ -9,6 +9,7 @@ import ( "github.com/10gen/migration-verifier/contextplus" "github.com/10gen/migration-verifier/internal/logger" + "github.com/10gen/migration-verifier/internal/testutil" "github.com/10gen/migration-verifier/internal/util" mapset "github.com/deckarep/golang-set/v2" "github.com/pkg/errors" @@ -118,6 +119,9 @@ func (suite *IntegrationTestSuite) SetupTest() { suite.initialDbNames.Add(dbName) } } + + testutil.KillTransactions(ctx, suite.T(), suite.srcMongoClient) + testutil.KillTransactions(ctx, suite.T(), suite.dstMongoClient) } func (suite *IntegrationTestSuite) TearDownTest() { From 9677fddf90555fb737dc772555424233f4773a74 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 14:49:45 -0500 Subject: [PATCH 066/130] local rc --- internal/testutil/testutil.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/testutil/testutil.go b/internal/testutil/testutil.go index 3bb68854..93ac5ecb 100644 --- a/internal/testutil/testutil.go +++ b/internal/testutil/testutil.go @@ -96,7 +96,10 @@ func KillTransactions( t *testing.T, client *mongo.Client, ) { - cursor, err := client.Database("admin").Aggregate( + cursor, err := client.Database( + "admin", + options.Database().SetReadConcern(readconcern.Local()), + ).Aggregate( ctx, mongo.Pipeline{ {{"$currentOp", bson.D{}}}, From 683b055b7240e75f7dffebae72bc51bdf47739c6 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 14:51:23 -0500 Subject: [PATCH 067/130] redundant --- internal/testutil/testutil.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/testutil/testutil.go b/internal/testutil/testutil.go index 93ac5ecb..79abf1c7 100644 --- a/internal/testutil/testutil.go +++ b/internal/testutil/testutil.go @@ -143,8 +143,6 @@ func KillTransactions( }, ).Err(), ) - - return } func KillApplicationChangeStreams( From b0f2aad02a067450d6b7e77b9d3df56a94fab5f3 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 14:56:19 -0500 Subject: [PATCH 068/130] invert order --- internal/verifier/migration_verifier_test.go | 32 ++++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 67524876..37ff306e 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2231,21 +2231,14 @@ func (suite *IntegrationTestSuite) TestGenerationalRechecking() { } func (suite *IntegrationTestSuite) TestVerifierWithFilter() { + ctx := suite.Context() + zerolog.SetGlobalLevel(zerolog.DebugLevel) dbname1 := suite.DBNameForTest("1") dbname2 := suite.DBNameForTest("2") filter := bson.D{{"inFilter", bson.M{"$ne": false}}} - verifier := suite.BuildVerifier() - verifier.SetSrcNamespaces([]string{dbname1 + ".testColl1"}) - verifier.SetDstNamespaces([]string{dbname2 + ".testColl3"}) - verifier.SetNamespaceMap() - verifier.SetDocCompareMethod(DocCompareIgnoreOrder) - // Set this value low to test the verifier with multiple partitions. - verifier.partitionSizeInBytes = 50 - - ctx := suite.Context() srcColl := suite.srcMongoClient.Database(dbname1).Collection("testColl1") dstColl := suite.dstMongoClient.Database(dbname2).Collection("testColl3") @@ -2268,6 +2261,14 @@ func (suite *IntegrationTestSuite) TestVerifierWithFilter() { _, err = srcColl.InsertMany(ctx, docs) suite.Require().NoError(err) + verifier := suite.BuildVerifier() + verifier.SetSrcNamespaces([]string{dbname1 + ".testColl1"}) + verifier.SetDstNamespaces([]string{dbname2 + ".testColl3"}) + verifier.SetNamespaceMap() + verifier.SetDocCompareMethod(DocCompareIgnoreOrder) + // Set this value low to test the verifier with multiple partitions. + verifier.partitionSizeInBytes = 50 + checkDoneChan := make(chan struct{}) checkContinueChan := make(chan struct{}) go func() { @@ -2297,11 +2298,22 @@ func (suite *IntegrationTestSuite) TestVerifierWithFilter() { <-checkDoneChan status := waitForTasks() - fmt.Printf("----- finished generation %d\n", verifier.generation) suite.Require().Greater(status.CompletedTasks, 1) suite.Require().Greater(status.TotalTasks, 1) suite.Require().Zero(status.FailedTasks, "there should be no failed tasks") + /* + // When reading the oplog, verifier often sees the “near past” + suite.Require().Eventually( + func() bool { + + }, + time.Minute, + time.Millisecond, + "verifier must reach stasis before continuing", + ) + */ + // Insert another document that is not in the filter. // This should trigger a recheck despite being outside the filter. _, err = srcColl.InsertOne(ctx, bson.M{"_id": 200, "x": 200, "inFilter": false}) From bc3ab8cf50c7b4b14c5dcdd05355a3511a9bc115 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 15:00:48 -0500 Subject: [PATCH 069/130] wait for catchup --- internal/verifier/migration_verifier_test.go | 26 ++++++++++++-------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 37ff306e..213df8dc 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2302,17 +2302,23 @@ func (suite *IntegrationTestSuite) TestVerifierWithFilter() { suite.Require().Greater(status.TotalTasks, 1) suite.Require().Zero(status.FailedTasks, "there should be no failed tasks") - /* - // When reading the oplog, verifier often sees the “near past” - suite.Require().Eventually( - func() bool { + // When reading the oplog, verifier often sees the “near past”. + // Wait for it to do initial checks before continuing. + suite.Require().Eventually( + func() bool { + suite.T().Logf("Checking whether verifier has caught up to itself …") - }, - time.Minute, - time.Millisecond, - "verifier must reach stasis before continuing", - ) - */ + checkContinueChan <- struct{}{} + <-checkDoneChan + status, err = verifier.GetVerificationStatus(ctx) + suite.Require().NoError(err) + + return status.TotalTasks == 0 + }, + time.Minute, + time.Millisecond, + "verifier must reach stasis before continuing", + ) // Insert another document that is not in the filter. // This should trigger a recheck despite being outside the filter. From 11be48da95ff24af914f6e681dbd9710c7491770 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 15:07:45 -0500 Subject: [PATCH 070/130] clear out --- internal/verifier/migration_verifier_test.go | 56 ++++++++++++-------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 213df8dc..d2b74eec 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2060,13 +2060,6 @@ func (suite *IntegrationTestSuite) TestMetadataMismatchAndPartitioning() { srcColl := suite.srcMongoClient.Database(suite.DBNameForTest()).Collection("coll") dstColl := suite.dstMongoClient.Database(suite.DBNameForTest()).Collection("coll") - verifier := suite.BuildVerifier() - - ns := srcColl.Database().Name() + "." + srcColl.Name() - verifier.SetSrcNamespaces([]string{ns}) - verifier.SetDstNamespaces([]string{ns}) - verifier.SetNamespaceMap() - for _, coll := range mslices.Of(srcColl, dstColl) { _, err := coll.InsertOne(ctx, bson.M{"_id": 1, "x": 42}) suite.Require().NoError(err) @@ -2080,6 +2073,13 @@ func (suite *IntegrationTestSuite) TestMetadataMismatchAndPartitioning() { ) suite.Require().NoError(err) + verifier := suite.BuildVerifier() + + ns := srcColl.Database().Name() + "." + srcColl.Name() + verifier.SetSrcNamespaces([]string{ns}) + verifier.SetDstNamespaces([]string{ns}) + verifier.SetNamespaceMap() + runner := RunVerifierCheck(ctx, suite.T(), verifier) suite.Require().NoError(runner.AwaitGenerationEnd()) @@ -2108,23 +2108,37 @@ func (suite *IntegrationTestSuite) TestMetadataMismatchAndPartitioning() { suite.Require().Equal(verificationTaskVerifyCollection, tasks[1].Type) suite.Require().Equal(verificationTaskMetadataMismatch, tasks[1].Status) - suite.Require().NoError(runner.StartNextGeneration()) - suite.Require().NoError(runner.AwaitGenerationEnd()) + // When tailing the oplog sometimes the verifier starts up “in the past”, + // which can cause extra rechecks that we wouldn’t normally expect. This + // waits for any of those to clear out. + suite.Assert().Eventually( + func() bool { + suite.Require().NoError(runner.StartNextGeneration()) + suite.Require().NoError(runner.AwaitGenerationEnd()) - cursor, err = verifier.verificationTaskCollection().Aggregate( - ctx, - append( - mongo.Pipeline{ - bson.D{{"$match", bson.D{{"generation", 1}}}}, - }, - testutil.SortByListAgg("type", sortedTaskTypes)..., - ), - ) - suite.Require().NoError(err) + cursor, err = verifier.verificationTaskCollection().Aggregate( + ctx, + append( + mongo.Pipeline{ + bson.D{{"$match", bson.D{{"generation", 1}}}}, + }, + testutil.SortByListAgg("type", sortedTaskTypes)..., + ), + ) + suite.Require().NoError(err) - suite.Require().NoError(cursor.All(ctx, &tasks)) + suite.Require().NoError(cursor.All(ctx, &tasks)) + + suite.Require().GreaterOrEqual(len(tasks), 1, "we always expect >=1 task") + + return len(tasks) == 1 + }, + time.Minute, + time.Millisecond, + "wait until verifier has caught up with itself", + ) - suite.Require().Len(tasks, 1, "generation 1 should only have done 1 task") + suite.Require().Len(tasks, 1, "generation 1 should only have done 1 task; tasks=%+v", tasks) suite.Require().Equal(verificationTaskVerifyCollection, tasks[0].Type) suite.Require().Equal(verificationTaskMetadataMismatch, tasks[0].Status) } From 8c7578fae3f7738238a0c68f6640f6cea96831be Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 15:25:35 -0500 Subject: [PATCH 071/130] fix?? --- internal/verifier/migration_verifier_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index d2b74eec..8fe2ca96 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2111,7 +2111,7 @@ func (suite *IntegrationTestSuite) TestMetadataMismatchAndPartitioning() { // When tailing the oplog sometimes the verifier starts up “in the past”, // which can cause extra rechecks that we wouldn’t normally expect. This // waits for any of those to clear out. - suite.Assert().Eventually( + suite.Require().Eventually( func() bool { suite.Require().NoError(runner.StartNextGeneration()) suite.Require().NoError(runner.AwaitGenerationEnd()) @@ -2120,7 +2120,7 @@ func (suite *IntegrationTestSuite) TestMetadataMismatchAndPartitioning() { ctx, append( mongo.Pipeline{ - bson.D{{"$match", bson.D{{"generation", 1}}}}, + bson.D{{"$match", bson.D{{"generation", verifier.generation}}}}, }, testutil.SortByListAgg("type", sortedTaskTypes)..., ), From 1784a7b2a45cac11e64aa3f9df15947dbd974c3c Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 15:29:26 -0500 Subject: [PATCH 072/130] tweak --- internal/verifier/migration_verifier_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 8fe2ca96..4b6a7f4f 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -2138,7 +2138,7 @@ func (suite *IntegrationTestSuite) TestMetadataMismatchAndPartitioning() { "wait until verifier has caught up with itself", ) - suite.Require().Len(tasks, 1, "generation 1 should only have done 1 task; tasks=%+v", tasks) + suite.Require().Len(tasks, 1, "should eventually only have 1 task; tasks=%+v", tasks) suite.Require().Equal(verificationTaskVerifyCollection, tasks[0].Type) suite.Require().Equal(verificationTaskMetadataMismatch, tasks[0].Status) } From 3160dc63c4e4bdc0208daac0fa64aa5182d49b19 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 15:44:28 -0500 Subject: [PATCH 073/130] switch over --- internal/verifier/change_reader.go | 28 ++++++++++++++-- internal/verifier/change_stream.go | 43 ++++++++++++++++-------- internal/verifier/check.go | 50 ++++++++-------------------- internal/verifier/compare.go | 38 +++++++++++---------- internal/verifier/recheck_persist.go | 3 -- 5 files changed, 89 insertions(+), 73 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 143f5f04..8fae036c 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -35,6 +35,7 @@ type changeReader interface { getWhichCluster() whichCluster getReadChannel() <-chan changeEventBatch getStartTimestamp() option.Option[bson.Timestamp] + getLatestTimestamp() option.Option[bson.Timestamp] getEventsPerSecond() option.Option[float64] getLag() option.Option[time.Duration] getBufferSaturation() float64 @@ -48,9 +49,8 @@ type changeReader interface { type ChangeReaderCommon struct { readerType whichCluster - lastChangeEventTime *bson.Timestamp - logger *logger.Logger - namespaces []string + logger *logger.Logger + namespaces []string metaDB *mongo.Database watcherClient *mongo.Client @@ -62,6 +62,8 @@ type ChangeReaderCommon struct { changeEventBatchChan chan changeEventBatch writesOffTs *util.Eventual[bson.Timestamp] + lastChangeEventTime *msync.TypedAtomic[option.Option[bson.Timestamp]] + startAtTs *bson.Timestamp lag *msync.TypedAtomic[option.Option[time.Duration]] @@ -70,6 +72,22 @@ type ChangeReaderCommon struct { onDDLEvent ddlEventHandling } +func newChangeReaderCommon(clusterName whichCluster) ChangeReaderCommon { + return ChangeReaderCommon{ + readerType: clusterName, + changeEventBatchChan: make(chan changeEventBatch, batchChanBufferSize), + writesOffTs: util.NewEventual[bson.Timestamp](), + lag: msync.NewTypedAtomic(option.None[time.Duration]()), + lastChangeEventTime: msync.NewTypedAtomic(option.None[bson.Timestamp]()), + batchSizeHistory: history.New[int](time.Minute), + onDDLEvent: lo.Ternary( + clusterName == dst, + onDDLEventAllow, + "", + ), + } +} + func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { return rc.readerType } @@ -90,6 +108,10 @@ func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { return rc.changeEventBatchChan } +func (rc *ChangeReaderCommon) getLatestTimestamp() option.Option[bson.Timestamp] { + return rc.lastChangeEventTime.Load() +} + // getBufferSaturation returns the reader’s internal buffer’s saturation level // as a fraction. If saturation rises, that means we’re reading events faster // than we can persist them. diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 76529256..86f59a4a 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -47,6 +47,28 @@ type ChangeStreamReader struct { var _ changeReader = &ChangeStreamReader{} +func (v *Verifier) newChangeStreamReader( + namespaces []string, + cluster whichCluster, + client *mongo.Client, + clusterInfo util.ClusterInfo, +) *ChangeStreamReader { + common := newChangeReaderCommon(cluster) + common.namespaces = namespaces + common.readerType = cluster + common.watcherClient = client + common.clusterInfo = clusterInfo + + common.logger = v.logger + common.metaDB = v.metaClient.Database(v.metaDBName) + + common.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken + + csr := &ChangeStreamReader{ChangeReaderCommon: common} + + return csr +} + // GetChangeStreamFilter returns an aggregation pipeline that filters // namespaces as per configuration. // @@ -193,11 +215,9 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( return errors.Errorf("Change event lacks a namespace: %+v", changeEvents[eventsRead]) } - if changeEvents[eventsRead].ClusterTime != nil && - (csr.lastChangeEventTime == nil || - csr.lastChangeEventTime.Before(*changeEvents[eventsRead].ClusterTime)) { - - csr.lastChangeEventTime = changeEvents[eventsRead].ClusterTime + eventTime := changeEvents[eventsRead].ClusterTime + if eventTime != nil && csr.lastChangeEventTime.Load().OrZero().Before(*eventTime) { + csr.lastChangeEventTime.Store(option.Some(*eventTime)) latestEvent = option.Some(changeEvents[eventsRead]) } @@ -230,9 +250,6 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( events: changeEvents, resumeToken: cs.ResumeToken(), - - // NB: We know by now that OperationTime is non-nil. - clusterTime: *sess.OperationTime(), }: } @@ -314,8 +331,8 @@ func (csr *ChangeStreamReader) iterateChangeStream( if gotwritesOffTimestamp { csr.running = false - if csr.lastChangeEventTime != nil { - csr.startAtTs = csr.lastChangeEventTime + if ts, has := csr.lastChangeEventTime.Load().Get(); has { + csr.startAtTs = &ts } break @@ -323,10 +340,10 @@ func (csr *ChangeStreamReader) iterateChangeStream( } infoLog := csr.logger.Info() - if csr.lastChangeEventTime == nil { - infoLog = infoLog.Str("lastEventTime", "none") + if ts, has := csr.lastChangeEventTime.Load().Get(); has { + infoLog = infoLog.Any("lastEventTime", ts) } else { - infoLog = infoLog.Any("lastEventTime", *csr.lastChangeEventTime) + infoLog = infoLog.Str("lastEventTime", "none") } infoLog. diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 529c3e3f..0405ffcc 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -6,13 +6,9 @@ import ( "time" "github.com/10gen/migration-verifier/contextplus" - "github.com/10gen/migration-verifier/history" "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/retry" - "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mslices" - "github.com/10gen/migration-verifier/msync" - "github.com/10gen/migration-verifier/option" mapset "github.com/deckarep/golang-set/v2" "github.com/goaux/timer" "github.com/pkg/errors" @@ -600,36 +596,18 @@ func (verifier *Verifier) work(ctx context.Context, workerNum int) error { } } -func (verifier *Verifier) initializeChangeReaders() { - srcReader := &ChangeStreamReader{ - ChangeReaderCommon: ChangeReaderCommon{ - readerType: src, - namespaces: verifier.srcNamespaces, - watcherClient: verifier.srcClient, - clusterInfo: *verifier.srcClusterInfo, - }, - } - verifier.srcChangeReader = srcReader - - dstReader := &ChangeStreamReader{ - ChangeReaderCommon: ChangeReaderCommon{ - readerType: dst, - namespaces: verifier.dstNamespaces, - watcherClient: verifier.dstClient, - clusterInfo: *verifier.dstClusterInfo, - onDDLEvent: onDDLEventAllow, - }, - } - verifier.dstChangeReader = dstReader - - // Common elements in both readers: - for _, csr := range mslices.Of(srcReader, dstReader) { - csr.logger = verifier.logger - csr.metaDB = verifier.metaClient.Database(verifier.metaDBName) - csr.changeEventBatchChan = make(chan changeEventBatch, batchChanBufferSize) - csr.writesOffTs = util.NewEventual[bson.Timestamp]() - csr.lag = msync.NewTypedAtomic(option.None[time.Duration]()) - csr.batchSizeHistory = history.New[int](time.Minute) - csr.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken - } +func (v *Verifier) initializeChangeReaders() { + v.srcChangeReader = v.newChangeStreamReader( + v.srcNamespaces, + src, + v.srcClient, + *v.srcClusterInfo, + ) + + v.dstChangeReader = v.newChangeStreamReader( + v.dstNamespaces, + dst, + v.dstClient, + *v.dstClusterInfo, + ) } diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 85fbfca0..d5b8b94d 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -18,7 +18,6 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" - "go.mongodb.org/mongo-driver/v2/mongo/readpref" "golang.org/x/exp/slices" ) @@ -467,7 +466,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.srcClientCollection(task), verifier.srcClusterInfo, - verifier.srcChangeReader.getStartTimestamp().ToPointer(), + verifier.srcChangeReader.getLatestTimestamp(), task, ) @@ -500,7 +499,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.dstClientCollection(task), verifier.dstClusterInfo, - verifier.dstChangeReader.getStartTimestamp().ToPointer(), + verifier.dstChangeReader.getLatestTimestamp(), task, ) @@ -573,8 +572,13 @@ func getMapKey(docKeyValues []bson.RawValue) string { return keyBuffer.String() } -func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mongo.Collection, clusterInfo *util.ClusterInfo, - startAtTs *bson.Timestamp, task *VerificationTask) (*mongo.Cursor, error) { +func (verifier *Verifier) getDocumentsCursor( + ctx context.Context, + collection *mongo.Collection, + clusterInfo *util.ClusterInfo, + readConcernTS option.Option[bson.Timestamp], + task *VerificationTask, +) (*mongo.Cursor, error) { var findOptions bson.D runCommandOptions := options.RunCmd() var andPredicates bson.A @@ -656,20 +660,18 @@ func (verifier *Verifier) getDocumentsCursor(ctx context.Context, collection *mo ) } - if verifier.readPreference.Mode() != readpref.PrimaryMode { - runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) - if startAtTs != nil { - readConcern := bson.D{ - {"afterClusterTime", *startAtTs}, - } - - // We never want to read before the change stream start time, - // or for the last generation, the change stream end time. - cmd = append( - cmd, - bson.E{"readConcern", readConcern}, - ) + runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) + if ts, has := readConcernTS.Get(); has { + readConcern := bson.D{ + {"afterClusterTime", ts}, } + + // We never want to read before the change stream start time, + // or for the last generation, the change stream end time. + cmd = append( + cmd, + bson.E{"readConcern", readConcern}, + ) } // Suppress this log for recheck tasks because the list of IDs can be diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 564e33a2..82db17fc 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -13,7 +13,6 @@ import ( type changeEventBatch struct { events []ParsedEvent resumeToken bson.Raw - clusterTime bson.Timestamp } // RunChangeEventPersistor persists rechecks from change event batches. @@ -166,14 +165,12 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE } latestTimestampTime := time.Unix(int64(latestTimestamp.T), 0) - lag := time.Unix(int64(batch.clusterTime.T), 0).Sub(latestTimestampTime) verifier.logger.Trace(). Str("origin", string(eventOrigin)). Int("count", len(docIDs)). Any("latestTimestamp", latestTimestamp). Time("latestTimestampTime", latestTimestampTime). - Stringer("lag", lag). Msg("Persisting rechecks for change events.") return verifier.insertRecheckDocs(ctx, dbNames, collNames, docIDs, dataSizes) From 263c9e70a125048835a03641312b11e24c0e8225 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 15:57:41 -0500 Subject: [PATCH 074/130] fall back to start ts as needed --- internal/verifier/change_reader.go | 4 ++-- internal/verifier/compare.go | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 8fae036c..c9021ecf 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -35,7 +35,7 @@ type changeReader interface { getWhichCluster() whichCluster getReadChannel() <-chan changeEventBatch getStartTimestamp() option.Option[bson.Timestamp] - getLatestTimestamp() option.Option[bson.Timestamp] + getLastSeenClusterTime() option.Option[bson.Timestamp] getEventsPerSecond() option.Option[float64] getLag() option.Option[time.Duration] getBufferSaturation() float64 @@ -108,7 +108,7 @@ func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { return rc.changeEventBatchChan } -func (rc *ChangeReaderCommon) getLatestTimestamp() option.Option[bson.Timestamp] { +func (rc *ChangeReaderCommon) getLastSeenClusterTime() option.Option[bson.Timestamp] { return rc.lastChangeEventTime.Load() } diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index d5b8b94d..5e6103f8 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -2,6 +2,7 @@ package verifier import ( "bytes" + "cmp" "context" "fmt" "time" @@ -466,7 +467,10 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.srcClientCollection(task), verifier.srcClusterInfo, - verifier.srcChangeReader.getLatestTimestamp(), + cmp.Or( + verifier.srcChangeReader.getLastSeenClusterTime(), + verifier.srcChangeReader.getStartTimestamp(), + ), task, ) @@ -499,7 +503,10 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.dstClientCollection(task), verifier.dstClusterInfo, - verifier.dstChangeReader.getLatestTimestamp(), + cmp.Or( + verifier.dstChangeReader.getLastSeenClusterTime(), + verifier.dstChangeReader.getStartTimestamp(), + ), task, ) From d009954211cbea466c72d228434dd988ace663ce Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 16:42:08 -0500 Subject: [PATCH 075/130] agg polish --- agg/agg.go | 48 ++++++++++++++++++++----------- internal/verifier/oplog_reader.go | 33 ++++++++++----------- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/agg/agg.go b/agg/agg.go index f1202cf7..c41ec28b 100644 --- a/agg/agg.go +++ b/agg/agg.go @@ -4,38 +4,48 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" ) -func Eq(comparands ...any) bson.D { - return bson.D{{"$eq", comparands}} +type Eq []any + +var _ bson.Marshaler = Eq{} + +func (e Eq) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{{"$eq", e}}) } -func In[T any](needle any, haystack ...T) bson.D { +// --------------------------------------------- + +func In[T any](needle any, haystack []T) bson.D { return bson.D{{"$in", bson.A{needle, haystack}}} } -func BSONSize(ref any) bson.D { - return bson.D{{"$bsonSize", ref}} -} +// --------------------------------------------- -func Type(ref any) bson.D { - return bson.D{{"$type", ref}} -} +type BSONSize [1]any -func Concat(refs ...any) bson.D { - return bson.D{{"$concat", refs}} +var _ bson.Marshaler = BSONSize{} + +func (b BSONSize) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{{"$bsonSize", b[0]}}) } // --------------------------------------------- -type Not struct { - Ref any +type Type [1]any + +var _ bson.Marshaler = Type{} + +func (t Type) MarshalBSON() ([]byte, error) { + return bson.Marshal(bson.D{{"$type", t[0]}}) } -var _ bson.Marshaler = Not{} +// --------------------------------------------- + +type Not [1]any + +var _ bson.Marshaler = Type{} func (n Not) MarshalBSON() ([]byte, error) { - return bson.Marshal(bson.D{ - {"$not", n.Ref}, - }) + return bson.Marshal(bson.D{{"$not", n[0]}}) } // --------------------------------------------- @@ -103,6 +113,8 @@ type Switch struct { Default any } +var _ bson.Marshaler = Switch{} + type SwitchCase struct { Case any Then any @@ -126,6 +138,8 @@ type ArrayElemAt struct { Index int } +var _ bson.Marshaler = ArrayElemAt{} + func (a ArrayElemAt) D() bson.D { return bson.D{{"$arrayElemAt", bson.A{ a.Array, diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index bd150348..93bfdbc8 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -13,6 +13,7 @@ import ( "github.com/10gen/migration-verifier/internal/verifier/oplog" "github.com/10gen/migration-verifier/mbson" "github.com/10gen/migration-verifier/mmongo" + "github.com/10gen/migration-verifier/mslices" "github.com/10gen/migration-verifier/option" "github.com/pkg/errors" "github.com/samber/lo" @@ -153,16 +154,16 @@ func (o *OplogReader) createCursor( bson.D{{"$expr", agg.Or{ // plain ops: one write per op append( - agg.And{agg.In("$op", "d", "i", "u")}, + agg.And{agg.In("$op", mslices.Of("d", "i", "u"))}, o.getNSFilter("$$ROOT")..., ), // op=n is for no-ops, so we stay up-to-date. - agg.Eq("$op", "n"), + agg.Eq{"$op", "n"}, // op=c is for applyOps, and also to detect forbidden DDL. agg.And{ - agg.Eq("$op", "c"), + agg.Eq{"$op", "c"}, agg.Not{helpers.StringHasPrefix{ FieldRef: "$ns", Prefix: "config.", @@ -206,7 +207,7 @@ func (o *OplogReader) getExprProjection() bson.D { {"docID", getOplogDocIDExpr("$$ROOT")}, {"cmdName", agg.Cond{ - If: agg.Eq("$op", "c"), + If: agg.Eq{"$op", "c"}, Then: agg.ArrayElemAt{ Array: agg.Map{ Input: bson.D{ @@ -222,8 +223,8 @@ func (o *OplogReader) getExprProjection() bson.D { {"o", agg.Cond{ If: agg.And{ - agg.Eq("$op", "c"), - agg.Eq("missing", agg.Type("$o.applyOps")), + agg.Eq{"$op", "c"}, + agg.Eq{"missing", agg.Type{"$o.applyOps"}}, }, Then: "$o", Else: "$$REMOVE", @@ -231,8 +232,8 @@ func (o *OplogReader) getExprProjection() bson.D { {"ops", agg.Cond{ If: agg.And{ - agg.Eq("$op", "c"), - agg.Eq(agg.Type("$o.applyOps"), "array"), + agg.Eq{"$op", "c"}, + agg.Eq{agg.Type{"$o.applyOps"}, "array"}, }, Then: agg.Map{ Input: agg.Filter{ @@ -635,7 +636,7 @@ func (o *OplogReader) getNSFilter(docroot string) agg.And { if len(o.namespaces) > 0 { filter = append( filter, - agg.In(docroot+".ns", o.namespaces...), + agg.In(docroot+".ns", o.namespaces), ) } @@ -645,13 +646,13 @@ func (o *OplogReader) getNSFilter(docroot string) agg.And { func getOplogDocLenExpr(docroot string) any { return agg.Cond{ If: agg.Or{ - agg.Eq(docroot+".op", "i"), + agg.Eq{docroot + ".op", "i"}, agg.And{ - agg.Eq(docroot+".op", "u"), - agg.Not{agg.Eq("missing", docroot+".o._id")}, + agg.Eq{docroot + ".op", "u"}, + agg.Not(agg.Eq{"missing", docroot + ".o._id"}), }, }, - Then: agg.BSONSize(docroot + ".o"), + Then: agg.BSONSize{docroot + ".o"}, Else: "$$REMOVE", } } @@ -661,15 +662,15 @@ func getOplogDocIDExpr(docroot string) any { return agg.Switch{ Branches: []agg.SwitchCase{ { - Case: agg.Eq(docroot+".op", "c"), + Case: agg.Eq{docroot + ".op", "c"}, Then: "$$REMOVE", }, { - Case: agg.In(docroot+".op", "i", "d"), + Case: agg.In(docroot+".op", mslices.Of("i", "d")), Then: docroot + ".o._id", }, { - Case: agg.In(docroot+".op", "u"), + Case: agg.Eq{docroot + ".op", "u"}, Then: docroot + ".o2._id", }, }, From f10c9077969acc0ffd056df3e1bf1be964793ba2 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 16:52:51 -0500 Subject: [PATCH 076/130] fix Eq --- agg/agg.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agg/agg.go b/agg/agg.go index c41ec28b..0216d091 100644 --- a/agg/agg.go +++ b/agg/agg.go @@ -9,7 +9,7 @@ type Eq []any var _ bson.Marshaler = Eq{} func (e Eq) MarshalBSON() ([]byte, error) { - return bson.Marshal(bson.D{{"$eq", e}}) + return bson.Marshal(bson.D{{"$eq", []any(e)}}) } // --------------------------------------------- From f05a4c68585d42bcb5a630a8aa843827e251db83 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 17:06:41 -0500 Subject: [PATCH 077/130] use driver API --- internal/verifier/compare.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 5e6103f8..02b509e3 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -667,18 +667,17 @@ func (verifier *Verifier) getDocumentsCursor( ) } + sess := mongo.SessionFromContext(ctx) + + if sess == nil { + panic("No session?!?") + } + runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) if ts, has := readConcernTS.Get(); has { - readConcern := bson.D{ - {"afterClusterTime", ts}, + if err := sess.AdvanceOperationTime(&ts); err != nil { + return nil, errors.Wrapf(err, "advancing session operation time to %v", ts) } - - // We never want to read before the change stream start time, - // or for the last generation, the change stream end time. - cmd = append( - cmd, - bson.E{"readConcern", readConcern}, - ) } // Suppress this log for recheck tasks because the list of IDs can be From cf08833fdf5e3b99971499060aab7a63c88fddff Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 17:07:32 -0500 Subject: [PATCH 078/130] sc --- internal/verifier/compare.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 0d45d9c0..c1256552 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -670,7 +670,7 @@ func (verifier *Verifier) getDocumentsCursor( ) } - sess := mongo.SessionFromContext(ctx) + sess := mongo.SessionFromContext(sctx) if sess == nil { panic("No session?!?") From 22d577525e627c20859f8766bd57491388559844 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 17:15:23 -0500 Subject: [PATCH 079/130] revert --- internal/verifier/compare.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 02b509e3..f2e17b68 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -587,7 +587,6 @@ func (verifier *Verifier) getDocumentsCursor( task *VerificationTask, ) (*mongo.Cursor, error) { var findOptions bson.D - runCommandOptions := options.RunCmd() var andPredicates bson.A var aggOptions bson.D @@ -673,11 +672,19 @@ func (verifier *Verifier) getDocumentsCursor( panic("No session?!?") } - runCommandOptions = runCommandOptions.SetReadPreference(verifier.readPreference) + runCommandOptions := options.RunCmd().SetReadPreference(verifier.readPreference) + if ts, has := readConcernTS.Get(); has { - if err := sess.AdvanceOperationTime(&ts); err != nil { - return nil, errors.Wrapf(err, "advancing session operation time to %v", ts) + readConcern := bson.D{ + {"afterClusterTime", ts}, } + + // We never want to read before the change stream start time, + // or for the last generation, the change stream end time. + cmd = append( + cmd, + bson.E{"readConcern", readConcern}, + ) } // Suppress this log for recheck tasks because the list of IDs can be From f2ca1e6050c552ed51cb8b5df3d65a5fa8815917 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 20:35:31 -0500 Subject: [PATCH 080/130] compulsory timestamp --- internal/verifier/change_reader.go | 10 ++++++--- internal/verifier/change_stream_test.go | 18 +++++---------- internal/verifier/compare.go | 29 ++++++++++--------------- 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index c9021ecf..a3d261f9 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -34,7 +34,7 @@ const ( type changeReader interface { getWhichCluster() whichCluster getReadChannel() <-chan changeEventBatch - getStartTimestamp() option.Option[bson.Timestamp] + getStartTimestamp() bson.Timestamp getLastSeenClusterTime() option.Option[bson.Timestamp] getEventsPerSecond() option.Option[float64] getLag() option.Option[time.Duration] @@ -92,8 +92,12 @@ func (rc *ChangeReaderCommon) getWhichCluster() whichCluster { return rc.readerType } -func (rc *ChangeReaderCommon) getStartTimestamp() option.Option[bson.Timestamp] { - return option.FromPointer(rc.startAtTs) +func (rc *ChangeReaderCommon) getStartTimestamp() bson.Timestamp { + if rc.startAtTs == nil { + panic("no start timestamp yet?!?") + } + + return *rc.startAtTs } func (rc *ChangeReaderCommon) setWritesOff(ts bson.Timestamp) { diff --git a/internal/verifier/change_stream_test.go b/internal/verifier/change_stream_test.go index 5f8e8395..62a39bca 100644 --- a/internal/verifier/change_stream_test.go +++ b/internal/verifier/change_stream_test.go @@ -441,9 +441,7 @@ func (suite *IntegrationTestSuite) TestChangeStreamResumability() { suite.startSrcChangeStreamReaderAndHandler(ctx, verifier2) - startAtTs, hasStartAtTs := verifier2.srcChangeReader.getStartTimestamp().Get() - - suite.Require().True(hasStartAtTs) + startAtTs := verifier2.srcChangeReader.getStartTimestamp() suite.Assert().False( startAtTs.After(newTime), @@ -631,14 +629,13 @@ func (suite *IntegrationTestSuite) TestStartAtTimeNoChanges() { eg := suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() - suite.Require().True(hasStartAtTs, "startAtTs should be set") + startAtTs := verifier.srcChangeReader.getStartTimestamp() verifier.srcChangeReader.setWritesOff(insertTs) suite.Require().NoError(eg.Wait()) - startAtTs2 := verifier.srcChangeReader.getStartTimestamp().MustGet() + startAtTs2 := verifier.srcChangeReader.getStartTimestamp() suite.Require().False( startAtTs2.Before(startAtTs), @@ -663,8 +660,7 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { suite.Require().NotNil(origSessionTime) eg := suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() - suite.Require().True(hasStartAtTs, "startAtTs should be set") + startAtTs := verifier.srcChangeReader.getStartTimestamp() // srcStartAtTs derives from the change stream’s resume token, which can // postdate our session time but should not precede it. @@ -697,8 +693,7 @@ func (suite *IntegrationTestSuite) TestStartAtTimeWithChanges() { suite.Require().NoError(eg.Wait()) - startAtTs, hasStartAtTs = verifier.srcChangeReader.getStartTimestamp().Get() - suite.Require().True(hasStartAtTs, "startAtTs should be set") + startAtTs = verifier.srcChangeReader.getStartTimestamp() suite.Assert().Equal( *postEventsSessionTime, @@ -720,8 +715,7 @@ func (suite *IntegrationTestSuite) TestNoStartAtTime() { suite.Require().NotNil(origStartTs) suite.startSrcChangeStreamReaderAndHandler(ctx, verifier) - startAtTs, hasStartAtTs := verifier.srcChangeReader.getStartTimestamp().Get() - suite.Require().True(hasStartAtTs, "startAtTs should be set") + startAtTs := verifier.srcChangeReader.getStartTimestamp() suite.Require().NotNil(startAtTs) suite.Require().LessOrEqual(origStartTs.Compare(startAtTs), 0) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index f2e17b68..da9988a1 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -2,7 +2,6 @@ package verifier import ( "bytes" - "cmp" "context" "fmt" "time" @@ -467,8 +466,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.srcClientCollection(task), verifier.srcClusterInfo, - cmp.Or( - verifier.srcChangeReader.getLastSeenClusterTime(), + verifier.srcChangeReader.getLastSeenClusterTime().OrElse( verifier.srcChangeReader.getStartTimestamp(), ), task, @@ -503,8 +501,7 @@ func (verifier *Verifier) getFetcherChannelsAndCallbacks( sctx, verifier.dstClientCollection(task), verifier.dstClusterInfo, - cmp.Or( - verifier.dstChangeReader.getLastSeenClusterTime(), + verifier.dstChangeReader.getLastSeenClusterTime().OrElse( verifier.dstChangeReader.getStartTimestamp(), ), task, @@ -583,7 +580,7 @@ func (verifier *Verifier) getDocumentsCursor( ctx context.Context, collection *mongo.Collection, clusterInfo *util.ClusterInfo, - readConcernTS option.Option[bson.Timestamp], + readConcernTS bson.Timestamp, task *VerificationTask, ) (*mongo.Cursor, error) { var findOptions bson.D @@ -674,18 +671,14 @@ func (verifier *Verifier) getDocumentsCursor( runCommandOptions := options.RunCmd().SetReadPreference(verifier.readPreference) - if ts, has := readConcernTS.Get(); has { - readConcern := bson.D{ - {"afterClusterTime", ts}, - } - - // We never want to read before the change stream start time, - // or for the last generation, the change stream end time. - cmd = append( - cmd, - bson.E{"readConcern", readConcern}, - ) - } + // We never want to read before the change stream start time, + // or for the last generation, the change stream end time. + cmd = append( + cmd, + bson.E{"readConcern", bson.D{ + {"afterClusterTime", readConcernTS}, + }}, + ) // Suppress this log for recheck tasks because the list of IDs can be // quite long. From 22e5072efbfa531684515287578fdb5eb76980e7 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 20:47:14 -0500 Subject: [PATCH 081/130] start handling --- internal/verifier/check.go | 48 ++++++++++++++++++------------- internal/verifier/compare_test.go | 3 +- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 0405ffcc..b606ee84 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -251,28 +251,10 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh verifier.phase = Idle }() - changeReaderGroup, groupCtx := contextplus.ErrGroup(ctx) - for _, changeReader := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { - if changeReader.isRunning() { - verifier.logger.Debug().Msgf("Check: %s already running.", changeReader) - } else { - verifier.logger.Debug().Msgf("%s not running; starting change reader", changeReader) - - err = changeReader.start(groupCtx, changeReaderGroup) - if err != nil { - return errors.Wrapf(err, "failed to start %s", changeReader) - } - changeReaderGroup.Go(func() error { - return verifier.RunChangeEventPersistor(groupCtx, changeReader) - }) - } + if err := verifier.startChangeHandling(ctx); err != nil { + return err } - changeHandlingErr := verifier.changeHandlingErr - go func() { - changeHandlingErr.Set(changeReaderGroup.Wait()) - }() - // Log the verification status when initially booting up so it's easy to see the current state verificationStatus, err := verifier.GetVerificationStatus(ctx) if err != nil { @@ -405,6 +387,32 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } } +func (verifier *Verifier) startChangeHandling(ctx context.Context) error { + changeReaderGroup, groupCtx := contextplus.ErrGroup(ctx) + for _, changeReader := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { + if changeReader.isRunning() { + verifier.logger.Debug().Msgf("Check: %s already running.", changeReader) + } else { + verifier.logger.Debug().Msgf("%s not running; starting change reader", changeReader) + + err := changeReader.start(groupCtx, changeReaderGroup) + if err != nil { + return errors.Wrapf(err, "failed to start %s", changeReader) + } + changeReaderGroup.Go(func() error { + return verifier.RunChangeEventPersistor(groupCtx, changeReader) + }) + } + } + + changeHandlingErr := verifier.changeHandlingErr + go func() { + changeHandlingErr.Set(changeReaderGroup.Wait()) + }() + + return nil +} + func (verifier *Verifier) setupAllNamespaceList(ctx context.Context) error { // We want to check all user collections on both source and dest. srcNamespaces, err := ListAllUserNamespaces(ctx, verifier.logger, verifier.srcClient, verifier.metaDBName) diff --git a/internal/verifier/compare_test.go b/internal/verifier/compare_test.go index 884e38a6..15686ba6 100644 --- a/internal/verifier/compare_test.go +++ b/internal/verifier/compare_test.go @@ -14,7 +14,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" ) -// TestFetchAndCompareDocuments_ContextCancellation ensures that nothing hangs +// TestFetchAndCompareDocuments_Context ensures that nothing hangs // when a context is canceled during FetchAndCompareDocuments(). func (s *IntegrationTestSuite) TestFetchAndCompareDocuments_Context() { ctx := s.Context() @@ -49,6 +49,7 @@ func (s *IntegrationTestSuite) TestFetchAndCompareDocuments_Context() { } verifier := s.BuildVerifier() + s.Require().NoError(verifier.startChangeHandling(ctx)) for range 100 { cancelableCtx, cancel := contextplus.WithCancelCause(ctx) From 88b065708c2291e0442ec3a846ea43ee591a3a7f Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 20:51:55 -0500 Subject: [PATCH 082/130] another --- internal/verifier/migration_verifier_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 396077bf..631ea2d4 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -126,6 +126,8 @@ func (suite *IntegrationTestSuite) TestProcessVerifyTask_Failure() { ctx := suite.Context() t := suite.T() + suite.Require().NoError(verifier.startChangeHandling(ctx)) + dbName := suite.DBNameForTest() collName := "coll" From ae85756fe4c4ed3b635f62cb99ce58a0e6cc1f71 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 20:56:50 -0500 Subject: [PATCH 083/130] fix another --- internal/verifier/migration_verifier_test.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 631ea2d4..0e4050bb 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -379,8 +379,6 @@ func (suite *IntegrationTestSuite) TestVerifier_DocFilter_ObjectID() { } func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { - verifier := suite.BuildVerifier() - ctx := suite.Context() task := &VerificationTask{ PrimaryKey: bson.NewObjectID(), @@ -396,20 +394,25 @@ func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { }, } - _, err := verifier.srcClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ + _, err := suite.srcMongoClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ bson.D{{"_id", nil}}, bson.D{{"_id", int32(123)}}, bson.D{{"_id", bson.Symbol("oh yeah")}}, }) suite.Require().NoError(err) - _, err = verifier.dstClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ + _, err = suite.dstMongoClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ bson.D{{"_id", nil}}, bson.D{{"_id", int32(123)}}, bson.D{{"_id", "oh yeah"}}, }) suite.Require().NoError(err) + verifier := suite.BuildVerifier() + ctx := suite.Context() + + suite.Require().NoError(verifier.startChangeHandling(ctx)) + cases := []struct { label string lower, upper any From 37373ac1f0c373a8c87b53852db0ef31eb7c22d2 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 20:57:11 -0500 Subject: [PATCH 084/130] fix --- internal/verifier/migration_verifier_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 0e4050bb..b67e86cc 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -379,6 +379,7 @@ func (suite *IntegrationTestSuite) TestVerifier_DocFilter_ObjectID() { } func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { + ctx := suite.Context() task := &VerificationTask{ PrimaryKey: bson.NewObjectID(), From b30e9a898c26f32251b60b0af88106d8431fbda0 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 20:59:57 -0500 Subject: [PATCH 085/130] fix again --- internal/verifier/migration_verifier_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index b67e86cc..40f51f26 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -410,7 +410,6 @@ func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { suite.Require().NoError(err) verifier := suite.BuildVerifier() - ctx := suite.Context() suite.Require().NoError(verifier.startChangeHandling(ctx)) From 0546f156ea3b50dbd418eaa8376dfbf79fc953a8 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 21:09:54 -0500 Subject: [PATCH 086/130] fix test --- internal/verifier/migration_verifier_test.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 40f51f26..c83de5c7 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -490,12 +490,12 @@ func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { } func (suite *IntegrationTestSuite) TestVerifierFetchDocuments() { - verifier := suite.BuildVerifier() + ctx := suite.Context() drop := func() { - err := verifier.srcClient.Database("keyhole").Drop(ctx) + err := suite.srcMongoClient.Database("keyhole").Drop(ctx) suite.Require().NoError(err) - err = verifier.dstClient.Database("keyhole").Drop(ctx) + err = suite.dstMongoClient.Database("keyhole").Drop(ctx) suite.Require().NoError(err) } drop() @@ -511,12 +511,12 @@ func (suite *IntegrationTestSuite) TestVerifierFetchDocuments() { } id := rand.Intn(1000) - _, err := verifier.srcClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ + _, err := suite.srcMongoClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ bson.D{{"_id", id}, {"num", 99}, {"name", "srcTest"}}, bson.D{{"_id", id + 1}, {"num", 101}, {"name", "srcTest"}}, }) suite.Require().NoError(err) - _, err = verifier.dstClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ + _, err = suite.dstMongoClient.Database("keyhole").Collection("dealers").InsertMany(ctx, []any{ bson.D{{"_id", id}, {"num", 99}, {"name", "dstTest"}}, bson.D{{"_id", id + 1}, {"num", 101}, {"name", "dstTest"}}, }) @@ -528,6 +528,9 @@ func (suite *IntegrationTestSuite) TestVerifierFetchDocuments() { QueryFilter: basicQueryFilter("keyhole.dealers"), } + verifier := suite.BuildVerifier() + suite.Require().NoError(verifier.startChangeHandling(ctx)) + // Test fetchDocuments without global filter. verifier.globalFilter = nil results, docCount, byteCount, err := verifier.FetchAndCompareDocuments(ctx, 0, task) From 1ffb5f69a205b7a0407b58244f14c4b524e7ba93 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 21:16:32 -0500 Subject: [PATCH 087/130] more --- internal/verifier/migration_verifier_test.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index c83de5c7..deb77e4f 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -332,15 +332,15 @@ func getShardIds(t *testing.T, client *mongo.Client) []string { } func (suite *IntegrationTestSuite) TestVerifier_DocFilter_ObjectID() { - verifier := suite.BuildVerifier() + ctx := suite.Context() t := suite.T() dbName := suite.DBNameForTest() collName := "coll" - srcColl := verifier.srcClient.Database(dbName).Collection(collName) - dstColl := verifier.dstClient.Database(dbName).Collection(collName) + srcColl := suite.srcMongoClient.Database(dbName).Collection(collName) + dstColl := suite.dstMongoClient.Database(dbName).Collection(collName) id1 := bson.NewObjectID() _, err := srcColl.InsertOne(ctx, bson.D{{"_id", id1}}) @@ -364,6 +364,9 @@ func (suite *IntegrationTestSuite) TestVerifier_DocFilter_ObjectID() { }, } + verifier := suite.BuildVerifier() + suite.Require().NoError(verifier.startChangeHandling(ctx)) + verifier.globalFilter = bson.D{{"_id", id1}} results, docCount, _, err := verifier.FetchAndCompareDocuments(ctx, 0, task) From a463151398b210223f266f00fd70c0f8582a5d91 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 21:21:35 -0500 Subject: [PATCH 088/130] move --- internal/verifier/migration_verifier_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index deb77e4f..c8a4ddd7 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -384,6 +384,10 @@ func (suite *IntegrationTestSuite) TestVerifier_DocFilter_ObjectID() { func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { ctx := suite.Context() + verifier := suite.BuildVerifier() + + suite.Require().NoError(verifier.startChangeHandling(ctx)) + task := &VerificationTask{ PrimaryKey: bson.NewObjectID(), QueryFilter: QueryFilter{ @@ -412,10 +416,6 @@ func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { }) suite.Require().NoError(err) - verifier := suite.BuildVerifier() - - suite.Require().NoError(verifier.startChangeHandling(ctx)) - cases := []struct { label string lower, upper any From a5ab5031dd82d1df451a8b3f0fe6acef5d0f89e3 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 21:22:55 -0500 Subject: [PATCH 089/130] save --- internal/verifier/migration_verifier_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index c8a4ddd7..6f51bb70 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -493,8 +493,11 @@ func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { } func (suite *IntegrationTestSuite) TestVerifierFetchDocuments() { - ctx := suite.Context() + + verifier := suite.BuildVerifier() + suite.Require().NoError(verifier.startChangeHandling(ctx)) + drop := func() { err := suite.srcMongoClient.Database("keyhole").Drop(ctx) suite.Require().NoError(err) @@ -531,9 +534,6 @@ func (suite *IntegrationTestSuite) TestVerifierFetchDocuments() { QueryFilter: basicQueryFilter("keyhole.dealers"), } - verifier := suite.BuildVerifier() - suite.Require().NoError(verifier.startChangeHandling(ctx)) - // Test fetchDocuments without global filter. verifier.globalFilter = nil results, docCount, byteCount, err := verifier.FetchAndCompareDocuments(ctx, 0, task) From fe2224230a352f6f237b80ef2ef2930aeef2acf9 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Fri, 14 Nov 2025 21:31:36 -0500 Subject: [PATCH 090/130] ctx --- internal/verifier/migration_verifier_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 6f51bb70..4d70ccd8 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -304,6 +304,7 @@ func (suite *IntegrationTestSuite) TestVerifier_Dotted_Shard_Key() { } verifier := suite.BuildVerifier() + suite.Require().NoError(verifier.startChangeHandling(ctx)) results, docCount, _, err := verifier.FetchAndCompareDocuments(ctx, 0, task) require.NoError(err, "should fetch & compare") assert.EqualValues(suite.T(), len(docs), docCount, "expected # of docs") From 474ae718a56bf382941208ad87741893eb67d262 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Sun, 16 Nov 2025 12:58:23 -0500 Subject: [PATCH 091/130] comment --- internal/verifier/check.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index b606ee84..be003ba6 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -387,6 +387,12 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter bson.D, testCh } } +// startChangeHandling starts the goroutines that read changes +// from the source & destination and that persist those changes +// to the metadata. +// +// As part of this, it sets the change readers’ start timestamps. +// (It blocks until those are set.) func (verifier *Verifier) startChangeHandling(ctx context.Context) error { changeReaderGroup, groupCtx := contextplus.ErrGroup(ctx) for _, changeReader := range mslices.Of(verifier.srcChangeReader, verifier.dstChangeReader) { From d0385824d58797530ed964f9f99533ed717cbbfc Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 17 Nov 2025 23:40:57 -0500 Subject: [PATCH 092/130] try 3.6 here --- .github/workflows/all.yml | 118 ++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 56 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index 280ee945..c30e94a2 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -28,40 +28,46 @@ jobs: - mongodb_versions: [ '4.2.5', '6.0' ] topology: replset - - mongodb_versions: [ '4.0', '8.0' ] - topology: replset - srcChangeReader: tailOplog - dstChangeReader: tailOplog - - - mongodb_versions: [ '4.2', '8.0' ] - topology: replset - srcChangeReader: tailOplog - dstChangeReader: tailOplog - - - mongodb_versions: [ '4.4', '8.0' ] - topology: replset - srcChangeReader: tailOplog - dstChangeReader: tailOplog - - - mongodb_versions: [ '5.0', '8.0' ] - topology: replset - srcChangeReader: tailOplog - dstChangeReader: tailOplog - - - mongodb_versions: [ '6.0', '8.0' ] - topology: replset - srcChangeReader: tailOplog - dstChangeReader: tailOplog - - - mongodb_versions: [ '7.0', '8.0' ] - topology: replset - srcChangeReader: tailOplog - dstChangeReader: tailOplog - - - mongodb_versions: [ '8.0', '8.0' ] + - mongodb_versions: [ '3.6', '8.0' ] topology: replset srcChangeReader: tailOplog dstChangeReader: tailOplog + toHashedIndexKey: false + +# - mongodb_versions: [ '4.0', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog +# +# - mongodb_versions: [ '4.2', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog +# +# - mongodb_versions: [ '4.4', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog +# +# - mongodb_versions: [ '5.0', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog +# +# - mongodb_versions: [ '6.0', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog +# +# - mongodb_versions: [ '7.0', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog +# +# - mongodb_versions: [ '8.0', '8.0' ] +# topology: replset +# srcChangeReader: tailOplog +# dstChangeReader: tailOplog exclude: - mongodb_versions: [ '4.2', '4.2' ] @@ -78,29 +84,29 @@ jobs: # versions are: source, destination mongodb_versions: - [ '4.2', '4.2' ] - - [ '4.2', '4.4' ] - - [ '4.2', '5.0' ] - - [ '4.2', '6.0' ] - - [ '4.2', '8.0' ] - - - [ '4.4', '4.4' ] - - [ '4.4', '5.0' ] - - [ '4.4', '6.0' ] - - [ '4.4', '8.0' ] - - - [ '5.0', '5.0' ] - - [ '5.0', '6.0' ] - - [ '5.0', '7.0' ] - - [ '5.0', '8.0' ] - - - [ '6.0', '6.0' ] - - [ '6.0', '7.0' ] - - [ '6.0', '8.0' ] - - - [ '7.0', '7.0' ] - - [ '7.0', '8.0' ] - - - [ '8.0', '8.0' ] +# - [ '4.2', '4.4' ] +# - [ '4.2', '5.0' ] +# - [ '4.2', '6.0' ] +# - [ '4.2', '8.0' ] +# +# - [ '4.4', '4.4' ] +# - [ '4.4', '5.0' ] +# - [ '4.4', '6.0' ] +# - [ '4.4', '8.0' ] +# +# - [ '5.0', '5.0' ] +# - [ '5.0', '6.0' ] +# - [ '5.0', '7.0' ] +# - [ '5.0', '8.0' ] +# +# - [ '6.0', '6.0' ] +# - [ '6.0', '7.0' ] +# - [ '6.0', '8.0' ] +# +# - [ '7.0', '7.0' ] +# - [ '7.0', '8.0' ] +# +# - [ '8.0', '8.0' ] toHashedIndexKey: [true, false] @@ -109,8 +115,8 @@ jobs: topology: - replset - - replset-to-sharded - - sharded +# - replset-to-sharded +# - sharded # Ubuntu 24 lacks OpenSSL 1.1.1’s libcrypto, which pre-v6 MongoDB # versions need. From ab63521cbcbe6ea2b8e77c175f1acaa98cd76408 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 17 Nov 2025 23:45:06 -0500 Subject: [PATCH 093/130] Revert "try 3.6 here" This reverts commit d0385824d58797530ed964f9f99533ed717cbbfc. --- .github/workflows/all.yml | 118 ++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index c30e94a2..280ee945 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -28,46 +28,40 @@ jobs: - mongodb_versions: [ '4.2.5', '6.0' ] topology: replset - - mongodb_versions: [ '3.6', '8.0' ] + - mongodb_versions: [ '4.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '4.2', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '4.4', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '5.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '6.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '7.0', '8.0' ] + topology: replset + srcChangeReader: tailOplog + dstChangeReader: tailOplog + + - mongodb_versions: [ '8.0', '8.0' ] topology: replset srcChangeReader: tailOplog dstChangeReader: tailOplog - toHashedIndexKey: false - -# - mongodb_versions: [ '4.0', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog -# -# - mongodb_versions: [ '4.2', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog -# -# - mongodb_versions: [ '4.4', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog -# -# - mongodb_versions: [ '5.0', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog -# -# - mongodb_versions: [ '6.0', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog -# -# - mongodb_versions: [ '7.0', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog -# -# - mongodb_versions: [ '8.0', '8.0' ] -# topology: replset -# srcChangeReader: tailOplog -# dstChangeReader: tailOplog exclude: - mongodb_versions: [ '4.2', '4.2' ] @@ -84,29 +78,29 @@ jobs: # versions are: source, destination mongodb_versions: - [ '4.2', '4.2' ] -# - [ '4.2', '4.4' ] -# - [ '4.2', '5.0' ] -# - [ '4.2', '6.0' ] -# - [ '4.2', '8.0' ] -# -# - [ '4.4', '4.4' ] -# - [ '4.4', '5.0' ] -# - [ '4.4', '6.0' ] -# - [ '4.4', '8.0' ] -# -# - [ '5.0', '5.0' ] -# - [ '5.0', '6.0' ] -# - [ '5.0', '7.0' ] -# - [ '5.0', '8.0' ] -# -# - [ '6.0', '6.0' ] -# - [ '6.0', '7.0' ] -# - [ '6.0', '8.0' ] -# -# - [ '7.0', '7.0' ] -# - [ '7.0', '8.0' ] -# -# - [ '8.0', '8.0' ] + - [ '4.2', '4.4' ] + - [ '4.2', '5.0' ] + - [ '4.2', '6.0' ] + - [ '4.2', '8.0' ] + + - [ '4.4', '4.4' ] + - [ '4.4', '5.0' ] + - [ '4.4', '6.0' ] + - [ '4.4', '8.0' ] + + - [ '5.0', '5.0' ] + - [ '5.0', '6.0' ] + - [ '5.0', '7.0' ] + - [ '5.0', '8.0' ] + + - [ '6.0', '6.0' ] + - [ '6.0', '7.0' ] + - [ '6.0', '8.0' ] + + - [ '7.0', '7.0' ] + - [ '7.0', '8.0' ] + + - [ '8.0', '8.0' ] toHashedIndexKey: [true, false] @@ -115,8 +109,8 @@ jobs: topology: - replset -# - replset-to-sharded -# - sharded + - replset-to-sharded + - sharded # Ubuntu 24 lacks OpenSSL 1.1.1’s libcrypto, which pre-v6 MongoDB # versions need. From 1a70ae78a0df580656d85ad66e32bde426069ee0 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 17 Nov 2025 23:46:45 -0500 Subject: [PATCH 094/130] add --- .../v2/x/mongo/driver/auth/mongodbcr.go | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go diff --git a/vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go b/vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go new file mode 100644 index 00000000..f8c1466d --- /dev/null +++ b/vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go @@ -0,0 +1,120 @@ +// Copyright (C) MongoDB, Inc. 2017-present. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. You may obtain +// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +package auth + +import ( + "context" + "fmt" + "io" + "net/http" + + // Ignore gosec warning "Blocklisted import crypto/md5: weak cryptographic primitive". We need + // to use MD5 here to implement the MONGODB-CR specification. + /* #nosec G501 */ + "crypto/md5" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/x/bsonx/bsoncore" + "go.mongodb.org/mongo-driver/v2/x/mongo/driver" + "go.mongodb.org/mongo-driver/v2/x/mongo/driver/operation" +) + +// MONGODBCR is the mechanism name for MONGODB-CR. +// +// The MONGODB-CR authentication mechanism is deprecated in MongoDB 3.6 and removed in +// MongoDB 4.0. +const MONGODBCR = "MONGODB-CR" + +func newMongoDBCRAuthenticator(cred *Cred, _ *http.Client) (Authenticator, error) { + source := cred.Source + if source == "" { + source = "admin" + } + return &MongoDBCRAuthenticator{ + DB: source, + Username: cred.Username, + Password: cred.Password, + }, nil +} + +// MongoDBCRAuthenticator uses the MONGODB-CR algorithm to authenticate a connection. +// +// The MONGODB-CR authentication mechanism is deprecated in MongoDB 3.6 and removed in +// MongoDB 4.0. +type MongoDBCRAuthenticator struct { + DB string + Username string + Password string +} + +// Auth authenticates the connection. +// +// The MONGODB-CR authentication mechanism is deprecated in MongoDB 3.6 and removed in +// MongoDB 4.0. +func (a *MongoDBCRAuthenticator) Auth(ctx context.Context, cfg *driver.AuthConfig) error { + + db := a.DB + if db == "" { + db = defaultAuthDB + } + + doc := bsoncore.BuildDocumentFromElements(nil, bsoncore.AppendInt32Element(nil, "getnonce", 1)) + cmd := operation.NewCommand(doc). + Database(db). + Deployment(driver.SingleConnectionDeployment{C: cfg.Connection}). + ClusterClock(cfg.ClusterClock). + ServerAPI(cfg.ServerAPI) + err := cmd.Execute(ctx) + if err != nil { + return newError(err, MONGODBCR) + } + rdr := cmd.Result() + + var getNonceResult struct { + Nonce string `bson:"nonce"` + } + + err = bson.Unmarshal(rdr, &getNonceResult) + if err != nil { + return newAuthError("unmarshal error", err) + } + + doc = bsoncore.BuildDocumentFromElements(nil, + bsoncore.AppendInt32Element(nil, "authenticate", 1), + bsoncore.AppendStringElement(nil, "user", a.Username), + bsoncore.AppendStringElement(nil, "nonce", getNonceResult.Nonce), + bsoncore.AppendStringElement(nil, "key", a.createKey(getNonceResult.Nonce)), + ) + cmd = operation.NewCommand(doc). + Database(db). + Deployment(driver.SingleConnectionDeployment{C: cfg.Connection}). + ClusterClock(cfg.ClusterClock). + ServerAPI(cfg.ServerAPI) + err = cmd.Execute(ctx) + if err != nil { + return newError(err, MONGODBCR) + } + + return nil +} + +// Reauth reauthenticates the connection. +func (a *MongoDBCRAuthenticator) Reauth(_ context.Context, _ *driver.AuthConfig) error { + return newAuthError("MONGODB-CR does not support reauthentication", nil) +} + +func (a *MongoDBCRAuthenticator) createKey(nonce string) string { + // Ignore gosec warning "Use of weak cryptographic primitive". We need to use MD5 here to + // implement the MONGODB-CR specification. + /* #nosec G401 */ + h := md5.New() + + _, _ = io.WriteString(h, nonce) + _, _ = io.WriteString(h, a.Username) + _, _ = io.WriteString(h, mongoPasswordDigest(a.Username, a.Password)) + return fmt.Sprintf("%x", h.Sum(nil)) +} From b2390d841bab1ed95751ae366fa414a45af7d3f1 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 17 Nov 2025 23:47:13 -0500 Subject: [PATCH 095/130] Revert "add" This reverts commit 1a70ae78a0df580656d85ad66e32bde426069ee0. --- .../v2/x/mongo/driver/auth/mongodbcr.go | 120 ------------------ 1 file changed, 120 deletions(-) delete mode 100644 vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go diff --git a/vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go b/vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go deleted file mode 100644 index f8c1466d..00000000 --- a/vendor/go.mongodb.org/mongo-driver/v2/x/mongo/driver/auth/mongodbcr.go +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (C) MongoDB, Inc. 2017-present. -// -// Licensed under the Apache License, Version 2.0 (the "License"); you may -// not use this file except in compliance with the License. You may obtain -// a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - -package auth - -import ( - "context" - "fmt" - "io" - "net/http" - - // Ignore gosec warning "Blocklisted import crypto/md5: weak cryptographic primitive". We need - // to use MD5 here to implement the MONGODB-CR specification. - /* #nosec G501 */ - "crypto/md5" - - "go.mongodb.org/mongo-driver/v2/bson" - "go.mongodb.org/mongo-driver/v2/x/bsonx/bsoncore" - "go.mongodb.org/mongo-driver/v2/x/mongo/driver" - "go.mongodb.org/mongo-driver/v2/x/mongo/driver/operation" -) - -// MONGODBCR is the mechanism name for MONGODB-CR. -// -// The MONGODB-CR authentication mechanism is deprecated in MongoDB 3.6 and removed in -// MongoDB 4.0. -const MONGODBCR = "MONGODB-CR" - -func newMongoDBCRAuthenticator(cred *Cred, _ *http.Client) (Authenticator, error) { - source := cred.Source - if source == "" { - source = "admin" - } - return &MongoDBCRAuthenticator{ - DB: source, - Username: cred.Username, - Password: cred.Password, - }, nil -} - -// MongoDBCRAuthenticator uses the MONGODB-CR algorithm to authenticate a connection. -// -// The MONGODB-CR authentication mechanism is deprecated in MongoDB 3.6 and removed in -// MongoDB 4.0. -type MongoDBCRAuthenticator struct { - DB string - Username string - Password string -} - -// Auth authenticates the connection. -// -// The MONGODB-CR authentication mechanism is deprecated in MongoDB 3.6 and removed in -// MongoDB 4.0. -func (a *MongoDBCRAuthenticator) Auth(ctx context.Context, cfg *driver.AuthConfig) error { - - db := a.DB - if db == "" { - db = defaultAuthDB - } - - doc := bsoncore.BuildDocumentFromElements(nil, bsoncore.AppendInt32Element(nil, "getnonce", 1)) - cmd := operation.NewCommand(doc). - Database(db). - Deployment(driver.SingleConnectionDeployment{C: cfg.Connection}). - ClusterClock(cfg.ClusterClock). - ServerAPI(cfg.ServerAPI) - err := cmd.Execute(ctx) - if err != nil { - return newError(err, MONGODBCR) - } - rdr := cmd.Result() - - var getNonceResult struct { - Nonce string `bson:"nonce"` - } - - err = bson.Unmarshal(rdr, &getNonceResult) - if err != nil { - return newAuthError("unmarshal error", err) - } - - doc = bsoncore.BuildDocumentFromElements(nil, - bsoncore.AppendInt32Element(nil, "authenticate", 1), - bsoncore.AppendStringElement(nil, "user", a.Username), - bsoncore.AppendStringElement(nil, "nonce", getNonceResult.Nonce), - bsoncore.AppendStringElement(nil, "key", a.createKey(getNonceResult.Nonce)), - ) - cmd = operation.NewCommand(doc). - Database(db). - Deployment(driver.SingleConnectionDeployment{C: cfg.Connection}). - ClusterClock(cfg.ClusterClock). - ServerAPI(cfg.ServerAPI) - err = cmd.Execute(ctx) - if err != nil { - return newError(err, MONGODBCR) - } - - return nil -} - -// Reauth reauthenticates the connection. -func (a *MongoDBCRAuthenticator) Reauth(_ context.Context, _ *driver.AuthConfig) error { - return newAuthError("MONGODB-CR does not support reauthentication", nil) -} - -func (a *MongoDBCRAuthenticator) createKey(nonce string) string { - // Ignore gosec warning "Use of weak cryptographic primitive". We need to use MD5 here to - // implement the MONGODB-CR specification. - /* #nosec G401 */ - h := md5.New() - - _, _ = io.WriteString(h, nonce) - _, _ = io.WriteString(h, a.Username) - _, _ = io.WriteString(h, mongoPasswordDigest(a.Username, a.Password)) - return fmt.Sprintf("%x", h.Sum(nil)) -} From 4631895ab53e735e35c41d5b90b9bff62c5e1542 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 18 Nov 2025 15:41:15 -0500 Subject: [PATCH 096/130] add majority RC --- internal/verifier/compare.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index da9988a1..678cbf9b 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -676,6 +676,7 @@ func (verifier *Verifier) getDocumentsCursor( cmd = append( cmd, bson.E{"readConcern", bson.D{ + {"level", "majority"}, {"afterClusterTime", readConcernTS}, }}, ) From 40044f518bec9ed237d8be5b3c3d845da2ddde0d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 18 Nov 2025 16:42:38 -0500 Subject: [PATCH 097/130] dummy From e9f534f4de34f91267f54d5ca3c3237eb0c6085c Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 18 Nov 2025 20:40:41 -0500 Subject: [PATCH 098/130] dummy From 858b7ff76c597221a62c18867be16d8cd9f21702 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 11:41:37 -0500 Subject: [PATCH 099/130] add doc --- agg/agg.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/agg/agg.go b/agg/agg.go index 0216d091..3633a1c7 100644 --- a/agg/agg.go +++ b/agg/agg.go @@ -1,3 +1,12 @@ +// Package agg provides convenience types for aggregation operators. +// This yields two major advantages over using bson.D or bson.M: +// - simpler syntax +// - auto-completion (i.e., via gopls) +// +// Guiding principles are: +// - Prefer [1]any for unary operators (e.g., `$bsonSize`). +// - Prefer struct types for operators with named parameters. +// - Use functions sparingly, e.g., for “tuple” operators like `$in`. package agg import ( From 3211a9851a45245a046b2fcdac91298c15be86ba Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 11:46:12 -0500 Subject: [PATCH 100/130] no pointer --- agg/helpers/string.go | 11 ----------- internal/verifier/change_stream.go | 4 ++-- internal/verifier/oplog_reader.go | 4 ++-- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/agg/helpers/string.go b/agg/helpers/string.go index 99502190..be44828d 100644 --- a/agg/helpers/string.go +++ b/agg/helpers/string.go @@ -21,15 +21,4 @@ func (sp StringHasPrefix) MarshalBSON() ([]byte, error) { }}}, }}, }) - - /* - return bson.Marshal(agg.Eq( - sp.Prefix, - agg.SubstrBytes{ - sp.FieldRef, - 0, - len(sp.Prefix), - }, - )) - */ } diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index f22dc8be..4371acaa 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -42,7 +42,7 @@ func (uee UnknownEventError) Error() string { type ChangeStreamReader struct { changeStream *mongo.ChangeStream - *ChangeReaderCommon + ChangeReaderCommon } var _ changeReader = &ChangeStreamReader{} @@ -64,7 +64,7 @@ func (v *Verifier) newChangeStreamReader( common.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken - csr := &ChangeStreamReader{ChangeReaderCommon: &common} + csr := &ChangeStreamReader{ChangeReaderCommon: common} common.createIteratorCb = csr.createChangeStream common.iterateCb = csr.iterateChangeStream diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 93bfdbc8..d39e474d 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -33,7 +33,7 @@ const ( // This significantly lightens server load and allows verification of heavier // workloads than change streams allow. It only works with replica sets. type OplogReader struct { - *ChangeReaderCommon + ChangeReaderCommon curDocs []bson.Raw scratch []byte @@ -59,7 +59,7 @@ func (v *Verifier) newOplogReader( common.resumeTokenTSExtractor = oplog.GetRawResumeTokenTimestamp - o := &OplogReader{ChangeReaderCommon: &common} + o := &OplogReader{ChangeReaderCommon: common} common.createIteratorCb = o.createCursor common.iterateCb = o.iterateCursor From 0dfe83244b3c37679aebeb295aaa47d66e2018d2 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 12:01:53 -0500 Subject: [PATCH 101/130] revert some --- internal/verifier/compare.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index b7536ecc..041b75fe 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -535,27 +535,25 @@ func iterateCursorToChannel( ) error { defer close(writer) - //sess := mongo.SessionFromContext(sctx) + sess := mongo.SessionFromContext(sctx) for cursor.Next(sctx) { state.NoteSuccess("received a document") - /* - clusterTime, err := util.GetClusterTimeFromSession(sess) - if err != nil { - return errors.Wrap(err, "reading cluster time from session") - } - */ + clusterTime, err := util.GetClusterTimeFromSession(sess) + if err != nil { + return errors.Wrap(err, "reading cluster time from session") + } buf := pool.Get(len(cursor.Current)) copy(buf, cursor.Current) - err := chanutil.WriteWithDoneCheck( + err = chanutil.WriteWithDoneCheck( sctx, writer, docWithTs{ doc: buf, - //ts: clusterTime, + ts: clusterTime, }, ) From 6fb0682f3ec3a72f46b22d551badc10f9f14b3a7 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 12:58:01 -0500 Subject: [PATCH 102/130] tweak --- internal/verifier/compare.go | 3 +- internal/verifier/integration_test_suite.go | 4 +- internal/verifier/migration_verifier.go | 4 +- internal/verifier/oplog/oplog.go | 90 ++++++++++++++------- main/migration_verifier.go | 4 +- 5 files changed, 67 insertions(+), 38 deletions(-) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index 041b75fe..c273f0ac 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -597,7 +597,6 @@ func (verifier *Verifier) getDocumentsCursor( case DocQueryFunctionFind: findOptions = bson.D{ bson.E{"filter", filter}, - //bson.E{"readConcern", readconcern.Majority()}, } case DocQueryFunctionAggregate: aggOptions = bson.D{ @@ -687,7 +686,7 @@ func (verifier *Verifier) getDocumentsCursor( if !task.IsRecheck() { if verifier.logger.Trace().Enabled() { - evt := verifier.logger.Debug(). + evt := verifier.logger.Trace(). Any("task", task.PrimaryKey) cmdStr, err := bson.MarshalExtJSON(cmd, true, false) diff --git a/internal/verifier/integration_test_suite.go b/internal/verifier/integration_test_suite.go index 3bd22fed..df86b3ff 100644 --- a/internal/verifier/integration_test_suite.go +++ b/internal/verifier/integration_test_suite.go @@ -201,14 +201,14 @@ func (suite *IntegrationTestSuite) BuildVerifier() *Verifier { os.Getenv("MVTEST_SRC_CHANGE_READER"), ChangeReaderOptChangeStream, ) - suite.Require().NoError(verifier.SetSrcChangeReader(envSrcChangeReader)) + suite.Require().NoError(verifier.SetSrcChangeReaderMethod(envSrcChangeReader)) envDstChangeReader := cmp.Or( os.Getenv("MVTEST_DST_CHANGE_READER"), ChangeReaderOptChangeStream, ) - suite.Require().NoError(verifier.SetDstChangeReader(envDstChangeReader)) + suite.Require().NoError(verifier.SetDstChangeReaderMethod(envDstChangeReader)) suite.Require().NoError(verifier.initializeChangeReaders()) diff --git a/internal/verifier/migration_verifier.go b/internal/verifier/migration_verifier.go index 65ed4e39..de97f8c3 100644 --- a/internal/verifier/migration_verifier.go +++ b/internal/verifier/migration_verifier.go @@ -381,7 +381,7 @@ func (verifier *Verifier) SetDocCompareMethod(method DocCompareMethod) { verifier.docCompareMethod = method } -func (verifier *Verifier) SetSrcChangeReader(method string) error { +func (verifier *Verifier) SetSrcChangeReaderMethod(method string) error { err := validateChangeReaderOpt(method, *verifier.srcClusterInfo) if err != nil { return errors.Wrap(err, "setting source change reader method") @@ -392,7 +392,7 @@ func (verifier *Verifier) SetSrcChangeReader(method string) error { return nil } -func (verifier *Verifier) SetDstChangeReader(method string) error { +func (verifier *Verifier) SetDstChangeReaderMethod(method string) error { err := validateChangeReaderOpt(method, *verifier.dstClusterInfo) if err != nil { return errors.Wrap(err, "setting source change reader method") diff --git a/internal/verifier/oplog/oplog.go b/internal/verifier/oplog/oplog.go index 91d31a8a..a3db7262 100644 --- a/internal/verifier/oplog/oplog.go +++ b/internal/verifier/oplog/oplog.go @@ -15,48 +15,40 @@ const ( rtBSONLength = 4 + 1 + 2 + 1 + 8 + 1 ) +// Op is an internal representation of the parts of an oplog entry that we +// care about. This struct is not meant to be marshaled/unmarshaled directly. type Op struct { - Op string - TS bson.Timestamp - Ns string - CmdName string - DocLen int32 - DocID bson.RawValue - Ops []Op -} + // Op holds the oplog entry’s `op`. + Op string -type ResumeToken struct { + // TS holds the oplog entry’s `ts`. TS bson.Timestamp -} - -func GetRawResumeTokenTimestamp(token bson.Raw) (bson.Timestamp, error) { - rv, err := token.LookupErr("ts") - if err != nil { - return bson.Timestamp{}, errors.Wrap(err, "getting ts") - } - return mbson.CastRawValue[bson.Timestamp](rv) -} - -func (rt ResumeToken) MarshalToBSON() []byte { - buf := make([]byte, 4, rtBSONLength) + // Ns holds the oplog entry’s `ns`. + Ns string - binary.LittleEndian.PutUint32(buf, uint32(cap(buf))) + // CmdName is the first field name in the oplog entry’s `o` document. + CmdName string - buf = bsoncore.AppendTimestampElement(buf, "ts", rt.TS.T, rt.TS.I) + // DocLen is the length, in bytes, of whatever document the oplog entry + // describes. This will only be meaningful for insert & replace entries. + DocLen int32 - buf = append(buf, 0) + // DocID is the `_id` of whatever document the oplog entry describes. + // This won’t be populated for multi-op Op instances. + DocID bson.RawValue - if len(buf) != rtBSONLength { - panic(fmt.Sprintf("bad resume token BSON length: %d", len(buf))) - } + // Ops holds the ops in an `applyOps` oplog entry. + Ops []Op +} - return buf +func (*Op) UnmarshalBSON([]byte) error { + panic("Use UnmarshalFromBSON.") } +// UnmarshalFromBSON unmarshals an Op more efficiently than the standard +// bson.Unmarshal function. func (o *Op) UnmarshalFromBSON(in []byte) error { - //fmt.Printf("---- unmarshaling: %+v\n\n", bson.Raw(in)) - for el, err := range mbson.RawElements(bson.Raw(in)) { if err != nil { return errors.Wrap(err, "iterating BSON document") @@ -123,3 +115,41 @@ func (o *Op) UnmarshalFromBSON(in []byte) error { return nil } + +// ResumeToken is Migration Verifier’s internal format for storing the +// timestamp to resume an oplog reader. +type ResumeToken struct { + TS bson.Timestamp +} + +func (ResumeToken) MarshalBSON() ([]byte, error) { + panic("Use MarshalToBSON.") +} + +// MarshalToBSON marshals a ResumeToken to BSON. Unlike with the standard +// bson.Marshaler interface, this method never fails. +func (rt ResumeToken) MarshalToBSON() []byte { + buf := make([]byte, 4, rtBSONLength) + + binary.LittleEndian.PutUint32(buf, uint32(cap(buf))) + + buf = bsoncore.AppendTimestampElement(buf, "ts", rt.TS.T, rt.TS.I) + + buf = append(buf, 0) + + if len(buf) != rtBSONLength { + panic(fmt.Sprintf("bad resume token BSON length: %d", len(buf))) + } + + return buf +} + +// GetRawResumeTokenTimestamp extracts the timestamp from a given oplog entry. +func GetRawResumeTokenTimestamp(token bson.Raw) (bson.Timestamp, error) { + rv, err := token.LookupErr("ts") + if err != nil { + return bson.Timestamp{}, errors.Wrap(err, "getting ts") + } + + return mbson.CastRawValue[bson.Timestamp](rv) +} diff --git a/main/migration_verifier.go b/main/migration_verifier.go index f203c96e..29a3858b 100644 --- a/main/migration_verifier.go +++ b/main/migration_verifier.go @@ -366,7 +366,7 @@ func handleArgs(ctx context.Context, cCtx *cli.Context) (*verifier.Verifier, err if !slices.Contains(verifier.ChangeReaderOpts, srcChangeReaderVal) { return nil, errors.Errorf("invalid %#q (%s); valid values are: %#q", srcChangeReader, srcChangeReaderVal, verifier.ChangeReaderOpts) } - err = v.SetSrcChangeReader(srcChangeReaderVal) + err = v.SetSrcChangeReaderMethod(srcChangeReaderVal) if err != nil { return nil, err } @@ -375,7 +375,7 @@ func handleArgs(ctx context.Context, cCtx *cli.Context) (*verifier.Verifier, err if !slices.Contains(verifier.ChangeReaderOpts, dstChangeReaderVal) { return nil, errors.Errorf("invalid %#q (%s); valid values are: %#q", dstChangeReader, dstChangeReaderVal, verifier.ChangeReaderOpts) } - err = v.SetDstChangeReader(srcChangeReaderVal) + err = v.SetDstChangeReaderMethod(srcChangeReaderVal) if err != nil { return nil, err } From 1dc05767e18726f4c282689ad82ce886d78fef33 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 13:03:08 -0500 Subject: [PATCH 103/130] comments --- internal/verifier/oplog/oplog.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/verifier/oplog/oplog.go b/internal/verifier/oplog/oplog.go index a3db7262..11f2e142 100644 --- a/internal/verifier/oplog/oplog.go +++ b/internal/verifier/oplog/oplog.go @@ -47,7 +47,8 @@ func (*Op) UnmarshalBSON([]byte) error { } // UnmarshalFromBSON unmarshals an Op more efficiently than the standard -// bson.Unmarshal function. +// bson.Unmarshal function. This function is called for every oplog entry, +// so that efficiency is material. func (o *Op) UnmarshalFromBSON(in []byte) error { for el, err := range mbson.RawElements(bson.Raw(in)) { if err != nil { @@ -127,7 +128,9 @@ func (ResumeToken) MarshalBSON() ([]byte, error) { } // MarshalToBSON marshals a ResumeToken to BSON. Unlike with the standard -// bson.Marshaler interface, this method never fails. +// bson.Marshaler interface, this method never fails. It’s also faster/lighter +// because it avoids reflection, which is relevant because this is called for +// every batch of ops. func (rt ResumeToken) MarshalToBSON() []byte { buf := make([]byte, 4, rtBSONLength) From 627e7935e31dcfa9e4596c5e5884e9af98ccb42e Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 14:04:43 -0500 Subject: [PATCH 104/130] fix --- internal/verifier/change_reader.go | 4 ++++ internal/verifier/change_stream.go | 4 ++-- internal/verifier/oplog_reader.go | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index 5dcb760a..f2960d74 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -194,6 +194,10 @@ func (rc *ChangeReaderCommon) start( return errors.Wrap(err, "failed to start session") } + if rc.createIteratorCb == nil { + panic("rc.createIteratorCb should be set") + } + startTs, err := rc.createIteratorCb(ctx, sess) if err != nil { logEvent := rc.logger.Debug(). diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 4371acaa..f22dc8be 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -42,7 +42,7 @@ func (uee UnknownEventError) Error() string { type ChangeStreamReader struct { changeStream *mongo.ChangeStream - ChangeReaderCommon + *ChangeReaderCommon } var _ changeReader = &ChangeStreamReader{} @@ -64,7 +64,7 @@ func (v *Verifier) newChangeStreamReader( common.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken - csr := &ChangeStreamReader{ChangeReaderCommon: common} + csr := &ChangeStreamReader{ChangeReaderCommon: &common} common.createIteratorCb = csr.createChangeStream common.iterateCb = csr.iterateChangeStream diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index d39e474d..93bfdbc8 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -33,7 +33,7 @@ const ( // This significantly lightens server load and allows verification of heavier // workloads than change streams allow. It only works with replica sets. type OplogReader struct { - ChangeReaderCommon + *ChangeReaderCommon curDocs []bson.Raw scratch []byte @@ -59,7 +59,7 @@ func (v *Verifier) newOplogReader( common.resumeTokenTSExtractor = oplog.GetRawResumeTokenTimestamp - o := &OplogReader{ChangeReaderCommon: common} + o := &OplogReader{ChangeReaderCommon: &common} common.createIteratorCb = o.createCursor common.iterateCb = o.iterateCursor From d001fdeec08e08520531b3dfb1d8254ac4dee0e0 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 14:07:06 -0500 Subject: [PATCH 105/130] revert --- internal/verifier/change_stream.go | 8 ++++---- internal/verifier/oplog_reader.go | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index f22dc8be..7337eae9 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -42,7 +42,7 @@ func (uee UnknownEventError) Error() string { type ChangeStreamReader struct { changeStream *mongo.ChangeStream - *ChangeReaderCommon + ChangeReaderCommon } var _ changeReader = &ChangeStreamReader{} @@ -64,10 +64,10 @@ func (v *Verifier) newChangeStreamReader( common.resumeTokenTSExtractor = extractTSFromChangeStreamResumeToken - csr := &ChangeStreamReader{ChangeReaderCommon: &common} + csr := &ChangeStreamReader{ChangeReaderCommon: common} - common.createIteratorCb = csr.createChangeStream - common.iterateCb = csr.iterateChangeStream + csr.createIteratorCb = csr.createChangeStream + csr.iterateCb = csr.iterateChangeStream return csr } diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 93bfdbc8..3f1c69cc 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -33,7 +33,7 @@ const ( // This significantly lightens server load and allows verification of heavier // workloads than change streams allow. It only works with replica sets. type OplogReader struct { - *ChangeReaderCommon + ChangeReaderCommon curDocs []bson.Raw scratch []byte @@ -59,10 +59,10 @@ func (v *Verifier) newOplogReader( common.resumeTokenTSExtractor = oplog.GetRawResumeTokenTimestamp - o := &OplogReader{ChangeReaderCommon: &common} + o := &OplogReader{ChangeReaderCommon: common} - common.createIteratorCb = o.createCursor - common.iterateCb = o.iterateCursor + o.createIteratorCb = o.createCursor + o.iterateCb = o.iterateCursor return o } From bf557590e37aff8abc21622947b525f540ad8f7d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 19 Nov 2025 14:08:00 -0500 Subject: [PATCH 106/130] =?UTF-8?q?don=E2=80=99t=20warn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/verifier/check.go | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/internal/verifier/check.go b/internal/verifier/check.go index 68711c5f..f310b3d0 100644 --- a/internal/verifier/check.go +++ b/internal/verifier/check.go @@ -620,16 +620,8 @@ func (verifier *Verifier) work(ctx context.Context, workerNum int) error { } func (v *Verifier) initializeChangeReaders() error { - warnAboutOplog := func(cluster whichCluster) { - v.logger.Warn(). - Str("cluster", string(cluster)). - Msg("Reading writes via oplog tailing. This feature is experimental.") - } - switch v.srcChangeReaderMethod { case ChangeReaderOptOplog: - warnAboutOplog(src) - v.srcChangeReader = v.newOplogReader( v.srcNamespaces, src, @@ -649,8 +641,6 @@ func (v *Verifier) initializeChangeReaders() error { switch v.dstChangeReaderMethod { case ChangeReaderOptOplog: - warnAboutOplog(dst) - v.dstChangeReader = v.newOplogReader( v.dstNamespaces, dst, From a81561cdda2dad74c15abff525d56e897d7c9af5 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 20 Nov 2025 16:02:41 -0500 Subject: [PATCH 107/130] fix --- internal/util/cluster_time.go | 4 ++++ internal/verifier/compare.go | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/util/cluster_time.go b/internal/util/cluster_time.go index da9bef04..fb8cb940 100644 --- a/internal/util/cluster_time.go +++ b/internal/util/cluster_time.go @@ -10,6 +10,10 @@ import ( func GetClusterTimeFromSession(sess *mongo.Session) (bson.Timestamp, error) { clusterTimeRaw := sess.ClusterTime() + if clusterTimeRaw == nil { + panic("session has empty cluster time?!?") + } + ctrv, err := clusterTimeRaw.LookupErr("$clusterTime", "clusterTime") if err != nil { return bson.Timestamp{}, errors.Wrapf(err, "finding clusterTime in session cluster time document (%v)", clusterTimeRaw) diff --git a/internal/verifier/compare.go b/internal/verifier/compare.go index c273f0ac..e63adbd3 100644 --- a/internal/verifier/compare.go +++ b/internal/verifier/compare.go @@ -536,6 +536,9 @@ func iterateCursorToChannel( defer close(writer) sess := mongo.SessionFromContext(sctx) + if sess == nil { + panic("need a session") + } for cursor.Next(sctx) { state.NoteSuccess("received a document") @@ -703,7 +706,7 @@ func (verifier *Verifier) getDocumentsCursor( } return collection.Database().RunCommandCursor( - mongo.NewSessionContext(sctx, nil), + sctx, cmd, runCommandOptions, ) From 43e5a88cc02796a589840aa3bda63fa310a98d9a Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 20 Nov 2025 19:07:10 -0500 Subject: [PATCH 108/130] Printf --- internal/verifier/oplog_reader.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 3f1c69cc..b2cfd64c 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -172,8 +172,6 @@ func (o *OplogReader) createCursor( }}}, }}} - fmt.Printf("------ oplogFilter: %v\n\n", oplogFilter) - cursor, err := o.watcherClient. Database("local"). Collection( @@ -293,7 +291,12 @@ CursorLoop: for { if !o.lastChangeEventTime.Load().OrZero().Before(writesOffTS) { - fmt.Printf("----------- %s reached writes off ts %v\n", o, writesOffTS) + o.logger.Debug(). + Stringer("reader", o). + Any("lastChangeEventTS", o.lastChangeEventTime.Load()). + Any("writesOffTS", writesOffTS). + Msg("Reached writes-off timestamp.") + break } From 9fe9613ddfa931775ab33f67c1018114780aa84d Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 20 Nov 2025 19:07:30 -0500 Subject: [PATCH 109/130] comments --- internal/verifier/oplog_reader.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index b2cfd64c..9cf2977c 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -394,8 +394,6 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti var latestTS bson.Timestamp parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { - //fmt.Printf("---- got op: %+v\n\n", rawDoc) - nsStr, err := mbson.Lookup[string](rawDoc, "ns") if err != nil { return err @@ -557,7 +555,6 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore var latestTS bson.Timestamp for _, rawDoc := range o.curDocs { - //fmt.Printf("----- %s got op: %+v\n\n", o, rawDoc) var op oplog.Op if err := (&op).UnmarshalFromBSON(rawDoc); err != nil { From 88f7d229c4f589476bf4591e7b31c7ce35fefcd2 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 20 Nov 2025 19:39:44 -0500 Subject: [PATCH 110/130] remove TODO --- internal/verifier/oplog_reader.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 9cf2977c..86724b64 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -306,7 +306,6 @@ CursorLoop: } } - // TODO: deduplicate o.running = false infoLog := o.logger.Info() From b6692bcf8850b07d9a3b5eb48503ca71aab673ac Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 20 Nov 2025 20:07:10 -0500 Subject: [PATCH 111/130] panic clearer --- internal/util/cluster_time.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/util/cluster_time.go b/internal/util/cluster_time.go index fb8cb940..294d36b4 100644 --- a/internal/util/cluster_time.go +++ b/internal/util/cluster_time.go @@ -11,7 +11,7 @@ func GetClusterTimeFromSession(sess *mongo.Session) (bson.Timestamp, error) { clusterTimeRaw := sess.ClusterTime() if clusterTimeRaw == nil { - panic("session has empty cluster time?!?") + panic("found empty session cluster time but need nonempty") } ctrv, err := clusterTimeRaw.LookupErr("$clusterTime", "clusterTime") From b1c53902ddaf8fbb12360cf3341b291a4ede0622 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Thu, 20 Nov 2025 20:20:54 -0500 Subject: [PATCH 112/130] document limitation --- README.md | 16 ++++++++++++++-- internal/verifier/metadata.go | 3 ++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 48926001..1c9180c9 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,8 @@ The verifier will now check to completion to make sure that there are no inconsi | `--dstNamespace ` | destination namespaces to check | | `--metaDBName ` | name of the database in which to store verification metadata (default: "migration_verification_metadata") | | `--docCompareMethod` | How to compare documents. See below for details. | +| `--srcChangeReader` | How to read changes from the source. See below for details. | +| `--dstChangeReader` | How to read changes from the destination. See below for details. | | `--start` | Start checking documents right away rather than waiting for a `/check` API request. | | `--verifyAll` | If set, verify all user namespaces | | `--clean` | If set, drop all previous verification metadata before starting | @@ -198,8 +200,6 @@ connection strings in the following environment variables: The migration-verifier has two steps: - - 1. The initial check 1. The verifier partitions up the data into 400MB (configurable) chunks and spins up many worker goroutines (threads) to read from both the source and destination. 2. The verifier compares the documents on the source and destination by bytes and if they are different, it then checks field by field in case the field ordering has changed (since field ordering isn't required to be the same for the migration to be a success) @@ -392,6 +392,18 @@ Full-document verification methods allow migration-verifier to diagnose mismatch Additionally, because the amount of data sent to migration-verifier doesn’t actually reflect the documents’ size, no meaningful statistics are shown concerning the collection data size. Document counts, of course, are still shown. +# Change reading methods + +(**NOTE:** If the verifier restarts, it **MUST** use the same change reader options as before.) + +## `changeStream` + +The default. The verifier will read a change stream, which works seamlessly on sharded or unsharded clusters. + +## `tailOplog` + +The verifier will read the oplog continually instead of reading a change stream. This is generally faster, but it doesn’t work in sharded clusters. + # Known Issues - The verifier may report missing documents on the destination that don’t actually appear to be missing (i.e., a nonexistent problem). This has been hard to reproduce. If missing documents are reported, it is good practice to check for false positives. diff --git a/internal/verifier/metadata.go b/internal/verifier/metadata.go index 906117c9..310048a6 100644 --- a/internal/verifier/metadata.go +++ b/internal/verifier/metadata.go @@ -6,5 +6,6 @@ package verifier // 3: Enqueued rechecks now reference the generation in which they’ll be // rechecked rather than the generation during which they were enqueued. // 4: Use “changeReader” instead of “changeStream” collection name. +// 5: Metadata now stores source & destination change reader options. -const verifierMetadataVersion = 4 +const verifierMetadataVersion = 5 From a7294d066f3a71ccc7e62ab3a3e9e7d73be033d4 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 24 Nov 2025 11:46:00 -0500 Subject: [PATCH 113/130] Forbid changing change reader opts. --- internal/verifier/generation.go | 44 ++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/internal/verifier/generation.go b/internal/verifier/generation.go index 9f267d82..0f5f8f52 100644 --- a/internal/verifier/generation.go +++ b/internal/verifier/generation.go @@ -12,13 +12,14 @@ import ( ) const ( - generationCollName = "generation" - generationFieldName = "generation" + generationCollName = "generation" ) type generationDoc struct { - Generation int - MetadataVersion int + Generation int + MetadataVersion int + SourceChangeReaderOpt string + DestinationChangeReaderOpt string } type metadataMismatchErr struct { @@ -32,6 +33,20 @@ func (mme metadataMismatchErr) Error() string { ) } +type changeReaderOptMismatchErr struct { + reader whichCluster + persistedOpt string + currentOpt string +} + +func (crme changeReaderOptMismatchErr) Error() string { + return fmt.Sprintf("new %s change reader opt is %#q, but %#q was used previously; either use the old option, or restart verification", + crme.reader, + crme.currentOpt, + crme.persistedOpt, + ) +} + func (v *Verifier) persistGenerationWhileLocked(ctx context.Context) error { generation, _ := v.getGenerationWhileLocked() @@ -41,8 +56,10 @@ func (v *Verifier) persistGenerationWhileLocked(ctx context.Context) error { ctx, bson.D{}, generationDoc{ - Generation: generation, - MetadataVersion: verifierMetadataVersion, + Generation: generation, + MetadataVersion: verifierMetadataVersion, + SourceChangeReaderOpt: v.srcChangeReaderMethod, + DestinationChangeReaderOpt: v.dstChangeReaderMethod, }, options.Replace().SetUpsert(true), ) @@ -78,7 +95,22 @@ func (v *Verifier) readGeneration(ctx context.Context) (option.Option[int], erro if parsed.MetadataVersion != verifierMetadataVersion { return option.None[int](), metadataMismatchErr{parsed.MetadataVersion} + } + if parsed.SourceChangeReaderOpt != v.srcChangeReaderMethod { + return option.None[int](), changeReaderOptMismatchErr{ + reader: src, + persistedOpt: parsed.SourceChangeReaderOpt, + currentOpt: v.srcChangeReaderMethod, + } + } + + if parsed.DestinationChangeReaderOpt != v.dstChangeReaderMethod { + return option.None[int](), changeReaderOptMismatchErr{ + reader: dst, + persistedOpt: parsed.DestinationChangeReaderOpt, + currentOpt: v.dstChangeReaderMethod, + } } return option.Some(parsed.Generation), nil From 93b67d8baa7e2c4dbc99ec3a2f77bd3a2f7035b1 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 24 Nov 2025 12:06:23 -0500 Subject: [PATCH 114/130] confirm failure --- internal/verifier/change_reader_test.go | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 internal/verifier/change_reader_test.go diff --git a/internal/verifier/change_reader_test.go b/internal/verifier/change_reader_test.go new file mode 100644 index 00000000..47d91d12 --- /dev/null +++ b/internal/verifier/change_reader_test.go @@ -0,0 +1,48 @@ +package verifier + +// TestFailChangeReaderOptChange confirms that verifier fails if it restarts +// with different change-reader settings. +func (suite *IntegrationTestSuite) TestFailChangeReaderOptChange() { + ctx := suite.Context() + + v1 := suite.BuildVerifier() + suite.Require().NoError( + v1.SetSrcChangeReaderMethod(ChangeReaderOptChangeStream), + ) + suite.Require().NoError( + v1.SetDstChangeReaderMethod(ChangeReaderOptChangeStream), + ) + + v1Runner := RunVerifierCheck(ctx, suite.T(), v1) + suite.Require().NoError( + v1Runner.AwaitGenerationEnd(), + ) + + badSrcOptVerifier := suite.BuildVerifier() + suite.Require().NoError( + badSrcOptVerifier.SetSrcChangeReaderMethod(ChangeReaderOptOplog), + ) + suite.Require().NoError( + badSrcOptVerifier.SetDstChangeReaderMethod(ChangeReaderOptChangeStream), + ) + + err := RunVerifierCheck(ctx, suite.T(), badSrcOptVerifier). + AwaitGenerationEnd() + + suite.Require().Error(err, "wrong source opt should fail") + suite.Assert().ErrorAs(err, &changeReaderOptMismatchErr{}) + + badDstOptVerifier := suite.BuildVerifier() + suite.Require().NoError( + badSrcOptVerifier.SetSrcChangeReaderMethod(ChangeReaderOptChangeStream), + ) + suite.Require().NoError( + badSrcOptVerifier.SetDstChangeReaderMethod(ChangeReaderOptOplog), + ) + + err = RunVerifierCheck(ctx, suite.T(), badDstOptVerifier). + AwaitGenerationEnd() + + suite.Require().Error(err, "wrong destination opt should fail") + suite.Assert().ErrorAs(err, &changeReaderOptMismatchErr{}) +} From dbb15a0bc6df5660ebba7896368a86267363d823 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 24 Nov 2025 12:11:40 -0500 Subject: [PATCH 115/130] moar doc --- agg/helpers/string.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agg/helpers/string.go b/agg/helpers/string.go index be44828d..a84523bf 100644 --- a/agg/helpers/string.go +++ b/agg/helpers/string.go @@ -1,9 +1,12 @@ +// Package helpers exposes functions that express common operations +// that don’t map to a single aggregation operator. package helpers import ( "go.mongodb.org/mongo-driver/v2/bson" ) +// StringHasPrefix parallels Go’s strings.HasPrefix. type StringHasPrefix struct { FieldRef any Prefix string From c2f422c6e38e728e44020d37737525c6a316f01c Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 24 Nov 2025 12:41:49 -0500 Subject: [PATCH 116/130] ofix test --- internal/verifier/change_reader_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/verifier/change_reader_test.go b/internal/verifier/change_reader_test.go index 47d91d12..36727584 100644 --- a/internal/verifier/change_reader_test.go +++ b/internal/verifier/change_reader_test.go @@ -34,10 +34,10 @@ func (suite *IntegrationTestSuite) TestFailChangeReaderOptChange() { badDstOptVerifier := suite.BuildVerifier() suite.Require().NoError( - badSrcOptVerifier.SetSrcChangeReaderMethod(ChangeReaderOptChangeStream), + badDstOptVerifier.SetSrcChangeReaderMethod(ChangeReaderOptChangeStream), ) suite.Require().NoError( - badSrcOptVerifier.SetDstChangeReaderMethod(ChangeReaderOptOplog), + badDstOptVerifier.SetDstChangeReaderMethod(ChangeReaderOptOplog), ) err = RunVerifierCheck(ctx, suite.T(), badDstOptVerifier). From 7e917b3ad97e8da830da843b04a8178dcca6edc8 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Mon, 24 Nov 2025 13:01:06 -0500 Subject: [PATCH 117/130] avoid sharded w/ oplog --- internal/verifier/change_reader_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/verifier/change_reader_test.go b/internal/verifier/change_reader_test.go index 36727584..83482a1e 100644 --- a/internal/verifier/change_reader_test.go +++ b/internal/verifier/change_reader_test.go @@ -1,8 +1,14 @@ package verifier +import "github.com/10gen/migration-verifier/internal/util" + // TestFailChangeReaderOptChange confirms that verifier fails if it restarts // with different change-reader settings. func (suite *IntegrationTestSuite) TestFailChangeReaderOptChange() { + if suite.GetTopology(suite.srcMongoClient) == util.TopologySharded { + suite.T().Skipf("sharded source can only read changes via change stream") + } + ctx := suite.Context() v1 := suite.BuildVerifier() @@ -32,6 +38,10 @@ func (suite *IntegrationTestSuite) TestFailChangeReaderOptChange() { suite.Require().Error(err, "wrong source opt should fail") suite.Assert().ErrorAs(err, &changeReaderOptMismatchErr{}) + if suite.GetTopology(suite.dstMongoClient) == util.TopologySharded { + return + } + badDstOptVerifier := suite.BuildVerifier() suite.Require().NoError( badDstOptVerifier.SetSrcChangeReaderMethod(ChangeReaderOptChangeStream), From fc1d80f98393f31a7a973590ab3df23ba902e16b Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 26 Nov 2025 19:38:08 -0500 Subject: [PATCH 118/130] rename --- internal/verifier/change_stream.go | 3 +- internal/verifier/list_namespaces.go | 29 +++----------------- internal/verifier/migration_verifier_test.go | 3 +- internal/verifier/namespaces/exclude.go | 26 ++++++++++++++++++ internal/verifier/namespaces/meta.go | 10 ------- internal/verifier/oplog_reader.go | 3 +- 6 files changed, 36 insertions(+), 38 deletions(-) create mode 100644 internal/verifier/namespaces/exclude.go delete mode 100644 internal/verifier/namespaces/meta.go diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index 97772b0c..ccd833d8 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -8,6 +8,7 @@ import ( "github.com/10gen/migration-verifier/internal/keystring" "github.com/10gen/migration-verifier/internal/retry" "github.com/10gen/migration-verifier/internal/util" + "github.com/10gen/migration-verifier/internal/verifier/namespaces" "github.com/10gen/migration-verifier/mbson" "github.com/10gen/migration-verifier/option" mapset "github.com/deckarep/golang-set/v2" @@ -90,7 +91,7 @@ func (csr *ChangeStreamReader) GetChangeStreamFilter() (pipeline mongo.Pipeline) {{"$match", util.ExcludePrefixesQuery( "ns.db", append( - slices.Clone(ExcludedDBPrefixes), + slices.Clone(namespaces.ExcludedDBPrefixes), csr.metaDB.Name(), ), )}}, diff --git a/internal/verifier/list_namespaces.go b/internal/verifier/list_namespaces.go index 7608469f..ebb28fa0 100644 --- a/internal/verifier/list_namespaces.go +++ b/internal/verifier/list_namespaces.go @@ -5,6 +5,7 @@ import ( "github.com/10gen/migration-verifier/internal/logger" "github.com/10gen/migration-verifier/internal/util" + "github.com/10gen/migration-verifier/internal/verifier/namespaces" "github.com/10gen/migration-verifier/mmongo" "github.com/10gen/migration-verifier/mslices" "go.mongodb.org/mongo-driver/v2/bson" @@ -12,28 +13,6 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo/options" ) -const ( - // ExcludedSystemCollPrefix is the prefix of system collections, - // which we ignore. - ExcludedSystemCollPrefix = "system." - - // MongoDBInternalDBPrefix is the prefix for MongoDB-internal databases. - // (e.g., Atlas’s availability canary) - MongoDBInternalDBPrefix = "__mdb_internal" -) - -var ( - ExcludedDBPrefixes = mslices.Of( - // mongosync metadata: - "mongosync_internal_", - "mongosync_reserved_", - MongoDBInternalDBPrefix, - ) - - // ExcludedSystemDBs are system databases that are excluded from verification. - ExcludedSystemDBs = []string{"admin", "config", "local"} -) - // ListAllUserNamespaces lists all the user collections on a cluster, // in addition to time-series “system.buckets.*” collections. // @@ -48,7 +27,7 @@ func ListAllUserNamespaces( ) ([]string, error) { excludedDBs := []string{} excludedDBs = append(excludedDBs, additionalExcludedDBs...) - excludedDBs = append(excludedDBs, ExcludedSystemDBs...) + excludedDBs = append(excludedDBs, namespaces.ExcludedSystemDBs...) var excluded []any for _, e := range excludedDBs { @@ -58,7 +37,7 @@ func ListAllUserNamespaces( dbNames, err := client.ListDatabaseNames(ctx, bson.D{ {"$and", []bson.D{ {{"name", bson.D{{"$nin", excluded}}}}, - util.ExcludePrefixesQuery("name", ExcludedDBPrefixes), + util.ExcludePrefixesQuery("name", namespaces.ExcludedDBPrefixes), }}, }) @@ -77,7 +56,7 @@ func ListAllUserNamespaces( {"$or", []bson.D{ util.ExcludePrefixesQuery( "name", - mslices.Of(ExcludedSystemCollPrefix), + mslices.Of(namespaces.ExcludedSystemCollPrefix), ), { {"$expr", mmongo.StartsWithAgg("$name", timeseriesBucketsPrefix)}, diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index c276eac4..92a2e693 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -25,6 +25,7 @@ import ( "github.com/10gen/migration-verifier/internal/testutil" "github.com/10gen/migration-verifier/internal/types" "github.com/10gen/migration-verifier/internal/util" + "github.com/10gen/migration-verifier/internal/verifier/namespaces" "github.com/10gen/migration-verifier/internal/verifier/recheck" "github.com/10gen/migration-verifier/mbson" "github.com/10gen/migration-verifier/mslices" @@ -2259,7 +2260,7 @@ func (suite *IntegrationTestSuite) TestGenerationalRechecking() { func (suite *IntegrationTestSuite) TestMongoDBInternalDB() { ctx := suite.Context() - dbName := MongoDBInternalDBPrefix + "internalDBTest" + dbName := namespaces.MongoDBInternalDBPrefix + "internalDBTest" _, err := suite.srcMongoClient. Database(dbName). diff --git a/internal/verifier/namespaces/exclude.go b/internal/verifier/namespaces/exclude.go new file mode 100644 index 00000000..d80cc40f --- /dev/null +++ b/internal/verifier/namespaces/exclude.go @@ -0,0 +1,26 @@ +package namespaces + +import ( + "github.com/10gen/migration-verifier/mslices" +) + +const ( + // ExcludedSystemCollPrefix is the prefix of system collections, + // which we ignore. + ExcludedSystemCollPrefix = "system." + + // MongoDBInternalDBPrefix is the prefix for MongoDB-internal databases. + // (e.g., Atlas’s availability canary) + MongoDBInternalDBPrefix = "__mdb_internal" +) + +var ( + ExcludedDBPrefixes = mslices.Of( + "mongosync_internal_", + "mongosync_reserved_", + MongoDBInternalDBPrefix, + ) + + // ExcludedSystemDBs are system databases that are excluded from verification. + ExcludedSystemDBs = []string{"admin", "config", "local"} +) diff --git a/internal/verifier/namespaces/meta.go b/internal/verifier/namespaces/meta.go deleted file mode 100644 index 97cf09af..00000000 --- a/internal/verifier/namespaces/meta.go +++ /dev/null @@ -1,10 +0,0 @@ -package namespaces - -import "github.com/10gen/migration-verifier/mslices" - -var ( - MongosyncMetaDBPrefixes = mslices.Of( - "mongosync_internal_", - "mongosync_reserved_", - ) -) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 86724b64..6026d83b 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -616,7 +616,8 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore func (o *OplogReader) getNSFilter(docroot string) agg.And { prefixes := append( - slices.Clone(namespaces.MongosyncMetaDBPrefixes), + slices.Clone(namespaces.ExcludedDBPrefixes), + o.metaDB.Name()+".", "config.", "admin.", From 6da099499a4e9783424b94843f09784a82781e41 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 2 Dec 2025 21:11:57 -0500 Subject: [PATCH 119/130] Withhold pending review - fixes --- internal/verifier/oplog_reader.go | 23 +++++++++++++++--- internal/verifier/recheck_persist.go | 2 +- mbson/raw_value_test.go | 24 +++++++++++++++++++ mmongo/cursor.go | 35 +++++++++++++--------------- 4 files changed, 61 insertions(+), 23 deletions(-) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 6026d83b..9c14e5f3 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -3,6 +3,7 @@ package verifier import ( "context" "fmt" + "strings" "github.com/10gen/migration-verifier/agg" "github.com/10gen/migration-verifier/agg/helpers" @@ -392,12 +393,26 @@ func (o *OplogReader) readAndHandleOneBatch( func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Timestamp) ([]ParsedEvent, bson.Timestamp, error) { var latestTS bson.Timestamp + nsPrefixesToExclude := o.getExcludedNSPrefixes() + parseOneDocumentOp := func(opName string, ts bson.Timestamp, rawDoc bson.Raw) error { nsStr, err := mbson.Lookup[string](rawDoc, "ns") if err != nil { return err } + // Things we always ignore: + for _, prefix := range nsPrefixesToExclude { + if strings.HasPrefix(nsStr, prefix) { + return nil + } + } + + // Honor namespace filtering: + if len(o.namespaces) > 0 && !slices.Contains(o.namespaces, nsStr) { + return nil + } + var docID bson.RawValue var docLength types.ByteCount var docField string @@ -614,17 +629,19 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore return events, latestTS, nil } -func (o *OplogReader) getNSFilter(docroot string) agg.And { - prefixes := append( +func (o *OplogReader) getExcludedNSPrefixes() []string { + return append( slices.Clone(namespaces.ExcludedDBPrefixes), o.metaDB.Name()+".", "config.", "admin.", ) +} +func (o *OplogReader) getNSFilter(docroot string) agg.And { filter := agg.And(lo.Map( - prefixes, + o.getExcludedNSPrefixes(), func(prefix string, _ int) any { return agg.Not{helpers.StringHasPrefix{ FieldRef: docroot + ".ns", diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 27b0d395..9840ecae 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -127,7 +127,7 @@ func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeE srcDBName = changeEvent.Ns.DB srcCollName = changeEvent.Ns.Coll } else { - if changeEvent.Ns.DB == "VERIFIER_TEST_META" { + if changeEvent.Ns.DB == verifier.metaDBName { continue } diff --git a/mbson/raw_value_test.go b/mbson/raw_value_test.go index 3c35c32c..718b7c1c 100644 --- a/mbson/raw_value_test.go +++ b/mbson/raw_value_test.go @@ -6,6 +6,7 @@ import ( "github.com/samber/lo" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.mongodb.org/mongo-driver/v2/bson" ) @@ -109,6 +110,29 @@ func TestRaw(t *testing.T) { } } +func TestRawArray(t *testing.T) { + vals := lo.Map( + []bson.RawArray{ + lo.Must(bson.Marshal(bson.D{})), + lo.Must(bson.Marshal(bson.D{{"0", nil}})), + lo.Must(bson.Marshal(bson.D{{"0", 1.2}, {"1", "abc"}})), + }, + func(ra bson.RawArray, _ int) bson.RawValue { + return bson.RawValue{ + Type: bson.TypeArray, + Value: []byte(ra), + } + }, + ) + + for _, cur := range vals { + ra, err := CastRawValue[bson.RawArray](cur) + require.NoError(t, err) + + assert.Equal(t, cur.Value, []byte(ra), "expect same bytes") + } +} + func TestTimestamp(t *testing.T) { vals := []bson.Timestamp{ {0, 0}, diff --git a/mmongo/cursor.go b/mmongo/cursor.go index c76b8a18..d9a4c7f0 100644 --- a/mmongo/cursor.go +++ b/mmongo/cursor.go @@ -2,6 +2,8 @@ package mmongo import ( "context" + "fmt" + "slices" "github.com/pkg/errors" "go.mongodb.org/mongo-driver/v2/bson" @@ -16,6 +18,8 @@ func GetBatch( docs []bson.Raw, buffer []byte, ) ([]bson.Raw, []byte, error) { + var docsCount, expectedCount int + for hasDocs := true; hasDocs; hasDocs = cursor.RemainingBatchLength() > 0 { got := cursor.TryNext(ctx) @@ -24,32 +28,25 @@ func GetBatch( } if !got { + if docsCount != 0 { + panic(fmt.Sprintf("Docs batch ended after %d but expected %d", docsCount, expectedCount)) + } + break } + // This ensures we only reallocate once (if at all): + if docsCount == 0 { + expectedCount = 1 + cursor.RemainingBatchLength() + docs = slices.Grow(docs, expectedCount) + } + + docsCount++ + docPos := len(buffer) buffer = append(buffer, cursor.Current...) docs = append(docs, buffer[docPos:]) } - /* - batchLen := cursor.RemainingBatchLength() - - docs = slices.Grow(docs, batchLen) - - for range batchLen { - if !cursor.Next(ctx) { - return nil, nil, mcmp.Or( - errors.Wrap(cursor.Err(), "iterating cursor mid-batch"), - fmt.Errorf("expected %d docs from cursor but only saw %d", batchLen, len(docs)), - ) - } - - docPos := len(buffer) - buffer = append(buffer, cursor.Current...) - docs = append(docs, buffer[docPos:]) - } - */ - return docs, buffer, nil } From 58f384650737541e32b2719524ee8595f0f8f2e7 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 12:33:07 -0500 Subject: [PATCH 120/130] =?UTF-8?q?renames=20&=20such=20from=20Jian?= =?UTF-8?q?=E2=80=99s=20review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 +- internal/verifier/change_reader.go | 29 ++++---- internal/verifier/change_stream.go | 2 +- internal/verifier/migration_verifier_test.go | 6 +- internal/verifier/oplog/oplog.go | 74 ++++++++++++-------- internal/verifier/oplog_reader.go | 2 +- internal/verifier/recheck_persist.go | 4 +- internal/verifier/recheck_test.go | 2 +- 8 files changed, 67 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 1c9180c9..e1c0afa5 100644 --- a/README.md +++ b/README.md @@ -394,7 +394,8 @@ Additionally, because the amount of data sent to migration-verifier doesn’t ac # Change reading methods -(**NOTE:** If the verifier restarts, it **MUST** use the same change reader options as before.) +NB: If the verifier restarts, it **MUST** use the same change reader options +as before, or it will fail immediately. ## `changeStream` diff --git a/internal/verifier/change_reader.go b/internal/verifier/change_reader.go index f2960d74..3e0273fd 100644 --- a/internal/verifier/change_reader.go +++ b/internal/verifier/change_reader.go @@ -35,7 +35,7 @@ const ( type changeReader interface { getWhichCluster() whichCluster - getReadChannel() <-chan changeEventBatch + getReadChannel() <-chan eventBatch getStartTimestamp() bson.Timestamp getLastSeenClusterTime() option.Option[bson.Timestamp] getEventsPerSecond() option.Option[float64] @@ -60,9 +60,9 @@ type ChangeReaderCommon struct { resumeTokenTSExtractor func(bson.Raw) (bson.Timestamp, error) - running bool - changeEventBatchChan chan changeEventBatch - writesOffTs *util.Eventual[bson.Timestamp] + running bool + eventBatchChan chan eventBatch + writesOffTs *util.Eventual[bson.Timestamp] lastChangeEventTime *msync.TypedAtomic[option.Option[bson.Timestamp]] @@ -79,12 +79,12 @@ type ChangeReaderCommon struct { func newChangeReaderCommon(clusterName whichCluster) ChangeReaderCommon { return ChangeReaderCommon{ - readerType: clusterName, - changeEventBatchChan: make(chan changeEventBatch, batchChanBufferSize), - writesOffTs: util.NewEventual[bson.Timestamp](), - lag: msync.NewTypedAtomic(option.None[time.Duration]()), - lastChangeEventTime: msync.NewTypedAtomic(option.None[bson.Timestamp]()), - batchSizeHistory: history.New[int](time.Minute), + readerType: clusterName, + eventBatchChan: make(chan eventBatch, batchChanBufferSize), + writesOffTs: util.NewEventual[bson.Timestamp](), + lag: msync.NewTypedAtomic(option.None[time.Duration]()), + lastChangeEventTime: msync.NewTypedAtomic(option.None[bson.Timestamp]()), + batchSizeHistory: history.New[int](time.Minute), onDDLEvent: lo.Ternary( clusterName == dst, onDDLEventAllow, @@ -113,8 +113,8 @@ func (rc *ChangeReaderCommon) isRunning() bool { return rc.running } -func (rc *ChangeReaderCommon) getReadChannel() <-chan changeEventBatch { - return rc.changeEventBatchChan +func (rc *ChangeReaderCommon) getReadChannel() <-chan eventBatch { + return rc.eventBatchChan } func (rc *ChangeReaderCommon) getLastSeenClusterTime() option.Option[bson.Timestamp] { @@ -125,7 +125,7 @@ func (rc *ChangeReaderCommon) getLastSeenClusterTime() option.Option[bson.Timest // as a fraction. If saturation rises, that means we’re reading events faster // than we can persist them. func (rc *ChangeReaderCommon) getBufferSaturation() float64 { - return util.DivideToF64(len(rc.changeEventBatchChan), cap(rc.changeEventBatchChan)) + return util.DivideToF64(len(rc.eventBatchChan), cap(rc.eventBatchChan)) } // getLag returns the observed change stream lag (i.e., the delta between @@ -160,7 +160,6 @@ func (rc *ChangeReaderCommon) getEventsPerSecond() option.Option[float64] { return option.None[float64]() } -// start starts the change reader func (rc *ChangeReaderCommon) start( ctx context.Context, eg *errgroup.Group, @@ -180,7 +179,7 @@ func (rc *ChangeReaderCommon) start( Str("reader", string(rc.readerType)). Msg("Finished.") - close(rc.changeEventBatchChan) + close(rc.eventBatchChan) }() retryer := retry.New().WithErrorCodes(util.CursorKilledErrCode) diff --git a/internal/verifier/change_stream.go b/internal/verifier/change_stream.go index ccd833d8..e4549841 100644 --- a/internal/verifier/change_stream.go +++ b/internal/verifier/change_stream.go @@ -251,7 +251,7 @@ func (csr *ChangeStreamReader) readAndHandleOneChangeEventBatch( select { case <-sctx.Done(): return util.WrapCtxErrWithCause(sctx) - case csr.changeEventBatchChan <- changeEventBatch{ + case csr.eventBatchChan <- eventBatch{ events: changeEvents, resumeToken: cs.ResumeToken(), }: diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index 92a2e693..f799af29 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -697,7 +697,7 @@ func (suite *IntegrationTestSuite) TestGetPersistedNamespaceStatistics_Recheck() err := verifier.PersistChangeEvents( ctx, - changeEventBatch{ + eventBatch{ events: []ParsedEvent{{ OpType: "insert", Ns: &Namespace{DB: "mydb", Coll: "coll2"}, @@ -713,7 +713,7 @@ func (suite *IntegrationTestSuite) TestGetPersistedNamespaceStatistics_Recheck() err = verifier.PersistChangeEvents( ctx, - changeEventBatch{ + eventBatch{ events: []ParsedEvent{{ OpType: "insert", Ns: &Namespace{DB: "mydb", Coll: "coll1"}, @@ -982,7 +982,7 @@ func (suite *IntegrationTestSuite) TestFailedVerificationTaskInsertions() { }, } - batch := changeEventBatch{ + batch := eventBatch{ events: mslices.Of(event), } diff --git a/internal/verifier/oplog/oplog.go b/internal/verifier/oplog/oplog.go index 11f2e142..5b4dccd5 100644 --- a/internal/verifier/oplog/oplog.go +++ b/internal/verifier/oplog/oplog.go @@ -15,8 +15,7 @@ const ( rtBSONLength = 4 + 1 + 2 + 1 + 8 + 1 ) -// Op is an internal representation of the parts of an oplog entry that we -// care about. This struct is not meant to be marshaled/unmarshaled directly. +// Op represents the parts of an oplog entry that we care about. type Op struct { // Op holds the oplog entry’s `op`. Op string @@ -47,8 +46,8 @@ func (*Op) UnmarshalBSON([]byte) error { } // UnmarshalFromBSON unmarshals an Op more efficiently than the standard -// bson.Unmarshal function. This function is called for every oplog entry, -// so that efficiency is material. +// bson.Unmarshal function. When verifier reads a v4.4+ server, this function +// is called for every oplog entry, so that efficiency is material. func (o *Op) UnmarshalFromBSON(in []byte) error { for el, err := range mbson.RawElements(bson.Raw(in)) { if err != nil { @@ -62,55 +61,68 @@ func (o *Op) UnmarshalFromBSON(in []byte) error { switch key { case "op": - err = mbson.UnmarshalElementValue(el, &o.Op) + err := mbson.UnmarshalElementValue(el, &o.Op) + if err != nil { + return errors.Wrapf(err, "parsing %#q", key) + } case "ts": - err = mbson.UnmarshalElementValue(el, &o.TS) + err := mbson.UnmarshalElementValue(el, &o.TS) + if err != nil { + return errors.Wrapf(err, "parsing %#q", key) + } case "ns": - err = mbson.UnmarshalElementValue(el, &o.Ns) + err := mbson.UnmarshalElementValue(el, &o.Ns) + if err != nil { + return errors.Wrapf(err, "parsing %#q", key) + } case "cmdName": - err = mbson.UnmarshalElementValue(el, &o.CmdName) + err := mbson.UnmarshalElementValue(el, &o.CmdName) + if err != nil { + return errors.Wrapf(err, "parsing %#q", key) + } case "docLen": - err = mbson.UnmarshalElementValue(el, &o.DocLen) + err := mbson.UnmarshalElementValue(el, &o.DocLen) + if err != nil { + return errors.Wrapf(err, "parsing %#q", key) + } case "docID": o.DocID, err = el.ValueErr() if err != nil { - err = errors.Wrapf(err, "parsing %#q value", key) + return errors.Wrapf(err, "parsing %#q value", key) } o.DocID.Value = slices.Clone(o.DocID.Value) case "ops": var arr bson.RawArray - err = errors.Wrapf( + err := errors.Wrapf( mbson.UnmarshalElementValue(el, &arr), "parsing ops", ) - if err == nil { - vals, err := arr.Values() - if err != nil { - return errors.Wrap(err, "parsing applyOps") - } + if err != nil { + return err + } - o.Ops = make([]Op, len(vals)) + vals, err := arr.Values() + if err != nil { + return errors.Wrap(err, "parsing applyOps") + } - for i, val := range vals { + o.Ops = make([]Op, len(vals)) - var opRaw bson.Raw - err := mbson.UnmarshalRawValue(val, &opRaw) - if err != nil { - return errors.Wrapf(err, "parsing applyOps field") - } + for i, val := range vals { - if err := (&o.Ops[i]).UnmarshalFromBSON(opRaw); err != nil { - return errors.Wrapf(err, "parsing applyOps value") - } + var opRaw bson.Raw + err := mbson.UnmarshalRawValue(val, &opRaw) + if err != nil { + return errors.Wrapf(err, "parsing applyOps field") + } + + if err := (&o.Ops[i]).UnmarshalFromBSON(opRaw); err != nil { + return errors.Wrapf(err, "parsing applyOps[%d]", i) } } default: - err = errors.Wrapf(err, "unexpected field %#q", key) - } - - if err != nil { - return err + return errors.Wrapf(err, "unexpected field %#q", key) } } diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 9c14e5f3..01a00b97 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -379,7 +379,7 @@ func (o *OplogReader) readAndHandleOneBatch( select { case <-sctx.Done(): return err - case o.changeEventBatchChan <- changeEventBatch{ + case o.eventBatchChan <- eventBatch{ events: events, resumeToken: resumeToken, }: diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 9840ecae..dfb90c7c 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -10,7 +10,7 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" ) -type changeEventBatch struct { +type eventBatch struct { events []ParsedEvent resumeToken bson.Raw } @@ -87,7 +87,7 @@ HandlerLoop: } // PersistChangeEvents performs the necessary work for change events after receiving a batch. -func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch changeEventBatch, eventOrigin whichCluster) error { +func (verifier *Verifier) PersistChangeEvents(ctx context.Context, batch eventBatch, eventOrigin whichCluster) error { if len(batch.events) == 0 { return nil } diff --git a/internal/verifier/recheck_test.go b/internal/verifier/recheck_test.go index 0da76289..f9e93f86 100644 --- a/internal/verifier/recheck_test.go +++ b/internal/verifier/recheck_test.go @@ -64,7 +64,7 @@ func (suite *IntegrationTestSuite) TestFailedCompareThenReplace() { err := verifier.PersistChangeEvents( ctx, - changeEventBatch{events: mslices.Of(event)}, + eventBatch{events: mslices.Of(event)}, src, ) suite.Require().NoError(err) From 0d284a33add82bb7ec2f77302d255f1e41bda7a4 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 15:11:04 -0500 Subject: [PATCH 121/130] fix & add more tests --- agg/helpers/exist.go | 12 + internal/retry/retry.go | 24 +- internal/verifier/oplog/oplog.go | 3 +- internal/verifier/oplog_reader.go | 18 +- internal/verifier/oplog_reader_test.go | 299 +++++++++++++++++++++++++ mbson/raw_value.go | 7 +- 6 files changed, 347 insertions(+), 16 deletions(-) create mode 100644 agg/helpers/exist.go create mode 100644 internal/verifier/oplog_reader_test.go diff --git a/agg/helpers/exist.go b/agg/helpers/exist.go new file mode 100644 index 00000000..0d507066 --- /dev/null +++ b/agg/helpers/exist.go @@ -0,0 +1,12 @@ +package helpers + +import ( + "github.com/10gen/migration-verifier/agg" + "go.mongodb.org/mongo-driver/v2/bson" +) + +type Exists [1]any + +func (e Exists) MarshalBSON() ([]byte, error) { + return bson.Marshal(agg.Not{agg.Eq{"missing", agg.Type{e[0]}}}) +} diff --git a/internal/retry/retry.go b/internal/retry/retry.go index 8f7b260d..7ab5959b 100644 --- a/internal/retry/retry.go +++ b/internal/retry/retry.go @@ -3,6 +3,7 @@ package retry import ( "context" "fmt" + "slices" "time" "github.com/10gen/migration-verifier/contextplus" @@ -180,11 +181,7 @@ func (r *Retryer) runRetryLoop( // Not a transient error? Fail immediately. if !r.shouldRetryWithSleep(logger, sleepTime, descriptions, cbErr) { - if descr, has := r.description.Get(); has { - cbErr = errors.Wrap(cbErr, descr) - } - - return cbErr + return wrapErrWithDescriptions(cbErr, descriptions) } // Our error is transient. If we've exhausted the allowed time @@ -197,11 +194,7 @@ func (r *Retryer) runRetryLoop( lastErr: groupErr.errFromCallback, } - if descr, has := r.description.Get(); has { - err = errors.Wrap(err, descr) - } - - return err + return wrapErrWithDescriptions(err, descriptions) } // Sleep and increase the sleep time for the next retry, @@ -245,6 +238,17 @@ func (r *Retryer) addDescriptionToEvent(event *zerolog.Event) *zerolog.Event { return event } +func wrapErrWithDescriptions(err error, descriptions []string) error { + reversed := slices.Clone(descriptions) + slices.Reverse(reversed) + + for _, d := range reversed { + err = errors.Wrap(err, d) + } + + return err +} + // // For the above function, there have historically been concerns regarding majority write concern // upon retrying a write operation to the server. Mongomirror explicitly handled this: diff --git a/internal/verifier/oplog/oplog.go b/internal/verifier/oplog/oplog.go index 5b4dccd5..04771295 100644 --- a/internal/verifier/oplog/oplog.go +++ b/internal/verifier/oplog/oplog.go @@ -45,7 +45,8 @@ func (*Op) UnmarshalBSON([]byte) error { panic("Use UnmarshalFromBSON.") } -// UnmarshalFromBSON unmarshals an Op more efficiently than the standard +// UnmarshalFromBSON unmarshals an Op as transformed by the oplog reader’s +// projection of the oplog. It’s more efficient than the standard // bson.Unmarshal function. When verifier reads a v4.4+ server, this function // is called for every oplog entry, so that efficiency is material. func (o *Op) UnmarshalFromBSON(in []byte) error { diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 01a00b97..3c9c6eda 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -198,7 +198,14 @@ func (o *OplogReader) createCursor( func (o *OplogReader) getExprProjection() bson.D { return bson.D{ {"ts", 1}, - {"op", 1}, + {"op", agg.Cond{ + If: agg.And{ + agg.Eq{"$op", "u"}, + helpers.Exists{"$o._id"}, + }, + Then: "r", + Else: "$op", + }}, {"ns", 1}, {"docLen", getOplogDocLenExpr("$$ROOT")}, @@ -326,7 +333,8 @@ CursorLoop: var oplogOpToOperationType = map[string]string{ "i": "insert", - "u": "update", // don’t need to distinguish from replace + "r": "replace", // NB: This doesn’t happen in the oplog; we project it. + "u": "update", "d": "delete", } @@ -490,6 +498,7 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti switch opName { case "n": + // Ignore. case "c": oDoc, err := mbson.Lookup[bson.Raw](rawDoc, "o") if err != nil { @@ -579,6 +588,7 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore switch op.Op { case "n": + // Ignore. case "c": if op.CmdName != "applyOps" { if o.onDDLEvent == onDDLEventAllow { @@ -666,11 +676,11 @@ func getOplogDocLenExpr(docroot string) any { agg.Eq{docroot + ".op", "i"}, agg.And{ agg.Eq{docroot + ".op", "u"}, - agg.Not(agg.Eq{"missing", docroot + ".o._id"}), + helpers.Exists{docroot + ".o._id"}, }, }, Then: agg.BSONSize{docroot + ".o"}, - Else: "$$REMOVE", + Else: defaultUserDocumentSize, } } diff --git a/internal/verifier/oplog_reader_test.go b/internal/verifier/oplog_reader_test.go new file mode 100644 index 00000000..8904f8f9 --- /dev/null +++ b/internal/verifier/oplog_reader_test.go @@ -0,0 +1,299 @@ +package verifier + +import ( + "github.com/10gen/migration-verifier/contextplus" + "github.com/10gen/migration-verifier/internal/util" + "github.com/10gen/migration-verifier/mbson" + "github.com/10gen/migration-verifier/mslices" + "github.com/samber/lo" + "go.mongodb.org/mongo-driver/v2/bson" +) + +// TestOplogReader_DDL verifies that the oplog reader sees & publishes +// document changes on the source. +func (suite *IntegrationTestSuite) TestOplogReader_DDL() { + ctx := suite.Context() + + verifier := suite.BuildVerifier() + + var reader changeReader = verifier.newOplogReader( + nil, + src, + verifier.srcClient, + *verifier.srcClusterInfo, + ) + + dbName := suite.DBNameForTest() + + coll := verifier.srcClient.Database(dbName).Collection("coll") + + eg, egCtx := contextplus.ErrGroup(ctx) + suite.Require().NoError(reader.start(egCtx, eg)) + lo.Must(coll.InsertOne(ctx, bson.D{{"_id", "hey"}})) + + batchReceiver := reader.getReadChannel() + + select { + case <-ctx.Done(): + suite.Require().NoError(ctx.Err()) + case _, isOpen := <-batchReceiver: + suite.Assert().False(isOpen, "channel should close") + } + + err := eg.Wait() + suite.Assert().ErrorAs(err, &UnknownEventError{}) + + // Confirm that the error text is wrapped: + suite.Assert().Contains(err.Error(), "reading") +} + +// TestOplogReader_Documents verifies that the oplog reader sees & publishes +// document changes on the source. +func (suite *IntegrationTestSuite) TestOplogReader_Documents() { + ctx := suite.Context() + + verifier := suite.BuildVerifier() + + dbName := suite.DBNameForTest() + + coll := verifier.srcClient.Database(dbName).Collection("coll") + + outFilterColl := verifier.srcClient. + Database(suite.DBNameForTest()). + Collection("coll2") + + lo.Must(coll.InsertOne(ctx, bson.D{{"_id", "hey"}})) + lo.Must(outFilterColl.InsertOne(ctx, bson.D{{"_id", "hey"}})) + + if suite.GetTopology(verifier.srcClient) == util.TopologySharded { + suite.T().Skipf("oplog mode is only for unsharded clusters") + } + + var reader changeReader = verifier.newOplogReader( + mslices.Of(FullName(coll)), + src, + verifier.srcClient, + *verifier.srcClusterInfo, + ) + + batchReceiver := reader.getReadChannel() + + var lastResumeTokenTS bson.Timestamp + + getBatch := func() eventBatch { + batch, isOpen := <-batchReceiver + suite.Require().True(isOpen, "channel should still be open") + + rtTS, err := mbson.Lookup[bson.Timestamp](batch.resumeToken, "ts") + suite.Require().NoError(err) + + lastResumeTokenTS = rtTS + + suite.Require().False( + rtTS.Before(*lo.LastOrEmpty(batch.events).ClusterTime), + "resume token must not predate the last event", + ) + + return batch + } + + eg, egCtx := contextplus.ErrGroup(ctx) + suite.Require().NoError(reader.start(egCtx, eg)) + + // NB: This should be the first event we see because the most + // recent op before this was for an out-filter namespace. + suite.Run( + "insert one", + func() { + + raw := lo.Must(bson.Marshal(bson.D{{"_id", "ho"}})) + lo.Must(coll.InsertOne(ctx, raw)) + batch := getBatch() + event := batch.events[0] + + suite.Assert().Equal( + NewNamespace(dbName, coll.Name()), + event.Ns, + ) + suite.Assert().Equal("insert", event.OpType) + suite.Assert().Equal("ho", lo.Must(mbson.CastRawValue[string](event.DocID))) + suite.Assert().EqualValues(len(raw), event.FullDocLen.MustGet(), "doc length") + }, + ) + + suite.Run( + "update one", + func() { + lo.Must(coll.UpdateOne( + ctx, + bson.D{{"_id", "hey"}}, + bson.D{{"$set", bson.D{{"foo", "bar"}}}}, + )) + + batch := getBatch() + event := batch.events[0] + + suite.Assert().Equal( + NewNamespace(dbName, coll.Name()), + event.Ns, + ) + suite.Assert().Equal("update", event.OpType) + suite.Assert().Equal("hey", lo.Must(mbson.CastRawValue[string](event.DocID))) + suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) + }, + ) + + suite.Run( + "replace one", + func() { + raw := lo.Must(bson.Marshal(bson.D{{"_id", "ho"}, {"a", "b"}})) + + lo.Must(coll.ReplaceOne(ctx, bson.D{{"_id", "ho"}}, raw)) + batch := getBatch() + event := batch.events[0] + + suite.Assert().Equal( + NewNamespace(dbName, coll.Name()), + event.Ns, + ) + suite.Assert().Equal("replace", event.OpType) + suite.Assert().Equal("ho", lo.Must(mbson.CastRawValue[string](event.DocID))) + suite.Assert().EqualValues(len(raw), event.FullDocLen.MustGet(), "doc length") + }, + ) + + suite.Run( + "delete one", + func() { + // Now check that the reader understands bulk inserts. + lo.Must(coll.DeleteOne(ctx, bson.D{{"_id", "hey"}})) + batch := getBatch() + event := batch.events[0] + + suite.Assert().Equal( + NewNamespace(dbName, coll.Name()), + event.Ns, + ) + suite.Assert().Equal("delete", event.OpType) + suite.Assert().Equal("hey", lo.Must(mbson.CastRawValue[string](event.DocID))) + suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) + }, + ) + + bulkDocs := []bson.D{ + {{"_id", 1.25}}, + {{"_id", 1.5}}, + {{"_id", 1.75}}, + {{"_id", 2.25}}, + } + + suite.Run( + "bulk insert", + func() { + lo.Must(coll.InsertMany(ctx, lo.ToAnySlice(bulkDocs))) + + docLen := len(lo.Must(bson.Marshal(bulkDocs[0]))) + + events := []ParsedEvent{} + + for len(events) < 4 { + batch := getBatch() + events = append(events, batch.events...) + } + + suite.Require().Len(events, 4) + + for i, event := range events { + suite.Assert().Equal("insert", event.OpType) + suite.Assert().EqualValues(docLen, event.FullDocLen.MustGet()) + + suite.Assert().Equal( + bulkDocs[i][0].Value, + lo.Must(mbson.CastRawValue[float64](event.DocID)), + "events[%d].DocID", i, + ) + } + }, + ) + + suite.Run( + "bulk update", + func() { + docIDs := lo.Map( + bulkDocs, + func(d bson.D, _ int) any { + return d[0].Value + }, + ) + + lo.Must(coll.UpdateMany( + ctx, + bson.D{{"_id", bson.D{{"$in", docIDs}}}}, + bson.D{{"$set", bson.D{{"aa", "bb"}}}}, + )) + + events := []ParsedEvent{} + + for len(events) < 4 { + batch := getBatch() + events = append(events, batch.events...) + } + + suite.Require().Len(events, 4) + + for _, event := range events { + suite.Assert().Equal("update", event.OpType) + suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) + } + + eventDocIDs := lo.Map( + events, + func(event ParsedEvent, _ int) any { + return lo.Must(mbson.CastRawValue[float64](event.DocID)) + }, + ) + + suite.Assert().ElementsMatch(docIDs, eventDocIDs) + }, + ) + + suite.Run( + "bulk delete", + func() { + docIDs := lo.Map( + bulkDocs, + func(d bson.D, _ int) any { + return d[0].Value + }, + ) + + lo.Must(coll.DeleteMany(ctx, bson.D{{"_id", bson.D{{"$in", docIDs}}}})) + + events := []ParsedEvent{} + + for len(events) < 4 { + batch := getBatch() + events = append(events, batch.events...) + } + + suite.Require().Len(events, 4) + + for _, event := range events { + suite.Assert().Equal("delete", event.OpType) + suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) + } + + eventDocIDs := lo.Map( + events, + func(event ParsedEvent, _ int) any { + return lo.Must(mbson.CastRawValue[float64](event.DocID)) + }, + ) + + suite.Assert().ElementsMatch(docIDs, eventDocIDs) + }, + ) + + reader.setWritesOff(lastResumeTokenTS) + suite.Require().NoError(eg.Wait()) +} diff --git a/mbson/raw_value.go b/mbson/raw_value.go index 7674c56e..9993232f 100644 --- a/mbson/raw_value.go +++ b/mbson/raw_value.go @@ -9,7 +9,8 @@ import ( ) type bsonCastRecipient interface { - bson.Raw | bson.RawArray | bson.Timestamp | bson.ObjectID | string | int32 + bson.Raw | bson.RawArray | bson.Timestamp | bson.ObjectID | + string | int32 | float64 } type bsonSourceTypes interface { @@ -56,6 +57,10 @@ func CastRawValue[T bsonCastRecipient](in bson.RawValue) (T, error) { if val, ok := in.Int32OK(); ok { return any(val).(T), nil } + case float64: + if val, ok := in.DoubleOK(); ok { + return any(val).(T), nil + } default: panic(fmt.Sprintf("Unrecognized Go type: %T (maybe augment bsonType?)", in)) } From 542ebdfcfd3003d06a5f2df9d995bf41c62b7484 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 15:31:04 -0500 Subject: [PATCH 122/130] add unit test --- internal/verifier/oplog/oplog.go | 13 +++++--- internal/verifier/oplog/oplog_test.go | 46 ++++++++++++++++++++++++++ internal/verifier/oplog/start_time.go | 4 ++- internal/verifier/oplog_reader.go | 17 ++++++---- internal/verifier/oplog_reader_test.go | 5 ++- 5 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 internal/verifier/oplog/oplog_test.go diff --git a/internal/verifier/oplog/oplog.go b/internal/verifier/oplog/oplog.go index 04771295..a718e66a 100644 --- a/internal/verifier/oplog/oplog.go +++ b/internal/verifier/oplog/oplog.go @@ -6,6 +6,7 @@ import ( "slices" "github.com/10gen/migration-verifier/mbson" + "github.com/10gen/migration-verifier/option" "github.com/pkg/errors" "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/x/bsonx/bsoncore" @@ -27,18 +28,18 @@ type Op struct { Ns string // CmdName is the first field name in the oplog entry’s `o` document. - CmdName string + CmdName option.Option[string] `bson:",omitempty"` // DocLen is the length, in bytes, of whatever document the oplog entry // describes. This will only be meaningful for insert & replace entries. - DocLen int32 + DocLen int32 `bson:"docLen"` // DocID is the `_id` of whatever document the oplog entry describes. // This won’t be populated for multi-op Op instances. - DocID bson.RawValue + DocID bson.RawValue `bson:"docID"` // Ops holds the ops in an `applyOps` oplog entry. - Ops []Op + Ops []Op `bson:",omitempty"` } func (*Op) UnmarshalBSON([]byte) error { @@ -77,10 +78,12 @@ func (o *Op) UnmarshalFromBSON(in []byte) error { return errors.Wrapf(err, "parsing %#q", key) } case "cmdName": - err := mbson.UnmarshalElementValue(el, &o.CmdName) + var cmdName string + err := mbson.UnmarshalElementValue(el, &cmdName) if err != nil { return errors.Wrapf(err, "parsing %#q", key) } + o.CmdName = option.Some(cmdName) case "docLen": err := mbson.UnmarshalElementValue(el, &o.DocLen) if err != nil { diff --git a/internal/verifier/oplog/oplog_test.go b/internal/verifier/oplog/oplog_test.go new file mode 100644 index 00000000..8d4b6f03 --- /dev/null +++ b/internal/verifier/oplog/oplog_test.go @@ -0,0 +1,46 @@ +package oplog + +import ( + "testing" + + "github.com/10gen/migration-verifier/mbson" + "github.com/samber/lo" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.mongodb.org/mongo-driver/v2/bson" +) + +func TestOpUnmarshal(t *testing.T) { + op := Op{ + Op: "hey", + TS: bson.Timestamp{345, 456}, + Ns: "hohqohewoqhwe", + DocLen: 777, + DocID: mbson.ToRawValue("haha"), + } + + raw := bson.Raw(lo.Must(bson.Marshal(op))) + + rt := &Op{} + require.NoError(t, rt.UnmarshalFromBSON(raw)) + + assert.Equal(t, &op, rt, "Op should round-trip BSON (raw: %+v)", raw) +} + +func TestResumeTokenBSON(t *testing.T) { + token := ResumeToken{ + TS: bson.Timestamp{T: 234234, I: 11}, + } + + raw := token.MarshalToBSON() + + ts, err := GetRawResumeTokenTimestamp(raw) + require.NoError(t, err) + + assert.Equal(t, token.TS, ts, "extracted timestamp should match") + + var rt ResumeToken + require.NoError(t, bson.Unmarshal(raw, &rt)) + + assert.Equal(t, token, rt) +} diff --git a/internal/verifier/oplog/start_time.go b/internal/verifier/oplog/start_time.go index 494ac8a8..67ba3ca2 100644 --- a/internal/verifier/oplog/start_time.go +++ b/internal/verifier/oplog/start_time.go @@ -12,6 +12,8 @@ import ( "go.mongodb.org/mongo-driver/v2/mongo/readconcern" ) +// GetTailingStartTimes returns the earliest transaction timestamp and the +// latest op in the oplog. func GetTailingStartTimes( ctx context.Context, client *mongo.Client, @@ -132,7 +134,7 @@ func getOldestTransactionTime( coll := client.Database("config"). Collection( "transactions", - options.Collection().SetReadConcern(readconcern.Local()), + options.Collection().SetReadConcern(readconcern.Majority()), ) decoded := struct { diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 3c9c6eda..8d72cdc0 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -422,7 +422,7 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti } var docID bson.RawValue - var docLength types.ByteCount + var docLength option.Option[types.ByteCount] var docField string switch opName { @@ -456,7 +456,7 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti return errors.Wrap(err, "extracting doc from op") } - docLength = types.ByteCount(len(doc)) + docLength = option.Some(types.ByteCount(len(doc))) docID, err = doc.LookupErr("_id") if err != nil { return errors.Wrap(err, "extracting doc ID from op") @@ -465,8 +465,6 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti if docID.IsZero() { panic("zero doc ID!") } - - docLength = defaultUserDocumentSize } docID.Value = slices.Clone(docID.Value) @@ -477,7 +475,7 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti OpType: oplogOpToOperationType[opName], Ns: NewNamespace(SplitNamespace(nsStr)), DocID: docID, - FullDocLen: option.Some(docLength), + FullDocLen: docLength, ClusterTime: lo.ToPtr(ts), }, ) @@ -590,7 +588,12 @@ func (o *OplogReader) parseExprProjectedOps(events []ParsedEvent, allowDDLBefore case "n": // Ignore. case "c": - if op.CmdName != "applyOps" { + cmdName, has := op.CmdName.Get() + if !has { + return nil, bson.Timestamp{}, fmt.Errorf("no cmdname in op=c: %+v", op) + } + + if cmdName != "applyOps" { if o.onDDLEvent == onDDLEventAllow { o.logIgnoredDDL(rawDoc) continue @@ -680,7 +683,7 @@ func getOplogDocLenExpr(docroot string) any { }, }, Then: agg.BSONSize{docroot + ".o"}, - Else: defaultUserDocumentSize, + Else: "$$REMOVE", } } diff --git a/internal/verifier/oplog_reader_test.go b/internal/verifier/oplog_reader_test.go index 8904f8f9..391e46a6 100644 --- a/internal/verifier/oplog_reader_test.go +++ b/internal/verifier/oplog_reader_test.go @@ -9,9 +9,8 @@ import ( "go.mongodb.org/mongo-driver/v2/bson" ) -// TestOplogReader_DDL verifies that the oplog reader sees & publishes -// document changes on the source. -func (suite *IntegrationTestSuite) TestOplogReader_DDL() { +// TestOplogReader_SourceDDL verifies that source DDL crashes the oplog reader. +func (suite *IntegrationTestSuite) TestOplogReader_SourceDDL() { ctx := suite.Context() verifier := suite.BuildVerifier() From 32445e9fecf38b62e5844e670e2edc96307bad3f Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 15:32:47 -0500 Subject: [PATCH 123/130] tweak docs --- internal/verifier/oplog/start_time.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/verifier/oplog/start_time.go b/internal/verifier/oplog/start_time.go index 67ba3ca2..cd2368fe 100644 --- a/internal/verifier/oplog/start_time.go +++ b/internal/verifier/oplog/start_time.go @@ -13,7 +13,7 @@ import ( ) // GetTailingStartTimes returns the earliest transaction timestamp and the -// latest op in the oplog. +// timestamp of the latest-visible op in the oplog. func GetTailingStartTimes( ctx context.Context, client *mongo.Client, From 4cfdbd683d330a1f408cbdd3406e14a54a93215e Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 16:03:44 -0500 Subject: [PATCH 124/130] fix oplog tests --- internal/verifier/oplog_reader_test.go | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/internal/verifier/oplog_reader_test.go b/internal/verifier/oplog_reader_test.go index 391e46a6..0799743e 100644 --- a/internal/verifier/oplog_reader_test.go +++ b/internal/verifier/oplog_reader_test.go @@ -1,6 +1,8 @@ package verifier import ( + "time" + "github.com/10gen/migration-verifier/contextplus" "github.com/10gen/migration-verifier/internal/util" "github.com/10gen/migration-verifier/mbson" @@ -32,11 +34,21 @@ func (suite *IntegrationTestSuite) TestOplogReader_SourceDDL() { batchReceiver := reader.getReadChannel() - select { - case <-ctx.Done(): - suite.Require().NoError(ctx.Err()) - case _, isOpen := <-batchReceiver: - suite.Assert().False(isOpen, "channel should close") + timer := time.NewTimer(time.Minute) + + channelOpen := true + for channelOpen { + var batch eventBatch + select { + case <-ctx.Done(): + suite.Require().NoError(ctx.Err()) + case <-timer.C: + suite.Require().Fail("should read batch channel") + case batch, channelOpen = <-batchReceiver: + if channelOpen { + suite.T().Logf("got batch: %+v", batch) + } + } } err := eg.Wait() @@ -138,7 +150,6 @@ func (suite *IntegrationTestSuite) TestOplogReader_Documents() { ) suite.Assert().Equal("update", event.OpType) suite.Assert().Equal("hey", lo.Must(mbson.CastRawValue[string](event.DocID))) - suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) }, ) @@ -175,7 +186,6 @@ func (suite *IntegrationTestSuite) TestOplogReader_Documents() { ) suite.Assert().Equal("delete", event.OpType) suite.Assert().Equal("hey", lo.Must(mbson.CastRawValue[string](event.DocID))) - suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) }, ) @@ -242,7 +252,6 @@ func (suite *IntegrationTestSuite) TestOplogReader_Documents() { for _, event := range events { suite.Assert().Equal("update", event.OpType) - suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) } eventDocIDs := lo.Map( @@ -279,7 +288,6 @@ func (suite *IntegrationTestSuite) TestOplogReader_Documents() { for _, event := range events { suite.Assert().Equal("delete", event.OpType) - suite.Assert().EqualValues(defaultUserDocumentSize, event.FullDocLen.MustGet()) } eventDocIDs := lo.Map( From 6b33e8cbbb7c1df25137689749c75225a59de5c4 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 17:46:01 -0500 Subject: [PATCH 125/130] slip test --- internal/verifier/oplog_reader_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/verifier/oplog_reader_test.go b/internal/verifier/oplog_reader_test.go index 0799743e..26045d38 100644 --- a/internal/verifier/oplog_reader_test.go +++ b/internal/verifier/oplog_reader_test.go @@ -17,6 +17,10 @@ func (suite *IntegrationTestSuite) TestOplogReader_SourceDDL() { verifier := suite.BuildVerifier() + if suite.GetTopology(verifier.srcClient) == util.TopologySharded { + suite.T().Skipf("oplog mode is only for unsharded clusters") + } + var reader changeReader = verifier.newOplogReader( nil, src, From 9b75105dcf410c77e292db47d070ab1eb1d18a4b Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 3 Dec 2025 18:45:38 -0500 Subject: [PATCH 126/130] replace w/ 4.2 --- internal/verifier/oplog_reader.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 8d72cdc0..80203993 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -451,6 +451,10 @@ func (o *OplogReader) parseRawOps(events []ParsedEvent, allowDDLBeforeTS bson.Ti } if docField != "" { + if opName == "u" { + opName = "r" + } + doc, err := mbson.Lookup[bson.Raw](rawDoc, docField) if err != nil { return errors.Wrap(err, "extracting doc from op") From 81483679e17313e3b05041bbd8fefc209729eca1 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 16 Dec 2025 12:25:47 -0500 Subject: [PATCH 127/130] fix test --- internal/verifier/migration_verifier_test.go | 26 ++++++++++++++++++-- internal/verifier/oplog_reader.go | 12 +++++++++ internal/verifier/recheck_persist.go | 3 ++- mslices/slices.go | 17 +++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/internal/verifier/migration_verifier_test.go b/internal/verifier/migration_verifier_test.go index fd337660..df11add8 100644 --- a/internal/verifier/migration_verifier_test.go +++ b/internal/verifier/migration_verifier_test.go @@ -497,6 +497,9 @@ func (suite *IntegrationTestSuite) TestTypesBetweenBoundaries() { } func (suite *IntegrationTestSuite) TestMismatchTimePersistence() { + zerolog.SetGlobalLevel(zerolog.TraceLevel) + defer zerolog.SetGlobalLevel(zerolog.DebugLevel) + ctx := suite.Context() collName := "c" @@ -514,6 +517,20 @@ func (suite *IntegrationTestSuite) TestMismatchTimePersistence() { InsertOne(ctx, bson.D{{"_id", "a"}}) suite.Require().NoError(err) + // So that the insert above isn’t the last thing in the oplog: + _, err = suite.srcMongoClient. + Database(suite.DBNameForTest()). + Collection(collName). + InsertOne(ctx, bson.D{{"_id", "qwe"}}) + suite.Require().NoError(err) + _, err = suite.srcMongoClient. + Database(suite.DBNameForTest()). + Collection(collName). + DeleteOne(ctx, bson.D{{"_id", "qwe"}}) + suite.Require().NoError(err) + + testutil.KillTransactions(ctx, suite.T(), suite.srcMongoClient) + verifier := suite.BuildVerifier() verifier.SetVerifyAll(true) runner := RunVerifierCheck(ctx, suite.T(), verifier) @@ -583,7 +600,11 @@ func (suite *IntegrationTestSuite) TestMismatchTimePersistence() { suite.Require().NoError(err) suite.Require().NoError(cur.All(ctx, &tasks)) suite.Require().Len(tasks, 1) - suite.Require().Contains(tasks[0].FirstMismatchTime, int32(0)) + suite.Require().Contains( + tasks[0].FirstMismatchTime, + int32(0), + "tasks[0].first-mismatch-time map (task: %+v)", tasks[0], + ) suite.Assert().Equal( firstMismatchTime, @@ -655,7 +676,8 @@ func (suite *IntegrationTestSuite) TestMismatchTimePersistence() { suite.Assert().Equal( firstMismatchTime, tasks[0].FirstMismatchTime[0], - "task in new gen should have the original first mismatch time", + "task in new gen should have the original first mismatch time (task: %+v)", + tasks[0], ) lastMismatchDuration := mismatches[0].Detail.MismatchHistory.DurationMS diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 2f61aaf4..9f202bc2 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -358,6 +358,18 @@ func (o *OplogReader) readAndHandleOneBatch( return nil } + if o.logger.Trace().Enabled() { + o.logger.Trace(). + Str("changeReader", string(o.getWhichCluster())). + Strs("events", mslices.Map1( + o.curDocs, + bson.Raw.String, + )). + Int("batchEvents", len(o.curDocs)). + Int("batchBytes", len(o.scratch)). + Msg("Received a batch of oplog events.") + } + var latestTS bson.Timestamp events := make([]ParsedEvent, 0, len(o.curDocs)) diff --git a/internal/verifier/recheck_persist.go b/internal/verifier/recheck_persist.go index 457ce290..5c3b5ad3 100644 --- a/internal/verifier/recheck_persist.go +++ b/internal/verifier/recheck_persist.go @@ -69,7 +69,8 @@ HandlerLoop: verifier.logger.Trace(). Str("changeReader", string(clusterName)). Int("batchSize", len(batch.events)). - Any("batch", batch). + Any("batch", batch.events). + Stringer("resumeToken", batch.resumeToken). Msg("Handling change event batch.") err = errors.Wrap( diff --git a/mslices/slices.go b/mslices/slices.go index 26a2be81..fe766466 100644 --- a/mslices/slices.go +++ b/mslices/slices.go @@ -5,6 +5,7 @@ import ( "slices" "github.com/10gen/migration-verifier/option" + "github.com/samber/lo" ) // This package complements the Go standard library’s package of the @@ -65,3 +66,19 @@ func FindFirstDupe[T comparable](items []T) option.Option[T] { return option.None[T]() } + +// Map1 is like lo.Map, but the callback accepts only a single parameter. +// This facilitates a lot of syntactic niceties that lo.Map makes difficult. +// For example, you can stringify a slice of `fmt.Stringer`s thus: +// +// strings := Map1( items, theType.String ) +// +// … which, with lo.Map, requires a wrapper callback. +func Map1[T any, V any](s []T, cb func(T) V) []V { + return lo.Map( + s, + func(d T, _ int) V { + return cb(d) + }, + ) +} From 10d3767af9e98cb0ff9b1202c18150cccb0d1a38 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 16 Dec 2025 20:59:03 -0500 Subject: [PATCH 128/130] fix race in getting start time --- internal/verifier/oplog/start_time.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/internal/verifier/oplog/start_time.go b/internal/verifier/oplog/start_time.go index cd2368fe..a219efe1 100644 --- a/internal/verifier/oplog/start_time.go +++ b/internal/verifier/oplog/start_time.go @@ -18,14 +18,16 @@ func GetTailingStartTimes( ctx context.Context, client *mongo.Client, ) (OpTime, OpTime, error) { - oldestTxn, err := getOldestTransactionTime(ctx, client) + // IMPORTANT: Fetch the latest time before getting the oldest transaction + // to avoid the race condition described in TOOLS-4015. + latestTime, err := getLatestVisibleOplogOpTime(ctx, client) if err != nil { - return OpTime{}, OpTime{}, errors.Wrapf(err, "finding oldest txn") + return OpTime{}, OpTime{}, errors.Wrapf(err, "finding latest optime") } - latestTime, err := getLatestVisibleOplogOpTime(ctx, client) + oldestTxn, err := getOldestTransactionTime(ctx, client) if err != nil { - return OpTime{}, OpTime{}, errors.Wrapf(err, "finding latest optime") + return OpTime{}, OpTime{}, errors.Wrapf(err, "finding oldest txn") } if oldestTime, has := oldestTxn.Get(); has { From 87b423374f4993f9a029464139b1c9b2731752a2 Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Tue, 16 Dec 2025 21:17:55 -0500 Subject: [PATCH 129/130] =?UTF-8?q?Max=E2=80=99s=20review?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- internal/verifier/oplog_reader.go | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1063e57e..8984575e 100644 --- a/README.md +++ b/README.md @@ -401,7 +401,7 @@ The default. The verifier will read a change stream, which works seamlessly on s ## `tailOplog` -The verifier will read the oplog continually instead of reading a change stream. This is generally faster, but it doesn’t work in sharded clusters. +The verifier will read the oplog continually instead of reading a change stream. This is generally faster, but in sharded clusters it requires verifying shard-to-shard. (This also requires that the data migrate shard-to-shard.) # Known Issues diff --git a/internal/verifier/oplog_reader.go b/internal/verifier/oplog_reader.go index 9f202bc2..b03dcdbb 100644 --- a/internal/verifier/oplog_reader.go +++ b/internal/verifier/oplog_reader.go @@ -1,5 +1,16 @@ package verifier +// ------------------------------------------------------------------ +// NOTE: The oplog reader sometimes triggers “extra” rechecks: +// - The first events may reflect writes that were already finalized +// when verification started +// - If a multi-statement transaction aborts, the oplog reader will +// still broadcast change events for the relevant documents. +// +// This is OK, of course, because extra rechecks pose no durability concerns; +// at worse, they’re just inefficient--and, we assume, trivially so. +// ------------------------------------------------------------------ + import ( "context" "fmt" @@ -704,7 +715,6 @@ func getOplogDocLenExpr(docroot string) any { } func getOplogDocIDExpr(docroot string) any { - // $switch was new in MongoDB 4.4, so use $cond instead. return agg.Switch{ Branches: []agg.SwitchCase{ { From 5d5f6e38e92d4bb86975dce6d28a01aacf04a99f Mon Sep 17 00:00:00 2001 From: Felipe Gasper Date: Wed, 17 Dec 2025 16:15:34 -0500 Subject: [PATCH 130/130] final reword --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8984575e..d5866e7e 100644 --- a/README.md +++ b/README.md @@ -401,7 +401,7 @@ The default. The verifier will read a change stream, which works seamlessly on s ## `tailOplog` -The verifier will read the oplog continually instead of reading a change stream. This is generally faster, but in sharded clusters it requires verifying shard-to-shard. (This also requires that the data migrate shard-to-shard.) +The verifier will read the oplog continually instead of reading a change stream. This is generally faster, but it only works when connecting to a replica set (i.e., not a mongos). # Known Issues