From ee70f41b61fe83ad4d9636deb35b0a8300c5d41d Mon Sep 17 00:00:00 2001 From: Ralf Haferkamp Date: Mon, 11 May 2026 15:47:58 +0200 Subject: [PATCH 1/2] chore: bump reva --- go.mod | 24 +- go.sum | 48 +- vendor/filippo.io/edwards25519/README.md | 6 +- vendor/filippo.io/edwards25519/doc.go | 6 +- vendor/filippo.io/edwards25519/extra.go | 67 +- vendor/filippo.io/edwards25519/field/fe.go | 34 +- .../filippo.io/edwards25519/field/fe_amd64.go | 3 +- .../filippo.io/edwards25519/field/fe_amd64.s | 203 +- .../edwards25519/field/fe_amd64_noasm.go | 3 +- .../filippo.io/edwards25519/field/fe_arm64.go | 16 - .../filippo.io/edwards25519/field/fe_arm64.s | 42 - .../edwards25519/field/fe_arm64_noasm.go | 12 - .../edwards25519/field/fe_generic.go | 170 +- vendor/filippo.io/edwards25519/pull.sh | 53 + vendor/filippo.io/edwards25519/scalar.go | 27 +- vendor/filippo.io/edwards25519/tables.go | 4 +- .../antithesis-sdk-go/assert/assert.go | 4 +- .../antithesis-sdk-go/internal/sdk_const.go | 2 +- vendor/github.com/go-sql-driver/mysql/AUTHORS | 10 +- .../go-sql-driver/mysql/CHANGELOG.md | 19 +- .../github.com/go-sql-driver/mysql/README.md | 7 +- vendor/github.com/go-sql-driver/mysql/auth.go | 2 +- .../go-sql-driver/mysql/conncheck.go | 1 - .../go-sql-driver/mysql/conncheck_dummy.go | 1 - .../go-sql-driver/mysql/connection.go | 277 +- .../go-sql-driver/mysql/connector.go | 12 +- .../github.com/go-sql-driver/mysql/const.go | 21 +- vendor/github.com/go-sql-driver/mysql/dsn.go | 9 +- .../github.com/go-sql-driver/mysql/fields.go | 38 +- .../github.com/go-sql-driver/mysql/infile.go | 5 +- .../github.com/go-sql-driver/mysql/packets.go | 341 ++- .../github.com/go-sql-driver/mysql/result.go | 6 +- vendor/github.com/go-sql-driver/mysql/rows.go | 10 +- .../go-sql-driver/mysql/statement.go | 34 +- .../github.com/go-sql-driver/mysql/utils.go | 151 +- .../nats-io/nats-server/v2/server/auth.go | 46 +- .../nats-server/v2/server/auth_callout.go | 14 +- .../nats-server/v2/server/avl/seqset.go | 4 +- .../nats-io/nats-server/v2/server/client.go | 313 +- .../nats-io/nats-server/v2/server/const.go | 2 +- .../nats-io/nats-server/v2/server/consumer.go | 667 ++++- .../nats-io/nats-server/v2/server/cron.go | 327 +++ .../nats-io/nats-server/v2/server/errors.json | 212 +- .../nats-io/nats-server/v2/server/events.go | 32 +- .../nats-server/v2/server/feature_flags.go | 130 + .../nats-server/v2/server/filestore.go | 2558 ++++++++++++----- .../nats-io/nats-server/v2/server/gateway.go | 20 +- .../nats-io/nats-server/v2/server/gsl/gsl.go | 60 + .../nats-server/v2/server/jetstream.go | 486 +--- .../nats-server/v2/server/jetstream_api.go | 502 ++-- .../v2/server/jetstream_batching.go | 503 +++- .../v2/server/jetstream_cluster.go | 931 ++++-- .../v2/server/jetstream_errors_generated.go | 300 ++ .../nats-server/v2/server/jetstream_events.go | 14 +- .../v2/server/jetstream_versioning.go | 12 +- .../nats-io/nats-server/v2/server/jwt.go | 58 +- .../nats-io/nats-server/v2/server/leafnode.go | 408 ++- .../nats-io/nats-server/v2/server/log.go | 8 + .../nats-io/nats-server/v2/server/memstore.go | 204 +- .../nats-io/nats-server/v2/server/monitor.go | 164 +- .../nats-io/nats-server/v2/server/mqtt.go | 163 +- .../nats-io/nats-server/v2/server/msgtrace.go | 133 +- .../nats-io/nats-server/v2/server/opts.go | 172 +- .../nats-io/nats-server/v2/server/parser.go | 59 +- .../nats-io/nats-server/v2/server/raft.go | 253 +- .../nats-io/nats-server/v2/server/reload.go | 744 +++-- .../nats-server/v2/server/scheduler.go | 139 +- .../nats-io/nats-server/v2/server/server.go | 43 +- .../nats-io/nats-server/v2/server/store.go | 39 +- .../nats-io/nats-server/v2/server/stream.go | 2141 +++++++++++--- .../nats-io/nats-server/v2/server/sublist.go | 12 + .../nats-io/nats-server/v2/server/thw/thw.go | 7 +- vendor/github.com/onsi/gomega/CHANGELOG.md | 8 + vendor/github.com/onsi/gomega/gomega_dsl.go | 2 +- .../storageprovider/storageprovider.go | 10 - .../services/userprovider/userprovider.go | 14 +- .../usershareprovider/usershareprovider.go | 8 +- .../http/services/owncloud/ocdav/proppatch.go | 7 + .../opencloud-eu/reva/v2/pkg/appctx/appctx.go | 10 - .../reva/v2/pkg/errtypes/errtypes.go | 21 + .../reva/v2/pkg/events/raw/raw.go | 42 +- .../reva/v2/pkg/events/stream/nats.go | 36 +- .../reva/v2/pkg/rgrpc/status/status.go | 13 + .../v2/pkg/share/manager/jsoncs3/jsoncs3.go | 230 +- .../migrations/0001_import_spacemembers.go | 435 +++ .../manager/jsoncs3/migrations/migration.go | 353 +++ .../v2/pkg/share/manager/memory/memory.go | 3 +- .../v2/pkg/share/manager/registry/registry.go | 7 +- .../reva/v2/pkg/storage/cache/kv.go | 2 +- .../pkg/storage/fs/posix/idcache/idcache.go | 11 +- .../reva/v2/pkg/storage/fs/posix/posix.go | 12 +- .../reva/v2/pkg/storage/fs/posix/tree/tree.go | 15 - .../pkg/decomposedfs/node/permissions.go | 1 + .../pkg/storage/pkg/decomposedfs/tree/tree.go | 5 - .../storage/utils/decomposedfs/tree/tree.go | 5 - .../v2/pkg/storage/utils/metadata/disk.go | 39 +- .../reva/v2/pkg/utils/ldap/identity.go | 89 +- vendor/golang.org/x/crypto/ssh/cipher.go | 2 +- vendor/golang.org/x/crypto/ssh/client_auth.go | 10 +- .../golang.org/x/tools/go/packages/golist.go | 33 +- .../x/tools/go/packages/packages.go | 4 + vendor/modules.txt | 35 +- 102 files changed, 10776 insertions(+), 4234 deletions(-) delete mode 100644 vendor/filippo.io/edwards25519/field/fe_arm64.go delete mode 100644 vendor/filippo.io/edwards25519/field/fe_arm64.s delete mode 100644 vendor/filippo.io/edwards25519/field/fe_arm64_noasm.go create mode 100644 vendor/filippo.io/edwards25519/pull.sh create mode 100644 vendor/github.com/nats-io/nats-server/v2/server/cron.go create mode 100644 vendor/github.com/nats-io/nats-server/v2/server/feature_flags.go create mode 100644 vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/0001_import_spacemembers.go create mode 100644 vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/migration.go diff --git a/go.mod b/go.mod index 4dba424616..3c59fba5c2 100644 --- a/go.mod +++ b/go.mod @@ -55,17 +55,17 @@ require ( github.com/mitchellh/mapstructure v1.5.0 github.com/mna/pigeon v1.3.0 github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 - github.com/nats-io/nats-server/v2 v2.12.6 + github.com/nats-io/nats-server/v2 v2.14.0 github.com/nats-io/nats.go v1.51.0 github.com/oklog/run v1.2.0 github.com/olekukonko/tablewriter v1.1.4 github.com/onsi/ginkgo v1.16.5 github.com/onsi/ginkgo/v2 v2.28.1 - github.com/onsi/gomega v1.39.1 + github.com/onsi/gomega v1.40.0 github.com/open-policy-agent/opa v1.15.2 github.com/opencloud-eu/icap-client v0.0.0-20250930132611-28a2afe62d89 github.com/opencloud-eu/libre-graph-api-go v1.0.8-0.20260310090739-853d972b282d - github.com/opencloud-eu/reva/v2 v2.43.1-0.20260424125411-c5db28365753 + github.com/opencloud-eu/reva/v2 v2.43.1-0.20260512061040-cd4be86c66b0 github.com/opensearch-project/opensearch-go/v4 v4.6.0 github.com/orcaman/concurrent-map v1.0.0 github.com/pkg/errors v0.9.1 @@ -103,14 +103,14 @@ require ( go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/trace v1.43.0 - golang.org/x/crypto v0.49.0 + golang.org/x/crypto v0.50.0 golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac golang.org/x/image v0.38.0 golang.org/x/net v0.52.0 golang.org/x/oauth2 v0.36.0 golang.org/x/sync v0.20.0 - golang.org/x/term v0.41.0 - golang.org/x/text v0.35.0 + golang.org/x/term v0.42.0 + golang.org/x/text v0.36.0 google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 google.golang.org/grpc v1.80.0 google.golang.org/protobuf v1.36.11 @@ -122,7 +122,7 @@ require ( require ( contrib.go.opencensus.io/exporter/prometheus v0.4.2 // indirect - filippo.io/edwards25519 v1.1.1 // indirect + filippo.io/edwards25519 v1.2.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/Azure/go-ntlmssp v0.1.1 // indirect github.com/BurntSushi/toml v1.6.0 // indirect @@ -136,7 +136,7 @@ require ( github.com/ajg/form v1.5.1 // indirect github.com/alexedwards/argon2id v1.0.0 // indirect github.com/amoghe/go-crypt v0.0.0-20220222110647-20eada5f5964 // indirect - github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op // indirect + github.com/antithesishq/antithesis-sdk-go v0.7.0-default-no-op // indirect github.com/armon/go-radix v1.0.0 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect @@ -220,7 +220,7 @@ require ( github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect github.com/go-redis/redis/v8 v8.11.5 // indirect - github.com/go-sql-driver/mysql v1.9.3 // indirect + github.com/go-sql-driver/mysql v1.10.0 // indirect github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/go-test/deep v1.1.0 // indirect @@ -286,7 +286,7 @@ require ( github.com/miekg/dns v1.1.68 // indirect github.com/mileusna/useragent v1.3.5 // indirect github.com/minio/crc64nvme v1.1.1 // indirect - github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 // indirect + github.com/minio/highwayhash v1.0.4 // indirect github.com/minio/md5-simd v1.1.2 // indirect github.com/minio/minio-go/v7 v7.0.99 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect @@ -388,10 +388,10 @@ require ( go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/mod v0.33.0 // indirect + golang.org/x/mod v0.34.0 // indirect golang.org/x/sys v0.43.0 // indirect golang.org/x/time v0.15.0 // indirect - golang.org/x/tools v0.42.0 // indirect + golang.org/x/tools v0.43.0 // indirect google.golang.org/genproto v0.0.0-20260128011058-8636f8732409 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d // indirect gopkg.in/cenkalti/backoff.v1 v1.1.0 // indirect diff --git a/go.sum b/go.sum index 12b0e931f5..940e127551 100644 --- a/go.sum +++ b/go.sum @@ -39,8 +39,8 @@ contrib.go.opencensus.io/exporter/prometheus v0.4.2/go.mod h1:dvEHbiKmgvbr5pjaF9 dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -filippo.io/edwards25519 v1.1.1 h1:YpjwWWlNmGIDyXOn8zLzqiD+9TyIlPhGFG96P39uBpw= -filippo.io/edwards25519 v1.1.1/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= +filippo.io/edwards25519 v1.2.0 h1:crnVqOiS4jqYleHd9vaKZ+HKtHfllngJIiOpNpoJsjo= +filippo.io/edwards25519 v1.2.0/go.mod h1:xzAOLCNug/yB62zG1bQ8uziwrIqIuxhctzJT18Q77mc= github.com/Acconut/go-httptest-recorder v1.0.0 h1:TAv2dfnqp/l+SUvIaMAUK4GeN4+wqb6KZsFFFTGhoJg= github.com/Acconut/go-httptest-recorder v1.0.0/go.mod h1:CwQyhTH1kq/gLyWiRieo7c0uokpu3PXeyF/nZjUNtmM= github.com/AdaLogics/go-fuzz-headers v0.0.0-20240806141605-e8a1dd7889d6 h1:He8afgbRMd7mFxO99hRNu+6tazq8nFF9lIwo9JFroBk= @@ -117,8 +117,8 @@ github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883 h1:bvNMNQO63//z+xNg github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= -github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op h1:kpBdlEPbRvff0mDD1gk7o9BhI16b9p5yYAXRlidpqJE= -github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E= +github.com/antithesishq/antithesis-sdk-go v0.7.0-default-no-op h1:Z/MZK75wC/NSrkgqeNIa7jexam9uWzhLmFTSCPI/kn0= +github.com/antithesishq/antithesis-sdk-go v0.7.0-default-no-op/go.mod h1:FQyySiasQQM8735Ddel3MRojmy4dA1IqCeyJ5jmPMbI= github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= @@ -456,8 +456,8 @@ github.com/go-redis/redis/v8 v8.11.5/go.mod h1:gREzHqY1hg6oD9ngVRbLStwAWKhA0FEgq github.com/go-resty/resty/v2 v2.1.1-0.20191201195748-d7b97669fe48/go.mod h1:dZGr0i9PLlaaTD4H/hoZIDjQ+r6xq8mgbRzHZf7f2J8= github.com/go-resty/resty/v2 v2.17.2 h1:FQW5oHYcIlkCNrMD2lloGScxcHJ0gkjshV3qcQAyHQk= github.com/go-resty/resty/v2 v2.17.2/go.mod h1:kCKZ3wWmwJaNc7S29BRtUhJwy7iqmn+2mLtQrOyQlVA= -github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo= -github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= +github.com/go-sql-driver/mysql v1.10.0 h1:Q+1LV8DkHJvSYAdR83XzuhDaTykuDx0l6fkXxoWCWfw= +github.com/go-sql-driver/mysql v1.10.0/go.mod h1:M+cqaI7+xxXGG9swrdeUIoPG3Y3KCkF0pZej+SK+nWk= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= @@ -839,8 +839,8 @@ github.com/mileusna/useragent v1.3.5 h1:SJM5NzBmh/hO+4LGeATKpaEX9+b4vcGg2qXGLiNG github.com/mileusna/useragent v1.3.5/go.mod h1:3d8TOmwL/5I8pJjyVDteHtgDGcefrFUX4ccGOMKNYYc= github.com/minio/crc64nvme v1.1.1 h1:8dwx/Pz49suywbO+auHCBpCtlW1OfpcLN7wYgVR6wAI= github.com/minio/crc64nvme v1.1.1/go.mod h1:eVfm2fAzLlxMdUGc0EEBGSMmPwmXD5XiNRpnu9J3bvg= -github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk= -github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= +github.com/minio/highwayhash v1.0.4 h1:asJizugGgchQod2ja9NJlGOWq4s7KsAWr5XUc9Clgl4= +github.com/minio/highwayhash v1.0.4/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE= @@ -900,8 +900,8 @@ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRW github.com/namedotcom/go v0.0.0-20180403034216-08470befbe04/go.mod h1:5sN+Lt1CaY4wsPvgQH/jsuJi4XO2ssZbdsIizr4CVC8= github.com/nats-io/jwt/v2 v2.8.1 h1:V0xpGuD/N8Mi+fQNDynXohVvp7ZztevW5io8CUWlPmU= github.com/nats-io/jwt/v2 v2.8.1/go.mod h1:nWnOEEiVMiKHQpnAy4eXlizVEtSfzacZ1Q43LIRavZg= -github.com/nats-io/nats-server/v2 v2.12.6 h1:Egbx9Vl7Ch8wTtpXPGqbehkZ+IncKqShUxvrt1+Enc8= -github.com/nats-io/nats-server/v2 v2.12.6/go.mod h1:4HPlrvtmSO3yd7KcElDNMx9kv5EBJBnJJzQPptXlheo= +github.com/nats-io/nats-server/v2 v2.14.0 h1:+8q0HrDFotwLLcGH/legOEOnowunhK+aZ4GYBIWpQlM= +github.com/nats-io/nats-server/v2 v2.14.0/go.mod h1:ImVUUDvfClJbb6cuJQRc1VmgDCXKM5ds0OoiG9MVOKo= github.com/nats-io/nats.go v1.51.0 h1:ByW84XTz6W03GSSsygsZcA+xgKK8vPGaa/FCAAEHnAI= github.com/nats-io/nats.go v1.51.0/go.mod h1:26HypzazeOkyO3/mqd1zZd53STJN0EjCYF9Uy2ZOBno= github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4= @@ -940,8 +940,8 @@ github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsx github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= -github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= -github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/onsi/gomega v1.40.0 h1:Vtol0e1MghCD2ZVIilPDIg44XSL9l2QAn8ZNaljWcJc= +github.com/onsi/gomega v1.40.0/go.mod h1:M/Uqpu/8qTjtzCLUA2zJHX9Iilrau25x1PdoSRbWh5A= github.com/open-policy-agent/opa v1.15.2 h1:dS9q+0Yvruq/VNvWJc5qCvCchn715OWc3HLHXn/UCCc= github.com/open-policy-agent/opa v1.15.2/go.mod h1:c6SN+7jSsUcKJLQc5P4yhwx8YYDRbjpAiGkBOTqxaa4= github.com/opencloud-eu/go-micro-plugins/v4/store/nats-js-kv v0.0.0-20250512152754-23325793059a h1:Sakl76blJAaM6NxylVkgSzktjo2dS504iDotEFJsh3M= @@ -952,8 +952,8 @@ github.com/opencloud-eu/inotifywaitgo v0.0.0-20251111171128-a390bae3c5e9 h1:dIft github.com/opencloud-eu/inotifywaitgo v0.0.0-20251111171128-a390bae3c5e9/go.mod h1:JWyDC6H+5oZRdUJUgKuaye+8Ph5hEs6HVzVoPKzWSGI= github.com/opencloud-eu/libre-graph-api-go v1.0.8-0.20260310090739-853d972b282d h1:JcqGDiyrcaQwVyV861TUyQgO7uEmsjkhfm7aQd84dOw= github.com/opencloud-eu/libre-graph-api-go v1.0.8-0.20260310090739-853d972b282d/go.mod h1:pzatilMEHZFT3qV7C/X3MqOa3NlRQuYhlRhZTL+hN6Q= -github.com/opencloud-eu/reva/v2 v2.43.1-0.20260424125411-c5db28365753 h1:/FpQdybaNb3OAISHmHRrh/4aWYQep3nVSYKzYt2F+jE= -github.com/opencloud-eu/reva/v2 v2.43.1-0.20260424125411-c5db28365753/go.mod h1:msu4TkFw7Jxog0QRbGPxyQOJG9sago5nc+f//y+bbpI= +github.com/opencloud-eu/reva/v2 v2.43.1-0.20260512061040-cd4be86c66b0 h1:e4w34sW1gXixTKi9z+odF6IKGyvisvu97xfYEXOvRGE= +github.com/opencloud-eu/reva/v2 v2.43.1-0.20260512061040-cd4be86c66b0/go.mod h1:SoRYtNJ9ha83YdUUep5wYF7F5/OIhgED7ZSgqudhpNo= github.com/opencloud-eu/secure v0.0.0-20260312082735-b6f5cb2244e4 h1:l2oB/RctH+t8r7QBj5p8thfEHCM/jF35aAY3WQ3hADI= github.com/opencloud-eu/secure v0.0.0-20260312082735-b6f5cb2244e4/go.mod h1:BmF5hyM6tXczk3MpQkFf1hpKSRqCyhqcbiQtiAF7+40= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= @@ -1358,8 +1358,8 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= -golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= -golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -1397,8 +1397,8 @@ golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= -golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI= +golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1566,8 +1566,8 @@ golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= -golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= -golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1580,8 +1580,8 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= -golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1642,8 +1642,8 @@ golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.0.0-20210112230658-8b4aab62c064/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= -golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= +golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= golang.org/x/tools/godoc v0.1.0-deprecated h1:o+aZ1BOj6Hsx/GBdJO/s815sqftjSnrZZwyYTHODvtk= golang.org/x/tools/godoc v0.1.0-deprecated/go.mod h1:qM63CriJ961IHWmnWa9CjZnBndniPt4a3CK0PVB9bIg= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/vendor/filippo.io/edwards25519/README.md b/vendor/filippo.io/edwards25519/README.md index 24e2457d87..dcdd8d85fc 100644 --- a/vendor/filippo.io/edwards25519/README.md +++ b/vendor/filippo.io/edwards25519/README.md @@ -7,8 +7,10 @@ import "filippo.io/edwards25519" This library implements the edwards25519 elliptic curve, exposing the necessary APIs to build a wide array of higher-level primitives. Read the docs at [pkg.go.dev/filippo.io/edwards25519](https://pkg.go.dev/filippo.io/edwards25519). -The code is originally derived from Adam Langley's internal implementation in the Go standard library, and includes George Tankersley's [performance improvements](https://golang.org/cl/71950). It was then further developed by Henry de Valence for use in ristretto255, and was finally [merged back into the Go standard library](https://golang.org/cl/276272) as of Go 1.17. It now tracks the upstream codebase and extends it with additional functionality. +The package tracks the upstream standard library package `crypto/internal/fips140/edwards25519` and extends it with additional functionality. -Most users don't need this package, and should instead use `crypto/ed25519` for signatures, `golang.org/x/crypto/curve25519` for Diffie-Hellman, or `github.com/gtank/ristretto255` for prime order group logic. However, for anyone currently using a fork of `crypto/internal/edwards25519`/`crypto/ed25519/internal/edwards25519` or `github.com/agl/edwards25519`, this package should be a safer, faster, and more powerful alternative. +The code is originally derived from Adam Langley's internal implementation in the Go standard library, and includes George Tankersley's [performance improvements](https://golang.org/cl/71950). It was then further developed by Henry de Valence for use in ristretto255, and was finally [merged back into the Go standard library](https://golang.org/cl/276272) as of Go 1.17. + +Most users don't need this package, and should instead use `crypto/ed25519` for signatures, `crypto/ecdh` for Diffie-Hellman, or `github.com/gtank/ristretto255` for prime order group logic. However, for anyone currently using a fork of the internal `edwards25519` package or of `github.com/agl/edwards25519`, this package should be a safer, faster, and more powerful alternative. Since this package is meant to curb proliferation of edwards25519 implementations in the Go ecosystem, it welcomes requests for new APIs or reviewable performance improvements. diff --git a/vendor/filippo.io/edwards25519/doc.go b/vendor/filippo.io/edwards25519/doc.go index ab6aaebc0f..dd2deb6449 100644 --- a/vendor/filippo.io/edwards25519/doc.go +++ b/vendor/filippo.io/edwards25519/doc.go @@ -10,11 +10,11 @@ // the curve used by the Ed25519 signature scheme. // // Most users don't need this package, and should instead use crypto/ed25519 for -// signatures, golang.org/x/crypto/curve25519 for Diffie-Hellman, or -// github.com/gtank/ristretto255 for prime order group logic. +// signatures, crypto/ecdh for Diffie-Hellman, or github.com/gtank/ristretto255 +// for prime order group logic. // // However, developers who do need to interact with low-level edwards25519 // operations can use this package, which is an extended version of -// crypto/internal/edwards25519 from the standard library repackaged as +// crypto/internal/fips140/edwards25519 from the standard library repackaged as // an importable module. package edwards25519 diff --git a/vendor/filippo.io/edwards25519/extra.go b/vendor/filippo.io/edwards25519/extra.go index ab2e44a518..ee9b5ca5bf 100644 --- a/vendor/filippo.io/edwards25519/extra.go +++ b/vendor/filippo.io/edwards25519/extra.go @@ -9,6 +9,7 @@ package edwards25519 import ( "errors" + "slices" "filippo.io/edwards25519/field" ) @@ -100,13 +101,15 @@ func (v *Point) bytesMontgomery(buf *[32]byte) []byte { // // u = (1 + y) / (1 - y) // - // where y = Y / Z. + // where y = Y / Z and therefore + // + // u = (Z + Y) / (Z - Y) - var y, recip, u field.Element + var n, r, u field.Element - y.Multiply(&v.y, y.Invert(&v.z)) // y = Y / Z - recip.Invert(recip.Subtract(feOne, &y)) // r = 1/(1 - y) - u.Multiply(u.Add(feOne, &y), &recip) // u = (1 + y)*r + n.Add(&v.z, &v.y) // n = Z + Y + r.Invert(r.Subtract(&v.z, &v.y)) // r = 1 / (Z - Y) + u.Multiply(&n, &r) // u = n * r return copyFieldElement(buf, &u) } @@ -124,7 +127,7 @@ func (v *Point) MultByCofactor(p *Point) *Point { return v.fromP1xP1(&result) } -// Given k > 0, set s = s**(2*i). +// Given k > 0, set s = s**(2*k). func (s *Scalar) pow2k(k int) { for i := 0; i < k; i++ { s.Multiply(s, s) @@ -250,12 +253,14 @@ func (v *Point) MultiScalarMult(scalars []*Scalar, points []*Point) *Point { // between each point in the multiscalar equation. // Build lookup tables for each point - tables := make([]projLookupTable, len(points)) + tables := make([]projLookupTable, 0, 2) // avoid allocation for small sizes + tables = slices.Grow(tables, len(points))[:len(points)] for i := range tables { tables[i].FromP3(points[i]) } // Compute signed radix-16 digits for each scalar - digits := make([][64]int8, len(scalars)) + digits := make([][64]int8, 0, 2) // avoid allocation for small sizes + digits = slices.Grow(digits, len(scalars))[:len(scalars)] for i := range digits { digits[i] = scalars[i].signedRadix16() } @@ -348,3 +353,49 @@ func (v *Point) VarTimeMultiScalarMult(scalars []*Scalar, points []*Point) *Poin v.fromP2(tmp2) return v } + +// Select sets v to a if cond == 1 and to b if cond == 0. +func (v *Point) Select(a, b *Point, cond int) *Point { + checkInitialized(a, b) + v.x.Select(&a.x, &b.x, cond) + v.y.Select(&a.y, &b.y, cond) + v.z.Select(&a.z, &b.z, cond) + v.t.Select(&a.t, &b.t, cond) + return v +} + +// Double sets v = p + p, and returns v. +func (v *Point) Double(p *Point) *Point { + checkInitialized(p) + + pp := new(projP2).FromP3(p) + p1 := new(projP1xP1).Double(pp) + return v.fromP1xP1(p1) +} + +func (v *Point) addCached(p *Point, qCached *projCached) *Point { + result := new(projP1xP1).Add(p, qCached) + return v.fromP1xP1(result) +} + +// ScalarMultSlow sets v = x * q, and returns v. It doesn't precompute a large +// table, so it is considerably slower, but requires less memory. +// +// The scalar multiplication is done in constant time. +func (v *Point) ScalarMultSlow(x *Scalar, q *Point) *Point { + checkInitialized(q) + + s := x.Bytes() + qCached := new(projCached).FromP3(q) + v.Set(NewIdentityPoint()) + t := new(Point) + + for i := 255; i >= 0; i-- { + v.Double(v) + t.addCached(v, qCached) + cond := (s[i/8] >> (i % 8)) & 1 + v.Select(t, v, int(cond)) + } + + return v +} diff --git a/vendor/filippo.io/edwards25519/field/fe.go b/vendor/filippo.io/edwards25519/field/fe.go index 5518ef2b90..4d52cc10d1 100644 --- a/vendor/filippo.io/edwards25519/field/fe.go +++ b/vendor/filippo.io/edwards25519/field/fe.go @@ -90,11 +90,7 @@ func (v *Element) Add(a, b *Element) *Element { v.l2 = a.l2 + b.l2 v.l3 = a.l3 + b.l3 v.l4 = a.l4 + b.l4 - // Using the generic implementation here is actually faster than the - // assembly. Probably because the body of this function is so simple that - // the compiler can figure out better optimizations by inlining the carry - // propagation. - return v.carryPropagateGeneric() + return v.carryPropagate() } // Subtract sets v = a - b, and returns v. @@ -232,18 +228,22 @@ func (v *Element) bytes(out *[32]byte) []byte { t := *v t.reduce() - var buf [8]byte - for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} { - bitsOffset := i * 51 - binary.LittleEndian.PutUint64(buf[:], l<= len(out) { - break - } - out[off] |= bb - } - } + // Pack five 51-bit limbs into four 64-bit words: + // + // 255 204 153 102 51 0 + // ├──l4──┼──l3──┼──l2──┼──l1──┼──l0──┤ + // ├───u3───┼───u2───┼───u1───┼───u0───┤ + // 256 192 128 64 0 + + u0 := t.l1<<51 | t.l0 + u1 := t.l2<<(102-64) | t.l1>>(64-51) + u2 := t.l3<<(153-128) | t.l2>>(128-102) + u3 := t.l4<<(204-192) | t.l3>>(192-153) + + binary.LittleEndian.PutUint64(out[0*8:], u0) + binary.LittleEndian.PutUint64(out[1*8:], u1) + binary.LittleEndian.PutUint64(out[2*8:], u2) + binary.LittleEndian.PutUint64(out[3*8:], u3) return out[:] } diff --git a/vendor/filippo.io/edwards25519/field/fe_amd64.go b/vendor/filippo.io/edwards25519/field/fe_amd64.go index edcf163c4e..00bf8f4479 100644 --- a/vendor/filippo.io/edwards25519/field/fe_amd64.go +++ b/vendor/filippo.io/edwards25519/field/fe_amd64.go @@ -1,7 +1,6 @@ // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. -//go:build amd64 && gc && !purego -// +build amd64,gc,!purego +//go:build !purego package field diff --git a/vendor/filippo.io/edwards25519/field/fe_amd64.s b/vendor/filippo.io/edwards25519/field/fe_amd64.s index 293f013c94..5e06e242ed 100644 --- a/vendor/filippo.io/edwards25519/field/fe_amd64.s +++ b/vendor/filippo.io/edwards25519/field/fe_amd64.s @@ -1,7 +1,6 @@ // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT. -//go:build amd64 && gc && !purego -// +build amd64,gc,!purego +//go:build !purego #include "textflag.h" @@ -17,32 +16,36 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 MOVQ DX, SI // r0 += 19×a1×b4 - MOVQ 8(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 8(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, DI + ADCQ DX, SI // r0 += 19×a2×b3 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(BX) + ADDQ AX, DI + ADCQ DX, SI // r0 += 19×a3×b2 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 16(BX) + ADDQ AX, DI + ADCQ DX, SI // r0 += 19×a4×b1 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 8(BX) - ADDQ AX, DI - ADCQ DX, SI + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 8(BX) + ADDQ AX, DI + ADCQ DX, SI // r1 = a0×b1 MOVQ (CX), AX @@ -57,25 +60,28 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 ADCQ DX, R8 // r1 += 19×a2×b4 - MOVQ 16(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R9 - ADCQ DX, R8 + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, R9 + ADCQ DX, R8 // r1 += 19×a3×b3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R9 - ADCQ DX, R8 + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(BX) + ADDQ AX, R9 + ADCQ DX, R8 // r1 += 19×a4×b2 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 16(BX) - ADDQ AX, R9 - ADCQ DX, R8 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 16(BX) + ADDQ AX, R9 + ADCQ DX, R8 // r2 = a0×b2 MOVQ (CX), AX @@ -96,18 +102,20 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 ADCQ DX, R10 // r2 += 19×a3×b4 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R11 - ADCQ DX, R10 + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, R11 + ADCQ DX, R10 // r2 += 19×a4×b3 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(BX) - ADDQ AX, R11 - ADCQ DX, R10 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(BX) + ADDQ AX, R11 + ADCQ DX, R10 // r3 = a0×b3 MOVQ (CX), AX @@ -134,11 +142,12 @@ TEXT ·feMul(SB), NOSPLIT, $0-24 ADCQ DX, R12 // r3 += 19×a4×b4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(BX) - ADDQ AX, R13 - ADCQ DX, R12 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(BX) + ADDQ AX, R13 + ADCQ DX, R12 // r4 = a0×b4 MOVQ (CX), AX @@ -232,18 +241,22 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, BX // r0 += 38×l1×l4 - MOVQ 8(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, SI - ADCQ DX, BX + MOVQ 8(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 32(CX) + ADDQ AX, SI + ADCQ DX, BX // r0 += 38×l2×l3 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 24(CX) - ADDQ AX, SI - ADCQ DX, BX + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 24(CX) + ADDQ AX, SI + ADCQ DX, BX // r1 = 2×l0×l1 MOVQ (CX), AX @@ -253,18 +266,21 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, DI // r1 += 38×l2×l4 - MOVQ 16(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R8 - ADCQ DX, DI + MOVQ 16(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 32(CX) + ADDQ AX, R8 + ADCQ DX, DI // r1 += 19×l3×l3 - MOVQ 24(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, DI + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 24(CX) + ADDQ AX, R8 + ADCQ DX, DI // r2 = 2×l0×l2 MOVQ (CX), AX @@ -280,11 +296,13 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 ADCQ DX, R9 // r2 += 38×l3×l4 - MOVQ 24(CX), AX - IMUL3Q $0x26, AX, AX - MULQ 32(CX) - ADDQ AX, R10 - ADCQ DX, R9 + MOVQ 24(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + SHLQ $0x01, AX + MULQ 32(CX) + ADDQ AX, R10 + ADCQ DX, R9 // r3 = 2×l0×l3 MOVQ (CX), AX @@ -294,18 +312,19 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, R11 // r3 += 2×l1×l2 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 16(CX) - ADDQ AX, R12 - ADCQ DX, R11 + MOVQ 8(CX), AX + SHLQ $0x01, AX + MULQ 16(CX) + ADDQ AX, R12 + ADCQ DX, R11 // r3 += 19×l4×l4 - MOVQ 32(CX), AX - IMUL3Q $0x13, AX, AX - MULQ 32(CX) - ADDQ AX, R12 - ADCQ DX, R11 + MOVQ 32(CX), DX + LEAQ (DX)(DX*8), AX + LEAQ (DX)(AX*2), AX + MULQ 32(CX) + ADDQ AX, R12 + ADCQ DX, R11 // r4 = 2×l0×l4 MOVQ (CX), AX @@ -315,11 +334,11 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16 MOVQ DX, R13 // r4 += 2×l1×l3 - MOVQ 8(CX), AX - IMUL3Q $0x02, AX, AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R13 + MOVQ 8(CX), AX + SHLQ $0x01, AX + MULQ 24(CX) + ADDQ AX, R14 + ADCQ DX, R13 // r4 += l2×l2 MOVQ 16(CX), AX diff --git a/vendor/filippo.io/edwards25519/field/fe_amd64_noasm.go b/vendor/filippo.io/edwards25519/field/fe_amd64_noasm.go index ddb6c9b8f7..4b81f25d1d 100644 --- a/vendor/filippo.io/edwards25519/field/fe_amd64_noasm.go +++ b/vendor/filippo.io/edwards25519/field/fe_amd64_noasm.go @@ -2,8 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 || !gc || purego -// +build !amd64 !gc purego +//go:build !amd64 || purego package field diff --git a/vendor/filippo.io/edwards25519/field/fe_arm64.go b/vendor/filippo.io/edwards25519/field/fe_arm64.go deleted file mode 100644 index af459ef515..0000000000 --- a/vendor/filippo.io/edwards25519/field/fe_arm64.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego -// +build arm64,gc,!purego - -package field - -//go:noescape -func carryPropagate(v *Element) - -func (v *Element) carryPropagate() *Element { - carryPropagate(v) - return v -} diff --git a/vendor/filippo.io/edwards25519/field/fe_arm64.s b/vendor/filippo.io/edwards25519/field/fe_arm64.s deleted file mode 100644 index 3126a43419..0000000000 --- a/vendor/filippo.io/edwards25519/field/fe_arm64.s +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build arm64 && gc && !purego - -#include "textflag.h" - -// carryPropagate works exactly like carryPropagateGeneric and uses the -// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but -// avoids loading R0-R4 twice and uses LDP and STP. -// -// See https://golang.org/issues/43145 for the main compiler issue. -// -// func carryPropagate(v *Element) -TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8 - MOVD v+0(FP), R20 - - LDP 0(R20), (R0, R1) - LDP 16(R20), (R2, R3) - MOVD 32(R20), R4 - - AND $0x7ffffffffffff, R0, R10 - AND $0x7ffffffffffff, R1, R11 - AND $0x7ffffffffffff, R2, R12 - AND $0x7ffffffffffff, R3, R13 - AND $0x7ffffffffffff, R4, R14 - - ADD R0>>51, R11, R11 - ADD R1>>51, R12, R12 - ADD R2>>51, R13, R13 - ADD R3>>51, R14, R14 - // R4>>51 * 19 + R10 -> R10 - LSR $51, R4, R21 - MOVD $19, R22 - MADD R22, R10, R21, R10 - - STP (R10, R11), 0(R20) - STP (R12, R13), 16(R20) - MOVD R14, 32(R20) - - RET diff --git a/vendor/filippo.io/edwards25519/field/fe_arm64_noasm.go b/vendor/filippo.io/edwards25519/field/fe_arm64_noasm.go deleted file mode 100644 index 234a5b2e5d..0000000000 --- a/vendor/filippo.io/edwards25519/field/fe_arm64_noasm.go +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright (c) 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !arm64 || !gc || purego -// +build !arm64 !gc purego - -package field - -func (v *Element) carryPropagate() *Element { - return v.carryPropagateGeneric() -} diff --git a/vendor/filippo.io/edwards25519/field/fe_generic.go b/vendor/filippo.io/edwards25519/field/fe_generic.go index 86f5fd9553..ef1f15a5dc 100644 --- a/vendor/filippo.io/edwards25519/field/fe_generic.go +++ b/vendor/filippo.io/edwards25519/field/fe_generic.go @@ -12,20 +12,42 @@ type uint128 struct { lo, hi uint64 } -// mul64 returns a * b. -func mul64(a, b uint64) uint128 { +// mul returns a * b. +func mul(a, b uint64) uint128 { hi, lo := bits.Mul64(a, b) return uint128{lo, hi} } -// addMul64 returns v + a * b. -func addMul64(v uint128, a, b uint64) uint128 { +// addMul returns v + a * b. +func addMul(v uint128, a, b uint64) uint128 { hi, lo := bits.Mul64(a, b) lo, c := bits.Add64(lo, v.lo, 0) hi, _ = bits.Add64(hi, v.hi, c) return uint128{lo, hi} } +// mul19 returns v * 19. +func mul19(v uint64) uint64 { + // Using this approach seems to yield better optimizations than *19. + return v + (v+v<<3)<<1 +} + +// addMul19 returns v + 19 * a * b, where a and b are at most 52 bits. +func addMul19(v uint128, a, b uint64) uint128 { + hi, lo := bits.Mul64(mul19(a), b) + lo, c := bits.Add64(lo, v.lo, 0) + hi, _ = bits.Add64(hi, v.hi, c) + return uint128{lo, hi} +} + +// addMul38 returns v + 38 * a * b, where a and b are at most 52 bits. +func addMul38(v uint128, a, b uint64) uint128 { + hi, lo := bits.Mul64(mul19(a), b*2) + lo, c := bits.Add64(lo, v.lo, 0) + hi, _ = bits.Add64(hi, v.hi, c) + return uint128{lo, hi} +} + // shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits. func shiftRightBy51(a uint128) uint64 { return (a.hi << (64 - 51)) | (a.lo >> 51) @@ -76,45 +98,40 @@ func feMulGeneric(v, a, b *Element) { // // Finally we add up the columns into wide, overlapping limbs. - a1_19 := a1 * 19 - a2_19 := a2 * 19 - a3_19 := a3 * 19 - a4_19 := a4 * 19 - // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) - r0 := mul64(a0, b0) - r0 = addMul64(r0, a1_19, b4) - r0 = addMul64(r0, a2_19, b3) - r0 = addMul64(r0, a3_19, b2) - r0 = addMul64(r0, a4_19, b1) + r0 := mul(a0, b0) + r0 = addMul19(r0, a1, b4) + r0 = addMul19(r0, a2, b3) + r0 = addMul19(r0, a3, b2) + r0 = addMul19(r0, a4, b1) // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) - r1 := mul64(a0, b1) - r1 = addMul64(r1, a1, b0) - r1 = addMul64(r1, a2_19, b4) - r1 = addMul64(r1, a3_19, b3) - r1 = addMul64(r1, a4_19, b2) + r1 := mul(a0, b1) + r1 = addMul(r1, a1, b0) + r1 = addMul19(r1, a2, b4) + r1 = addMul19(r1, a3, b3) + r1 = addMul19(r1, a4, b2) // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) - r2 := mul64(a0, b2) - r2 = addMul64(r2, a1, b1) - r2 = addMul64(r2, a2, b0) - r2 = addMul64(r2, a3_19, b4) - r2 = addMul64(r2, a4_19, b3) + r2 := mul(a0, b2) + r2 = addMul(r2, a1, b1) + r2 = addMul(r2, a2, b0) + r2 = addMul19(r2, a3, b4) + r2 = addMul19(r2, a4, b3) // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 - r3 := mul64(a0, b3) - r3 = addMul64(r3, a1, b2) - r3 = addMul64(r3, a2, b1) - r3 = addMul64(r3, a3, b0) - r3 = addMul64(r3, a4_19, b4) + r3 := mul(a0, b3) + r3 = addMul(r3, a1, b2) + r3 = addMul(r3, a2, b1) + r3 = addMul(r3, a3, b0) + r3 = addMul19(r3, a4, b4) // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 - r4 := mul64(a0, b4) - r4 = addMul64(r4, a1, b3) - r4 = addMul64(r4, a2, b2) - r4 = addMul64(r4, a3, b1) - r4 = addMul64(r4, a4, b0) + r4 := mul(a0, b4) + r4 = addMul(r4, a1, b3) + r4 = addMul(r4, a2, b2) + r4 = addMul(r4, a3, b1) + r4 = addMul(r4, a4, b0) // After the multiplication, we need to reduce (carry) the five coefficients // to obtain a result with limbs that are at most slightly larger than 2⁵¹, @@ -149,7 +166,7 @@ func feMulGeneric(v, a, b *Element) { c3 := shiftRightBy51(r3) c4 := shiftRightBy51(r4) - rr0 := r0.lo&maskLow51Bits + c4*19 + rr0 := r0.lo&maskLow51Bits + mul19(c4) rr1 := r1.lo&maskLow51Bits + c0 rr2 := r2.lo&maskLow51Bits + c1 rr3 := r3.lo&maskLow51Bits + c2 @@ -158,8 +175,12 @@ func feMulGeneric(v, a, b *Element) { // Now all coefficients fit into 64-bit registers but are still too large to // be passed around as an Element. We therefore do one last carry chain, // where the carries will be small enough to fit in the wiggle room above 2⁵¹. - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() + + v.l0 = rr0&maskLow51Bits + mul19(rr4>>51) + v.l1 = rr1&maskLow51Bits + rr0>>51 + v.l2 = rr2&maskLow51Bits + rr1>>51 + v.l3 = rr3&maskLow51Bits + rr2>>51 + v.l4 = rr4&maskLow51Bits + rr3>>51 } func feSquareGeneric(v, a *Element) { @@ -190,44 +211,31 @@ func feSquareGeneric(v, a *Element) { // l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4 = // -------------------------------------- // r4 r3 r2 r1 r0 - // - // With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with - // only three Mul64 and four Add64, instead of five and eight. - - l0_2 := l0 * 2 - l1_2 := l1 * 2 - - l1_38 := l1 * 38 - l2_38 := l2 * 38 - l3_38 := l3 * 38 - - l3_19 := l3 * 19 - l4_19 := l4 * 19 // r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3) - r0 := mul64(l0, l0) - r0 = addMul64(r0, l1_38, l4) - r0 = addMul64(r0, l2_38, l3) + r0 := mul(l0, l0) + r0 = addMul38(r0, l1, l4) + r0 = addMul38(r0, l2, l3) // r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 - r1 := mul64(l0_2, l1) - r1 = addMul64(r1, l2_38, l4) - r1 = addMul64(r1, l3_19, l3) + r1 := mul(l0*2, l1) + r1 = addMul38(r1, l2, l4) + r1 = addMul19(r1, l3, l3) // r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4 - r2 := mul64(l0_2, l2) - r2 = addMul64(r2, l1, l1) - r2 = addMul64(r2, l3_38, l4) + r2 := mul(l0*2, l2) + r2 = addMul(r2, l1, l1) + r2 = addMul38(r2, l3, l4) // r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 - r3 := mul64(l0_2, l3) - r3 = addMul64(r3, l1_2, l2) - r3 = addMul64(r3, l4_19, l4) + r3 := mul(l0*2, l3) + r3 = addMul(r3, l1*2, l2) + r3 = addMul19(r3, l4, l4) // r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2 - r4 := mul64(l0_2, l4) - r4 = addMul64(r4, l1_2, l3) - r4 = addMul64(r4, l2, l2) + r4 := mul(l0*2, l4) + r4 = addMul(r4, l1*2, l3) + r4 = addMul(r4, l2, l2) c0 := shiftRightBy51(r0) c1 := shiftRightBy51(r1) @@ -235,32 +243,30 @@ func feSquareGeneric(v, a *Element) { c3 := shiftRightBy51(r3) c4 := shiftRightBy51(r4) - rr0 := r0.lo&maskLow51Bits + c4*19 + rr0 := r0.lo&maskLow51Bits + mul19(c4) rr1 := r1.lo&maskLow51Bits + c0 rr2 := r2.lo&maskLow51Bits + c1 rr3 := r3.lo&maskLow51Bits + c2 rr4 := r4.lo&maskLow51Bits + c3 - *v = Element{rr0, rr1, rr2, rr3, rr4} - v.carryPropagate() + v.l0 = rr0&maskLow51Bits + mul19(rr4>>51) + v.l1 = rr1&maskLow51Bits + rr0>>51 + v.l2 = rr2&maskLow51Bits + rr1>>51 + v.l3 = rr3&maskLow51Bits + rr2>>51 + v.l4 = rr4&maskLow51Bits + rr3>>51 } -// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction +// carryPropagate brings the limbs below 52 bits by applying the reduction // identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry. -func (v *Element) carryPropagateGeneric() *Element { - c0 := v.l0 >> 51 - c1 := v.l1 >> 51 - c2 := v.l2 >> 51 - c3 := v.l3 >> 51 - c4 := v.l4 >> 51 - - // c4 is at most 64 - 51 = 13 bits, so c4*19 is at most 18 bits, and +func (v *Element) carryPropagate() *Element { + // (l4>>51) is at most 64 - 51 = 13 bits, so (l4>>51)*19 is at most 18 bits, and // the final l0 will be at most 52 bits. Similarly for the rest. - v.l0 = v.l0&maskLow51Bits + c4*19 - v.l1 = v.l1&maskLow51Bits + c0 - v.l2 = v.l2&maskLow51Bits + c1 - v.l3 = v.l3&maskLow51Bits + c2 - v.l4 = v.l4&maskLow51Bits + c3 + l0 := v.l0 + v.l0 = v.l0&maskLow51Bits + mul19(v.l4>>51) + v.l4 = v.l4&maskLow51Bits + v.l3>>51 + v.l3 = v.l3&maskLow51Bits + v.l2>>51 + v.l2 = v.l2&maskLow51Bits + v.l1>>51 + v.l1 = v.l1&maskLow51Bits + l0>>51 return v } diff --git a/vendor/filippo.io/edwards25519/pull.sh b/vendor/filippo.io/edwards25519/pull.sh new file mode 100644 index 0000000000..f6217c96e2 --- /dev/null +++ b/vendor/filippo.io/edwards25519/pull.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +TAG="$1" +TMPDIR="$(mktemp -d)" + +cleanup() { + rm -rf "$TMPDIR" +} +trap cleanup EXIT + +command -v git >/dev/null +command -v git-filter-repo >/dev/null + +if [ -d "$HOME/go/.git" ]; then + REFERENCE=(--reference "$HOME/go" --dissociate) +else + REFERENCE=() +fi + +git -c advice.detachedHead=false clone --no-checkout "${REFERENCE[@]}" \ + -b "$TAG" https://go.googlesource.com/go.git "$TMPDIR" + +# Simplify the history graph by removing the dev.boringcrypto branches, whose +# merges end up empty after grafting anyway. This also fixes a weird quirk +# (maybe a git-filter-repo bug?) where only one file from an old path, +# src/crypto/ed25519/internal/edwards25519/const.go, would still exist in the +# filtered repo. +git -C "$TMPDIR" replace --graft f771edd7f9 99f1bf54eb +git -C "$TMPDIR" replace --graft 109c13b64f c2f96e686f +git -C "$TMPDIR" replace --graft aa4da4f189 912f075047 + +git -C "$TMPDIR" filter-repo --force \ + --paths-from-file /dev/stdin \ + --prune-empty always \ + --prune-degenerate always \ + --tag-callback 'tag.skip()' <<'EOF' +src/crypto/internal/fips140/edwards25519 +src/crypto/internal/edwards25519 +src/crypto/ed25519/internal/edwards25519 +EOF + +git fetch "$TMPDIR" +git update-ref "refs/heads/upstream/$TAG" FETCH_HEAD + +echo +echo "Fetched upstream history up to $TAG. Merge with:" +echo -e "\tgit merge --no-ff --no-commit --allow-unrelated-histories upstream/$TAG" diff --git a/vendor/filippo.io/edwards25519/scalar.go b/vendor/filippo.io/edwards25519/scalar.go index 3fd1653877..f08b26245c 100644 --- a/vendor/filippo.io/edwards25519/scalar.go +++ b/vendor/filippo.io/edwards25519/scalar.go @@ -7,6 +7,7 @@ package edwards25519 import ( "encoding/binary" "errors" + "math/bits" ) // A Scalar is an integer modulo @@ -179,15 +180,23 @@ func isReduced(s []byte) bool { return false } - for i := len(s) - 1; i >= 0; i-- { - switch { - case s[i] > scalarMinusOneBytes[i]: - return false - case s[i] < scalarMinusOneBytes[i]: - return true - } - } - return true + s0 := binary.LittleEndian.Uint64(s[:8]) + s1 := binary.LittleEndian.Uint64(s[8:16]) + s2 := binary.LittleEndian.Uint64(s[16:24]) + s3 := binary.LittleEndian.Uint64(s[24:]) + + l0 := binary.LittleEndian.Uint64(scalarMinusOneBytes[:8]) + l1 := binary.LittleEndian.Uint64(scalarMinusOneBytes[8:16]) + l2 := binary.LittleEndian.Uint64(scalarMinusOneBytes[16:24]) + l3 := binary.LittleEndian.Uint64(scalarMinusOneBytes[24:]) + + // Do a constant time subtraction chain scalarMinusOneBytes - s. If there is + // a borrow at the end, then s > scalarMinusOneBytes. + _, b := bits.Sub64(l0, s0, 0) + _, b = bits.Sub64(l1, s1, b) + _, b = bits.Sub64(l2, s2, b) + _, b = bits.Sub64(l3, s3, b) + return b == 0 } // SetBytesWithClamping applies the buffer pruning described in RFC 8032, diff --git a/vendor/filippo.io/edwards25519/tables.go b/vendor/filippo.io/edwards25519/tables.go index 83234bbc0f..4a2b54ebad 100644 --- a/vendor/filippo.io/edwards25519/tables.go +++ b/vendor/filippo.io/edwards25519/tables.go @@ -4,9 +4,7 @@ package edwards25519 -import ( - "crypto/subtle" -) +import "crypto/subtle" // A dynamic lookup table for variable-base, constant-time scalar muls. type projLookupTable struct { diff --git a/vendor/github.com/antithesishq/antithesis-sdk-go/assert/assert.go b/vendor/github.com/antithesishq/antithesis-sdk-go/assert/assert.go index 3ede0101e4..eff6fa96bd 100644 --- a/vendor/github.com/antithesishq/antithesis-sdk-go/assert/assert.go +++ b/vendor/github.com/antithesishq/antithesis-sdk-go/assert/assert.go @@ -14,8 +14,8 @@ // // [Antithesis Go SDK]: https://antithesis.com/docs/using_antithesis/sdk/go/ // [Antithesis platform]: https://antithesis.com -// [test properties]: https://antithesis.com/docs/using_antithesis/properties/ -// [workload]: https://antithesis.com/docs/getting_started/first_test/ +// [test properties]: https://antithesis.com/docs/properties_assertions/properties/ +// [workload]: https://antithesis.com/docs/test_templates/first_test/ // [antithesis-go-generator]: https://antithesis.com/docs/using_antithesis/sdk/go/instrumentor/ // [triage report]: https://antithesis.com/docs/reports/ // [here]: https://antithesis.com/docs/using_antithesis/sdk/fallback/ diff --git a/vendor/github.com/antithesishq/antithesis-sdk-go/internal/sdk_const.go b/vendor/github.com/antithesishq/antithesis-sdk-go/internal/sdk_const.go index 5eccf1b58c..e520f15202 100644 --- a/vendor/github.com/antithesishq/antithesis-sdk-go/internal/sdk_const.go +++ b/vendor/github.com/antithesishq/antithesis-sdk-go/internal/sdk_const.go @@ -3,7 +3,7 @@ package internal // -------------------------------------------------------------------------------- // Versions // -------------------------------------------------------------------------------- -const SDK_Version = "0.6.0" +const SDK_Version = "0.7.0" const Protocol_Version = "1.1.0" // -------------------------------------------------------------------------------- diff --git a/vendor/github.com/go-sql-driver/mysql/AUTHORS b/vendor/github.com/go-sql-driver/mysql/AUTHORS index ec346e203b..42c7f02c0b 100644 --- a/vendor/github.com/go-sql-driver/mysql/AUTHORS +++ b/vendor/github.com/go-sql-driver/mysql/AUTHORS @@ -18,8 +18,8 @@ Alex Snast Alexey Palazhchenko Andrew Reid Animesh Ray -Arne Hormann Ariel Mashraki +Arne Hormann Artur Melanchyk Asta Xie B Lamarche @@ -38,6 +38,7 @@ Daniel Montoya Daniel Nichter Daniël van Eeden Dave Protasowski +Demouth Diego Dupin Dirkjan Bussink DisposaBoy @@ -66,6 +67,7 @@ Jeff Hodges Jeffrey Charles Jennifer Purevsuren Jerome Meyer +Jiabin Zhang Jiajia Zhong Jian Zhen Joe Mann @@ -85,10 +87,12 @@ Linh Tran Tuan Lion Yang Luca Looz Lucas Liu -Lunny Xiao Luke Scott +Lunny Xiao Maciej Zimnoch Michael Woolnough +Minh Quang +Morgan Tocker Nao Yokotsuka Nathanial Murphy Nicola Peduzzi @@ -99,7 +103,6 @@ Paul Bonser Paulius Lozys Peter Schultz Phil Porada -Minh Quang Rebecca Chin Reed Allman Richard Wilkes @@ -134,6 +137,7 @@ Ziheng Lyu # Organizations Barracuda Networks, Inc. +Block, Inc. Counting Ltd. Defined Networking Inc. DigitalOcean Inc. diff --git a/vendor/github.com/go-sql-driver/mysql/CHANGELOG.md b/vendor/github.com/go-sql-driver/mysql/CHANGELOG.md index 75674b6039..b24af9bed6 100644 --- a/vendor/github.com/go-sql-driver/mysql/CHANGELOG.md +++ b/vendor/github.com/go-sql-driver/mysql/CHANGELOG.md @@ -1,13 +1,26 @@ # Changelog +## v1.10.0 (2026-04-28) + +* Fix `getSystemVar("max_allowed_packet")` potentially returned wrong value. (#1754) + This affects only when `maxAllowedPacket=0` is set. + +* Bump filippo.io/edwards25519 from 1.1.1 to 1.2.0. (#1756) + While older versions have reported CVEs, they do not affect go-mysql. + +* Update Go versions to 1.24-1.26. (#1763) + +* Enhance interpolateParams to correctly handle placeholders. (#1732) + The question mark (?) within strings and comments will no longer be treated as a placeholder. + + ## v1.9.3 (2025-06-13) * `tx.Commit()` and `tx.Rollback()` returned `ErrInvalidConn` always. Now they return cached real error if present. (#1690) -* Optimize reading small resultsets to fix performance regression - introduced by compression protocol support. (#1707) - +* Optimize reading small result sets to fix a performance regression + introduced by compression protocol support. (`#1707`) * Fix `db.Ping()` on compressed connection. (#1723) diff --git a/vendor/github.com/go-sql-driver/mysql/README.md b/vendor/github.com/go-sql-driver/mysql/README.md index da4593ccf8..3da0538c79 100644 --- a/vendor/github.com/go-sql-driver/mysql/README.md +++ b/vendor/github.com/go-sql-driver/mysql/README.md @@ -1,5 +1,8 @@ # Go-MySQL-Driver +[![DeepWiki](https://img.shields.io/badge/DeepWiki-go--sql--driver%2Fmysql-blue.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACwAAAAyCAYAAAAnWDnqAAAAAXNSR0IArs4c6QAAA05JREFUaEPtmUtyEzEQhtWTQyQLHNak2AB7ZnyXZMEjXMGeK/AIi+QuHrMnbChYY7MIh8g01fJoopFb0uhhEqqcbWTp06/uv1saEDv4O3n3dV60RfP947Mm9/SQc0ICFQgzfc4CYZoTPAswgSJCCUJUnAAoRHOAUOcATwbmVLWdGoH//PB8mnKqScAhsD0kYP3j/Yt5LPQe2KvcXmGvRHcDnpxfL2zOYJ1mFwrryWTz0advv1Ut4CJgf5uhDuDj5eUcAUoahrdY/56ebRWeraTjMt/00Sh3UDtjgHtQNHwcRGOC98BJEAEymycmYcWwOprTgcB6VZ5JK5TAJ+fXGLBm3FDAmn6oPPjR4rKCAoJCal2eAiQp2x0vxTPB3ALO2CRkwmDy5WohzBDwSEFKRwPbknEggCPB/imwrycgxX2NzoMCHhPkDwqYMr9tRcP5qNrMZHkVnOjRMWwLCcr8ohBVb1OMjxLwGCvjTikrsBOiA6fNyCrm8V1rP93iVPpwaE+gO0SsWmPiXB+jikdf6SizrT5qKasx5j8ABbHpFTx+vFXp9EnYQmLx02h1QTTrl6eDqxLnGjporxl3NL3agEvXdT0WmEost648sQOYAeJS9Q7bfUVoMGnjo4AZdUMQku50McDcMWcBPvr0SzbTAFDfvJqwLzgxwATnCgnp4wDl6Aa+Ax283gghmj+vj7feE2KBBRMW3FzOpLOADl0Isb5587h/U4gGvkt5v60Z1VLG8BhYjbzRwyQZemwAd6cCR5/XFWLYZRIMpX39AR0tjaGGiGzLVyhse5C9RKC6ai42ppWPKiBagOvaYk8lO7DajerabOZP46Lby5wKjw1HCRx7p9sVMOWGzb/vA1hwiWc6jm3MvQDTogQkiqIhJV0nBQBTU+3okKCFDy9WwferkHjtxib7t3xIUQtHxnIwtx4mpg26/HfwVNVDb4oI9RHmx5WGelRVlrtiw43zboCLaxv46AZeB3IlTkwouebTr1y2NjSpHz68WNFjHvupy3q8TFn3Hos2IAk4Ju5dCo8B3wP7VPr/FGaKiG+T+v+TQqIrOqMTL1VdWV1DdmcbO8KXBz6esmYWYKPwDL5b5FA1a0hwapHiom0r/cKaoqr+27/XcrS5UwSMbQAAAABJRU5ErkJggg==)](https://deepwiki.com/go-sql-driver/mysql) + + A MySQL-Driver for Go's [database/sql](https://golang.org/pkg/database/sql/) package ![Go-MySQL-Driver logo](https://raw.github.com/wiki/go-sql-driver/mysql/gomysql_m.png "Golang Gopher holding the MySQL Dolphin") @@ -42,8 +45,8 @@ A MySQL-Driver for Go's [database/sql](https://golang.org/pkg/database/sql/) pac ## Requirements -* Go 1.21 or higher. We aim to support the 3 latest versions of Go. -* MySQL (5.7+) and MariaDB (10.5+) are supported. +* Go 1.24 or higher. We aim to support the 3 latest versions of Go. +* MySQL (5.7+) and MariaDB (10.5+) are supported by maintainers. * [TiDB](https://github.com/pingcap/tidb) is supported by PingCAP. * Do not ask questions about TiDB in our issue tracker or forum. * [Document](https://docs.pingcap.com/tidb/v6.1/dev-guide-sample-application-golang) diff --git a/vendor/github.com/go-sql-driver/mysql/auth.go b/vendor/github.com/go-sql-driver/mysql/auth.go index 74e1bd03ed..610044fc16 100644 --- a/vendor/github.com/go-sql-driver/mysql/auth.go +++ b/vendor/github.com/go-sql-driver/mysql/auth.go @@ -305,7 +305,7 @@ func (mc *mysqlConn) auth(authData []byte, plugin string) ([]byte, error) { if !mc.cfg.AllowNativePasswords { return nil, ErrNativePassword } - // https://dev.mysql.com/doc/internals/en/secure-password-authentication.html + // https://dev.mysql.com/doc/dev/mysql-server/8.4.5/page_protocol_connection_phase_authentication_methods_native_password_authentication.html // Native password authentication only need and will need 20-byte challenge. authResp := scramblePassword(authData[:20], mc.cfg.Passwd) return authResp, nil diff --git a/vendor/github.com/go-sql-driver/mysql/conncheck.go b/vendor/github.com/go-sql-driver/mysql/conncheck.go index 0ea721720c..f9c5cb65cb 100644 --- a/vendor/github.com/go-sql-driver/mysql/conncheck.go +++ b/vendor/github.com/go-sql-driver/mysql/conncheck.go @@ -7,7 +7,6 @@ // You can obtain one at http://mozilla.org/MPL/2.0/. //go:build linux || darwin || dragonfly || freebsd || netbsd || openbsd || solaris || illumos -// +build linux darwin dragonfly freebsd netbsd openbsd solaris illumos package mysql diff --git a/vendor/github.com/go-sql-driver/mysql/conncheck_dummy.go b/vendor/github.com/go-sql-driver/mysql/conncheck_dummy.go index a56c138f2d..0ebf05c213 100644 --- a/vendor/github.com/go-sql-driver/mysql/conncheck_dummy.go +++ b/vendor/github.com/go-sql-driver/mysql/conncheck_dummy.go @@ -7,7 +7,6 @@ // You can obtain one at http://mozilla.org/MPL/2.0/. //go:build !linux && !darwin && !dragonfly && !freebsd && !netbsd && !openbsd && !solaris && !illumos -// +build !linux,!darwin,!dragonfly,!freebsd,!netbsd,!openbsd,!solaris,!illumos package mysql diff --git a/vendor/github.com/go-sql-driver/mysql/connection.go b/vendor/github.com/go-sql-driver/mysql/connection.go index 3e455a3ff0..65204e2d21 100644 --- a/vendor/github.com/go-sql-driver/mysql/connection.go +++ b/vendor/github.com/go-sql-driver/mysql/connection.go @@ -33,7 +33,8 @@ type mysqlConn struct { connector *connector maxAllowedPacket int maxWriteSize int - flags clientFlag + capabilities capabilityFlag + extCapabilities extendedCapabilityFlag status statusFlag sequence uint8 compressSequence uint8 @@ -171,7 +172,7 @@ func (mc *mysqlConn) close() { } // Closes the network connection and unsets internal variables. Do not call this -// function after successfully authentication, call Close instead. This function +// function after successful authentication, call Close instead. This function // is called before auth or on auth failure because MySQL will have already // closed the network connection. func (mc *mysqlConn) cleanup() { @@ -223,13 +224,21 @@ func (mc *mysqlConn) Prepare(query string) (driver.Stmt, error) { columnCount, err := stmt.readPrepareResultPacket() if err == nil { if stmt.paramCount > 0 { - if err = mc.readUntilEOF(); err != nil { + if err = mc.skipColumns(stmt.paramCount); err != nil { return nil, err } } if columnCount > 0 { - err = mc.readUntilEOF() + if mc.extCapabilities&clientCacheMetadata != 0 { + if stmt.columns, err = mc.readColumns(int(columnCount), nil); err != nil { + return nil, err + } + } else { + if err = mc.skipColumns(int(columnCount)); err != nil { + return nil, err + } + } } } @@ -237,100 +246,184 @@ func (mc *mysqlConn) Prepare(query string) (driver.Stmt, error) { } func (mc *mysqlConn) interpolateParams(query string, args []driver.Value) (string, error) { - // Number of ? should be same to len(args) - if strings.Count(query, "?") != len(args) { - return "", driver.ErrSkip - } + noBackslashEscapes := (mc.status & statusNoBackslashEscapes) != 0 + const ( + stateNormal = iota + stateString + stateEscape + stateEOLComment + stateSlashStarComment + stateBacktick + ) + + const ( + QUOTE_BYTE = byte('\'') + DBL_QUOTE_BYTE = byte('"') + BACKSLASH_BYTE = byte('\\') + QUESTION_MARK_BYTE = byte('?') + SLASH_BYTE = byte('/') + STAR_BYTE = byte('*') + HASH_BYTE = byte('#') + MINUS_BYTE = byte('-') + LINE_FEED_BYTE = byte('\n') + BACKTICK_BYTE = byte('`') + ) buf, err := mc.buf.takeCompleteBuffer() if err != nil { - // can not take the buffer. Something must be wrong with the connection mc.cleanup() - // interpolateParams would be called before sending any query. - // So its safe to retry. return "", driver.ErrBadConn } buf = buf[:0] + state := stateNormal + singleQuotes := false + lastChar := byte(0) argPos := 0 - - for i := 0; i < len(query); i++ { - q := strings.IndexByte(query[i:], '?') - if q == -1 { - buf = append(buf, query[i:]...) - break - } - buf = append(buf, query[i:i+q]...) - i += q - - arg := args[argPos] - argPos++ - - if arg == nil { - buf = append(buf, "NULL"...) + lenQuery := len(query) + lastIdx := 0 + + for i := range lenQuery { + currentChar := query[i] + if state == stateEscape && !((currentChar == QUOTE_BYTE && singleQuotes) || (currentChar == DBL_QUOTE_BYTE && !singleQuotes)) { + state = stateString + lastChar = currentChar continue } - - switch v := arg.(type) { - case int64: - buf = strconv.AppendInt(buf, v, 10) - case uint64: - // Handle uint64 explicitly because our custom ConvertValue emits unsigned values - buf = strconv.AppendUint(buf, v, 10) - case float64: - buf = strconv.AppendFloat(buf, v, 'g', -1, 64) - case bool: - if v { - buf = append(buf, '1') - } else { - buf = append(buf, '0') + switch currentChar { + case STAR_BYTE: + if state == stateNormal && lastChar == SLASH_BYTE { + state = stateSlashStarComment } - case time.Time: - if v.IsZero() { - buf = append(buf, "'0000-00-00'"...) - } else { - buf = append(buf, '\'') - buf, err = appendDateTime(buf, v.In(mc.cfg.Loc), mc.cfg.timeTruncate) - if err != nil { - return "", err - } - buf = append(buf, '\'') + case SLASH_BYTE: + if state == stateSlashStarComment && lastChar == STAR_BYTE { + state = stateNormal + // Clear lastChar so the '/' that closed the comment isn't + // reused to start a new comment with a following '*'. + lastChar = 0 + continue } - case json.RawMessage: - buf = append(buf, '\'') - if mc.status&statusNoBackslashEscapes == 0 { - buf = escapeBytesBackslash(buf, v) - } else { - buf = escapeBytesQuotes(buf, v) + case HASH_BYTE: + if state == stateNormal { + state = stateEOLComment } - buf = append(buf, '\'') - case []byte: - if v == nil { - buf = append(buf, "NULL"...) - } else { - buf = append(buf, "_binary'"...) - if mc.status&statusNoBackslashEscapes == 0 { - buf = escapeBytesBackslash(buf, v) + case MINUS_BYTE: + if state == stateNormal && lastChar == MINUS_BYTE { + // -- only starts a comment if followed by whitespace or control char + if i+1 < lenQuery { + nextChar := query[i+1] + if nextChar == ' ' || nextChar == '\t' || nextChar == '\n' || nextChar == '\r' { + state = stateEOLComment + } } else { - buf = escapeBytesQuotes(buf, v) + state = stateEOLComment } - buf = append(buf, '\'') } - case string: - buf = append(buf, '\'') - if mc.status&statusNoBackslashEscapes == 0 { - buf = escapeStringBackslash(buf, v) - } else { - buf = escapeStringQuotes(buf, v) + case LINE_FEED_BYTE: + if state == stateEOLComment { + state = stateNormal } - buf = append(buf, '\'') - default: - return "", driver.ErrSkip - } + case DBL_QUOTE_BYTE: + if state == stateNormal { + state = stateString + singleQuotes = false + } else if state == stateString && !singleQuotes { + state = stateNormal + } else if state == stateEscape { + state = stateString + } + case QUOTE_BYTE: + if state == stateNormal { + state = stateString + singleQuotes = true + } else if state == stateString && singleQuotes { + state = stateNormal + } else if state == stateEscape { + state = stateString + } + case BACKSLASH_BYTE: + if state == stateString && !noBackslashEscapes { + state = stateEscape + } + case QUESTION_MARK_BYTE: + if state == stateNormal { + if argPos >= len(args) { + return "", driver.ErrSkip + } + buf = append(buf, query[lastIdx:i]...) + arg := args[argPos] + argPos++ + + if arg == nil { + buf = append(buf, "NULL"...) + lastIdx = i + 1 + break + } - if len(buf)+4 > mc.maxAllowedPacket { - return "", driver.ErrSkip + switch v := arg.(type) { + case int64: + buf = strconv.AppendInt(buf, v, 10) + case uint64: + buf = strconv.AppendUint(buf, v, 10) + case float64: + buf = strconv.AppendFloat(buf, v, 'g', -1, 64) + case bool: + if v { + buf = append(buf, '1') + } else { + buf = append(buf, '0') + } + case time.Time: + if v.IsZero() { + buf = append(buf, "'0000-00-00'"...) + } else { + buf = append(buf, '\'') + buf, err = appendDateTime(buf, v.In(mc.cfg.Loc), mc.cfg.timeTruncate) + if err != nil { + return "", err + } + buf = append(buf, '\'') + } + case json.RawMessage: + if noBackslashEscapes { + buf = escapeBytesQuotes(buf, v, false) + } else { + buf = escapeBytesBackslash(buf, v, false) + } + case []byte: + if v == nil { + buf = append(buf, "NULL"...) + } else { + if noBackslashEscapes { + buf = escapeBytesQuotes(buf, v, true) + } else { + buf = escapeBytesBackslash(buf, v, true) + } + } + case string: + if noBackslashEscapes { + buf = escapeStringQuotes(buf, v) + } else { + buf = escapeStringBackslash(buf, v) + } + default: + return "", driver.ErrSkip + } + + if len(buf)+4 > mc.maxAllowedPacket { + return "", driver.ErrSkip + } + lastIdx = i + 1 + } + case BACKTICK_BYTE: + if state == stateBacktick { + state = stateNormal + } else if state == stateNormal { + state = stateBacktick + } } + lastChar = currentChar } + buf = append(buf, query[lastIdx:]...) if argPos != len(args) { return "", driver.ErrSkip } @@ -370,19 +463,19 @@ func (mc *mysqlConn) exec(query string) error { } // Read Result - resLen, err := handleOk.readResultSetHeaderPacket() + resLen, _, err := handleOk.readResultSetHeaderPacket() if err != nil { return err } if resLen > 0 { // columns - if err := mc.readUntilEOF(); err != nil { + if err := mc.skipColumns(resLen); err != nil { return err } // rows - if err := mc.readUntilEOF(); err != nil { + if err := mc.skipRows(); err != nil { return err } } @@ -419,7 +512,7 @@ func (mc *mysqlConn) query(query string, args []driver.Value) (*textRows, error) // Read Result var resLen int - resLen, err = handleOk.readResultSetHeaderPacket() + resLen, _, err = handleOk.readResultSetHeaderPacket() if err != nil { return nil, err } @@ -439,21 +532,20 @@ func (mc *mysqlConn) query(query string, args []driver.Value) (*textRows, error) } // Columns - rows.rs.columns, err = mc.readColumns(resLen) + rows.rs.columns, err = mc.readColumns(resLen, nil) return rows, err } // Gets the value of the given MySQL System Variable -// The returned byte slice is only valid until the next read -func (mc *mysqlConn) getSystemVar(name string) ([]byte, error) { +func (mc *mysqlConn) getSystemVar(name string) (string, error) { // Send command handleOk := mc.clearResult() if err := mc.writeCommandPacketStr(comQuery, "SELECT @@"+name); err != nil { - return nil, err + return "", err } // Read Result - resLen, err := handleOk.readResultSetHeaderPacket() + resLen, _, err := handleOk.readResultSetHeaderPacket() if err == nil { rows := new(textRows) rows.mc = mc @@ -461,17 +553,20 @@ func (mc *mysqlConn) getSystemVar(name string) ([]byte, error) { if resLen > 0 { // Columns - if err := mc.readUntilEOF(); err != nil { - return nil, err + if err := mc.skipColumns(resLen); err != nil { + return "", err } } dest := make([]driver.Value, resLen) if err = rows.readRow(dest); err == nil { - return dest[0].([]byte), mc.readUntilEOF() + // Convert to string before skipRows, which may + // overwrite the read buffer that dest[0] points into. + val := string(dest[0].([]byte)) + return val, mc.skipRows() } } - return nil, err + return "", err } // cancel is called when the query has canceled. diff --git a/vendor/github.com/go-sql-driver/mysql/connector.go b/vendor/github.com/go-sql-driver/mysql/connector.go index bc1d46afc6..3d37604775 100644 --- a/vendor/github.com/go-sql-driver/mysql/connector.go +++ b/vendor/github.com/go-sql-driver/mysql/connector.go @@ -42,7 +42,7 @@ func encodeConnectionAttributes(cfg *Config) string { } // user-defined connection attributes - for _, connAttr := range strings.Split(cfg.ConnectionAttributes, ",") { + for connAttr := range strings.SplitSeq(cfg.ConnectionAttributes, ",") { k, v, found := strings.Cut(connAttr, ":") if !found { continue @@ -131,7 +131,7 @@ func (c *connector) Connect(ctx context.Context) (driver.Conn, error) { mc.buf = newBuffer() // Reading Handshake Initialization Packet - authData, plugin, err := mc.readHandshakePacket() + authData, serverCapabilities, serverExtCapabilities, plugin, err := mc.readHandshakePacket() if err != nil { mc.cleanup() return nil, err @@ -153,6 +153,7 @@ func (c *connector) Connect(ctx context.Context) (driver.Conn, error) { return nil, err } } + mc.initCapabilities(serverCapabilities, serverExtCapabilities, mc.cfg) if err = mc.writeHandshakeResponsePacket(authResp, plugin); err != nil { mc.cleanup() return nil, err @@ -161,13 +162,14 @@ func (c *connector) Connect(ctx context.Context) (driver.Conn, error) { // Handle response to auth packet, switch methods if possible if err = mc.handleAuthResult(authData, plugin); err != nil { // Authentication failed and MySQL has already closed the connection - // (https://dev.mysql.com/doc/internals/en/authentication-fails.html). + // (https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_phase.html#sect_protocol_connection_phase_fast_path_fails). // Do not send COM_QUIT, just cleanup and return the error. mc.cleanup() return nil, err } - if mc.cfg.compress && mc.flags&clientCompress == clientCompress { + // compression is enabled after auth, not right after sending handshake response. + if mc.capabilities&clientCompress > 0 { mc.compress = true mc.compIO = newCompIO(mc) } @@ -180,7 +182,7 @@ func (c *connector) Connect(ctx context.Context) (driver.Conn, error) { mc.Close() return nil, err } - n, err := strconv.Atoi(string(maxap)) + n, err := strconv.Atoi(maxap) if err != nil { mc.Close() return nil, fmt.Errorf("invalid max_allowed_packet value (%q): %w", maxap, err) diff --git a/vendor/github.com/go-sql-driver/mysql/const.go b/vendor/github.com/go-sql-driver/mysql/const.go index 4aadcd6422..6f0cdf3032 100644 --- a/vendor/github.com/go-sql-driver/mysql/const.go +++ b/vendor/github.com/go-sql-driver/mysql/const.go @@ -32,7 +32,7 @@ const ( ) // MySQL constants documentation: -// http://dev.mysql.com/doc/internals/en/client-server-protocol.html +// https://dev.mysql.com/doc/dev/mysql-server/latest/PAGE_PROTOCOL.html const ( iOK byte = 0x00 @@ -42,11 +42,12 @@ const ( iERR byte = 0xff ) -// https://dev.mysql.com/doc/internals/en/capability-flags.html#packet-Protocol::CapabilityFlags -type clientFlag uint32 +// https://dev.mysql.com/doc/dev/mysql-server/latest/group__group__cs__capabilities__flags.html +// https://mariadb.com/kb/en/connection/#capabilities +type capabilityFlag uint32 const ( - clientLongPassword clientFlag = 1 << iota + clientMySQL capabilityFlag = 1 << iota clientFoundRows clientLongFlag clientConnectWithDB @@ -73,6 +74,18 @@ const ( clientDeprecateEOF ) +// https://mariadb.com/kb/en/connection/#capabilities +type extendedCapabilityFlag uint32 + +const ( + progressIndicator extendedCapabilityFlag = 1 << iota + clientComMulti + clientStmtBulkOperations + clientExtendedMetadata + clientCacheMetadata + clientUnitBulkResult +) + const ( comQuit byte = iota + 1 comInitDB diff --git a/vendor/github.com/go-sql-driver/mysql/dsn.go b/vendor/github.com/go-sql-driver/mysql/dsn.go index ecf62567a6..491e10f371 100644 --- a/vendor/github.com/go-sql-driver/mysql/dsn.go +++ b/vendor/github.com/go-sql-driver/mysql/dsn.go @@ -15,6 +15,7 @@ import ( "crypto/tls" "errors" "fmt" + "maps" "math/big" "net" "net/url" @@ -157,9 +158,7 @@ func (cfg *Config) Clone() *Config { } if len(cp.Params) > 0 { cp.Params = make(map[string]string, len(cfg.Params)) - for k, v := range cfg.Params { - cp.Params[k] = v - } + maps.Copy(cp.Params, cfg.Params) } if cfg.pubKey != nil { cp.pubKey = &rsa.PublicKey{ @@ -414,7 +413,7 @@ func ParseDSN(dsn string) (cfg *Config, err error) { if dsn[j] == '@' { // username[:password] // Find the first ':' in dsn[:j] - for k = 0; k < j; k++ { + for k = 0; k < j; k++ { // We cannot use k = range j here, because we use dsn[:k] below if dsn[k] == ':' { cfg.Passwd = dsn[k+1 : j] break @@ -477,7 +476,7 @@ func ParseDSN(dsn string) (cfg *Config, err error) { // parseDSNParams parses the DSN "query string" // Values must be url.QueryEscape'ed func parseDSNParams(cfg *Config, params string) (err error) { - for _, v := range strings.Split(params, "&") { + for v := range strings.SplitSeq(params, "&") { key, value, found := strings.Cut(v, "=") if !found { continue diff --git a/vendor/github.com/go-sql-driver/mysql/fields.go b/vendor/github.com/go-sql-driver/mysql/fields.go index be5cd809a6..ee9d964171 100644 --- a/vendor/github.com/go-sql-driver/mysql/fields.go +++ b/vendor/github.com/go-sql-driver/mysql/fields.go @@ -120,23 +120,24 @@ func (mf *mysqlField) typeDatabaseName() string { } var ( - scanTypeFloat32 = reflect.TypeOf(float32(0)) - scanTypeFloat64 = reflect.TypeOf(float64(0)) - scanTypeInt8 = reflect.TypeOf(int8(0)) - scanTypeInt16 = reflect.TypeOf(int16(0)) - scanTypeInt32 = reflect.TypeOf(int32(0)) - scanTypeInt64 = reflect.TypeOf(int64(0)) - scanTypeNullFloat = reflect.TypeOf(sql.NullFloat64{}) - scanTypeNullInt = reflect.TypeOf(sql.NullInt64{}) - scanTypeNullTime = reflect.TypeOf(sql.NullTime{}) - scanTypeUint8 = reflect.TypeOf(uint8(0)) - scanTypeUint16 = reflect.TypeOf(uint16(0)) - scanTypeUint32 = reflect.TypeOf(uint32(0)) - scanTypeUint64 = reflect.TypeOf(uint64(0)) - scanTypeString = reflect.TypeOf("") - scanTypeNullString = reflect.TypeOf(sql.NullString{}) - scanTypeBytes = reflect.TypeOf([]byte{}) - scanTypeUnknown = reflect.TypeOf(new(any)) + scanTypeFloat32 = reflect.TypeFor[float32]() + scanTypeFloat64 = reflect.TypeFor[float64]() + scanTypeInt8 = reflect.TypeFor[int8]() + scanTypeInt16 = reflect.TypeFor[int16]() + scanTypeInt32 = reflect.TypeFor[int32]() + scanTypeInt64 = reflect.TypeFor[int64]() + scanTypeNullFloat = reflect.TypeFor[sql.NullFloat64]() + scanTypeNullInt = reflect.TypeFor[sql.NullInt64]() + scanTypeNullUint = reflect.TypeFor[sql.Null[uint64]]() + scanTypeNullTime = reflect.TypeFor[sql.NullTime]() + scanTypeUint8 = reflect.TypeFor[uint8]() + scanTypeUint16 = reflect.TypeFor[uint16]() + scanTypeUint32 = reflect.TypeFor[uint32]() + scanTypeUint64 = reflect.TypeFor[uint64]() + scanTypeString = reflect.TypeFor[string]() + scanTypeNullString = reflect.TypeFor[sql.NullString]() + scanTypeBytes = reflect.TypeFor[[]byte]() + scanTypeUnknown = reflect.TypeFor[*any]() ) type mysqlField struct { @@ -185,6 +186,9 @@ func (mf *mysqlField) scanType() reflect.Type { } return scanTypeInt64 } + if mf.flags&flagUnsigned != 0 { + return scanTypeNullUint + } return scanTypeNullInt case fieldTypeFloat: diff --git a/vendor/github.com/go-sql-driver/mysql/infile.go b/vendor/github.com/go-sql-driver/mysql/infile.go index 453ae091e5..597b5e7f67 100644 --- a/vendor/github.com/go-sql-driver/mysql/infile.go +++ b/vendor/github.com/go-sql-driver/mysql/infile.go @@ -95,10 +95,7 @@ const defaultPacketSize = 16 * 1024 // 16KB is small enough for disk readahead a func (mc *okHandler) handleInFileRequest(name string) (err error) { var rdr io.Reader - packetSize := defaultPacketSize - if mc.maxWriteSize < packetSize { - packetSize = mc.maxWriteSize - } + packetSize := min(mc.maxWriteSize, defaultPacketSize) if idx := strings.Index(name, "Reader::"); idx == 0 || (idx > 0 && name[idx-1] == '/') { // io.Reader // The server might return an an absolute path. See issue #355. diff --git a/vendor/github.com/go-sql-driver/mysql/packets.go b/vendor/github.com/go-sql-driver/mysql/packets.go index 831fca6ca9..d0b21b06c9 100644 --- a/vendor/github.com/go-sql-driver/mysql/packets.go +++ b/vendor/github.com/go-sql-driver/mysql/packets.go @@ -179,20 +179,22 @@ func (mc *mysqlConn) writePacket(data []byte) error { ******************************************************************************/ // Handshake Initialization Packet -// http://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::Handshake -func (mc *mysqlConn) readHandshakePacket() (data []byte, plugin string, err error) { +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_phase_packets_protocol_handshake_v10.html +// https://mariadb.com/kb/en/connection/#initial-handshake-packet +func (mc *mysqlConn) readHandshakePacket() (data []byte, capabilities capabilityFlag, extendedCapabilities extendedCapabilityFlag, plugin string, err error) { data, err = mc.readPacket() if err != nil { return } if data[0] == iERR { - return nil, "", mc.handleErrorPacket(data) + err = mc.handleErrorPacket(data) + return } // protocol version [1 byte] if data[0] < minProtocolVersion { - return nil, "", fmt.Errorf( + return nil, 0, 0, "", fmt.Errorf( "unsupported protocol version %d. Version %d or higher is required", data[0], minProtocolVersion, @@ -210,15 +212,15 @@ func (mc *mysqlConn) readHandshakePacket() (data []byte, plugin string, err erro pos += 8 + 1 // capability flags (lower 2 bytes) [2 bytes] - mc.flags = clientFlag(binary.LittleEndian.Uint16(data[pos : pos+2])) - if mc.flags&clientProtocol41 == 0 { - return nil, "", ErrOldProtocol + capabilities = capabilityFlag(binary.LittleEndian.Uint16(data[pos : pos+2])) + if capabilities&clientProtocol41 == 0 { + return nil, capabilities, 0, "", ErrOldProtocol } - if mc.flags&clientSSL == 0 && mc.cfg.TLS != nil { + if capabilities&clientSSL == 0 && mc.cfg.TLS != nil { if mc.cfg.AllowFallbackToPlaintext { mc.cfg.TLS = nil } else { - return nil, "", ErrNoTLS + return nil, capabilities, 0, "", ErrNoTLS } } pos += 2 @@ -228,11 +230,16 @@ func (mc *mysqlConn) readHandshakePacket() (data []byte, plugin string, err erro // status flags [2 bytes] pos += 3 // capability flags (upper 2 bytes) [2 bytes] - mc.flags |= clientFlag(binary.LittleEndian.Uint16(data[pos:pos+2])) << 16 + capabilities |= capabilityFlag(binary.LittleEndian.Uint16(data[pos:pos+2])) << 16 pos += 2 // length of auth-plugin-data [1 byte] - // reserved (all [00]) [10 bytes] - pos += 11 + // reserved (all [00]) [6 bytes] + pos += 7 + if capabilities&clientMySQL == 0 { + // MariaDB server extended flag + extendedCapabilities = extendedCapabilityFlag(binary.LittleEndian.Uint32(data[pos : pos+4])) + } + pos += 4 // second part of the password cipher [minimum 13 bytes], // where len=MAX(13, length of auth-plugin-data - 8) @@ -260,82 +267,72 @@ func (mc *mysqlConn) readHandshakePacket() (data []byte, plugin string, err erro // make a memory safe copy of the cipher slice var b [20]byte copy(b[:], authData) - return b[:], plugin, nil + return b[:], capabilities, extendedCapabilities, plugin, nil } // make a memory safe copy of the cipher slice var b [8]byte copy(b[:], authData) - return b[:], plugin, nil + return b[:], capabilities, 0, plugin, nil } -// Client Authentication Packet -// http://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::HandshakeResponse -func (mc *mysqlConn) writeHandshakeResponsePacket(authResp []byte, plugin string) error { - // Adjust client flags based on server support - clientFlags := clientProtocol41 | - clientSecureConn | - clientLongPassword | - clientTransactions | - clientLocalFiles | - clientPluginAuth | - clientMultiResults | - mc.flags&clientConnectAttrs | - mc.flags&clientLongFlag - - sendConnectAttrs := mc.flags&clientConnectAttrs != 0 - - if mc.cfg.ClientFoundRows { - clientFlags |= clientFoundRows +// initCapabilities initializes the capabilities based on server support and configuration +func (mc *mysqlConn) initCapabilities(serverCapabilities capabilityFlag, serverExtCapabilities extendedCapabilityFlag, cfg *Config) { + clientCapabilities := + clientMySQL | + clientLongFlag | + clientProtocol41 | + clientSecureConn | + clientTransactions | + clientPluginAuthLenEncClientData | + clientLocalFiles | + clientPluginAuth | + clientMultiResults | + clientConnectAttrs | + clientDeprecateEOF + + if cfg.ClientFoundRows { + clientCapabilities |= clientFoundRows } - if mc.cfg.compress && mc.flags&clientCompress == clientCompress { - clientFlags |= clientCompress + if cfg.compress { + clientCapabilities |= clientCompress } // To enable TLS / SSL if mc.cfg.TLS != nil { - clientFlags |= clientSSL + clientCapabilities |= clientSSL } if mc.cfg.MultiStatements { - clientFlags |= clientMultiStatements + clientCapabilities |= clientMultiStatements } - - // encode length of the auth plugin data - var authRespLEIBuf [9]byte - authRespLen := len(authResp) - authRespLEI := appendLengthEncodedInteger(authRespLEIBuf[:0], uint64(authRespLen)) - if len(authRespLEI) > 1 { - // if the length can not be written in 1 byte, it must be written as a - // length encoded integer - clientFlags |= clientPluginAuthLenEncClientData + if n := len(cfg.DBName); n > 0 { + clientCapabilities |= clientConnectWithDB } - pktLen := 4 + 4 + 1 + 23 + len(mc.cfg.User) + 1 + len(authRespLEI) + len(authResp) + 21 + 1 + // only keep client capabilities that server have + mc.capabilities = clientCapabilities & serverCapabilities - // To specify a db name - if n := len(mc.cfg.DBName); n > 0 { - clientFlags |= clientConnectWithDB - pktLen += n + 1 - } - - // encode length of the connection attributes - var connAttrsLEI []byte - if sendConnectAttrs { - var connAttrsLEIBuf [9]byte - connAttrsLen := len(mc.connector.encodedAttributes) - connAttrsLEI = appendLengthEncodedInteger(connAttrsLEIBuf[:0], uint64(connAttrsLen)) - pktLen += len(connAttrsLEI) + len(mc.connector.encodedAttributes) - } + // set MariaDB extended clientCacheMetadata capability if server support it + mc.extCapabilities = clientCacheMetadata & serverExtCapabilities +} - // Calculate packet length and get buffer with that size - data, err := mc.buf.takeBuffer(pktLen + 4) +// Client Authentication Packet +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_phase_packets_protocol_handshake_response.html +func (mc *mysqlConn) writeHandshakeResponsePacket(authResp []byte, plugin string) error { + // packet header 4 + // capabilities 4 + // maxPacketSize 4 + // collation id 1 + // filler 23 + data, err := mc.buf.takeSmallBuffer(4*3 + 24) if err != nil { mc.cleanup() return err } + _ = data[4*3+23] // boundery check - // ClientFlags [32 bit] - binary.LittleEndian.PutUint32(data[4:], uint32(clientFlags)) + // clientCapabilities [32 bit] + binary.LittleEndian.PutUint32(data[4:], uint32(mc.capabilities)) // MaxPacketSize [32 bit] (none) binary.LittleEndian.PutUint32(data[8:], 0) @@ -353,16 +350,26 @@ func (mc *mysqlConn) writeHandshakeResponsePacket(authResp []byte, plugin string } // Filler [23 bytes] (all 0x00) + // or filler 19bytes + mariadb extCapabilities pos := 13 - for ; pos < 13+23; pos++ { - data[pos] = 0 + if mc.capabilities&clientMySQL == 0 { + for ; pos < 13+19; pos++ { + data[pos] = 0 + } + // MariaDB Extended Capabilities + binary.LittleEndian.PutUint32(data[13+19:], uint32(mc.extCapabilities)) + } else { + for ; pos < 13+23; pos++ { + data[pos] = 0 + } } // SSL Connection Request Packet - // http://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::SSLRequest + // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_phase_packets_protocol_ssl_request.html + // https://mariadb.com/kb/en/connection/#sslrequest-packet if mc.cfg.TLS != nil { // Send TLS / SSL request packet - if err := mc.writePacket(data[:(4+4+1+23)+4]); err != nil { + if err := mc.writePacket(data); err != nil { return err } @@ -379,37 +386,35 @@ func (mc *mysqlConn) writeHandshakeResponsePacket(authResp []byte, plugin string // User [null terminated string] if len(mc.cfg.User) > 0 { - pos += copy(data[pos:], mc.cfg.User) + data = append(data, mc.cfg.User...) } - data[pos] = 0x00 - pos++ + data = append(data, 0) // Auth Data [length encoded integer] - pos += copy(data[pos:], authRespLEI) - pos += copy(data[pos:], authResp) + data = appendLengthEncodedInteger(data, uint64(len(authResp))) + data = append(data, authResp...) - // Databasename [null terminated string] - if len(mc.cfg.DBName) > 0 { - pos += copy(data[pos:], mc.cfg.DBName) - data[pos] = 0x00 - pos++ + // Database name [null terminated string] + if mc.capabilities&clientConnectWithDB != 0 { + data = append(data, mc.cfg.DBName...) + data = append(data, 0) } - pos += copy(data[pos:], plugin) - data[pos] = 0x00 - pos++ + data = append(data, plugin...) + data = append(data, 0) // Connection Attributes - if sendConnectAttrs { - pos += copy(data[pos:], connAttrsLEI) - pos += copy(data[pos:], []byte(mc.connector.encodedAttributes)) + if mc.capabilities&clientConnectAttrs != 0 { + connAttrsLen := len(mc.connector.encodedAttributes) + data = appendLengthEncodedInteger(data, uint64(connAttrsLen)) + data = append(data, mc.connector.encodedAttributes...) } // Send Auth packet - return mc.writePacket(data[:pos]) + return mc.writePacket(data) } -// http://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::AuthSwitchResponse +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_phase_packets_protocol_auth_switch_response.html func (mc *mysqlConn) writeAuthSwitchPacket(authData []byte) error { pktLen := 4 + len(authData) data, err := mc.buf.takeBuffer(pktLen) @@ -511,7 +516,7 @@ func (mc *mysqlConn) readAuthResult() ([]byte, string, error) { case iEOF: if len(data) == 1 { - // https://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::OldAuthSwitchRequest + // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_phase_packets_protocol_old_auth_switch_request.html return nil, "mysql_old_password", nil } pluginEndIndex := bytes.IndexByte(data, 0x00) @@ -545,36 +550,41 @@ func (mc *okHandler) readResultOK() error { // Result Set Header Packet // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_query_response.html -func (mc *okHandler) readResultSetHeaderPacket() (int, error) { +func (mc *okHandler) readResultSetHeaderPacket() (int, bool, error) { // handleOkPacket replaces both values; other cases leave the values unchanged. mc.result.affectedRows = append(mc.result.affectedRows, 0) mc.result.insertIds = append(mc.result.insertIds, 0) data, err := mc.conn().readPacket() if err != nil { - return 0, err + return 0, false, err } switch data[0] { case iOK: - return 0, mc.handleOkPacket(data) + return 0, false, mc.handleOkPacket(data) case iERR: - return 0, mc.conn().handleErrorPacket(data) + return 0, false, mc.conn().handleErrorPacket(data) case iLocalInFile: - return 0, mc.handleInFileRequest(string(data[1:])) + return 0, false, mc.handleInFileRequest(string(data[1:])) } // column count // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_query_response_text_resultset.html - num, _, _ := readLengthEncodedInteger(data) + // https://mariadb.com/kb/en/result-set-packets/#column-count-packet + num, _, len := readLengthEncodedInteger(data) + + if mc.extCapabilities&clientCacheMetadata != 0 { + return int(num), data[len] == 0x01, nil + } // ignore remaining data in the packet. see #1478. - return int(num), nil + return int(num), true, nil } // Error Packet -// http://dev.mysql.com/doc/internals/en/generic-response-packets.html#packet-ERR_Packet +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_basic_err_packet.html func (mc *mysqlConn) handleErrorPacket(data []byte) error { if data[0] != iERR { return ErrMalformPkt @@ -656,7 +666,7 @@ func (mc *mysqlConn) clearResult() *okHandler { } // Ok Packet -// http://dev.mysql.com/doc/internals/en/generic-response-packets.html#packet-OK_Packet +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_basic_ok_packet.html func (mc *okHandler) handleOkPacket(data []byte) error { var n, m int var affectedRows, insertId uint64 @@ -690,24 +700,19 @@ func (mc *okHandler) handleOkPacket(data []byte) error { } // Read Packets as Field Packets until EOF-Packet or an Error appears -// http://dev.mysql.com/doc/internals/en/com-query-response.html#packet-Protocol::ColumnDefinition41 -func (mc *mysqlConn) readColumns(count int) ([]mysqlField, error) { +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_query_response_text_resultset_column_definition.html#sect_protocol_com_query_response_text_resultset_column_definition_41 +func (mc *mysqlConn) readColumns(count int, old []mysqlField) ([]mysqlField, error) { columns := make([]mysqlField, count) + if len(old) != count { + old = nil + } - for i := 0; ; i++ { + for i := range count { data, err := mc.readPacket() if err != nil { return nil, err } - // EOF Packet - if data[0] == iEOF && (len(data) == 5 || len(data) == 1) { - if i == count { - return columns, nil - } - return nil, fmt.Errorf("column count mismatch n:%d len:%d", count, len(columns)) - } - // Catalog pos, err := skipLengthEncodedString(data) if err != nil { @@ -728,7 +733,12 @@ func (mc *mysqlConn) readColumns(count int) ([]mysqlField, error) { return nil, err } pos += n - columns[i].tableName = string(tableName) + if old != nil && old[i].tableName == string(tableName) { + // avoid allocating new string + columns[i].tableName = old[i].tableName + } else { + columns[i].tableName = string(tableName) + } } else { n, err = skipLengthEncodedString(data[pos:]) if err != nil { @@ -749,7 +759,12 @@ func (mc *mysqlConn) readColumns(count int) ([]mysqlField, error) { if err != nil { return nil, err } - columns[i].name = string(name) + if old != nil && old[i].name == string(name) { + // avoid allocating new string + columns[i].name = old[i].name + } else { + columns[i].name = string(name) + } pos += n // Original name [len coded string] @@ -780,17 +795,17 @@ func (mc *mysqlConn) readColumns(count int) ([]mysqlField, error) { // Decimals [uint8] columns[i].decimals = data[pos] - //pos++ + } - // Default value [len coded binary] - //if pos < len(data) { - // defaultVal, _, err = bytesToLengthCodedBinary(data[pos:]) - //} + // skip EOF packet if client does not support deprecateEOF + if err := mc.skipEof(); err != nil { + return nil, err } + return columns, nil } // Read Packets as Field Packets until EOF-Packet or an Error appears -// http://dev.mysql.com/doc/internals/en/com-query-response.html#packet-ProtocolText::ResultsetRow +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_query_response_text_resultset_row.html func (rows *textRows) readRow(dest []driver.Value) error { mc := rows.mc @@ -804,9 +819,20 @@ func (rows *textRows) readRow(dest []driver.Value) error { } // EOF Packet - if data[0] == iEOF && len(data) == 5 { - // server_status [2 bytes] - rows.mc.status = readStatus(data[3:]) + // text row packets may starts with LengthEncodedString. + // In such case, 0xFE can mean string larger than 0xffffff. + // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_basic_dt_integers.html#sect_protocol_basic_dt_int_le + if data[0] == iEOF && len(data) <= 0xffffff { + if mc.capabilities&clientDeprecateEOF == 0 { + // Deprecated EOF packet + // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_basic_eof_packet.html + mc.status = readStatus(data[3:]) + } else { + // Ok Packet with an 0xFE header + _, _, n := readLengthEncodedInteger(data[1:]) // affected_rows + _, _, m := readLengthEncodedInteger(data[1+n:]) // last_insert_id + mc.status = readStatus(data[1+n+m:]) + } rows.rs.done = true if !rows.HasNextResultSet() { rows.mc = nil @@ -880,8 +906,34 @@ func (rows *textRows) readRow(dest []driver.Value) error { return nil } -// Reads Packets until EOF-Packet or an Error appears. Returns count of Packets read -func (mc *mysqlConn) readUntilEOF() error { +func (mc *mysqlConn) skipPackets(n int) error { + for range n { + if _, err := mc.readPacket(); err != nil { + return err + } + } + return nil +} + +// skips EOF packet after n * ColumnDefinition packets when clientDeprecateEOF is not set +func (mc *mysqlConn) skipEof() error { + if mc.capabilities&clientDeprecateEOF == 0 { + if _, err := mc.readPacket(); err != nil { + return err + } + } + return nil +} + +func (mc *mysqlConn) skipColumns(n int) error { + if err := mc.skipPackets(n); err != nil { + return err + } + return mc.skipEof() +} + +// Reads Packets until EOF-Packet or an Error appears. +func (mc *mysqlConn) skipRows() error { for { data, err := mc.readPacket() if err != nil { @@ -892,10 +944,20 @@ func (mc *mysqlConn) readUntilEOF() error { case iERR: return mc.handleErrorPacket(data) case iEOF: - if len(data) == 5 { - mc.status = readStatus(data[3:]) + // text row packets may starts with LengthEncodedString. + // In such case, 0xFE can mean string larger than 0xffffff. + if len(data) <= 0xffffff { + if mc.capabilities&clientDeprecateEOF == 0 { + // EOF packet + mc.status = readStatus(data[3:]) + } else { + // OK packet with an 0xFE header + _, _, n := readLengthEncodedInteger(data[1:]) // affected_rows + _, _, m := readLengthEncodedInteger(data[1+n:]) // last_insert_id + mc.status = readStatus(data[1+n+m:]) + } + return nil } - return nil } } } @@ -905,7 +967,7 @@ func (mc *mysqlConn) readUntilEOF() error { ******************************************************************************/ // Prepare Result Packets -// http://dev.mysql.com/doc/internals/en/com-stmt-prepare-response.html +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_stmt_prepare.html#sect_protocol_com_stmt_prepare_response func (stmt *mysqlStmt) readPrepareResultPacket() (uint16, error) { data, err := stmt.mc.readPacket() if err == nil { @@ -932,7 +994,7 @@ func (stmt *mysqlStmt) readPrepareResultPacket() (uint16, error) { return 0, err } -// http://dev.mysql.com/doc/internals/en/com-stmt-send-long-data.html +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_stmt_send_long_data.html func (stmt *mysqlStmt) writeCommandLongData(paramID int, arg []byte) error { maxLen := stmt.mc.maxAllowedPacket - 1 pktLen := maxLen @@ -979,7 +1041,7 @@ func (stmt *mysqlStmt) writeCommandLongData(paramID int, arg []byte) error { } // Execute Prepared Statement -// http://dev.mysql.com/doc/internals/en/com-stmt-execute.html +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_com_stmt_execute.html func (stmt *mysqlStmt) writeExecutePacket(args []driver.Value) error { if len(args) != stmt.paramCount { return fmt.Errorf( @@ -993,10 +1055,7 @@ func (stmt *mysqlStmt) writeExecutePacket(args []driver.Value) error { mc := stmt.mc // Determine threshold dynamically to avoid packet size shortage. - longDataSize := mc.maxAllowedPacket / (stmt.paramCount + 1) - if longDataSize < 64 { - longDataSize = 64 - } + longDataSize := max(mc.maxAllowedPacket/(stmt.paramCount+1), 64) // Reset packet-sequence mc.resetSequence() @@ -1185,17 +1244,17 @@ func (stmt *mysqlStmt) writeExecutePacket(args []driver.Value) error { // mc.affectedRows and mc.insertIds. func (mc *okHandler) discardResults() error { for mc.status&statusMoreResultsExists != 0 { - resLen, err := mc.readResultSetHeaderPacket() + resLen, _, err := mc.readResultSetHeaderPacket() if err != nil { return err } if resLen > 0 { // columns - if err := mc.conn().readUntilEOF(); err != nil { + if err := mc.conn().skipColumns(resLen); err != nil { return err } // rows - if err := mc.conn().readUntilEOF(); err != nil { + if err := mc.conn().skipRows(); err != nil { return err } } @@ -1203,7 +1262,7 @@ func (mc *okHandler) discardResults() error { return nil } -// http://dev.mysql.com/doc/internals/en/binary-protocol-resultset-row.html +// https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_binary_resultset.html#sect_protocol_binary_resultset_row func (rows *binaryRows) readRow(dest []driver.Value) error { data, err := rows.mc.readPacket() if err != nil { @@ -1212,9 +1271,17 @@ func (rows *binaryRows) readRow(dest []driver.Value) error { // packet indicator [1 byte] if data[0] != iOK { - // EOF Packet - if data[0] == iEOF && len(data) == 5 { - rows.mc.status = readStatus(data[3:]) + // EOF/OK Packet + if data[0] == iEOF { + if rows.mc.capabilities&clientDeprecateEOF == 0 { + // EOF packet + rows.mc.status = readStatus(data[3:]) + } else { + // OK Packet with an 0xFE header + _, _, n := readLengthEncodedInteger(data[1:]) + _, _, m := readLengthEncodedInteger(data[1+n:]) + rows.mc.status = readStatus(data[1+n+m:]) + } rows.rs.done = true if !rows.HasNextResultSet() { rows.mc = nil diff --git a/vendor/github.com/go-sql-driver/mysql/result.go b/vendor/github.com/go-sql-driver/mysql/result.go index d516314683..82dc0f9b6c 100644 --- a/vendor/github.com/go-sql-driver/mysql/result.go +++ b/vendor/github.com/go-sql-driver/mysql/result.go @@ -8,6 +8,8 @@ package mysql +import "slices" + import "database/sql/driver" // Result exposes data not available through *connection.Result. @@ -42,9 +44,9 @@ func (res *mysqlResult) RowsAffected() (int64, error) { } func (res *mysqlResult) AllLastInsertIds() []int64 { - return append([]int64{}, res.insertIds...) // defensive copy + return slices.Clone(res.insertIds) // defensive copy } func (res *mysqlResult) AllRowsAffected() []int64 { - return append([]int64{}, res.affectedRows...) // defensive copy + return slices.Clone(res.affectedRows) // defensive copy } diff --git a/vendor/github.com/go-sql-driver/mysql/rows.go b/vendor/github.com/go-sql-driver/mysql/rows.go index df98417b8d..190e75f9bf 100644 --- a/vendor/github.com/go-sql-driver/mysql/rows.go +++ b/vendor/github.com/go-sql-driver/mysql/rows.go @@ -113,7 +113,7 @@ func (rows *mysqlRows) Close() (err error) { // Remove unread packets from stream if !rows.rs.done { - err = mc.readUntilEOF() + err = mc.skipRows() } if err == nil { handleOk := mc.clearResult() @@ -143,7 +143,7 @@ func (rows *mysqlRows) nextResultSet() (int, error) { // Remove unread packets from stream if !rows.rs.done { - if err := rows.mc.readUntilEOF(); err != nil { + if err := rows.mc.skipRows(); err != nil { return 0, err } rows.rs.done = true @@ -156,7 +156,7 @@ func (rows *mysqlRows) nextResultSet() (int, error) { rows.rs = resultSet{} // rows.mc.affectedRows and rows.mc.insertIds accumulate on each call to // nextResultSet. - resLen, err := rows.mc.resultUnchanged().readResultSetHeaderPacket() + resLen, _, err := rows.mc.resultUnchanged().readResultSetHeaderPacket() if err != nil { // Clean up about multi-results flag rows.rs.done = true @@ -186,7 +186,7 @@ func (rows *binaryRows) NextResultSet() error { return err } - rows.rs.columns, err = rows.mc.readColumns(resLen) + rows.rs.columns, err = rows.mc.readColumns(resLen, nil) return err } @@ -208,7 +208,7 @@ func (rows *textRows) NextResultSet() (err error) { return err } - rows.rs.columns, err = rows.mc.readColumns(resLen) + rows.rs.columns, err = rows.mc.readColumns(resLen, nil) return err } diff --git a/vendor/github.com/go-sql-driver/mysql/statement.go b/vendor/github.com/go-sql-driver/mysql/statement.go index 35df854570..0261903b9d 100644 --- a/vendor/github.com/go-sql-driver/mysql/statement.go +++ b/vendor/github.com/go-sql-driver/mysql/statement.go @@ -20,6 +20,7 @@ type mysqlStmt struct { mc *mysqlConn id uint32 paramCount int + columns []mysqlField } func (stmt *mysqlStmt) Close() error { @@ -64,19 +65,26 @@ func (stmt *mysqlStmt) Exec(args []driver.Value) (driver.Result, error) { handleOk := stmt.mc.clearResult() // Read Result - resLen, err := handleOk.readResultSetHeaderPacket() + resLen, metadataFollows, err := handleOk.readResultSetHeaderPacket() if err != nil { return nil, err } if resLen > 0 { // Columns - if err = mc.readUntilEOF(); err != nil { - return nil, err + if metadataFollows && stmt.mc.extCapabilities&clientCacheMetadata != 0 { + // we can not skip column metadata because next stmt.Query() may use it. + if stmt.columns, err = mc.readColumns(resLen, stmt.columns); err != nil { + return nil, err + } + } else { + if err = mc.skipColumns(resLen); err != nil { + return nil, err + } } // Rows - if err := mc.readUntilEOF(); err != nil { + if err = mc.skipRows(); err != nil { return nil, err } } @@ -107,7 +115,7 @@ func (stmt *mysqlStmt) query(args []driver.Value) (*binaryRows, error) { // Read Result handleOk := stmt.mc.clearResult() - resLen, err := handleOk.readResultSetHeaderPacket() + resLen, metadataFollows, err := handleOk.readResultSetHeaderPacket() if err != nil { return nil, err } @@ -116,7 +124,17 @@ func (stmt *mysqlStmt) query(args []driver.Value) (*binaryRows, error) { if resLen > 0 { rows.mc = mc - rows.rs.columns, err = mc.readColumns(resLen) + if metadataFollows { + if rows.rs.columns, err = mc.readColumns(resLen, stmt.columns); err != nil { + return nil, err + } + stmt.columns = rows.rs.columns + } else { + if err = mc.skipEof(); err != nil { + return nil, err + } + rows.rs.columns = stmt.columns + } } else { rows.rs.done = true @@ -131,7 +149,7 @@ func (stmt *mysqlStmt) query(args []driver.Value) (*binaryRows, error) { return rows, err } -var jsonType = reflect.TypeOf(json.RawMessage{}) +var jsonType = reflect.TypeFor[json.RawMessage]() type converter struct{} @@ -193,7 +211,7 @@ func (c converter) ConvertValue(v any) (driver.Value, error) { return nil, fmt.Errorf("unsupported type %T, a %s", v, rv.Kind()) } -var valuerReflectType = reflect.TypeOf((*driver.Valuer)(nil)).Elem() +var valuerReflectType = reflect.TypeFor[driver.Valuer]() // callValuerValue returns vr.Value(), with one exception: // If vr.Value is an auto-generated method on a pointer type and the diff --git a/vendor/github.com/go-sql-driver/mysql/utils.go b/vendor/github.com/go-sql-driver/mysql/utils.go index 8716c26c52..2dccb7d53e 100644 --- a/vendor/github.com/go-sql-driver/mysql/utils.go +++ b/vendor/github.com/go-sql-driver/mysql/utils.go @@ -182,7 +182,7 @@ func parseDateTime(b []byte, loc *time.Location) (time.Time, error) { func parseByteYear(b []byte) (int, error) { year, n := 0, 1000 - for i := 0; i < 4; i++ { + for i := range 4 { v, err := bToi(b[i]) if err != nil { return 0, err @@ -207,7 +207,7 @@ func parseByte2Digits(b1, b2 byte) (int, error) { func parseByteNanoSec(b []byte) (int, error) { ns, digit := 0, 100000 // max is 6-digits - for i := 0; i < len(b); i++ { + for i := range b { v, err := bToi(b[i]) if err != nil { return 0, err @@ -625,108 +625,80 @@ func reserveBuffer(buf []byte, appendSize int) []byte { return buf[:newSize] } -// escapeBytesBackslash escapes []byte with backslashes (\) -// This escapes the contents of a string (provided as []byte) by adding backslashes before special -// characters, and turning others into specific escape sequences, such as -// turning newlines into \n and null bytes into \0. -// https://github.com/mysql/mysql-server/blob/mysql-5.7.5/mysys/charset.c#L823-L932 -func escapeBytesBackslash(buf, v []byte) []byte { - pos := len(buf) - buf = reserveBuffer(buf, len(v)*2) +// Lookup table for backslash escapes (used for both string and bytes) +var backslashEscapeTable [256]byte - for _, c := range v { - switch c { - case '\x00': - buf[pos+1] = '0' - buf[pos] = '\\' - pos += 2 - case '\n': - buf[pos+1] = 'n' - buf[pos] = '\\' - pos += 2 - case '\r': - buf[pos+1] = 'r' - buf[pos] = '\\' - pos += 2 - case '\x1a': - buf[pos+1] = 'Z' - buf[pos] = '\\' - pos += 2 - case '\'': - buf[pos+1] = '\'' - buf[pos] = '\\' - pos += 2 - case '"': - buf[pos+1] = '"' - buf[pos] = '\\' - pos += 2 - case '\\': - buf[pos+1] = '\\' +func init() { + backslashEscapeTable['\x00'] = '0' + backslashEscapeTable['\n'] = 'n' + backslashEscapeTable['\r'] = 'r' + backslashEscapeTable['\x1a'] = 'Z' + backslashEscapeTable['\''] = '\'' + backslashEscapeTable['"'] = '"' + backslashEscapeTable['\\'] = '\\' +} + +// escapeStringBackslash is similar to escapeBytesBackslash but for string. +func escapeStringBackslash(buf []byte, v string) []byte { + pos := len(buf) + buf = reserveBuffer(buf, len(v)*2+2) + buf[pos] = '\'' + pos++ + for i := 0; i < len(v); i++ { + c := v[i] + if esc := backslashEscapeTable[c]; esc != 0 { + buf[pos+1] = esc buf[pos] = '\\' pos += 2 - default: + } else { buf[pos] = c pos++ } } - + buf[pos] = '\'' + pos++ return buf[:pos] } -// escapeStringBackslash is similar to escapeBytesBackslash but for string. -func escapeStringBackslash(buf []byte, v string) []byte { +// escapeBytesBackslash appends _binary'...' or '...' with backslash escaping for bytes. +func escapeBytesBackslash(buf, v []byte, binary bool) []byte { pos := len(buf) - buf = reserveBuffer(buf, len(v)*2) - - for i := 0; i < len(v); i++ { - c := v[i] - switch c { - case '\x00': - buf[pos+1] = '0' - buf[pos] = '\\' - pos += 2 - case '\n': - buf[pos+1] = 'n' - buf[pos] = '\\' - pos += 2 - case '\r': - buf[pos+1] = 'r' - buf[pos] = '\\' - pos += 2 - case '\x1a': - buf[pos+1] = 'Z' - buf[pos] = '\\' - pos += 2 - case '\'': - buf[pos+1] = '\'' - buf[pos] = '\\' - pos += 2 - case '"': - buf[pos+1] = '"' - buf[pos] = '\\' - pos += 2 - case '\\': - buf[pos+1] = '\\' + if binary { + buf = reserveBuffer(buf, len(v)*2+9) + copy(buf[pos:], []byte("_binary'")) + pos += 8 + } else { + buf = reserveBuffer(buf, len(v)*2+2) + buf[pos] = '\'' + pos++ + } + for _, c := range v { + if esc := backslashEscapeTable[c]; esc != 0 { + buf[pos+1] = esc buf[pos] = '\\' pos += 2 - default: + } else { buf[pos] = c pos++ } } - + buf[pos] = '\'' + pos++ return buf[:pos] } -// escapeBytesQuotes escapes apostrophes in []byte by doubling them up. -// This escapes the contents of a string by doubling up any apostrophes that -// it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in -// effect on the server. -// https://github.com/mysql/mysql-server/blob/mysql-5.7.5/mysys/charset.c#L963-L1038 -func escapeBytesQuotes(buf, v []byte) []byte { +// escapeBytesQuotes appends _binary'...' or '...' with single-quote escaping for bytes. +func escapeBytesQuotes(buf, v []byte, binary bool) []byte { pos := len(buf) - buf = reserveBuffer(buf, len(v)*2) - + if binary { + buf = reserveBuffer(buf, len(v)*2+9) + copy(buf[pos:], []byte("_binary'")) + pos += 8 + } else { + buf = reserveBuffer(buf, len(v)*2+2) + buf[pos] = '\'' + pos++ + } for _, c := range v { if c == '\'' { buf[pos+1] = '\'' @@ -737,16 +709,18 @@ func escapeBytesQuotes(buf, v []byte) []byte { pos++ } } - + buf[pos] = '\'' + pos++ return buf[:pos] } // escapeStringQuotes is similar to escapeBytesQuotes but for string. func escapeStringQuotes(buf []byte, v string) []byte { pos := len(buf) - buf = reserveBuffer(buf, len(v)*2) - - for i := 0; i < len(v); i++ { + buf = reserveBuffer(buf, len(v)*2+2) + buf[pos] = '\'' + pos++ + for i := range len(v) { c := v[i] if c == '\'' { buf[pos+1] = '\'' @@ -757,7 +731,8 @@ func escapeStringQuotes(buf []byte, v string) []byte { pos++ } } - + buf[pos] = '\'' + pos++ return buf[:pos] } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/auth.go b/vendor/github.com/nats-io/nats-server/v2/server/auth.go index b8cf69783a..09fad33460 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/auth.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/auth.go @@ -696,7 +696,7 @@ func (s *Server) processClientOrLeafAuthentication(c *client, opts *Options) (au // If we are here we have an auth callout defined and we have failed auth so far // so we will callout to our auth backend for processing. if !skip { - authorized, reason = s.processClientOrLeafCallout(c, opts, proxyRequired, trustedProxy) + authorized, reason = s.processClientOrLeafCallout(c, opts, proxyRequired, trustedProxy, ujwt) } // Check if we are authorized and in the auth callout account, and if so add in deny publish permissions for the auth subject. if authorized { @@ -797,26 +797,42 @@ func (s *Server) processClientOrLeafAuthentication(c *client, opts *Options) (au token = opts.Authorization } + // MQTT can carry JWTs in the password field. Reconstruct it here for auth + // processing and auth callout, but do not populate c.opts.JWT yet or it would + // be exposed through monitoring and advisory paths even when the password is + // not actually a JWT. + if ujwt == _EMPTY_ && c.isMqtt() && c.opts.JWT == _EMPTY_ { + // Don't set juc here, leave that to the next s.trustedKeys != nil block, + // so that we don't try to trust a JWT when we aren't in operator mode. We + // will allow it to be passed through auth callout though. + if _, err := jwt.DecodeUserClaims(c.opts.Password); err == nil { + ujwt = c.opts.Password + } + } + // Check if we have trustedKeys defined in the server. If so we require a user jwt. if s.trustedKeys != nil { - ujwt = c.opts.JWT - if ujwt == _EMPTY_ && c.isMqtt() { - // For MQTT, we pass the password as the JWT too, but do so here so it's not - // publicly exposed in the client options if it isn't a JWT. - ujwt = c.opts.Password + if ujwt == _EMPTY_ { + // Need to be sure that it's a NATS JWT, otherwise we will not correctly + // attempt the default sentinel below. + if _, err = jwt.DecodeUserClaims(c.opts.JWT); err == nil { + ujwt = c.opts.JWT + } } - if ujwt == _EMPTY_ && opts.DefaultSentinel != _EMPTY_ { - c.opts.JWT = opts.DefaultSentinel - ujwt = c.opts.JWT + if ujwt == _EMPTY_ { + // Didn't fall through with a valid NATS JWT, so try the default sentinel + // if configured. + if opts.DefaultSentinel != _EMPTY_ { + c.opts.JWT = opts.DefaultSentinel + ujwt = c.opts.JWT + } } if ujwt == _EMPTY_ { s.mu.Unlock() c.Debugf("Authentication requires a user JWT") return false } - // So we have a valid user jwt here. - juc, err = jwt.DecodeUserClaims(ujwt) - if err != nil { + if juc, err = jwt.DecodeUserClaims(ujwt); err != nil { s.mu.Unlock() c.Debugf("User JWT not valid: %v", err) return false @@ -1015,8 +1031,10 @@ func (s *Server) processClientOrLeafAuthentication(c *client, opts *Options) (au c.Debugf("Connection type not allowed") return false } - // skip validation of nonce when presented with a bearer token - // FIXME: if BearerToken is only for WSS, need check for server with that port enabled + // Skip validation of nonce when presented with a bearer token. + // While support for bearer tokens was added for WebSockets, there is no + // security benefit in restricting their use to that client protocol: the + // client can just go use the other protocol. if !juc.BearerToken { // Verify the signature against the nonce. if c.opts.Sig == _EMPTY_ { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/auth_callout.go b/vendor/github.com/nats-io/nats-server/v2/server/auth_callout.go index df903b3f25..b44fa1f8d7 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/auth_callout.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/auth_callout.go @@ -41,7 +41,7 @@ func titleCase(m string) string { } // Process a callout on this client's behalf. -func (s *Server) processClientOrLeafCallout(c *client, opts *Options, proxyRequired, trustedProxy bool) (authorized bool, errStr string) { +func (s *Server) processClientOrLeafCallout(c *client, opts *Options, proxyRequired, trustedProxy bool, ujwt string) (authorized bool, errStr string) { isOperatorMode := len(opts.TrustedKeys) > 0 // this is the account the user connected in, or the one running the callout @@ -374,7 +374,7 @@ func (s *Server) processClientOrLeafCallout(c *client, opts *Options, proxyRequi // Grab client info for the request. c.mu.Lock() c.fillClientInfo(&claim.ClientInformation) - c.fillConnectOpts(&claim.ConnectOptions) + c.fillConnectOpts(&claim.ConnectOptions, ujwt) // If we have a sig in the client opts, fill in nonce. if claim.ConnectOptions.SignedNonce != _EMPTY_ { claim.ClientInformation.Nonce = string(c.nonce) @@ -474,16 +474,22 @@ func (c *client) fillClientInfo(ci *jwt.ClientInformation) { // Fill in client options. // Lock should be held. -func (c *client) fillConnectOpts(opts *jwt.ConnectOptions) { +func (c *client) fillConnectOpts(opts *jwt.ConnectOptions, ujwt string) { if c == nil || (c.kind != CLIENT && c.kind != LEAF && c.kind != JETSTREAM && c.kind != ACCOUNT) { return } o := c.opts + if ujwt == _EMPTY_ { + // The caller may supply a reconstructed JWT that should be sent to auth + // callout without storing it in c.opts.JWT. If not, fall back to the client + // option as before. + ujwt = o.JWT + } // Do it this way to fail to compile if fields are added to jwt.ClientInformation. *opts = jwt.ConnectOptions{ - JWT: o.JWT, + JWT: ujwt, Nkey: o.Nkey, SignedNonce: o.Sig, Token: o.Token, diff --git a/vendor/github.com/nats-io/nats-server/v2/server/avl/seqset.go b/vendor/github.com/nats-io/nats-server/v2/server/avl/seqset.go index de281d0304..f4fa127df6 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/avl/seqset.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/avl/seqset.go @@ -239,7 +239,7 @@ func (ss SequenceSet) EncodeLen() int { return minLen + (ss.Nodes() * ((numBuckets+1)*8 + 2)) } -func (ss SequenceSet) Encode(buf []byte) ([]byte, error) { +func (ss SequenceSet) Encode(buf []byte) []byte { nn, encLen := ss.Nodes(), ss.EncodeLen() if cap(buf) < encLen { @@ -268,7 +268,7 @@ func (ss SequenceSet) Encode(buf []byte) ([]byte, error) { le.PutUint16(buf[i:], uint16(n.h)) i += 2 }) - return buf[:i], nil + return buf[:i] } // ErrBadEncoding is returned when we can not decode properly. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/client.go b/vendor/github.com/nats-io/nats-server/v2/server/client.go index 0abfd21800..162c235007 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/client.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/client.go @@ -1061,18 +1061,19 @@ func (c *client) setPermissions(perms *Permissions) { return } c.perms = &permissions{} + slcache := c.srv != nil && !c.srv.getOpts().NoSublistCache // Loop over publish permissions if perms.Publish != nil { if perms.Publish.Allow != nil { - c.perms.pub.allow = NewSublistWithCache() + c.perms.pub.allow = NewSublist(slcache) } for _, pubSubject := range perms.Publish.Allow { sub := &subscription{subject: []byte(pubSubject)} c.perms.pub.allow.Insert(sub) } if len(perms.Publish.Deny) > 0 { - c.perms.pub.deny = NewSublistWithCache() + c.perms.pub.deny = NewSublist(slcache) } for _, pubSubject := range perms.Publish.Deny { sub := &subscription{subject: []byte(pubSubject)} @@ -1091,7 +1092,7 @@ func (c *client) setPermissions(perms *Permissions) { if perms.Subscribe != nil { var err error if len(perms.Subscribe.Allow) > 0 { - c.perms.sub.allow = NewSublistWithCache() + c.perms.sub.allow = NewSublist(slcache) } for _, subSubject := range perms.Subscribe.Allow { sub := &subscription{} @@ -1103,7 +1104,7 @@ func (c *client) setPermissions(perms *Permissions) { c.perms.sub.allow.Insert(sub) } if len(perms.Subscribe.Deny) > 0 { - c.perms.sub.deny = NewSublistWithCache() + c.perms.sub.deny = NewSublist(slcache) // Also hold onto this array for later. c.darray = perms.Subscribe.Deny } @@ -1200,6 +1201,7 @@ func (c *client) mergeDenyPermissions(what denyType, denyPubs []string) { if c.perms == nil { c.perms = &permissions{} } + slcache := c.srv != nil && !c.srv.getOpts().NoSublistCache var perms []*perm switch what { case pub: @@ -1211,7 +1213,7 @@ func (c *client) mergeDenyPermissions(what denyType, denyPubs []string) { } for _, p := range perms { if p.deny == nil { - p.deny = NewSublistWithCache() + p.deny = NewSublist(slcache) } FOR_DENY: for _, subj := range denyPubs { @@ -2254,12 +2256,20 @@ func (c *client) processConnect(arg []byte) error { // least ClientProtoInfo, we need to increment the following counter. // This is decremented when client is removed from the server's // clients map. - if kind == CLIENT && proto >= ClientProtoInfo { + if kind == CLIENT && proto >= ClientProtoInfo && firstConnect { srv.mu.Lock() srv.cproto++ srv.mu.Unlock() } + // A second CONNECT may move the client into a different account via + // checkAuthentication. Drop any previously-registered subscriptions + // from the current account first so they don't leak in that account's + // sublist after the client switches. + if !firstConnect { + c.clearAccountSubs(false) + } + // Check for Auth if ok := srv.checkAuthentication(c); !ok { // We may fail here because we reached max limits on an account. @@ -3273,19 +3283,20 @@ func (c *client) canSubscribe(subject string, optQueue ...string) bool { r := c.perms.sub.deny.Match(subject) allowed = len(r.psubs) == 0 - if queue != _EMPTY_ && len(r.qsubs) > 0 { + if allowed && queue != _EMPTY_ && len(r.qsubs) > 0 { // If the queue appears in the deny list, then DO NOT allow. allowed = !queueMatches(queue, r.qsubs) } // We use the actual subscription to signal us to spin up the deny mperms - // and cache. We check if the subject is a wildcard that contains any of + // and cache. We check if the subject is a wildcard that intersects any of // the deny clauses. // FIXME(dlc) - We could be smarter and track when these go away and remove. if allowed && c.mperms == nil && subjectHasWildcard(subject) { - // Whip through the deny array and check if this wildcard subject is within scope. + // Whip through the deny array and check if this wildcard subject can + // overlap with any denied deliveries. for _, sub := range c.darray { - if subjectIsSubsetMatch(sub, subject) { + if SubjectsCollide(sub, subject) { c.loadMsgDenyFilter() break } @@ -3658,14 +3669,7 @@ func (c *client) deliverMsg(prodIsMQTT bool, sub *subscription, acc *Account, su // Check if we are a leafnode and have perms to check. if client.kind == LEAF && client.perms != nil { - var subjectToCheck []byte - if subject[0] == '_' && bytes.HasPrefix(subject, []byte(gwReplyPrefix)) { - subjectToCheck = subject[gwSubjectOffset:] - } else if subject[0] == '$' && bytes.HasPrefix(subject, []byte(oldGWReplyPrefix)) { - subjectToCheck = subject[oldGWReplyStart:] - } else { - subjectToCheck = subject - } + subjectToCheck, _ := getGWRoutedSubjectOrSelf(subject) if !client.pubAllowedFullCheck(string(subjectToCheck), true, true) { mt.addEgressEvent(client, sub, errMsgTracePubViolation) client.mu.Unlock() @@ -4068,7 +4072,7 @@ func (c *client) allowedMsgTraceDest(hdr []byte, hasLock bool) (string, bool) { return _EMPTY_, true } td := sliceHeader(MsgTraceDest, hdr) - if len(td) == 0 { + if len(td) == 0 || bytes.Equal(td, traceDestDisabledAsBytes) { return _EMPTY_, true } dest := bytesToString(td) @@ -4131,17 +4135,7 @@ func (c *client) pubAllowedFullCheck(subject string, fullCheck, hasLock bool) bo if !hasLock { c.mu.Lock() } - if resp := c.replies[subject]; resp != nil { - resp.n++ - // Check if we have sent too many responses. - if c.perms.resp.MaxMsgs > 0 && resp.n > c.perms.resp.MaxMsgs { - delete(c.replies, subject) - } else if c.perms.resp.Expires > 0 && time.Since(resp.t) > c.perms.resp.Expires { - delete(c.replies, subject) - } else { - allowed = true - } - } + allowed = c.responseAllowed(subject) if !hasLock { c.mu.Unlock() } @@ -4155,6 +4149,25 @@ func (c *client) pubAllowedFullCheck(subject string, fullCheck, hasLock bool) bo return allowed } +// Returns true if this subject matches a tracked dynamic reply permission. +// Lock must be held. +func (c *client) responseAllowed(subject string) bool { + if c.perms == nil || c.perms.resp == nil { + return false + } + if resp := c.replies[subject]; resp != nil { + resp.n++ + if c.perms.resp.MaxMsgs > 0 && resp.n > c.perms.resp.MaxMsgs { + delete(c.replies, subject) + } else if c.perms.resp.Expires > 0 && time.Since(resp.t) > c.perms.resp.Expires { + delete(c.replies, subject) + } else { + return true + } + } + return false +} + // Test whether a reply subject is a service import reply. func isServiceReply(reply []byte) bool { // This function is inlined and checking this way is actually faster @@ -4162,16 +4175,51 @@ func isServiceReply(reply []byte) bool { return len(reply) > 3 && bytesToString(reply[:4]) == replyPrefix } +// Test whether a subject is a JetStream ACK. +func isJSAckSubject(subject []byte) bool { + return len(subject) > jsAckPreLen && bytesToString(subject[:jsAckPreLen]) == jsAckPre +} + +// jsAckDeliverIdx returns the byte offset of the `@` separator in an encoded +// `$JS.ACK....@` reply, or -1 if reply is not in that form. Stream, +// consumer, and subject tokens may legally contain `@`, so we accept only the +// first `@` that follows the eight dots of the JS ACK token: +// +// $JS.ACK.......@ +func jsAckDeliverIdx(reply []byte) int { + if !isJSAckSubject(reply) { + return -1 + } + dots := 0 + for i, b := range reply { + switch b { + case '.': + dots++ + case '@': + if dots >= 8 { + return i + } + } + } + return -1 +} + +// replyHasJSAckSuffix reports whether reply is already in `$JS.ACK....@` +// form, so callers don't double-append the suffix on a re-entrant pass +// (service-import or chained JS push). +func replyHasJSAckSuffix(reply []byte) bool { + return jsAckDeliverIdx(reply) != -1 +} + // Test whether a reply subject is a service import or a gateway routed reply. func isReservedReply(reply []byte) bool { if isServiceReply(reply) { return true } - rLen := len(reply) // Faster to check with string([:]) than byte-by-byte - if rLen > jsAckPreLen && bytesToString(reply[:jsAckPreLen]) == jsAckPre { + if isJSAckSubject(reply) { return true - } else if rLen > gwReplyPrefixLen && bytesToString(reply[:gwReplyPrefixLen]) == gwReplyPrefix { + } else if len(reply) > gwReplyPrefixLen && bytesToString(reply[:gwReplyPrefixLen]) == gwReplyPrefix { return true } return false @@ -4370,7 +4418,7 @@ func (c *client) processInboundClientMsg(msg []byte) (bool, bool) { // Now deal with gateways if c.srv.gateway.enabled { reply := c.pa.reply - if len(c.pa.deliver) > 0 && c.kind == JETSTREAM && len(c.pa.reply) > 0 { + if len(c.pa.deliver) > 0 && c.kind == JETSTREAM && len(reply) > 0 && !replyHasJSAckSuffix(reply) { reply = append(reply, '@') reply = append(reply, c.pa.deliver...) } @@ -4418,7 +4466,7 @@ func (c *client) handleGWReplyMap(msg []byte) bool { } if c.srv.gateway.enabled { reply := c.pa.reply - if len(c.pa.deliver) > 0 && c.kind == JETSTREAM && len(c.pa.reply) > 0 { + if len(c.pa.deliver) > 0 && c.kind == JETSTREAM && len(reply) > 0 && !replyHasJSAckSuffix(reply) { reply = append(reply, '@') reply = append(reply, c.pa.deliver...) } @@ -4531,7 +4579,8 @@ func (c *client) setHeader(key, value string, msg []byte) []byte { // Write original header if present. if c.pa.hdr > LEN_CR_LF { omi = c.pa.hdr - hdr := removeHeaderIfPresent(msg[:c.pa.hdr-LEN_CR_LF], key) + // Need to copy since we're removing the header in place. + hdr := removeHeaderIfPresent(copyBytes(msg[:c.pa.hdr-LEN_CR_LF]), key) if len(hdr) == 0 { bb.WriteString(hdrLine) } else { @@ -4825,6 +4874,12 @@ func (c *client) processServiceImport(si *serviceImport, acc *Account, msg []byt // but the local server must replace it with the identity of the // authenticated leaf connection instead of trusting forwarded values. ci = c.getClientInfo(share) + if hadPrevSi && cis != nil && cis.Reply != _EMPTY_ { + ci.Reply = cis.Reply + } else if bytes.HasSuffix(c.pa.reply, []byte(FastBatchSuffix)) { + // Fast batch requires knowledge of the original reply subject. + ci.Reply = bytesToString(c.pa.reply) + } if hadPrevSi { ci.Service = acc.Name if !share && (si.share || isSysImport) { @@ -4843,6 +4898,10 @@ func (c *client) processServiceImport(si *serviceImport, acc *Account, msg []byt } } else if c.kind != LEAF || c.pa.hdr < 0 || len(sliceHeader(ClientInfoHdr, msg[:c.pa.hdr])) == 0 { ci = c.getClientInfo(share) + // Fast batch requires knowledge of the original reply subject. + if bytes.HasSuffix(c.pa.reply, []byte(FastBatchSuffix)) { + ci.Reply = bytesToString(c.pa.reply) + } // If we did not share but the imports destination is the system account add in the server and cluster info. if !share && isSysImport { c.addServerAndClusterInfo(ci) @@ -4902,8 +4961,7 @@ func (c *client) processServiceImport(si *serviceImport, acc *Account, msg []byt // We also need to disable the message trace headers so that // if the message is routed, it does not initialize tracing in the // remote. - positions := disableTraceHeaders(c, msg) - defer enableTraceHeaders(msg, positions) + msg = c.setHeader(MsgTraceDest, MsgTraceDestDisabled, msg) } } } @@ -5037,21 +5095,9 @@ func (c *client) processMsgResults(acc *Account, r *SublistResult, msg, deliver, // Check for JetStream encoded reply subjects. // For now these will only be on $JS.ACK prefixed reply subjects. var remapped bool - if len(creply) > 0 && c.kind != CLIENT && !isInternalClient(c.kind) && bytes.HasPrefix(creply, []byte(jsAckPre)) { + if len(creply) > 0 && c.kind != CLIENT && !isInternalClient(c.kind) { // We need to rewrite the subject and the reply. - // But, we must be careful that the stream name, consumer name, and subject can contain '@' characters. - // JS ACK contains at least 8 dots, find the first @ after this prefix. - // - $JS.ACK....... - counter := 0 - li := bytes.IndexFunc(creply, func(rn rune) bool { - if rn == '.' { - counter++ - } else if rn == '@' { - return counter >= 8 - } - return false - }) - if li != -1 && li < len(creply)-1 { + if li := jsAckDeliverIdx(creply); li != -1 && li < len(creply)-1 { remapped = true subj, creply = creply[li+1:], creply[:li] } @@ -5471,7 +5517,7 @@ sendToRoutesOrLeafs: // at the end of the reply subject if it exists. But only if this wasn't // already performed, otherwise we'd end up with a duplicate '@' suffix // resulting in a protocol error. - if len(deliver) > 0 && len(reply) > 0 && !remapped { + if len(deliver) > 0 && len(reply) > 0 && !remapped && !replyHasJSAckSuffix(reply) { reply = append(reply, '@') reply = append(reply, deliver...) } @@ -5754,11 +5800,12 @@ func (c *client) clearAuthTimer() bool { return stopped } -// We may reuse atmr for expiring user jwts, -// so check connectReceived. +// Track whether the parser should still enforce pre-CONNECT rules. +// This is handshake state, not timer state, since some handshakes +// use a different timer while still expecting CONNECT. // Lock assume held on entry. func (c *client) awaitingAuth() bool { - return !c.flags.isSet(connectReceived) && c.atmr != nil + return c.flags.isSet(expectConnect) && !c.flags.isSet(connectReceived) } // This will set the atmr for the JWT expiration time. @@ -5987,37 +6034,12 @@ func (c *client) closeConnection(reason ClosedState) { srv = c.srv noReconnect = c.flags.isSet(noReconnect) acc = c.acc - spoke bool ) - - // Snapshot for use if we are a client connection. - // FIXME(dlc) - we can just stub in a new one for client - // and reference existing one. - var subs []*subscription - if kind == CLIENT || kind == LEAF || kind == JETSTREAM { - var _subs [32]*subscription - subs = _subs[:0] - // Do not set c.subs to nil or delete the sub from c.subs here because - // it will be needed in saveClosedClient (which has been started as a - // go routine in markConnAsClosed). Cleanup will be done there. - for _, sub := range c.subs { - // Auto-unsubscribe subscriptions must be unsubscribed forcibly. - sub.max = 0 - sub.close() - subs = append(subs, sub) - } - spoke = c.isSpokeLeafNode() - } - c.mu.Unlock() - // Remove client's or leaf node or jetstream subscriptions. - if acc != nil && (kind == CLIENT || kind == LEAF || kind == JETSTREAM) { - acc.sl.RemoveBatch(subs) - } else if kind == ROUTER { + if kind == ROUTER { c.removeRemoteSubs() } - if srv != nil { // Unregister srv.removeClient(c) @@ -6025,45 +6047,11 @@ func (c *client) closeConnection(reason ClosedState) { if acc != nil { // Update remote subscriptions. if kind == CLIENT || kind == LEAF || kind == JETSTREAM { - qsubs := map[string]*qsub{} - for _, sub := range subs { - // Call unsubscribe here to cleanup shadow subscriptions and such. - c.unsubscribe(acc, sub, true, false) - // Update route as normal for a normal subscriber. - if sub.queue == nil { - if !spoke { - srv.updateRouteSubscriptionMap(acc, sub, -1) - if srv.gateway.enabled { - srv.gatewayUpdateSubInterest(acc.Name, sub, -1) - } - } - acc.updateLeafNodes(sub, -1) - } else { - // We handle queue subscribers special in case we - // have a bunch we can just send one update to the - // connected routes. - num := int32(1) - if kind == LEAF { - num = sub.qw - } - key := keyFromSub(sub) - if esub, ok := qsubs[key]; ok { - esub.n += num - } else { - qsubs[key] = &qsub{sub, num} - } - } - } - // Process any qsubs here. - for _, esub := range qsubs { - if !spoke { - srv.updateRouteSubscriptionMap(acc, esub.sub, -(esub.n)) - if srv.gateway.enabled { - srv.gatewayUpdateSubInterest(acc.Name, esub.sub, -(esub.n)) - } - } - acc.updateLeafNodes(esub.sub, -(esub.n)) - } + // Remove client's subscriptions from the account and unregister + // client from that account. Keep c.subs populated because + // saveClosedClient (started as a goroutine in markConnAsClosed) + // still needs to read it. + c.clearAccountSubs(true) } // Always remove from the account, otherwise we can leak clients. // Note that SYSTEM and ACCOUNT types from above cleanup their own subs. @@ -6090,6 +6078,87 @@ func (c *client) closeConnection(reason ClosedState) { c.reconnect() } +// clearAccountSubs removes the client's subscriptions from its current account +// and unregisters it from that account. If close is true, c.subs is left +// populated for saveClosedClient; otherwise c.subs is cleared and c.acc +// registered back to the global account. +// Client lock MUST NOT be held on entry. +func (c *client) clearAccountSubs(close bool) { + c.mu.Lock() + kind := c.kind + srv := c.srv + acc := c.acc + if acc == nil || (kind != CLIENT && kind != LEAF && kind != JETSTREAM) { + c.mu.Unlock() + return + } + var _subs [32]*subscription + subs := _subs[:0] + // Do not set c.subs to nil or delete the sub from c.subs here because + // it will be needed in saveClosedClient (which has been started as a + // go routine in markConnAsClosed). Cleanup will be done there. + for _, sub := range c.subs { + // Auto-unsubscribe subscriptions must be unsubscribed forcibly. + sub.max = 0 + sub.close() + subs = append(subs, sub) + if !close { + delete(c.subs, string(sub.sid)) + } + } + spoke := c.isSpokeLeafNode() + c.mu.Unlock() + + acc.sl.RemoveBatch(subs) + + if srv != nil { + qsubs := map[string]*qsub{} + for _, sub := range subs { + // Call unsubscribe here to cleanup shadow subscriptions and such. + c.unsubscribe(acc, sub, true, false) + // Update route as normal for a normal subscriber. + if sub.queue == nil { + if !spoke { + srv.updateRouteSubscriptionMap(acc, sub, -1) + if srv.gateway.enabled { + srv.gatewayUpdateSubInterest(acc.Name, sub, -1) + } + } + acc.updateLeafNodes(sub, -1) + } else { + // We handle queue subscribers special in case we + // have a bunch we can just send one update to the + // connected routes. + num := int32(1) + if kind == LEAF { + num = sub.qw + } + key := keyFromSub(sub) + if esub, ok := qsubs[key]; ok { + esub.n += num + } else { + qsubs[key] = &qsub{sub, num} + } + } + } + // Process any qsubs here. + for _, esub := range qsubs { + if !spoke { + srv.updateRouteSubscriptionMap(acc, esub.sub, -(esub.n)) + if srv.gateway.enabled { + srv.gatewayUpdateSubInterest(acc.Name, esub.sub, -(esub.n)) + } + } + acc.updateLeafNodes(esub.sub, -(esub.n)) + } + } + + if !close { + // Register back to global account, mimicking the state after client initialization. + c.registerWithAccount(srv.globalAccount()) + } +} + // Depending on the kind of connections, this may attempt to recreate a connection. // The actual reconnect attempt will be started in a go routine. func (c *client) reconnect() { @@ -6180,7 +6249,7 @@ func (c *client) reconnect() { srv.Debugf("Gateway %q not in configuration, not attempting reconnect", gwName) } } else if leafCfg != nil { - // Check if this is a solicited leaf node. Start up a reconnect. + // This is a solicited leaf node. Start up a reconnect. srv.startGoRoutine(func() { srv.reConnectToRemoteLeafNode(leafCfg) }) } } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/const.go b/vendor/github.com/nats-io/nats-server/v2/server/const.go index 410fc5e601..b173aa2a78 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/const.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/const.go @@ -66,7 +66,7 @@ func init() { const ( // VERSION is the current version for the server. - VERSION = "2.12.6" + VERSION = "2.14.0" // PROTO is the currently supported protocol. // 0 was the original diff --git a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go index 82233d926f..dba9517413 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/consumer.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/consumer.go @@ -125,7 +125,8 @@ type ConsumerConfig struct { MemoryStorage bool `json:"mem_storage,omitempty"` // Don't add to general clients. - Direct bool `json:"direct,omitempty"` + Direct bool `json:"direct,omitempty"` + Sourcing bool `json:"sourcing,omitempty"` // Metadata is additional metadata for the Consumer. Metadata map[string]string `json:"metadata,omitempty"` @@ -336,6 +337,8 @@ const ( AckAll // AckExplicit requires ack or nack for all messages. AckExplicit + // AckFlowControl functions like AckAll, but acks based on responses to flow control. + AckFlowControl ) func (a AckPolicy) String() string { @@ -344,6 +347,8 @@ func (a AckPolicy) String() string { return "none" case AckAll: return "all" + case AckFlowControl: + return "flow_control" default: return "explicit" } @@ -436,17 +441,27 @@ type consumer struct { lss *lastSeqSkipList rlimit *rate.Limiter reqSub *subscription + resetSub *subscription + ackSubOld *subscription + ackReplyOldT string + ackSubjOld string ackSub *subscription ackReplyT string ackSubj string + fcPreOld string + fcSubjOld string + fcPre string + fcSubj string nextMsgSubj string nextMsgReqs *ipQueue[*nextMsgReq] + resetSubj string maxp int pblimit int maxpb int pbytes int fcsz int fcid string + fcSubOld *subscription fcSub *subscription outq *jsOutQ pending map[uint64]*Pending @@ -465,6 +480,7 @@ type consumer struct { store ConsumerStore active bool replay bool + useV2Ack bool dtmr *time.Timer uptmr *time.Timer // Unpause timer gwdtmr *time.Timer @@ -489,6 +505,7 @@ type consumer struct { infoSub *subscription lqsent time.Time prm map[string]struct{} + rsm map[string]bool // Reset requests that need to be responded to on the internal sys account (if true). prOk bool uch chan struct{} retention RetentionPolicy @@ -649,7 +666,7 @@ func setConsumerConfigDefaults(config *ConsumerConfig, streamCfg *StreamConfig, config.InactiveThreshold = streamCfg.ConsumerLimits.InactiveThreshold } // Set proper default for max ack pending if we are ack explicit and none has been set. - if (config.AckPolicy == AckExplicit || config.AckPolicy == AckAll) && config.MaxAckPending == 0 { + if config.MaxAckPending == 0 && config.AckPolicy != AckNone { ackPending := JsDefaultMaxAckPending if lim.MaxAckPending > 0 && lim.MaxAckPending < ackPending { ackPending = lim.MaxAckPending @@ -671,6 +688,12 @@ func setConsumerConfigDefaults(config *ConsumerConfig, streamCfg *StreamConfig, if config.PriorityPolicy == PriorityPinnedClient && config.PinnedTTL == 0 { config.PinnedTTL = JsDefaultPinnedTTL } + + // Set default values for flow control policy. + if config.AckPolicy == AckFlowControl && !pedantic { + config.FlowControl = true + config.Heartbeat = sourceHealthHB + } return nil } @@ -684,6 +707,13 @@ func checkConsumerCfg( isRecovering bool, ) *ApiError { + if config.Name != _EMPTY_ && !isValidAssetName(config.Name) { + return NewJSStreamInvalidConfigError(errors.New("consumer name can not contain '.', '*', '>', '\\', '/'")) + } + if config.Durable != _EMPTY_ && !isValidAssetName(config.Durable) { + return NewJSStreamInvalidConfigError(errors.New("consumer durable name can not contain '.', '*', '>', '\\', '/'")) + } + // Check if replicas is defined but exceeds parent stream. if config.Replicas > 0 && config.Replicas > cfg.Replicas { return NewJSConsumerReplicasExceedsStreamError() @@ -720,6 +750,31 @@ func checkConsumerCfg( return NewJSConsumerAckWaitNegativeError() } + // Ack Flow Control policy requires push-based flow-controlled consumer. + if config.AckPolicy == AckFlowControl { + if config.DeliverSubject == _EMPTY_ { + return NewJSConsumerAckFCRequiresPushError() + } + if !config.FlowControl { + return NewJSConsumerAckFCRequiresFCError() + } + // We currently limit using heartbeat of 1s, since those are used for ephemeral sourcing consumers as well. + // We could decide to relax this in the future, but need to be careful to not allow a heartbeat larger + // than the stalled source timeout. + if config.Heartbeat != sourceHealthHB { + return NewJSStreamInvalidConfigError(fmt.Errorf("flow control ack policy heartbeat needs to be 1s")) + } + if config.MaxAckPending <= 0 { + return NewJSConsumerAckFCRequiresMaxAckPendingError() + } + if config.AckWait != 0 || len(config.BackOff) > 0 { + return NewJSConsumerAckFCRequiresNoAckWaitError() + } + if config.MaxDeliver > 0 { + return NewJSConsumerAckFCRequiresNoMaxDeliverError() + } + } + // Check if we have a BackOff defined that MaxDeliver is within range etc. if lbo := len(config.BackOff); lbo > 0 && config.MaxDeliver != -1 && lbo > config.MaxDeliver { return NewJSConsumerMaxDeliverBackoffError() @@ -962,7 +1017,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri } mset.mu.RLock() - s, js, jsa, cfg, acc := mset.srv, mset.js, mset.jsa, mset.cfg, mset.acc + s, js, jsa, cfg, acc, lseq := mset.srv, mset.js, mset.jsa, mset.cfg, mset.acc, mset.lseq mset.mu.RUnlock() // If we do not have the consumer currently assigned to us in cluster mode we will proceed but warn. @@ -1058,10 +1113,12 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri return nil, NewJSConsumerDoesNotExistError() } + standalone := !s.JetStreamIsClustered() && s.standAloneMode() + // If we're clustered we've already done this check, only do this if we're a standalone server. // But if we're standalone, only enforce if we're not recovering, since the MaxConsumers could've // been updated while we already had more consumers on disk. - if !s.JetStreamIsClustered() && s.standAloneMode() && !isRecovering { + if standalone && !isRecovering { // Check for any limits, if the config for the consumer sets a limit we check against that // but if not we use the value from account limits, if account limits is more restrictive // than stream config we prefer the account limits to handle cases where account limits are @@ -1070,21 +1127,21 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) { maxc = selectedLimits.MaxConsumers } - if maxc > 0 && mset.numPublicConsumers() >= maxc { + if maxc > 0 && mset.numLimitableConsumers() >= maxc { mset.mu.Unlock() return nil, NewJSMaximumConsumersLimitError() } } // Check on stream type conflicts with WorkQueues. - if cfg.Retention == WorkQueuePolicy && !config.Direct { + if cfg.Retention == WorkQueuePolicy && !config.Direct && !config.Sourcing { // Force explicit acks here. - if config.AckPolicy != AckExplicit { + if config.AckPolicy != AckExplicit && config.AckPolicy != AckFlowControl { mset.mu.Unlock() return nil, NewJSConsumerWQRequiresExplicitAckError() } - if len(mset.consumers) > 0 { + if mset.numLimitableConsumers() > 0 { subjects := gatherSubjectFilters(config.FilterSubject, config.FilterSubjects) if len(subjects) == 0 { mset.mu.Unlock() @@ -1191,7 +1248,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri o.nakEventT = JSAdvisoryConsumerMsgNakPre + "." + o.stream + "." + o.name o.deliveryExcEventT = JSAdvisoryConsumerMaxDeliveryExceedPre + "." + o.stream + "." + o.name - if !isValidName(o.name) { + if !isValidAssetName(o.name) { mset.mu.Unlock() o.deleteWithoutAdvisory() return nil, NewJSConsumerBadDurableNameError() @@ -1232,10 +1289,55 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // Restore our saved state. o.mu.Lock() o.readStoredState() + + replicas := o.cfg.replicas(&mset.cfg) + + // Starting sequence represents the next sequence to be delivered, so decrement it + // since that's the minimum amount the stream should have as its last sequence. + sseq := o.sseq + if sseq > 0 { + sseq-- + } + o.mu.Unlock() - } else { - // Select starting sequence number - o.selectStartingSeqNo() + + // A stream observing data loss rolls back in its sequence. Check if we need to reconcile the consumer state + // to ensure new messages aren't skipped. + // Only performed for non-replicated consumers for now. + if replicas == 1 && lseq < sseq && isRecovering { + s.Warnf("JetStream consumer '%s > %s > %s' delivered sequence %d past last stream sequence of %d", + o.acc.Name, o.stream, o.name, sseq, lseq) + + o.mu.Lock() + o.reconcileStateWithStream(lseq) + + // Save the reconciled state + state := &ConsumerState{ + Delivered: SequencePair{ + Stream: o.sseq - 1, + Consumer: o.dseq - 1, + }, + AckFloor: SequencePair{ + Stream: o.asflr, + Consumer: o.adflr, + }, + Pending: o.pending, + Redelivered: o.rdc, + } + err := o.store.ForceUpdate(state) + o.mu.Unlock() + if err != nil { + s.Errorf("JetStream consumer '%s > %s > %s' errored while updating state: %v", o.acc.Name, o.stream, o.name, err) + mset.mu.Unlock() + return nil, NewJSConsumerStoreFailedError(err) + } + } + } else if config.Direct || standalone { + // Clustered non-direct consumers defer this to setLeader so the + // expensive store scans don't block the meta apply goroutine. + if err := o.selectStartingSeqNo(); err != nil { + return nil, err + } } // Now register with mset and create the ack subscription. @@ -1271,10 +1373,32 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // Escape '%' in consumer and stream names, as `pre` is used as a template later // in consumer.ackReply(), resulting in erroneous formatting of the ack subject. mn := strings.ReplaceAll(cfg.Name, "%", "%%") - pre := fmt.Sprintf(jsAckT, mn, strings.ReplaceAll(o.name, "%", "%%")) + on := strings.ReplaceAll(o.name, "%", "%%") + domain := strings.ReplaceAll(o.srv.getOpts().JetStreamDomain, "%", "%%") + if domain == _EMPTY_ { + domain = "_" + } + accHash := getHash(accName) + + o.useV2Ack = s.getOpts().getFeatureFlag(FeatureFlagJsAckFormatV2) + + // v1 format: $JS.(ACK|FC)...etc. + o.fcPreOld = jsFlowControlPre + o.fcSubjOld = fmt.Sprintf(jsFlowControl, cfg.Name, o.name) + preOld := fmt.Sprintf(jsAckT, mn, on) + o.ackReplyOldT = fmt.Sprintf("%s.%%d.%%d.%%d.%%d.%%d", preOld) + o.ackSubjOld = fmt.Sprintf("%s.*.*.*.*.*", preOld) + + // v2 format: $JS.(ACK|FC).....etc. + o.fcPre = fmt.Sprintf("%s%s.%s.", jsFlowControlPre, domain, accHash) + o.fcSubj = fmt.Sprintf(jsFlowControlV2, domain, accHash, cfg.Name, o.name) + pre := fmt.Sprintf(jsAckTv2, domain, accHash, mn, on) o.ackReplyT = fmt.Sprintf("%s.%%d.%%d.%%d.%%d.%%d", pre) - o.ackSubj = fmt.Sprintf("%s.*.*.*.*.*", pre) + // Subscribe on this ack subject for v2, we require 11 tokens, but allow for more tokens/extension. + o.ackSubj = fmt.Sprintf("%s.*.*.*.*.>", pre) + o.nextMsgSubj = fmt.Sprintf(JSApiRequestNextT, mn, o.name) + o.resetSubj = fmt.Sprintf(JSApiConsumerResetT, mn, o.name) // Check/update the inactive threshold o.updateInactiveThreshold(&o.cfg) @@ -1304,7 +1428,10 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri mset.setConsumer(o) mset.mu.Unlock() - if config.Direct || (!s.JetStreamIsClustered() && s.standAloneMode()) { + if config.Sourcing && standalone { + o.resetStartingSeq(0, _EMPTY_, false) + } + if config.Direct || standalone { o.setLeader(true) } @@ -1315,7 +1442,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri if !s.standAloneMode() && ca == nil { suppress = true } else if ca != nil { - suppress = ca.responded + suppress = ca.hasResponded() } if !suppress { o.sendCreateAdvisory() @@ -1472,18 +1599,39 @@ func (o *consumer) isLeader() bool { return o.leader.Load() } -func (o *consumer) setLeader(isLeader bool) { +func (o *consumer) setLeader(isLeader bool) error { o.mu.RLock() mset, closed := o.mset, o.closed movingToClustered := o.node != nil && o.pch == nil movingToNonClustered := o.node == nil && o.pch != nil wasLeader := o.leader.Swap(isLeader) + + // For clustered new consumers, starting seq selection was deferred from + // addConsumerWithAssignment so the scan wouldn't block the meta apply + // goroutine, run it here on leader-elect instead. + needsSelect := isLeader && !wasLeader && o.dseq == 0 && (o.store == nil || !o.store.HasState()) o.mu.RUnlock() // If we are here we have a change in leader status. if isLeader { if closed || mset == nil { - return + return nil + } + + if needsSelect { + o.mu.Lock() + if err := o.selectStartingSeqNo(); err != nil { + o.srv.Errorf("JetStream consumer '%s > %s > %s' select starting seq failed: %v", + o.acc.Name, o.stream, o.name, err) + o.leader.Store(false) + node := o.node + o.mu.Unlock() + if node != nil { + _ = node.StepDown() + } + return err + } + o.mu.Unlock() } if wasLeader { @@ -1512,7 +1660,7 @@ func (o *consumer) setLeader(isLeader bool) { } o.mu.Unlock() } - return + return nil } mset.mu.RLock() @@ -1548,10 +1696,14 @@ func (o *consumer) setLeader(isLeader bool) { } var err error - if o.cfg.AckPolicy != AckNone { + if o.cfg.AckPolicy != AckNone && o.cfg.AckPolicy != AckFlowControl { + if o.ackSubOld, err = o.subscribeInternal(o.ackSubjOld, o.pushAck); err != nil { + o.mu.Unlock() + return nil + } if o.ackSub, err = o.subscribeInternal(o.ackSubj, o.pushAck); err != nil { o.mu.Unlock() - return + return nil } } @@ -1559,16 +1711,23 @@ func (o *consumer) setLeader(isLeader bool) { // Will error if wrong mode to provide feedback to users. if o.reqSub, err = o.subscribeInternal(o.nextMsgSubj, o.processNextMsgReq); err != nil { o.mu.Unlock() - return + return nil + } + if o.resetSub, err = o.subscribeInternal(o.resetSubj, o.processResetReq); err != nil { + o.mu.Unlock() + return nil } // Check on flow control settings. if o.cfg.FlowControl { o.setMaxPendingBytes(JsFlowControlMaxPending) - fcsubj := fmt.Sprintf(jsFlowControl, stream, o.name) - if o.fcSub, err = o.subscribeInternal(fcsubj, o.processFlowControl); err != nil { + if o.fcSubOld, err = o.subscribeInternal(o.fcSubjOld, o.processFlowControl); err != nil { o.mu.Unlock() - return + return nil + } + if o.fcSub, err = o.subscribeInternal(o.fcSubj, o.processFlowControl); err != nil { + o.mu.Unlock() + return nil } } @@ -1676,12 +1835,16 @@ func (o *consumer) setLeader(isLeader bool) { o.rdq = nil o.rdqi.Empty() o.pending = nil + o.rsm = nil o.resetPendingDeliveries() // ok if they are nil, we protect inside unsubscribe() + o.unsubscribe(o.ackSubOld) o.unsubscribe(o.ackSub) o.unsubscribe(o.reqSub) + o.unsubscribe(o.resetSub) + o.unsubscribe(o.fcSubOld) o.unsubscribe(o.fcSub) - o.ackSub, o.reqSub, o.fcSub = nil, nil, nil + o.ackSubOld, o.ackSub, o.reqSub, o.resetSub, o.fcSubOld, o.fcSub = nil, nil, nil, nil, nil, nil if o.infoSub != nil { o.srv.sysUnsubscribe(o.infoSub) o.infoSub = nil @@ -1704,6 +1867,7 @@ func (o *consumer) setLeader(isLeader bool) { } o.mu.Unlock() } + return nil } // This is coming on the wire so do not block here. @@ -2074,16 +2238,15 @@ func (o *consumer) deleteNotActive() { if !isDirect && s.JetStreamIsClustered() { js.mu.RLock() var ( - cca consumerAssignment meta RaftNode removeEntry []byte ) ca, cc := js.consumerAssignment(acc, stream, name), js.cluster if ca != nil && cc != nil { meta = cc.meta - cca = *ca + cca := ca.clone() cca.Reply = _EMPTY_ - removeEntry = encodeDeleteConsumerAssignment(&cca) + removeEntry = encodeDeleteConsumerAssignment(cca) meta.ForwardProposal(removeEntry) } js.mu.RUnlock() @@ -2452,6 +2615,9 @@ func (o *consumer) updateConfig(cfg *ConsumerConfig) error { // Allowed but considered no-op, [Description, SampleFrequency, MaxWaiting, HeadersOnly] o.cfg = *cfg + if cfg.Sourcing && (!o.srv.JetStreamIsClustered() && o.srv.standAloneMode()) { + o.resetStartingSeqLocked(0, _EMPTY_, false) + } if updatedFilters { // Cleanup messages that lost interest. if o.retention == InterestPolicy { @@ -2557,7 +2723,7 @@ func (o *consumer) processAck(subject, reply string, hdr int, rmsg []byte) { msg = rmsg } - sseq, dseq, dc := ackReplyInfo(subject) + sseq, dseq, dc, _, _ := ackReplyInfo(subject) skipAckReply := sseq == 0 @@ -2617,6 +2783,92 @@ func (o *consumer) updateSkipped(seq uint64) { o.propose(b[:]) } +func (o *consumer) resetStartingSeq(seq uint64, reply string, internal bool) (uint64, bool, error) { + o.mu.Lock() + defer o.mu.Unlock() + return o.resetStartingSeqLocked(seq, reply, internal) +} + +// Lock should be held. +func (o *consumer) resetStartingSeqLocked(seq uint64, reply string, internal bool) (uint64, bool, error) { + // Reset to a specific sequence, or back to the ack floor. + if seq == 0 { + seq = o.asflr + 1 + } else if o.cfg.DeliverPolicy == DeliverAll { + // Always allowed. + goto VALID + } else if o.cfg.DeliverPolicy == DeliverByStartSequence { + // Only allowed if not going below what's configured. + if seq < o.cfg.OptStartSeq { + return 0, false, errors.New("below start seq") + } + goto VALID + } else if o.cfg.DeliverPolicy == DeliverByStartTime && o.mset != nil { + // Only allowed if not going below what's configured. + nseq := o.mset.store.GetSeqFromTime(*o.cfg.OptStartTime) + if seq < nseq { + return 0, false, errors.New("below start time") + } + goto VALID + } else { + return 0, false, errors.New("not allowed") + } + +VALID: + // Must be a minimum of 1. + if seq <= 0 { + seq = 1 + } + // The replicated path requires quorum first before the reset actually takes effect. + if o.node != nil { + if !o.isLeader() { + return 0, false, nil + } + b := make([]byte, 1+8+len(reply)) + b[0] = byte(resetSeqOp) + var le = binary.LittleEndian + le.PutUint64(b[1:], seq) + copy(b[1+8:], reply) + o.propose(b[:]) + if reply != _EMPTY_ { + if o.rsm == nil { + o.rsm = make(map[string]bool, 1) + } + o.rsm[reply] = internal + } + return seq, false, nil + } + o.resetLocalStartingSeq(seq) + if o.store != nil { + o.store.Reset(seq - 1) + // Cleanup messages that lost interest. + if o.retention == InterestPolicy { + if mset := o.mset; mset != nil { + o.mu.Unlock() + ss := mset.state() + o.checkStateForInterestStream(&ss) + o.mu.Lock() + } + } + + // Recalculate pending, and re-trigger message delivery. + o.streamNumPending() + o.signalNewMessages() + return seq, true, nil + } + return seq, false, nil +} + +// Lock should be held. +func (o *consumer) resetLocalStartingSeq(seq uint64) { + o.pending, o.rdc = nil, nil + o.rdq = nil + o.rdqi.Empty() + o.sseq, o.dseq = seq, 1 + o.adflr, o.asflr = o.dseq-1, o.sseq-1 + o.ldt, o.lat = time.Time{}, time.Time{} +} + func (o *consumer) loopAndForwardProposals(qch chan struct{}) { // On exit make sure we nil out pch. defer func() { @@ -2948,11 +3200,17 @@ func (o *consumer) processNak(sseq, dseq, dc uint64, nak []byte) { // to the client, or `false` if there was an error or the ack is replicated (in which // case the reply will be sent later). func (o *consumer) processTerm(sseq, dseq, dc uint64, reason, reply string) bool { + return o.processTermLocked(sseq, dseq, dc, reason, reply, true) +} + +func (o *consumer) processTermLocked(sseq, dseq, dc uint64, reason, reply string, needLock bool) bool { // Treat like an ack to suppress redelivery. - ackedInPlace := o.processAckMsg(sseq, dseq, dc, reply, false) + ackedInPlace := o.processAckMsgLocked(sseq, dseq, dc, reply, false, needLock) - o.mu.Lock() - defer o.mu.Unlock() + if needLock { + o.mu.Lock() + defer o.mu.Unlock() + } // Deliver an advisory e := JSConsumerDeliveryTerminatedAdvisory{ @@ -3161,6 +3419,14 @@ func (o *consumer) infoWithSnapAndReply(snap bool, reply string) *ConsumerInfo { return nil } + dseq, sseq := o.dseq, o.sseq + if dseq <= 0 { + dseq = 1 + } + if sseq <= 0 { + sseq = 1 + } + cfg := o.cfg info := &ConsumerInfo{ Stream: o.stream, @@ -3168,8 +3434,8 @@ func (o *consumer) infoWithSnapAndReply(snap bool, reply string) *ConsumerInfo { Created: o.created, Config: &cfg, Delivered: SequenceInfo{ - Consumer: o.dseq - 1, - Stream: o.sseq - 1, + Consumer: dseq - 1, + Stream: sseq - 1, }, AckFloor: SequenceInfo{ Consumer: o.adflr, @@ -3305,20 +3571,36 @@ func (o *consumer) sampleAck(sseq, dseq, dc uint64) { // to the client, or `false` if there was an error or the ack is replicated (in which // case the reply will be sent later). func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample bool) bool { - o.mu.Lock() + return o.processAckMsgLocked(sseq, dseq, dc, reply, doSample, true) +} + +func (o *consumer) processAckMsgLocked(sseq, dseq, dc uint64, reply string, doSample bool, needLock bool) bool { + lock := func() { + if needLock { + o.mu.Lock() + } + } + unlock := func() { + if needLock { + o.mu.Unlock() + } + } + lock() if o.closed { - o.mu.Unlock() + unlock() return false } mset := o.mset if mset == nil || mset.closed.Load() { - o.mu.Unlock() + unlock() return false } // Check if this ack is above the current pointer to our next to deliver. - if sseq >= o.sseq { + // Ignore if it's a flow-controlled consumer, its state could end up further ahead + // since its state is not replicated before delivery. + if sseq >= o.sseq && !o.cfg.FlowControl { // Let's make sure this is valid. // This is only received on the consumer leader, so should never be higher // than the last stream sequence. But could happen if we've just become @@ -3333,14 +3615,16 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b // Even though another leader must have delivered a message with this sequence, we must not adjust // the current pointer. This could otherwise result in a stuck consumer, where messages below this // sequence can't be redelivered, and we'll have incorrect pending state and ack floors. - o.mu.Unlock() + unlock() return false } // Let the owning stream know if we are interest or workqueue retention based. // If this consumer is clustered (o.node != nil) this will be handled by // processReplicatedAck after the ack has propagated. - ackInPlace := o.node == nil && o.retention != LimitsPolicy + // If we're already holding the lock we can't ack in place, since that will + // violate lock ordering with respect to the stream. + ackInPlace := o.node == nil && o.retention != LimitsPolicy && needLock var sgap, floor uint64 var needSignal bool @@ -3376,10 +3660,10 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b } delete(o.rdc, sseq) o.removeFromRedeliverQueue(sseq) - case AckAll: + case AckAll, AckFlowControl: // no-op if dseq <= o.adflr || sseq <= o.asflr { - o.mu.Unlock() + unlock() // Return true to let caller respond back to the client. return true } @@ -3412,7 +3696,7 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b } case AckNone: // FIXME(dlc) - This is error but do we care? - o.mu.Unlock() + unlock() return ackInPlace } @@ -3423,7 +3707,7 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b } // Update underlying store. o.updateAcks(dseq, sseq, reply) - o.mu.Unlock() + unlock() if ackInPlace { if sgap > 1 { @@ -3536,7 +3820,7 @@ func (o *consumer) needAck(sseq uint64, subj string) bool { } switch o.cfg.AckPolicy { - case AckNone, AckAll: + case AckNone, AckAll, AckFlowControl: needAck = sseq > asflr case AckExplicit: if sseq > asflr { @@ -4162,6 +4446,42 @@ func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, o.nextMsgReqs.push(newNextMsgReq(reply, copyBytes(msg))) } +// processResetReq will reset a consumer to a new starting sequence. +func (o *consumer) processResetReq(_ *subscription, c *client, a *Account, _, reply string, rmsg []byte) { + if reply == _EMPTY_ { + return + } + + s := o.srv + var resp = JSApiConsumerResetResponse{ApiResponse: ApiResponse{Type: JSApiConsumerResetResponseType}} + + hdr, msg := c.msgParts(rmsg) + if errorOnRequiredApiLevel(hdr) { + resp.Error = NewJSRequiredApiLevelError() + s.sendInternalAccountMsg(a, reply, s.jsonResponse(&resp)) + return + } + + // An empty message resets back to the ack floor, otherwise a custom sequence can be used. + var req JSApiConsumerResetRequest + if len(msg) > 0 { + if err := json.Unmarshal(msg, &req); err != nil { + resp.Error = NewJSInvalidJSONError(err) + s.sendInternalAccountMsg(a, reply, s.jsonResponse(&resp)) + return + } + } + resetSeq, canRespond, err := o.resetStartingSeq(req.Seq, reply, false) + if err != nil { + resp.Error = NewJSConsumerInvalidResetError(err) + s.sendInternalAccountMsg(a, reply, s.jsonResponse(&resp)) + } else if canRespond { + resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) + resp.ResetSeq = resetSeq + s.sendInternalAccountMsg(a, reply, s.jsonResponse(&resp)) + } +} + func (o *consumer) processNextMsgRequest(reply string, msg []byte) { o.mu.Lock() defer o.mu.Unlock() @@ -4451,7 +4771,7 @@ func (o *consumer) getNextMsg() (*jsPubMsg, uint64, error) { sm, err := o.mset.store.LoadMsg(seq, &pmsg.StoreMsg) if sm == nil || err != nil { pmsg.returnToPool() - pmsg, dc = nil, 0 + pmsg = nil // Adjust back deliver count. o.decDeliveryCount(seq) } @@ -4459,10 +4779,14 @@ func (o *consumer) getNextMsg() (*jsPubMsg, uint64, error) { if err == ErrStoreMsgNotFound || err == errDeletedMsg { // This is a race condition where the message is still in o.pending and // scheduled for redelivery, but it has been removed from the stream. - // o.processTerm is called in a goroutine so could run after we get here. + // o.processTerm is called in a goroutine so would normally run. However, + // if we get here this likely didn't fire, or we are replicated and changed leaders. // That will correct the pending state and delivery/ack floors, so just skip here. pmsg.returnToPool() pmsg = nil + if p, ok := o.pending[seq]; ok { + o.processTermLocked(seq, p.Sequence, dc-1, ackTermUnackedLimitsReason, _EMPTY_, false) + } continue } return pmsg, dc, err @@ -4968,19 +5292,9 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { wrn, wrb = wr.n, wr.b dsubj = wr.reply if o.cfg.PriorityPolicy == PriorityPinnedClient { - // FIXME(jrm): Can we make this prettier? - if len(pmsg.hdr) == 0 { - pmsg.hdr = genHeader(pmsg.hdr, JSPullRequestNatsPinId, o.currentPinId) - pmsg.buf = append(pmsg.hdr, pmsg.msg...) - } else { - pmsg.hdr = genHeader(pmsg.hdr, JSPullRequestNatsPinId, o.currentPinId) - bufLen := len(pmsg.hdr) + len(pmsg.msg) - pmsg.buf = make([]byte, bufLen) - pmsg.buf = append(pmsg.hdr, pmsg.msg...) - } - + pmsg.hdr = genHeader(pmsg.hdr, JSPullRequestNatsPinId, o.currentPinId) + pmsg.buf = append(pmsg.hdr, pmsg.msg...) sz = len(pmsg.subj) + len(ackReply) + len(pmsg.hdr) + len(pmsg.msg) - } if done := wr.recycleIfDone(); done && o.node != nil { o.removeClusterPendingRequest(dsubj) @@ -5097,9 +5411,13 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { o.mu.Unlock() case <-hbc: if o.isActive() { - o.mu.RLock() + o.mu.Lock() o.sendIdleHeartbeat(odsubj) - o.mu.RUnlock() + // Send flow control on EOS if it's used for acknowledgements. + if o.cfg.AckPolicy == AckFlowControl && len(o.pending) > 0 && o.fcid == _EMPTY_ { + o.sendFlowControl() + } + o.mu.Unlock() } // Reset our idle heartbeat timer. hb.Reset(hbd) @@ -5121,7 +5439,10 @@ func (o *consumer) sendIdleHeartbeat(subj string) { } func (o *consumer) ackReply(sseq, dseq, dc uint64, ts int64, pending uint64) string { - return fmt.Sprintf(o.ackReplyT, dc, sseq, dseq, ts, pending) + if o.useV2Ack { + return fmt.Sprintf(o.ackReplyT, dc, sseq, dseq, ts, pending) + } + return fmt.Sprintf(o.ackReplyOldT, dc, sseq, dseq, ts, pending) } // Used mostly for testing. Sets max pending bytes for flow control setups. @@ -5275,11 +5596,11 @@ func (o *consumer) deliverMsg(dsubj, ackReply string, pmsg *jsPubMsg, dc uint64, // Update delivered first. o.updateDelivered(dseq, seq, dc, ts) - if ap == AckExplicit || ap == AckAll { - o.trackPending(seq, dseq) - } else if ap == AckNone { + if ap == AckNone { o.adflr = dseq o.asflr = seq + } else { + o.trackPending(seq, dseq) } // Send message. @@ -5327,6 +5648,10 @@ func (o *consumer) needFlowControl(sz int) bool { if o.fcid == _EMPTY_ && o.pbytes > o.maxpb/2 { return true } + // Or, when acking based on flow control, we need to send it if we've hit the max pending limit earlier. + if o.fcid == _EMPTY_ && o.cfg.AckPolicy == AckFlowControl && o.maxp > 0 && len(o.pending) >= o.maxp { + return true + } // If we have an existing outstanding FC, check to see if we need to expand the o.fcsz if o.fcid != _EMPTY_ && (o.pbytes-o.fcsz) >= o.maxpb { o.fcsz += sz @@ -5334,12 +5659,12 @@ func (o *consumer) needFlowControl(sz int) bool { return false } -func (o *consumer) processFlowControl(_ *subscription, c *client, _ *Account, subj, _ string, _ []byte) { +func (o *consumer) processFlowControl(_ *subscription, c *client, _ *Account, subj, _ string, rmsg []byte) { o.mu.Lock() - defer o.mu.Unlock() // Ignore if not the latest we have sent out. if subj != o.fcid { + o.mu.Unlock() return } @@ -5359,12 +5684,35 @@ func (o *consumer) processFlowControl(_ *subscription, c *client, _ *Account, su o.fcid, o.fcsz = _EMPTY_, 0 o.signalNewMessages() + ackFlowControl := o.cfg.AckPolicy == AckFlowControl + o.mu.Unlock() + + if !ackFlowControl { + return + } + hdr, _ := c.msgParts(rmsg) + if len(hdr) > 0 { + ldseq := parseInt64(sliceHeader(JSLastConsumerSeq, hdr)) + lsseq := parseInt64(sliceHeader(JSLastStreamSeq, hdr)) + if lsseq > 0 { + // Delivered sequence is allowed to be zero as a response + // to flow control without any deliveries. + if ldseq <= 0 { + ldseq = 0 + } + o.processAckMsg(uint64(lsseq), uint64(ldseq), 1, _EMPTY_, false) + } + } } // Lock should be held. func (o *consumer) fcReply() string { var sb strings.Builder - sb.WriteString(jsFlowControlPre) + if o.useV2Ack { + sb.WriteString(o.fcPre) + } else { + sb.WriteString(o.fcPreOld) + } sb.WriteString(o.stream) sb.WriteByte(btsep) sb.WriteString(o.name) @@ -5542,8 +5890,9 @@ func (o *consumer) checkPending() { defer o.mu.Unlock() mset := o.mset + ttl := int64(o.cfg.AckWait) // On stop, mset and timer will be nil. - if o.closed || mset == nil || o.ptmr == nil { + if o.closed || mset == nil || o.ptmr == nil || ttl == 0 { o.stopAndClearPtmr() return } @@ -5554,7 +5903,6 @@ func (o *consumer) checkPending() { fseq := state.FirstSeq now := time.Now().UnixNano() - ttl := int64(o.cfg.AckWait) next := int64(o.ackWait(0)) // However, if there is backoff, initializes with the largest backoff. // It will be adjusted as needed. @@ -5657,13 +6005,13 @@ func (o *consumer) checkPending() { // SeqFromReply will extract a sequence number from a reply subject. func (o *consumer) seqFromReply(reply string) uint64 { - _, dseq, _ := ackReplyInfo(reply) + _, dseq, _, _, _ := ackReplyInfo(reply) return dseq } // StreamSeqFromReply will extract the stream sequence from the reply subject. func (o *consumer) streamSeqFromReply(reply string) uint64 { - sseq, _, _ := ackReplyInfo(reply) + sseq, _, _, _, _ := ackReplyInfo(reply) return sseq } @@ -5681,11 +6029,14 @@ func parseAckReplyNum(d string) (n int64) { return n } -const expectedNumReplyTokens = 9 +const ( + expectedNumReplyTokensV1 = 9 + expectedNumReplyTokensV2 = 11 +) // Grab encoded information in the reply subject for a delivered message. -func replyInfo(subject string) (sseq, dseq, dc uint64, ts int64, pending uint64) { - tsa := [expectedNumReplyTokens]string{} +func ackReplyInfo(subject string) (sseq, dseq, dc uint64, ts int64, pending uint64) { + tsa := [expectedNumReplyTokensV2]string{} start, tokens := 0, tsa[:0] for i := 0; i < len(subject); i++ { if subject[i] == btsep { @@ -5694,38 +6045,23 @@ func replyInfo(subject string) (sseq, dseq, dc uint64, ts int64, pending uint64) } } tokens = append(tokens, subject[start:]) - if len(tokens) != expectedNumReplyTokens || tokens[0] != "$JS" || tokens[1] != "ACK" { + if (len(tokens) != expectedNumReplyTokensV1 && len(tokens) < expectedNumReplyTokensV2) || tokens[0] != "$JS" || tokens[1] != "ACK" { return 0, 0, 0, 0, 0 } + offset := 2 + if len(tokens) >= expectedNumReplyTokensV2 { + offset = 4 + } // TODO(dlc) - Should we error if we do not match consumer name? - // stream is tokens[2], consumer is 3. - dc = uint64(parseAckReplyNum(tokens[4])) - sseq, dseq = uint64(parseAckReplyNum(tokens[5])), uint64(parseAckReplyNum(tokens[6])) - ts = parseAckReplyNum(tokens[7]) - pending = uint64(parseAckReplyNum(tokens[8])) + // stream is tokens[offset], consumer is offset+1. + dc = uint64(parseAckReplyNum(tokens[offset+2])) + sseq, dseq = uint64(parseAckReplyNum(tokens[offset+3])), uint64(parseAckReplyNum(tokens[offset+4])) + ts = parseAckReplyNum(tokens[offset+5]) + pending = uint64(parseAckReplyNum(tokens[offset+6])) return sseq, dseq, dc, ts, pending } -func ackReplyInfo(subject string) (sseq, dseq, dc uint64) { - tsa := [expectedNumReplyTokens]string{} - start, tokens := 0, tsa[:0] - for i := 0; i < len(subject); i++ { - if subject[i] == btsep { - tokens = append(tokens, subject[start:i]) - start = i + 1 - } - } - tokens = append(tokens, subject[start:]) - if len(tokens) != expectedNumReplyTokens || tokens[0] != "$JS" || tokens[1] != "ACK" { - return 0, 0, 0 - } - dc = uint64(parseAckReplyNum(tokens[4])) - sseq, dseq = uint64(parseAckReplyNum(tokens[5])), uint64(parseAckReplyNum(tokens[6])) - - return sseq, dseq, dc -} - // NextSeq returns the next delivered sequence number for this consumer. func (o *consumer) nextSeq() uint64 { o.mu.RLock() @@ -5746,8 +6082,69 @@ func (o *consumer) hasSkipListPending() bool { return o.lss != nil && len(o.lss.seqs) > 0 } +// reconcileStateWithStream reconciles consumer state when the stream has reverted +// due to data loss (e.g., VM crash). This handles the case where consumer state +// is ahead of the stream's last sequence. +// Lock should be held. +func (o *consumer) reconcileStateWithStream(streamLastSeq uint64) { + // If an ack floor is higher than stream last sequence, + // reset back down but keep the highest known sequences. + if o.asflr > streamLastSeq { + o.asflr = streamLastSeq + // Delivery floor is one below the delivered sequence, + // but if it is zero somehow, ensure we don't underflow. + o.adflr = o.dseq + if o.adflr > 0 { + o.adflr-- + } + o.pending = nil + o.rdc = nil + } + + // Remove pending entries that are beyond the stream's last sequence + if len(o.pending) > 0 { + for seq := range o.pending { + if seq > streamLastSeq { + delete(o.pending, seq) + } + } + } + + // Remove redelivered entries that are beyond the stream's last sequence + if len(o.rdc) > 0 { + for seq := range o.rdc { + if seq > streamLastSeq { + delete(o.rdc, seq) + } + } + } + + // Update starting sequence and delivery sequence based on pending state + if len(o.pending) == 0 { + o.sseq = o.asflr + 1 + o.dseq = o.adflr + 1 + } else { + // Find highest stream sequence in pending + var maxStreamSeq uint64 + var maxConsumerSeq uint64 + + for streamSeq, p := range o.pending { + if streamSeq > maxStreamSeq { + maxStreamSeq = streamSeq + } + if p.Sequence > maxConsumerSeq { + maxConsumerSeq = p.Sequence + } + } + + // Set next sequences based on highest pending + o.sseq = maxStreamSeq + 1 + o.dseq = maxConsumerSeq + 1 + } +} + // Will select the starting sequence. -func (o *consumer) selectStartingSeqNo() { +func (o *consumer) selectStartingSeqNo() error { if o.mset == nil || o.mset.store == nil { o.sseq = 1 } else { @@ -5762,7 +6159,10 @@ func (o *consumer) selectStartingSeqNo() { } else { // If we are partitioned here this will be properly set when we become leader. for _, filter := range o.subjf { - ss := o.mset.store.FilteredState(1, filter.subject) + ss, err := o.mset.store.FilteredState(1, filter.subject) + if err != nil { + return err + } if ss.Last > o.sseq { o.sseq = ss.Last } @@ -5808,7 +6208,10 @@ func (o *consumer) selectStartingSeqNo() { nseq := state.LastSeq for _, filter := range o.subjf { // Use first sequence since this is more optimized atm. - ss := o.mset.store.FilteredState(state.FirstSeq, filter.subject) + ss, err := o.mset.store.FilteredState(state.FirstSeq, filter.subject) + if err != nil { + return err + } if ss.First >= o.sseq && ss.First < nseq { nseq = ss.First } @@ -5850,8 +6253,11 @@ func (o *consumer) selectStartingSeqNo() { // Set our starting sequence state. // But only if we're not clustered, if clustered we propose upon becoming leader. if o.store != nil && o.sseq > 0 && o.cfg.replicas(&o.mset.cfg) == 1 { - o.store.SetStarting(o.sseq - 1) + if err := o.store.SetStarting(o.sseq - 1); err != nil { + return err + } } + return nil } // Test whether a config represents a durable subscriber. @@ -5884,6 +6290,41 @@ func createConsumerName() string { return getHash(nuid.Next()) } +// Lock should be held. +func (mset *stream) createStableConsumerHash() string { + id := fmt.Sprintf("%s %s", mset.cfg.Name, mset.acc.Name) + if domain := mset.srv.getOpts().JetStreamDomain; domain != _EMPTY_ { + id = fmt.Sprintf("%s %s", id, domain) + } + return getHash(id) +} + +// Lock should be held. +func (mset *stream) createSourcingConsumerHash(ssi *StreamSource, sources []*StreamSource) string { + id := mset.createStableConsumerHash() + + // If the stream sources contain the same stream at least twice, we use a more strict hash of + // an ID that also contains filter subjects etc. If the stream name is only used once, we can + // support the stable identifier above. + var once bool + for _, src := range sources { + if src.Name == ssi.Name { + if once { + if ssi.iname == _EMPTY_ { + ssi.setIndexName() + } + // Append identifying information of the filter subjects, etc. to make it unique + id = fmt.Sprintf("%s %s", id, ssi.iname) + break + } else { + once = true + } + } + } + + return getHash(id) +} + // deleteConsumer will delete the consumer from this stream. func (mset *stream) deleteConsumer(o *consumer) error { return o.delete() @@ -6108,11 +6549,17 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { mset := o.mset o.mset = nil o.active = false + o.unsubscribe(o.ackSubOld) o.unsubscribe(o.ackSub) o.unsubscribe(o.reqSub) + o.unsubscribe(o.resetSub) + o.unsubscribe(o.fcSubOld) o.unsubscribe(o.fcSub) + o.ackSubOld = nil o.ackSub = nil o.reqSub = nil + o.resetSub = nil + o.fcSubOld = nil o.fcSub = nil if o.infoSub != nil { o.srv.sysUnsubscribe(o.infoSub) @@ -6577,6 +7024,12 @@ func (o *consumer) checkStateForInterestStream(ss *StreamState) error { } func (o *consumer) resetPtmr(delay time.Duration) { + // A delay of zero means it should be stopped. + if delay == 0 { + o.stopAndClearPtmr() + return + } + if o.ptmr == nil { o.ptmr = time.AfterFunc(delay, o.checkPending) } else { @@ -6586,6 +7039,10 @@ func (o *consumer) resetPtmr(delay time.Duration) { } func (o *consumer) stopAndClearPtmr() { + // If the end time is unset, short-circuit since the timer will already be stopped. + if o.ptmrEnd.IsZero() { + return + } stopAndClearTimer(&o.ptmr) o.ptmrEnd = time.Time{} } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/cron.go b/vendor/github.com/nats-io/nats-server/v2/server/cron.go new file mode 100644 index 0000000000..558e2a7904 --- /dev/null +++ b/vendor/github.com/nats-io/nats-server/v2/server/cron.go @@ -0,0 +1,327 @@ +// Copyright 2025 The NATS Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Based on code from https://github.com/robfig/cron +// Copyright (C) 2012 Rob Figueiredo +// All Rights Reserved. +// +// MIT LICENSE +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software is furnished to do so, +// subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +package server + +import ( + "errors" + "fmt" + "math" + "strconv" + "strings" + "time" +) + +// parseCron parses the given cron pattern and returns the next time it will fire based on the provided ts. +func parseCron(pattern string, loc *time.Location, ts int64) (time.Time, error) { + fields := strings.Fields(pattern) + if len(fields) != 6 { + return time.Time{}, fmt.Errorf("pattern requires 6 fields, got %d", len(fields)) + } + + // If no time zone is passed, default to UTC. + if loc == nil { + loc = time.UTC + } + + // Parse each field. + var err error + var second, minute, hour, dayOfMonth, month, dayOfWeek uint64 + if second, err = getField(fields[0], seconds); err != nil { + return time.Time{}, err + } + if minute, err = getField(fields[1], minutes); err != nil { + return time.Time{}, err + } + if hour, err = getField(fields[2], hours); err != nil { + return time.Time{}, err + } + if dayOfMonth, err = getField(fields[3], dom); err != nil { + return time.Time{}, err + } + if month, err = getField(fields[4], months); err != nil { + return time.Time{}, err + } + if dayOfWeek, err = getField(fields[5], dow); err != nil { + return time.Time{}, err + } + + // General approach + // + // For Month, Day, Hour, Minute, Second: + // Check if the time value matches. If yes, continue to the next field. + // If the field doesn't match the schedule, then increment the field until it matches. + // While incrementing the field, a wrap-around brings it back to the beginning + // of the field list (since it is necessary to re-verify previous field values) + next := time.Unix(0, ts).In(loc) + + // Start at the earliest possible time (the upcoming second). + next = next.Truncate(time.Second).Add(time.Second) + + // This flag indicates whether a field has been truncated at one point. + truncated := false + + // If no time is found within five years, return error. + yearLimit := next.Year() + 5 + +WRAP: + if next.Year() > yearLimit { + return time.Time{}, errors.New("pattern exceeds maximum range") + } + for 1< 1 { + extra = 0 + } + default: + return 0, fmt.Errorf("too many slashes: %s", expr) + } + + if start < r.min { + return 0, fmt.Errorf("beginning of range (%d) below minimum (%d): %s", start, r.min, expr) + } + if end > r.max { + return 0, fmt.Errorf("end of range (%d) above maximum (%d): %s", end, r.max, expr) + } + if start > end { + return 0, fmt.Errorf("beginning of range (%d) beyond end of range (%d): %s", start, end, expr) + } + if step == 0 { + return 0, fmt.Errorf("step of range should be a positive number: %s", expr) + } + return getBits(start, end, step) | extra, nil +} + +// parseIntOrName returns the (possibly-named) integer contained in expr. +func parseIntOrName(expr string, names map[string]uint) (uint, error) { + if names != nil { + if namedInt, ok := names[strings.ToLower(expr)]; ok { + return namedInt, nil + } + } + return mustParseInt(expr) +} + +// mustParseInt parses the given expression as an int or returns an error. +func mustParseInt(expr string) (uint, error) { + num, err := strconv.Atoi(expr) + if err != nil { + return 0, fmt.Errorf("failed to parse int from %s: %s", expr, err) + } + if num < 0 { + return 0, fmt.Errorf("negative number (%d) not allowed: %s", num, expr) + } + return uint(num), nil +} + +// getBits sets all bits in the range [min, max], modulo the given step size. +func getBits(min, max, step uint) uint64 { + var bits uint64 + + // If step is 1, use shifts. + if step == 1 { + return ^(math.MaxUint64 << (max + 1)) & (math.MaxUint64 << min) + } + + // Else, use a simple loop. + for i := min; i <= max; i += step { + bits |= 1 << i + } + return bits +} + +// bounds provides a range of acceptable values (plus a map of name to value). +type bounds struct { + min, max uint + names map[string]uint +} + +// The bounds for each field. +var ( + seconds = bounds{0, 59, nil} + minutes = bounds{0, 59, nil} + hours = bounds{0, 23, nil} + dom = bounds{1, 31, nil} + months = bounds{1, 12, map[string]uint{ + "jan": 1, + "feb": 2, + "mar": 3, + "apr": 4, + "may": 5, + "jun": 6, + "jul": 7, + "aug": 8, + "sep": 9, + "oct": 10, + "nov": 11, + "dec": 12, + }} + dow = bounds{0, 6, map[string]uint{ + "sun": 0, + "mon": 1, + "tue": 2, + "wed": 3, + "thu": 4, + "fri": 5, + "sat": 6, + }} +) + +const ( + // Set the top bit if a star was included in the expression. + starBit = 1 << 63 +) + +// dayMatches returns true if the schedule's day-of-week and day-of-month +// restrictions are satisfied by the given time. +func dayMatches(dayOfMonth, dayOfWeek uint64, t time.Time) bool { + var ( + domMatch = 1< 0 + dowMatch = 1< 0 + ) + if dayOfMonth&starBit > 0 || dayOfWeek&starBit > 0 { + return domMatch && dowMatch + } + return domMatch || dowMatch +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/errors.json b/vendor/github.com/nats-io/nats-server/v2/server/errors.json index 97c21c7eef..49c45a705e 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/errors.json +++ b/vendor/github.com/nats-io/nats-server/v2/server/errors.json @@ -2008,5 +2008,215 @@ "help": "", "url": "", "deprecates": "" + }, + { + "constant": "JSMessageSchedulesSourceInvalidErr", + "code": 400, + "error_code": 10203, + "description": "message schedules source is invalid", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerInvalidResetErr", + "code": 400, + "error_code": 10204, + "description": "invalid reset: {err}", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSBatchPublishDisabledErr", + "code": 400, + "error_code": 10205, + "description": "batch publish is disabled", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSBatchPublishInvalidPatternErr", + "code": 400, + "error_code": 10206, + "description": "batch publish pattern is invalid", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSBatchPublishInvalidBatchIDErr", + "code": 400, + "error_code": 10207, + "description": "batch publish ID is invalid", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSBatchPublishUnknownBatchIDErr", + "code": 400, + "error_code": 10208, + "description": "batch publish ID unknown", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSMirrorWithBatchPublishErr", + "code": 400, + "error_code": 10209, + "description": "stream mirrors can not also use batch publishing", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSAtomicPublishTooManyInflight", + "code": 429, + "error_code": 10210, + "description": "atomic publish too many inflight", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSBatchPublishTooManyInflight", + "code": 429, + "error_code": 10211, + "description": "batch publish too many inflight", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSMessageSchedulesSchedulerInvalidErr", + "code": 400, + "error_code": 10212, + "description": "message schedules invalid scheduler", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSMirrorDurableConsumerCfgInvalid", + "code": 400, + "error_code": 10213, + "description": "stream mirror consumer config is invalid", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSMirrorConsumerRequiresAckFCErr", + "code": 400, + "error_code": 10214, + "description": "stream mirror consumer requires flow control ack policy", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSSourceDurableConsumerCfgInvalid", + "code": 400, + "error_code": 10215, + "description": "stream source consumer config is invalid", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSSourceDurableConsumerDuplicateDetected", + "code": 400, + "error_code": 10216, + "description": "duplicate stream source consumer detected", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSSourceConsumerRequiresAckFCErr", + "code": 400, + "error_code": 10217, + "description": "stream source consumer requires flow control ack policy", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerAckFCRequiresPushErr", + "code": 400, + "error_code": 10218, + "description": "flow control ack policy requires a push based consumer", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerAckFCRequiresFCErr", + "code": 400, + "error_code": 10219, + "description": "flow control ack policy requires flow control", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerAckFCRequiresMaxAckPendingErr", + "code": 400, + "error_code": 10220, + "description": "flow control ack policy requires max ack pending", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerAckFCRequiresNoAckWaitErr", + "code": 400, + "error_code": 10221, + "description": "flow control ack policy requires unset ack wait", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSConsumerAckFCRequiresNoMaxDeliverErr", + "code": 400, + "error_code": 10222, + "description": "flow control ack policy requires unset max deliver", + "comment": "", + "help": "", + "url": "", + "deprecates": "" + }, + { + "constant": "JSMessageSchedulesTimeZoneInvalidErr", + "code": 400, + "error_code": 10223, + "description": "message schedules time zone is invalid", + "comment": "", + "help": "", + "url": "", + "deprecates": "" } -] +] \ No newline at end of file diff --git a/vendor/github.com/nats-io/nats-server/v2/server/events.go b/vendor/github.com/nats-io/nats-server/v2/server/events.go index bfe190cd00..8bc9bcd511 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/events.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/events.go @@ -247,14 +247,15 @@ type ServerCapability uint64 // ServerInfo identifies remote servers. type ServerInfo struct { - Name string `json:"name"` - Host string `json:"host"` - ID string `json:"id"` - Cluster string `json:"cluster,omitempty"` - Domain string `json:"domain,omitempty"` - Version string `json:"ver"` - Tags []string `json:"tags,omitempty"` - Metadata map[string]string `json:"metadata,omitempty"` + Name string `json:"name"` + Host string `json:"host"` + ID string `json:"id"` + Cluster string `json:"cluster,omitempty"` + Domain string `json:"domain,omitempty"` + Version string `json:"ver"` + Tags []string `json:"tags,omitempty"` + Metadata map[string]string `json:"metadata,omitempty"` + FeatureFlags map[string]bool `json:"feature_flags,omitempty"` // Whether JetStream is enabled (deprecated in favor of the `ServerCapability`). JetStream bool `json:"jetstream"` // Generic capability flags @@ -328,6 +329,7 @@ type ClientInfo struct { ClientType string `json:"client_type,omitempty"` MQTTClient string `json:"client_id,omitempty"` // This is the MQTT client ID Nonce string `json:"nonce,omitempty"` + Reply string `json:"reply,omitempty"` // Original reply subject after a service import (only when needed). } // forAssignmentSnap returns the minimum amount of ClientInfo we need for assignment snapshots. @@ -518,7 +520,7 @@ RESET: // Grab tags and metadata. opts := s.getOpts() - tags, metadata := opts.Tags, opts.Metadata + tags, metadata, featureFlags := opts.Tags, opts.Metadata, opts.getMergedFeatureFlags() for s.eventsRunning() { select { @@ -536,6 +538,7 @@ RESET: si.Time = time.Now().UTC() si.Tags = tags si.Metadata = metadata + si.FeatureFlags = featureFlags si.Flags = 0 if js { // New capability based flags. @@ -1052,8 +1055,15 @@ func (s *Server) sendStatsz(subj string) { Size: mg.ClusterSize(), } } - if ipq := s.jsAPIRoutedReqs; ipq != nil && jStat.Meta != nil { - jStat.Meta.Pending = ipq.len() + if jStat.Meta != nil { + if ipq := s.jsAPIRoutedReqs; ipq != nil { + jStat.Meta.PendingRequests = ipq.len() + } + if ipq := s.jsAPIRoutedInfoReqs; ipq != nil { + jStat.Meta.PendingInfos = ipq.len() + } + jStat.Meta.Pending = jStat.Meta.PendingRequests + jStat.Meta.PendingInfos + jStat.Meta.Snapshot = s.metaClusterSnapshotStats(js, mg) } } jStat.Limits = &s.getOpts().JetStreamLimits diff --git a/vendor/github.com/nats-io/nats-server/v2/server/feature_flags.go b/vendor/github.com/nats-io/nats-server/v2/server/feature_flags.go new file mode 100644 index 0000000000..a0744adc5f --- /dev/null +++ b/vendor/github.com/nats-io/nats-server/v2/server/feature_flags.go @@ -0,0 +1,130 @@ +// Copyright 2026 The NATS Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package server + +import ( + "maps" + "slices" + "strings" +) + +const ( + FeatureFlagJsAckFormatV2 = "js_ack_fc_v2" + FeatureFlagJsRaftDeleteRange = "js_raft_delete_range" +) + +var featureFlags = map[string]bool{ + // Use v2 format for `$JS.ACK.>` and `$JS.FC.>`. + // - Introduced: 2.14.0, both v1 and v2 supported, only using v1. + // - Enabled: TBD, both supported, v2 becomes the default. + // + // - v1: $JS.ACK....... + // - v2: $JS.ACK......... + // See also: https://github.com/nats-io/nats-architecture-and-design/blob/main/adr/ADR-15.md#jsack + FeatureFlagJsAckFormatV2: false, + + // Propose delete range gaps as a single `deleteRangeOp` Raft append entry + // instead of one entry per deleted sequence. Dramatically reduces Raft cost + // on mirrors whose origin has a large number of interior deletes. + // - Introduced: 2.14.0, apply-side always supports receiving `deleteRangeOp`. + // - Enabled: TBD, once all supported versions carry the apply-side. + // + // WARNING: Only enable once every peer in the cluster is on a version that + // supports receiving `deleteRangeOp`. Older peers panic on apply of an + // unknown stream entry operation. + FeatureFlagJsRaftDeleteRange: false, +} + +// getFeatureFlag is used to retrieve either the default or overwritten value for a feature flag. +// The user's value takes precedence over the system's default. However, if the flag doesn't exist, it's disabled. +// The *Options returned by Server.getOpts() is treated as immutable, mutations go through setOpts, +// so no lock is required on the map read here. +func (o *Options) getFeatureFlag(k string) bool { + defaultValue, ok := featureFlags[k] + if !ok { + return false // Not supported. + } + if userValue, ok := o.FeatureFlags[k]; ok { + return userValue + } + return defaultValue +} + +// getMergedFeatureFlags returns a merged map of feature flags, with the user's values taking precedence. +func (o *Options) getMergedFeatureFlags() map[string]bool { + merged := make(map[string]bool) + for k, v := range featureFlags { + merged[k] = v + } + for k, v := range o.FeatureFlags { + if _, ok := featureFlags[k]; !ok { + continue + } + merged[k] = v + } + return merged +} + +// printFeatureFlags logs the currently used feature flags on server startup. +func (s *Server) printFeatureFlags(o *Options) { + if len(o.FeatureFlags) == 0 { + return + } + keys := slices.Sorted(maps.Keys(o.FeatureFlags)) + + var ( + configured strings.Builder + unsupported strings.Builder + ) + + for _, k := range keys { + // Unsupported + defaultValue, ok := featureFlags[k] + if !ok { + if unsupported.Len() > 0 { + unsupported.WriteString(", ") + } + unsupported.WriteString(k) + continue + } + + v := o.FeatureFlags[k] + if configured.Len() > 0 { + configured.WriteString(", ") + } + configured.WriteString(k) + configured.WriteString(" (") + if defaultValue { + if v { + configured.WriteString("enabled") + } else { + configured.WriteString("opt-out") + } + } else if v { + configured.WriteString("opt-in") + } else { + configured.WriteString("disabled") + } + configured.WriteString(")") + } + if configured.Len() == 0 { + configured.WriteString("none") + } + + s.Noticef(" Feature flags:") + s.Noticef(" Configured: %s", configured.String()) + if unsupported.Len() > 0 { + s.Noticef(" Unsupported: %s", unsupported.String()) + } +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go index f68e5d7613..c6108ad68b 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/filestore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/filestore.go @@ -33,6 +33,7 @@ import ( "os" "path/filepath" "runtime" + "runtime/debug" "slices" "sort" "strings" @@ -40,6 +41,7 @@ import ( "sync/atomic" "time" + "github.com/antithesishq/antithesis-sdk-go/assert" "github.com/klauspost/compress/s2" "github.com/minio/highwayhash" "github.com/nats-io/nats-server/v2/server/ats" @@ -192,12 +194,15 @@ type fileStore struct { bim map[uint32]*msgBlock psim *stree.SubjectTree[psi] tsl int - adml int + wfsmu sync.Mutex // Only one writeFullState at a time to protect from overwrites. + wfsrun atomic.Int64 // Is writeFullState already running? For timer check only + wfsadml int // writeFullState average dmap length, protected by wfsmu. hh *highwayhash.Digest64 qch chan struct{} fsld chan struct{} cmu sync.RWMutex cfs []ConsumerStore + werr error sips int dirty int closing bool @@ -295,6 +300,8 @@ const ( msgDir = "msgs" // This is where we temporarily move the messages dir. purgeDir = "__msgs__" + // This is where we temporarily move the new message block during purge. + newMsgDir = "__new_msgs__" // used to scan blk file names. blkScan = "%d.blk" // suffix of a block file @@ -313,9 +320,6 @@ const ( consumerState = "o.dat" // The suffix that will be given to a new temporary block for compression or when rewriting the full file. blkTmpSuffix = ".tmp" - // This is where we keep state on templates. - // Deprecated: stream templates are deprecated and will be removed in a future version. - tmplsDir = "templates" // default cache buffer expiration defaultCacheBufferExpiration = 10 * time.Second // default sync interval @@ -566,7 +570,10 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim // Check if we have any left over tombstones to process. if len(fs.tombs) > 0 { for _, seq := range fs.tombs { - fs.removeMsg(seq, false, true, false) + _, err = fs.removeMsg(seq, false, true, false) + if err != nil && err != ErrStoreEOF && err != ErrStoreMsgNotFound { + return nil, err + } fs.removeFromLostData(seq) } // Not needed after this phase. @@ -574,13 +581,16 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim } // Limits checks and enforcement. - fs.enforceMsgLimit() - fs.enforceBytesLimit() + if err = fs.enforceMsgLimit(); err != nil { + return nil, err + } + if err = fs.enforceBytesLimit(); err != nil { + return nil, err + } // Do age checks too, make sure to call in place. if fs.cfg.MaxAge != 0 { - err := fs.expireMsgsOnRecover() - if isPermissionError(err) { + if err = fs.expireMsgsOnRecover(); err != nil { return nil, err } fs.startAgeChk() @@ -588,7 +598,9 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim // If we have max msgs per subject make sure the is also enforced. if fs.cfg.MaxMsgsPer > 0 { - fs.enforceMsgPerSubjectLimit(false) + if err = fs.enforceMsgPerSubjectLimit(false); err != nil { + return nil, err + } } // Grab first sequence for check below while we have lock. @@ -690,20 +702,32 @@ func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { // Create or delete the THW if needed. if cfg.AllowMsgTTL && fs.ttls == nil { - fs.recoverTTLState() + if err := fs.recoverTTLState(); err != nil { + fs.mu.Unlock() + return err + } } else if !cfg.AllowMsgTTL && fs.ttls != nil { fs.ttls = nil } // Create or delete the message scheduling state if needed. if cfg.AllowMsgSchedules && fs.scheduling == nil { - fs.recoverMsgSchedulingState() + if err := fs.recoverMsgSchedulingState(); err != nil { + fs.mu.Unlock() + return err + } } else if !cfg.AllowMsgSchedules && fs.scheduling != nil { fs.scheduling = nil } // Limits checks and enforcement. - fs.enforceMsgLimit() - fs.enforceBytesLimit() + if err := fs.enforceMsgLimit(); err != nil { + fs.mu.Unlock() + return err + } + if err := fs.enforceBytesLimit(); err != nil { + fs.mu.Unlock() + return err + } // Do age timers. if fs.ageChk == nil && fs.cfg.MaxAge != 0 { @@ -716,7 +740,10 @@ func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { } if fs.cfg.MaxMsgsPer > 0 && (old_cfg.MaxMsgsPer == 0 || fs.cfg.MaxMsgsPer < old_cfg.MaxMsgsPer) { - fs.enforceMsgPerSubjectLimit(true) + if err := fs.enforceMsgPerSubjectLimit(true); err != nil { + fs.mu.Unlock() + return err + } } if lmb := fs.lmb; lmb != nil { @@ -742,8 +769,12 @@ func (fs *fileStore) UpdateConfig(cfg *StreamConfig) error { close(lmb.qch) lmb.qch = nil } - lmb.flushPendingMsgsLocked() + _, err := lmb.flushPendingMsgsLocked() lmb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return err + } } // Set flush in place to AsyncFlush which by default is false. fs.fip = !fs.fcfg.AsyncFlush @@ -1161,7 +1192,9 @@ func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { } // Make sure encryption loaded if needed. - fs.loadEncryptionForMsgBlock(mb) + if err = fs.loadEncryptionForMsgBlock(mb); err != nil { + return nil, err + } // Grab last checksum from main block file. var lchk [8]byte @@ -1175,12 +1208,14 @@ func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { } // We can recycle it now. recycleMsgBlockBuf(buf) - } else { - file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) + } else if _, err = file.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize); err != nil { + return nil, err } } - file.Close() + if err = file.Close(); err != nil { + return nil, err + } // Read our index file. Use this as source of truth if possible. // This not applicable in >= 2.10 servers. Here for upgrade paths from < 2.10. @@ -1189,7 +1224,9 @@ func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { // Note this only checks that the message blk file is not newer then this file, or is empty and we expect empty. if (mb.rbytes == 0 && mb.msgs == 0) || bytes.Equal(lchk[:], mb.lchk[:]) { if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { - fs.populateGlobalPerSubjectInfo(mb) + if err = fs.populateGlobalPerSubjectInfo(mb); err != nil { + return nil, err + } // Try to dump any state we needed on recovery. mb.tryForceExpireCacheLocked() } @@ -1199,8 +1236,10 @@ func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { } // If we get data loss rebuilding the message block state record that with the fs itself. - ld, tombs, _ := mb.rebuildState() - if ld != nil { + ld, tombs, err := mb.rebuildState() + if err != nil { + return nil, err + } else if ld != nil { fs.addLostData(ld) } // Collect all tombstones. @@ -1209,12 +1248,16 @@ func (fs *fileStore) recoverMsgBlock(index uint32) (*msgBlock, error) { } if mb.msgs > 0 && !mb.noTrack && fs.psim != nil { - fs.populateGlobalPerSubjectInfo(mb) + if err = fs.populateGlobalPerSubjectInfo(mb); err != nil { + return nil, err + } // Try to dump any state we needed on recovery. mb.tryForceExpireCacheLocked() } - mb.closeFDs() + if err = mb.closeFDs(); err != nil { + return nil, err + } fs.addMsgBlock(mb) return mb, nil @@ -1458,6 +1501,10 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { defer recycleMsgBlockBuf(buf) if err != nil || len(buf) == 0 { + // Only allow continuing to mark lost data if the file itself doesn't exist, or was empty. + if err != nil && err != errNoBlkData { + return nil, nil, err + } var ld *LostStreamData // No data to rebuild from here. if mb.msgs > 0 { @@ -1474,7 +1521,7 @@ func (mb *msgBlock) rebuildStateLocked() (*LostStreamData, []uint64, error) { mb.dmap.Empty() atomic.StoreUint64(&mb.first.seq, atomic.LoadUint64(&mb.last.seq)+1) } - return ld, nil, err + return ld, nil, nil } // Check if we need to decrypt. @@ -1508,15 +1555,37 @@ func (mb *msgBlock) rebuildStateFromBufLocked(buf []byte, allowTruncate bool) (* mb.dmap.Insert(seq) } - var le = binary.LittleEndian + // For tombstones that we find and collect. + var ( + tombstones []uint64 + maxTombstoneSeq uint64 + maxTombstoneTs int64 + ) - truncate := func(index uint32) { - // There are cases where we're not allowed to truncate, like for an encrypted or compressed - // block since the index will be the decrypted and decompressed index. - if !allowTruncate { - return + defer func() { + // For empty msg blocks make sure we recover last seq correctly based off of first. + // Or if we seem to have no messages but had a tombstone, which we use to remember + // sequences and timestamps now, use that to properly setup the first and last. + if mb.msgs == 0 { + fseq := atomic.LoadUint64(&mb.first.seq) + if fseq > 0 { + atomic.StoreUint64(&mb.last.seq, fseq-1) + } else if fseq == 0 && maxTombstoneSeq > 0 { + atomic.StoreUint64(&mb.first.seq, maxTombstoneSeq+1) + mb.first.ts = 0 + if mb.last.seq == 0 { + atomic.StoreUint64(&mb.last.seq, maxTombstoneSeq) + mb.last.ts = maxTombstoneTs + } + } } + }() + + var le = binary.LittleEndian + // There are cases where we're not allowed to truncate, like for an encrypted or compressed + // block since the index will be the decrypted and decompressed index. + truncate := func(index uint32) error { var fd *os.File if mb.mfd != nil { fd = mb.mfd @@ -1529,17 +1598,24 @@ func (mb *msgBlock) rebuildStateFromBufLocked(buf []byte, allowTruncate bool) (* } } if fd == nil { - return + return nil + } + if err := fd.Truncate(int64(index)); err != nil { + return err } - if err := fd.Truncate(int64(index)); err == nil { - // Update our checksum. - if index >= 8 { - var lchk [8]byte - fd.ReadAt(lchk[:], int64(index-8)) - copy(mb.lchk[0:], lchk[:]) + + // Update our checksum. + if index >= 8 { + var lchk [8]byte + if _, err = fd.ReadAt(lchk[:], int64(index-8)); err != nil { + return err } - fd.Sync() + copy(mb.lchk[0:], lchk[:]) } + if err = fd.Sync(); err != nil { + return err + } + return nil } gatherLost := func(lb uint32) *LostStreamData { @@ -1551,13 +1627,6 @@ func (mb *msgBlock) rebuildStateFromBufLocked(buf []byte, allowTruncate bool) (* return &ld } - // For tombstones that we find and collect. - var ( - tombstones []uint64 - maxTombstoneSeq uint64 - maxTombstoneTs int64 - ) - // To detect gaps from compaction, and to ensure the sequence keeps moving up. var last uint64 var hb [highwayhash.Size64]byte @@ -1581,8 +1650,13 @@ func (mb *msgBlock) rebuildStateFromBufLocked(buf []byte, allowTruncate bool) (* for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { if index+msgHdrSize > lbuf { - truncate(index) - return gatherLost(lbuf - index), tombstones, nil + err = errBadMsg{mb.mfn, fmt.Sprintf("message overrun (index %d lbuf %d)", index, lbuf)} + if allowTruncate { + if err = truncate(index); err != nil { + return nil, nil, err + } + } + return gatherLost(lbuf - index), tombstones, err } hdr := buf[index : index+msgHdrSize] @@ -1598,8 +1672,13 @@ func (mb *msgBlock) rebuildStateFromBufLocked(buf []byte, allowTruncate bool) (* dlen := int(rl) - msgHdrSize // Do some quick sanity checks here. if dlen < 0 || shlen > (dlen-recordHashSize) || dlen > int(rl) || index+rl > lbuf || rl > rlBadThresh { - truncate(index) - return gatherLost(lbuf - index), tombstones, errBadMsg{mb.mfn, fmt.Sprintf("sanity check failed (dlen %d slen %d rl %d index %d lbuf %d)", dlen, slen, rl, index, lbuf)} + err = errBadMsg{mb.mfn, fmt.Sprintf("sanity check failed (dlen %d slen %d rl %d index %d lbuf %d)", dlen, slen, rl, index, lbuf)} + if allowTruncate { + if err = truncate(index); err != nil { + return nil, nil, err + } + } + return gatherLost(lbuf - index), tombstones, err } // Check for checksum failures before additional processing. @@ -1616,8 +1695,13 @@ func (mb *msgBlock) rebuildStateFromBufLocked(buf []byte, allowTruncate bool) (* } checksum := hh.Sum(hb[:0]) if !bytes.Equal(checksum, data[len(data)-recordHashSize:]) { - truncate(index) - return gatherLost(lbuf - index), tombstones, errBadMsg{mb.mfn, "invalid checksum"} + err = errBadMsg{mb.mfn, "invalid checksum"} + if allowTruncate { + if err = truncate(index); err != nil { + return nil, nil, err + } + } + return gatherLost(lbuf - index), tombstones, err } copy(mb.lchk[0:], checksum) } @@ -1712,6 +1796,26 @@ func (fs *fileStore) warn(format string, args ...any) { fs.srv.Warnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) } +// For doing rate-limited warn logging. +// Lock should be held. +func (fs *fileStore) rateLimitWarn(format string, args ...any) { + // No-op if no server configured. + if fs.srv == nil { + return + } + fs.srv.RateLimitWarnf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) +} + +// For doing error logging. +// Lock should be held. +func (fs *fileStore) error(format string, args ...any) { + // No-op if no server configured. + if fs.srv == nil { + return + } + fs.srv.Errorf(fmt.Sprintf("Filestore [%s] %s", fs.cfg.Name, format), args...) +} + // For doing debug logging. // Lock should be held. func (fs *fileStore) debug(format string, args ...any) { @@ -1756,9 +1860,9 @@ func (fs *fileStore) recoverFullState() (rerr error) { // Check for any left over purged messages. <-dios - pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) - if _, err := os.Stat(pdir); err == nil { - os.RemoveAll(pdir) + if err := fs.recoverPartialPurge(); err != nil { + dios <- struct{}{} + return err } // Grab our stream state file and load it in. fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) @@ -1774,7 +1878,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { const minLen = 32 if len(buf) < minLen { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state too short (%d bytes)", len(buf)) return errCorruptState } @@ -1786,7 +1890,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { fs.hh.Write(buf) var hb [highwayhash.Size64]byte if !bytes.Equal(h, fs.hh.Sum(hb[:0])) { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state checksum did not match") return errCorruptState } @@ -1805,7 +1909,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { version := buf[1] if buf[0] != fullStateMagic || version < fullStateMinVersion || version > fullStateVersion { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state magic and version mismatch") return errCorruptState } @@ -1860,7 +1964,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { for i := 0; i < numSubjects; i++ { if lsubj := int(readU64()); lsubj > 0 { if bi+lsubj > len(buf) { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state bad subject len (%d)", lsubj) return errCorruptState } @@ -1871,13 +1975,13 @@ func (fs *fileStore) recoverFullState() (rerr error) { // We had a bug that could cause memory corruption in the PSIM that could have gotten stored to disk. // Only would affect subjects, so do quick check. if !isValidSubject(bytesToString(subj), true) { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state corrupt subject detected") return errCorruptState } bi += lsubj psi := psi{total: readU64(), fblk: uint32(readU64())} - if psi.total > 1 { + if psi.total > 1 || version >= 4 { psi.lblk = uint32(readU64()) } else { psi.lblk = psi.fblk @@ -1905,7 +2009,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { schedules = readU64() } if bi < 0 { - os.Remove(fn) + _ = os.Remove(fn) return errCorruptState } mb := fs.initMsgBlock(index) @@ -1918,7 +2022,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { if numDeleted > 0 { dmap, n, err := avl.Decode(buf[bi:]) if err != nil { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state error decoding avl dmap: %v", err) return errCorruptState } @@ -1957,7 +2061,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { // Check if we had any errors. if bi < 0 { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state has no checksum present") return errCorruptState } @@ -1970,7 +2074,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { var matched bool mb := fs.lmb if mb == nil || mb.index != blkIndex { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state block does not exist or index mismatch") return errCorruptState } @@ -2032,7 +2136,7 @@ func (fs *fileStore) recoverFullState() (rerr error) { // We check first and last seq and number of msgs and bytes. If there is a difference, // return and error so we rebuild from the message block state on disk. if !trackingStatesEqual(&fs.state, &mstate) { - os.Remove(fn) + _ = os.Remove(fn) fs.warn("Stream state encountered internal inconsistency on recover") return errCorruptState } @@ -2059,7 +2163,7 @@ func (fs *fileStore) recoverTTLState() error { ttlseq, err = fs.ttls.Decode(buf) if err != nil { fs.warn("Error decoding TTL state: %s", err) - os.Remove(fn) + _ = os.Remove(fn) } } @@ -2140,7 +2244,7 @@ func (fs *fileStore) recoverMsgSchedulingState() error { schedSeq, err = fs.scheduling.decode(buf) if err != nil { fs.warn("Error decoding message scheduling state: %s", err) - os.Remove(fn) + _ = os.Remove(fn) } } @@ -2193,8 +2297,9 @@ func (fs *fileStore) recoverMsgSchedulingState() error { if len(msg.hdr) == 0 { continue } - if schedule, ok := getMessageSchedule(sm.hdr); ok && !schedule.IsZero() { - fs.scheduling.init(seq, sm.subj, schedule.UnixNano()) + if schedule, apiErr := nextMessageSchedule(sm.hdr, sm.ts); apiErr == nil && !schedule.IsZero() { + // Copy the subject, as it's stored in the scheduling maps and the backing cache could be reused in the meantime. + fs.scheduling.init(seq, copyString(sm.subj), schedule.UnixNano()) } } } @@ -2218,7 +2323,7 @@ func (mb *msgBlock) lastChecksum() []byte { return lchk[:] } // Encrypted? - if err := mb.checkAndLoadEncryption(); err != nil { + if err = mb.checkAndLoadEncryption(); err != nil { return nil } if mb.bek != nil { @@ -2227,9 +2332,11 @@ func (mb *msgBlock) lastChecksum() []byte { return nil } copy(lchk[0:], buf[len(buf)-checksumSize:]) + } else { + return nil } - } else { - f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize) + } else if _, err = f.ReadAt(lchk[:], int64(mb.rbytes)-checksumSize); err != nil { + return nil } return lchk[:] } @@ -2268,9 +2375,9 @@ func (fs *fileStore) recoverMsgs() error { // Check for any left over purged messages. <-dios - pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) - if _, err := os.Stat(pdir); err == nil { - os.RemoveAll(pdir) + if err := fs.recoverPartialPurge(); err != nil { + dios <- struct{}{} + return err } mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) f, err := os.Open(mdir) @@ -2307,7 +2414,10 @@ func (fs *fileStore) recoverMsgs() error { // out from underneath of us this is possible. mb.mu.Lock() if atomic.LoadUint64(&mb.first.seq) == 0 { - mb.dirtyCloseWithRemove(true) + if err := mb.dirtyCloseWithRemove(true); err != nil { + mb.mu.Unlock() + return err + } fs.removeMsgBlockFromList(mb) mb.mu.Unlock() continue @@ -2354,8 +2464,8 @@ func (fs *fileStore) recoverMsgs() error { if len(fs.blks) > 0 { fs.lmb = fs.blks[len(fs.blks)-1] - } else { - _, err = fs.newMsgBlockForWrite() + } else if _, err = fs.newMsgBlockForWrite(); err != nil { + return err } // Check if we encountered any lost data. @@ -2369,15 +2479,14 @@ func (fs *fileStore) recoverMsgs() error { for _, mb := range emptyBlks { // Need the mb lock here. mb.mu.Lock() - fs.removeMsgBlock(mb) + err = fs.forceRemoveMsgBlock(mb) mb.mu.Unlock() + if err != nil { + return err + } } } - if err != nil { - return err - } - // Check for keyfiles orphans. if kms, err := filepath.Glob(filepath.Join(mdir, keyScanAll)); err == nil && len(kms) > 0 { valid := make(map[uint32]bool) @@ -2431,7 +2540,9 @@ func (fs *fileStore) expireMsgsOnRecover() error { last.ts = mb.last.ts } // Make sure we do subject cleanup as well. - mb.ensurePerSubjectInfoLoaded() + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + return err + } mb.fss.IterOrdered(func(bsubj []byte, ss *SimpleState) bool { subj := bytesToString(bsubj) for i := uint64(0); i < ss.Msgs; i++ { @@ -2439,8 +2550,7 @@ func (fs *fileStore) expireMsgsOnRecover() error { } return true }) - err := mb.dirtyCloseWithRemove(true) - if isPermissionError(err) { + if err := mb.dirtyCloseWithRemove(true); err != nil { return err } deleted++ @@ -2460,7 +2570,7 @@ func (fs *fileStore) expireMsgsOnRecover() error { bytes += mb.bytes err := deleteEmptyBlock(mb) mb.mu.Unlock() - if isPermissionError(err) { + if err != nil { return err } continue @@ -2470,7 +2580,7 @@ func (fs *fileStore) expireMsgsOnRecover() error { // This will load fss as well. if err := mb.loadMsgsWithLock(); err != nil { mb.mu.Unlock() - break + return err } var smv StoreMsg @@ -2524,7 +2634,11 @@ func (fs *fileStore) expireMsgsOnRecover() error { } // Update fss // Make sure we have fss loaded. - mb.removeSeqPerSubject(sm.subj, seq) + if _, err = mb.removeSeqPerSubject(sm.subj, seq); err != nil { + mb.finishedWithCache() + mb.mu.Unlock() + return err + } fs.removePerSubject(sm.subj) } // Make sure we have a proper next first sequence. @@ -2532,11 +2646,15 @@ func (fs *fileStore) expireMsgsOnRecover() error { mb.selectNextFirst() } // Check if empty after processing, could happen if tail of messages are all deleted. + var err error if mb.msgs == 0 { - deleteEmptyBlock(mb) + err = deleteEmptyBlock(mb) } mb.finishedWithCache() mb.mu.Unlock() + if err != nil { + return err + } break } @@ -2572,13 +2690,20 @@ func (fs *fileStore) expireMsgsOnRecover() error { fs.state.Bytes = 0 } // Make sure to we properly set the fs first sequence and timestamp. - fs.selectNextFirst() + if err := fs.selectNextFirst(); err != nil { + return err + } // Check if we have no messages and blocks left. if fs.lmb == nil && last.seq != 0 { - if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { - fs.writeTombstone(last.seq, last.ts) + if lmb, err := fs.newMsgBlockForWrite(); err != nil || lmb == nil { + if err == nil { + err = errors.New("lmb missing") + } + return err + } else if err = fs.writeTombstone(last.seq, last.ts); err != nil { + return err } // Clear any global subject state. fs.psim, fs.tsl = fs.psim.Empty(), 0 @@ -2710,7 +2835,9 @@ func (mb *msgBlock) firstMatchingMulti(sl *gsl.SimpleSublist, start uint64, sm * } if ss.firstNeedsUpdate || ss.lastNeedsUpdate { // mb is already loaded into the cache so should be fast-ish. - mb.recalculateForSubj(bytesToString(subj), ss) + if ierr = mb.recalculateForSubj(bytesToString(subj), ss); ierr != nil { + return + } } first := max(start, ss.First) if first > ss.Last || first >= hseq { @@ -2883,17 +3010,28 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor // If we have a wildcard match against all tracked subjects we know about. fseq = lseq + 1 if bfilter := stringToBytes(filter); wc { + var ierr error mb.fss.Match(bfilter, func(bsubj []byte, ss *SimpleState) { + if ierr != nil { + return + } if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(bytesToString(bsubj), ss) + if ierr = mb.recalculateForSubj(bytesToString(bsubj), ss); ierr != nil { + return + } } if start <= ss.Last { fseq = min(fseq, max(start, ss.First)) } }) + if ierr != nil { + return nil, false, ierr + } } else if ss, _ := mb.fss.Find(bfilter); ss != nil { if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(filter, ss) + if err := mb.recalculateForSubj(filter, ss); err != nil { + return nil, false, err + } } if start <= ss.Last { fseq = min(fseq, max(start, ss.First)) @@ -2987,10 +3125,16 @@ func (mb *msgBlock) prevMatchingMulti(sl *gsl.SimpleSublist, start uint64, sm *S if uint64(mb.fss.Size()) < start-lseq { // If there are no subject matches then this is effectively no-op. hseq := uint64(0) + var ierr error stree.IntersectGSL(mb.fss, sl, func(subj []byte, ss *SimpleState) { + if ierr != nil { + return + } if ss.firstNeedsUpdate || ss.lastNeedsUpdate { // mb is already loaded into the cache so should be fast-ish. - mb.recalculateForSubj(bytesToString(subj), ss) + if ierr = mb.recalculateForSubj(bytesToString(subj), ss); ierr != nil { + return + } } first := min(start, ss.Last) // Skip if cutoff is before this subject's first, or if we already @@ -3034,6 +3178,9 @@ func (mb *msgBlock) prevMatchingMulti(sl *gsl.SimpleSublist, start uint64, sm *S mb.llseq = llseq } }) + if ierr != nil { + return nil, false, ierr + } if hseq > 0 && sm != nil { return sm, didLoad && start == lseq, nil } @@ -3064,7 +3211,7 @@ func (mb *msgBlock) prevMatchingMulti(sl *gsl.SimpleSublist, start uint64, sm *S } // This will traverse a message block and generate the filtered pending. -func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64) { +func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, first, last uint64, err error) { mb.mu.Lock() defer mb.mu.Unlock() return mb.filteredPendingLocked(subj, wc, seq) @@ -3072,14 +3219,14 @@ func (mb *msgBlock) filteredPending(subj string, wc bool, seq uint64) (total, fi // This will traverse a message block and generate the filtered pending. // Lock should be held. -func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64) { +func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (total, first, last uint64, err error) { isAll := filter == _EMPTY_ || filter == fwcs // First check if we can optimize this part. // This means we want all and the starting sequence was before this block. if isAll { if fseq := atomic.LoadUint64(&mb.first.seq); sseq <= fseq { - return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq) + return mb.msgs, fseq, atomic.LoadUint64(&mb.last.seq), nil } } @@ -3098,7 +3245,9 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( } // Make sure we have fss loaded. - mb.ensurePerSubjectInfoLoaded() + if err = mb.ensurePerSubjectInfoLoaded(); err != nil { + return 0, 0, 0, err + } var havePartial bool @@ -3106,7 +3255,9 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( if !wc { if ss, ok := mb.fss.Find(stringToBytes(filter)); ok && ss != nil { if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(filter, ss) + if err = mb.recalculateForSubj(filter, ss); err != nil { + return 0, 0, 0, err + } } if sseq <= ss.First { update(ss) @@ -3117,12 +3268,17 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( } } else { mb.fss.Match(stringToBytes(filter), func(bsubj []byte, ss *SimpleState) { + if err != nil { + return + } if havePartial { // If we already found a partial then don't do anything else. return } if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(bytesToString(bsubj), ss) + if err = mb.recalculateForSubj(bytesToString(bsubj), ss); err != nil { + return + } } if sseq <= ss.First { update(ss) @@ -3131,11 +3287,14 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( havePartial = true } }) + if err != nil { + return 0, 0, 0, err + } } // If we did not encounter any partials we can return here. if !havePartial { - return total, first, last + return total, first, last, nil } // If we are here we need to scan the msgs. @@ -3145,7 +3304,9 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( // If we load the cache for a linear scan we want to expire that cache upon exit. var shouldExpire bool if mb.cacheNotLoaded() { - mb.loadMsgsWithLock() + if err = mb.loadMsgsWithLock(); err != nil { + return 0, 0, 0, err + } shouldExpire = true } defer mb.finishedWithCache() @@ -3189,11 +3350,11 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) ( mb.tryForceExpireCacheLocked() } - return total, first, last + return total, first, last, nil } // FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence. -func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { +func (fs *fileStore) FilteredState(sseq uint64, subj string) (SimpleState, error) { fs.mu.RLock() defer fs.mu.RUnlock() @@ -3210,14 +3371,16 @@ func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { // Make sure we track sequences ss.First = fs.state.FirstSeq ss.Last = fs.state.LastSeq - return ss + return ss, nil } // If we want all msgs that match we can shortcircuit. // TODO(dlc) - This can be extended for all cases but would // need to be careful on total msgs calculations etc. if sseq == fs.state.FirstSeq { - fs.numFilteredPending(subj, &ss) + if err := fs.numFilteredPending(subj, &ss); err != nil { + return ss, err + } } else { wc := subjectHasWildcard(subj) // Tracking subject state. @@ -3227,7 +3390,10 @@ func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { if sseq > atomic.LoadUint64(&mb.last.seq) { continue } - t, f, l := mb.filteredPending(subj, wc, sseq) + t, f, l, err := mb.filteredPending(subj, wc, sseq) + if err != nil { + return ss, err + } ss.Msgs += t if ss.First == 0 || (f > 0 && f < ss.First) { ss.First = f @@ -3238,7 +3404,7 @@ func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState { } } - return ss + return ss, nil } // This is used to see if we can selectively jump start blocks based on filter subject and a starting block index. @@ -3310,20 +3476,20 @@ func (fs *fileStore) selectSkipFirstBlock(bi int, start, stop uint32) (int, erro // Optimized way for getting all num pending matching a filter subject. // Lock should be held. -func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) { - fs.numFilteredPendingWithLast(filter, true, ss) +func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) error { + return fs.numFilteredPendingWithLast(filter, true, ss) } // Optimized way for getting all num pending matching a filter subject and first sequence only. // Lock should be held. -func (fs *fileStore) numFilteredPendingNoLast(filter string, ss *SimpleState) { - fs.numFilteredPendingWithLast(filter, false, ss) +func (fs *fileStore) numFilteredPendingNoLast(filter string, ss *SimpleState) error { + return fs.numFilteredPendingWithLast(filter, false, ss) } // Optimized way for getting all num pending matching a filter subject. // Optionally look up last sequence. Sometimes do not need last and this avoids cost. // Read lock should be held. -func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *SimpleState) { +func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *SimpleState) error { isAll := filter == _EMPTY_ || filter == fwcs // If isAll we do not need to do anything special to calculate the first and last and total. @@ -3331,7 +3497,7 @@ func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *Si ss.First = fs.state.FirstSeq ss.Last = fs.state.LastSeq ss.Msgs = fs.state.Msgs - return + return nil } // Always reset. ss.First, ss.Last, ss.Msgs = 0, 0, 0 @@ -3358,13 +3524,16 @@ func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *Si // Did not find anything. if stop == 0 { - return + return nil } // Do start mb := fs.bim[start] if mb != nil { - _, f, _ := mb.filteredPending(filter, wc, 0) + _, f, _, err := mb.filteredPending(filter, wc, 0) + if err != nil { + return err + } ss.First = f } @@ -3379,7 +3548,9 @@ func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *Si if mb == nil { continue } - if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 { + if _, f, _, err := mb.filteredPending(filter, wc, 0); err != nil { + return err + } else if f > 0 { ss.First = f break } @@ -3408,10 +3579,14 @@ func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *Si // Now gather last sequence if asked to do so. if last { if mb = fs.bim[stop]; mb != nil { - _, _, l := mb.filteredPending(filter, wc, 0) + _, _, l, err := mb.filteredPending(filter, wc, 0) + if err != nil { + return err + } ss.Last = l } } + return nil } // SubjectsState returns a map of SimpleState for all matching subjects. @@ -3466,10 +3641,16 @@ func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState { } // Mark fss activity. mb.lsts = ats.AccessTime() + var ierr error mb.fss.Match(stringToBytes(subject), func(bsubj []byte, ss *SimpleState) { + if ierr != nil { + return + } subj := string(bsubj) if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(subj, ss) + if ierr = mb.recalculateForSubj(subj, ss); ierr != nil { + return + } } oss := fss[subj] if oss.First == 0 { // New @@ -3487,7 +3668,9 @@ func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState { mb.finishedWithCache() } mb.mu.Unlock() - + if ierr != nil { + return nil + } if mb == stop { break } @@ -3532,13 +3715,16 @@ func (fs *fileStore) allLastSeqsLocked() ([]uint64, error) { shouldExpire = true } + var ierr error mb.fss.IterFast(func(bsubj []byte, ss *SimpleState) bool { // Check if already been processed and accounted. if _, ok := subs[string(bsubj)]; !ok { // Check if we need to recalculate. We only care about the last sequence. if ss.lastNeedsUpdate { // mb is already loaded into the cache so should be fast-ish. - mb.recalculateForSubj(bytesToString(bsubj), ss) + if ierr = mb.recalculateForSubj(bytesToString(bsubj), ss); ierr != nil { + return false + } } seqs = append(seqs, ss.Last) subs[string(bsubj)] = struct{}{} @@ -3551,6 +3737,9 @@ func (fs *fileStore) allLastSeqsLocked() ([]uint64, error) { } mb.finishedWithCache() mb.mu.Unlock() + if ierr != nil { + return nil, ierr + } } slices.Sort(seqs) @@ -3646,11 +3835,14 @@ func (fs *fileStore) MultiLastSeqs(filters []string, maxSeq uint64, maxAllowed i } // We can start properly looking here. mb.mu.Lock() - mb.ensurePerSubjectInfoLoaded() + var ierr error + if ierr = mb.ensurePerSubjectInfoLoaded(); ierr != nil { + mb.mu.Unlock() + return nil, ierr + } // Iterate the fss and check against our subs. We will delete from subs as we add. // Once len(subs) == 0 we are done. - var ierr error mb.fss.IterFast(func(bsubj []byte, ss *SimpleState) bool { // Already been processed and accounted for was not matched in the first place. if subs[string(bsubj)] == nil { @@ -3659,7 +3851,9 @@ func (fs *fileStore) MultiLastSeqs(filters []string, maxSeq uint64, maxAllowed i // Check if we need to recalculate. We only care about the last sequence. if ss.lastNeedsUpdate { // mb is already loaded into the cache so should be fast-ish. - mb.recalculateForSubj(bytesToString(bsubj), ss) + if ierr = mb.recalculateForSubj(bytesToString(bsubj), ss); ierr != nil { + return false + } } // If we are equal or below just add to seqs slice. if ss.Last <= maxSeq { @@ -3886,15 +4080,21 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) mb.lsts = ats.AccessTime() var t uint64 + var ierr error var havePartial bool mb.fss.Match(stringToBytes(filter), func(bsubj []byte, ss *SimpleState) { + if ierr != nil { + return + } if havePartial { // If we already found a partial then don't do anything else. return } subj := bytesToString(bsubj) if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(subj, ss) + if ierr = mb.recalculateForSubj(subj, ss); ierr != nil { + return + } } if sseq <= ss.First { t += ss.Msgs @@ -3903,6 +4103,10 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool) havePartial = true } }) + if ierr != nil { + mb.mu.Unlock() + return 0, 0, ierr + } // See if we need to scan msgs here. if havePartial { @@ -4215,16 +4419,22 @@ func (fs *fileStore) NumPendingMulti(sseq uint64, sl *gsl.SimpleSublist, lastPer mb.lsts = ats.AccessTime() var t uint64 + var ierr error var havePartial bool var updateLLTS bool stree.IntersectGSL[SimpleState](mb.fss, sl, func(bsubj []byte, ss *SimpleState) { + if ierr != nil { + return + } subj := bytesToString(bsubj) if havePartial { // If we already found a partial then don't do anything else. return } if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(subj, ss) + if ierr = mb.recalculateForSubj(subj, ss); ierr != nil { + return + } } if sseq <= ss.First { t += ss.Msgs @@ -4233,6 +4443,10 @@ func (fs *fileStore) NumPendingMulti(sseq uint64, sl *gsl.SimpleSublist, lastPer havePartial = true } }) + if ierr != nil { + mb.mu.Unlock() + return 0, 0, ierr + } // See if we need to scan msgs here. if havePartial { @@ -4500,8 +4714,15 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { close(lmb.qch) lmb.qch = nil } + // If we had a write error before, don't allow continuing into a new block. + if err := lmb.werr; err != nil { + return nil, err + } // Flush any pending messages. - lmb.flushPendingMsgsLocked() + if _, err := lmb.flushPendingMsgsLocked(); err != nil { + lmb.mu.Unlock() + return nil, err + } // Determine if we can reclaim any resources here. lmb.closeFDsLockedNoCheck() if lmb.cache != nil { @@ -4516,7 +4737,8 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { go func() { lmb.mu.Lock() defer lmb.mu.Unlock() - lmb.recompressOnDiskIfNeeded() + // Might error, but we can't handle it here anyway. + _ = lmb.recompressOnDiskIfNeeded() }() } } @@ -4553,7 +4775,7 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) { if isPermissionError(err) { return nil, err } - mb.dirtyCloseWithRemove(true) + _ = mb.dirtyCloseWithRemove(true) return nil, fmt.Errorf("Error creating msg block file: %v", err) } mb.mfd = mfd @@ -4601,7 +4823,7 @@ func (fs *fileStore) genEncryptionKeysForBlock(mb *msgBlock) error { // Stores a raw message with expected sequence number and timestamp. // Lock should be held. -func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64) (err error) { +func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64, discardNewCheck bool) (err error) { if fs.isClosed() { return ErrStoreClosed } @@ -4618,11 +4840,15 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t var fseq uint64 // Check if we are discarding new messages when we reach the limit. - if fs.cfg.Discard == DiscardNew { + // If we are clustered, we do the enforcement above and should not disqualify + // the message here since it could cause replicas to drift. + if discardNewCheck && fs.cfg.Discard == DiscardNew { var asl bool if psmax && psmc >= mmp { // If we are instructed to discard new per subject, this is an error. - if fs.cfg.DiscardNewPer { + // However, allow rollup messages through since they will purge old + // messages for the subject after storing, restoring the limit. + if fs.cfg.DiscardNewPer && len(sliceHeader(JSMsgRollup, hdr)) == 0 { return ErrMaxMsgsPerSubject } if fseq, err = fs.firstSeqForSubj(subj); err != nil { @@ -4630,16 +4856,12 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t } asl = true } - // If we are discard new and limits policy and clustered, we do the enforcement - // above and should not disqualify the message here since it could cause replicas to drift. - if fs.cfg.Retention == LimitsPolicy || fs.cfg.Replicas == 1 { - if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl { - return ErrMaxMsgs - } - if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) >= uint64(fs.cfg.MaxBytes) { - if !asl || fs.sizeForSeq(fseq) <= int(fileStoreMsgSize(subj, hdr, msg)) { - return ErrMaxBytes - } + if fs.cfg.MaxMsgs > 0 && fs.state.Msgs >= uint64(fs.cfg.MaxMsgs) && !asl { + return ErrMaxMsgs + } + if fs.cfg.MaxBytes > 0 && fs.state.Bytes+fileStoreMsgSize(subj, hdr, msg) > uint64(fs.cfg.MaxBytes) { + if !asl || fs.sizeForSeq(fseq) < int(fileStoreMsgSize(subj, hdr, msg)) { + return ErrMaxBytes } } } @@ -4652,6 +4874,17 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t seq = fs.state.LastSeq + 1 } + // Return previous write errors immediately. + if fs.werr != nil { + return fs.werr + } + // Persist any returned errors to be used in the future. + defer func() { + if err != nil { + fs.setWriteErr(err) + } + }() + // Write msg record. // Add expiry bit to sequence if needed. This is so that if we need to // rebuild, we know which messages to look at more quickly. @@ -4691,18 +4924,25 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t if psmax && psmc >= mmp { // We may have done this above. if fseq == 0 { - fseq, _ = fs.firstSeqForSubj(subj) + fseq, err = fs.firstSeqForSubj(subj) + if err != nil { + return err + } } - if ok, _ := fs.removeMsgViaLimits(fseq); ok { + if ok, err := fs.removeMsgViaLimits(fseq); err != nil { + return err + } else if ok { // Make sure we are below the limit. if psmc--; psmc >= mmp { bsubj := stringToBytes(subj) for info, ok := fs.psim.Find(bsubj); ok && info.total > mmp; info, ok = fs.psim.Find(bsubj) { - if seq, _ := fs.firstSeqForSubj(subj); seq > 0 { - if ok, _ := fs.removeMsgViaLimits(seq); !ok { - break - } - } else { + if seq, err := fs.firstSeqForSubj(subj); err != nil { + return err + } else if seq == 0 { + break + } else if ok, err = fs.removeMsgViaLimits(seq); err != nil { + return err + } else if !ok { break } } @@ -4710,7 +4950,9 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t } else if mb := fs.selectMsgBlock(fseq); mb != nil { // If we are here we could not remove fseq from above, so rebuild. var ld *LostStreamData - if ld, _, _ = mb.rebuildState(); ld != nil { + if ld, _, err = mb.rebuildState(); err != nil { + return err + } else if ld != nil { fs.rebuildStateLocked(ld) } } @@ -4719,8 +4961,12 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t // Limits checks and enforcement. // If they do any deletions they will update the // byte count on their own, so no need to compensate. - fs.enforceMsgLimit() - fs.enforceBytesLimit() + if err = fs.enforceMsgLimit(); err != nil { + return err + } + if err = fs.enforceBytesLimit(); err != nil { + return err + } // Per-message TTL. if ttl > 0 { @@ -4743,21 +4989,83 @@ func (fs *fileStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t // Message scheduling. if fs.scheduling != nil { - if schedule, ok := getMessageSchedule(hdr); ok && !schedule.IsZero() { + if schedule, apiErr := nextMessageSchedule(hdr, ts); apiErr == nil && !schedule.IsZero() { fs.scheduling.add(seq, subj, schedule.UnixNano()) fs.lmb.schedules++ - } else { + } else if getMessageScheduler(hdr) == _EMPTY_ { fs.scheduling.removeSubject(subj) } + + // Check for a repeating schedule and update such that it triggers again. + if scheduleNext := bytesToString(sliceHeader(JSScheduleNext, hdr)); scheduleNext != _EMPTY_ && scheduleNext != JSScheduleNextPurge { + scheduler := getMessageScheduler(hdr) + if next, err := time.Parse(time.RFC3339Nano, scheduleNext); err == nil && scheduler != _EMPTY_ { + fs.scheduling.update(scheduler, next.UnixNano()) + } + } } return nil } -// StoreRawMsg stores a raw message with expected sequence number and timestamp. -func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64) error { - fs.mu.Lock() - err := fs.storeRawMsg(subj, hdr, msg, seq, ts, ttl) +// isReadErr reports whether err originated from reading or interpreting +// existing on-disk data rather than from a failed write. These surface through +// cache/load paths and should not permanently disable writes. +func isReadErr(err error) bool { + var badMsg errBadMsg + return errors.Is(err, errNoCache) || + errors.Is(err, errDeletedMsg) || + errors.Is(err, errPartialCache) || + errors.Is(err, errCorruptState) || + errors.Is(err, errPriorState) || + errors.Is(err, io.ErrUnexpectedEOF) || + errors.Is(err, errMsgBlkTooBig) || + errors.As(err, &badMsg) +} + +// Lock should be held. +func (fs *fileStore) setWriteErr(err error) { + if fs.werr != nil { + return + } + // Ignore non-write errors. + if err == ErrStoreClosed { + return + } + // If this is a not found report but do not disable. + if os.IsNotExist(err) { + fs.warn("Resource not found: %v", err) + return + } + // Read/decode errors surfaced from existing on-disk data are not write failures. + // Log and continue instead of disabling writes. + if isReadErr(err) { + fs.rateLimitWarn("Ignoring non-write error: %v", err) + assert.Unreachable("Filestore encountered read error", map[string]any{ + "name": fs.cfg.Name, + "err": err, + "stack": string(debug.Stack()), + }) + return + } + fs.error("Critical write error: %v", err) + fs.werr = err + assert.Unreachable("Filestore encountered write error", map[string]any{ + "name": fs.cfg.Name, + "err": err, + "stack": string(debug.Stack()), + }) +} + +// StoreRawMsg stores a raw message with expected sequence number and timestamp. +func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64, discardNewCheck bool) error { + fs.mu.Lock() + // Always return previous write errors. + if err := fs.werr; err != nil { + fs.mu.Unlock() + return err + } + err := fs.storeRawMsg(subj, hdr, msg, seq, ts, ttl, discardNewCheck) cb := fs.scb // Check if first message timestamp requires expiry // sooner than initial replica expiry timer set to MaxAge when initializing. @@ -4777,8 +5085,14 @@ func (fs *fileStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts, t // Store stores a message. We hold the main filestore lock for any write operation. func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte, ttl int64) (uint64, int64, error) { fs.mu.Lock() + // Always return previous write errors. + if err := fs.werr; err != nil { + fs.mu.Unlock() + return 0, 0, err + } seq, ts := fs.state.LastSeq+1, time.Now().UnixNano() - err := fs.storeRawMsg(subj, hdr, msg, seq, ts, ttl) + // This is called for a R1 with no expected sequence number, so perform DiscardNew checks on the store-level. + err := fs.storeRawMsg(subj, hdr, msg, seq, ts, ttl, true) cb := fs.scb fs.mu.Unlock() @@ -4796,13 +5110,17 @@ func (fs *fileStore) StoreMsg(subj string, hdr, msg []byte, ttl int64) (uint64, // we will place an empty record marking the sequence as used. The // sequence will be marked erased. // fs lock should be held. -func (mb *msgBlock) skipMsg(seq uint64, now int64) { +func (mb *msgBlock) skipMsg(seq uint64, now int64) error { if mb == nil { - return + return nil } var needsRecord bool mb.mu.Lock() + if err := mb.werr; err != nil { + mb.mu.Unlock() + return err + } // If we are empty can just do meta. if mb.msgs == 0 { atomic.StoreUint64(&mb.last.seq, seq) @@ -4815,12 +5133,16 @@ func (mb *msgBlock) skipMsg(seq uint64, now int64) { mb.dmap.Insert(seq) } if needsRecord { - mb.writeMsgRecordLocked(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, now, true, true) + if err := mb.writeMsgRecordLocked(emptyRecordLen, seq|ebit, _EMPTY_, nil, nil, now, true, true); err != nil { + mb.mu.Unlock() + return err + } } mb.mu.Unlock() if !needsRecord { mb.kickFlusher() } + return nil } // SkipMsg will use the next sequence number but not store anything. @@ -4831,6 +5153,11 @@ func (fs *fileStore) SkipMsg(seq uint64) (uint64, error) { fs.mu.Lock() defer fs.mu.Unlock() + // Always return previous write errors. + if err := fs.werr; err != nil { + return 0, err + } + // Check sequence matches our last sequence. if seq != fs.state.LastSeq+1 { if seq > 0 { @@ -4846,7 +5173,10 @@ func (fs *fileStore) SkipMsg(seq uint64) (uint64, error) { } // Write skip msg. - mb.skipMsg(seq, now) + if err = mb.skipMsg(seq, now); err != nil { + fs.setWriteErr(err) + return 0, err + } // Update fs state. fs.state.LastSeq, fs.state.LastTime = seq, time.Unix(0, now).UTC() @@ -4862,11 +5192,16 @@ func (fs *fileStore) SkipMsg(seq uint64) (uint64, error) { return seq, nil } -// Skip multiple msgs. We will determine if we can fit into current lmb or we need to create a new block. +// SkipMsgs skips multiple msgs. We will determine if we can fit into current lmb or we need to create a new block. func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { fs.mu.Lock() defer fs.mu.Unlock() + // Always return previous write errors. + if err := fs.werr; err != nil { + return err + } + // Check sequence matches our last sequence. if seq != fs.state.LastSeq+1 { if seq > 0 { @@ -4899,6 +5234,10 @@ func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { lseq := seq + num - 1 mb.mu.Lock() + if err := mb.werr; err != nil { + mb.mu.Unlock() + return err + } // If we are empty update meta directly. if mb.msgs == 0 { atomic.StoreUint64(&mb.last.seq, lseq) @@ -4911,8 +5250,12 @@ func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { } } // Write out our placeholder. - mb.writeMsgRecordLocked(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, now, true, true) + err := mb.writeMsgRecordLocked(emptyRecordLen, lseq|ebit, _EMPTY_, nil, nil, now, true, true) mb.mu.Unlock() + if err != nil { + fs.setWriteErr(err) + return err + } // Now update FS accounting. // Update fs state. @@ -4928,20 +5271,24 @@ func (fs *fileStore) SkipMsgs(seq uint64, num uint64) error { } // FlushAllPending flushes all data that was still pending to be written. -func (fs *fileStore) FlushAllPending() { +func (fs *fileStore) FlushAllPending() error { fs.mu.Lock() defer fs.mu.Unlock() - fs.checkAndFlushLastBlock() + // Return previous write errors immediately. + if fs.werr != nil { + return fs.werr + } + return fs.checkAndFlushLastBlock() } // Lock should be held. -func (fs *fileStore) rebuildFirst() { +func (fs *fileStore) rebuildFirst() error { if len(fs.blks) == 0 { - return + return nil } fmb := fs.blks[0] if fmb == nil { - return + return nil } ld, _, _ := fmb.rebuildState() @@ -4950,11 +5297,17 @@ func (fs *fileStore) rebuildFirst() { fmb.mu.RUnlock() if isEmpty { fmb.mu.Lock() - fs.removeMsgBlock(fmb) + err := fs.forceRemoveMsgBlock(fmb) fmb.mu.Unlock() + if err != nil { + return err + } + } + if err := fs.selectNextFirst(); err != nil { + return err } - fs.selectNextFirst() fs.rebuildStateLocked(ld) + return nil } // Optimized helper function to return first sequence. @@ -4998,8 +5351,9 @@ func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { bsubj := stringToBytes(subj) if ss, ok := mb.fss.Find(bsubj); ok && ss != nil { + var err error if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - mb.recalculateForSubj(subj, ss) + err = mb.recalculateForSubj(subj, ss) } mb.mu.Unlock() // Re-acquire fs lock @@ -5010,6 +5364,9 @@ func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { info.fblk = i } } + if err != nil { + return 0, err + } return ss.First, nil } // If we did not find it and we loaded this msgBlock try to expire as long as not the last. @@ -5028,12 +5385,12 @@ func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) { // Will check the msg limit and drop firstSeq msg if needed. // Lock should be held. -func (fs *fileStore) enforceMsgLimit() { +func (fs *fileStore) enforceMsgLimit() error { if fs.cfg.Discard != DiscardOld { - return + return nil } if fs.cfg.MaxMsgs <= 0 || fs.state.Msgs <= uint64(fs.cfg.MaxMsgs) { - return + return nil } for nmsgs := fs.state.Msgs; nmsgs > uint64(fs.cfg.MaxMsgs); nmsgs = fs.state.Msgs { // If the first block can be removed fully, purge it entirely without needing to walk sequences. @@ -5043,25 +5400,29 @@ func (fs *fileStore) enforceMsgLimit() { msgs := fmb.msgs fmb.mu.RUnlock() if nmsgs-msgs > uint64(fs.cfg.MaxMsgs) { - fs.purgeMsgBlock(fmb) + if err := fs.purgeMsgBlock(fmb); err != nil { + return err + } continue } } - if removed, err := fs.deleteFirstMsg(); err != nil || !removed { - fs.rebuildFirst() - return + if removed, err := fs.deleteFirstMsg(); err != nil { + return err + } else if !removed { + return fs.rebuildFirst() } } + return nil } // Will check the bytes limit and drop msgs if needed. // Lock should be held. -func (fs *fileStore) enforceBytesLimit() { +func (fs *fileStore) enforceBytesLimit() error { if fs.cfg.Discard != DiscardOld { - return + return nil } if fs.cfg.MaxBytes <= 0 || fs.state.Bytes <= uint64(fs.cfg.MaxBytes) { - return + return nil } for bs := fs.state.Bytes; bs > uint64(fs.cfg.MaxBytes); bs = fs.state.Bytes { // If the first block can be removed fully, purge it entirely without needing to walk sequences. @@ -5071,22 +5432,26 @@ func (fs *fileStore) enforceBytesLimit() { bytes := fmb.bytes fmb.mu.RUnlock() if bs-bytes > uint64(fs.cfg.MaxBytes) { - fs.purgeMsgBlock(fmb) + if err := fs.purgeMsgBlock(fmb); err != nil { + return err + } continue } } - if removed, err := fs.deleteFirstMsg(); err != nil || !removed { - fs.rebuildFirst() - return + if removed, err := fs.deleteFirstMsg(); err != nil { + return err + } else if !removed { + return fs.rebuildFirst() } } + return nil } // Will make sure we have limits honored for max msgs per subject on recovery or config update. // We will make sure to go through all msg blocks etc. but in practice this // will most likely only be the last one, so can take a more conservative approach. // Lock should be held. -func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { +func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) error { start := time.Now() defer func() { if took := time.Since(start); took > time.Minute { @@ -5131,10 +5496,14 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { fs.psim, fs.tsl = fs.psim.Empty(), 0 for _, mb := range fs.blks { ld, _, err := mb.rebuildState() - if err != nil && ld != nil { + if err != nil { + return err + } else if ld != nil { fs.addLostData(ld) } - fs.populateGlobalPerSubjectInfo(mb) + if err = fs.populateGlobalPerSubjectInfo(mb); err != nil { + return err + } } // Rebuild fs state too. fs.rebuildStateLocked(nil) @@ -5157,7 +5526,7 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { // If nothing to do then stop. if fblk == math.MaxUint32 { - return + return nil } // Collect all the msgBlks we alter. @@ -5172,7 +5541,10 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { continue } mb.mu.Lock() - mb.ensurePerSubjectInfoLoaded() + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + mb.mu.Unlock() + return err + } // It isn't safe to intersect mb.fss directly, because removeMsgViaLimits modifies it // during the iteration, which can cause us to miss keys. We won't copy the entire // SimpleState structs though but rather just take pointers for speed. @@ -5182,15 +5554,19 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { return true }) mb.mu.Unlock() + var ierr error stree.LazyIntersect(needAttention, fss, func(subj []byte, total *uint64, ssptr **SimpleState) { - if ssptr == nil || total == nil { + if ssptr == nil || total == nil || ierr != nil { return } ss := *ssptr if ss.firstNeedsUpdate || ss.lastNeedsUpdate { mb.mu.Lock() - mb.recalculateForSubj(bytesToString(subj), ss) + ierr = mb.recalculateForSubj(bytesToString(subj), ss) mb.mu.Unlock() + if ierr != nil { + return + } } for first := ss.First; *total > maxMsgsPer && first <= ss.Last; { m, _, err := mb.firstMatching(bytesToString(subj), false, first, &sm) @@ -5204,6 +5580,9 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { } } }) + if ierr != nil { + return ierr + } } // Expire the cache if we can. @@ -5214,6 +5593,7 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) { } mb.mu.Unlock() } + return nil } // Lock should be held. @@ -5248,9 +5628,7 @@ func (fs *fileStore) removePerSubject(subj string) uint64 { bsubj := stringToBytes(subj) if info, ok := fs.psim.Find(bsubj); ok { info.total-- - if info.total == 1 { - info.fblk = info.lblk - } else if info.total == 0 { + if info.total == 0 { if _, ok = fs.psim.Delete(bsubj); ok { fs.tsl -= len(subj) return 0 @@ -5278,11 +5656,15 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( } fsLock() + defer fsUnlock() if fs.isClosed() { - fsUnlock() return false, ErrStoreClosed } + // Always return previous write errors. + if err := fs.werr; err != nil { + return false, err + } // If in encrypted mode negate secure rewrite here. if secure && fs.prf != nil { secure = false @@ -5294,19 +5676,30 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( if seq <= fs.state.LastSeq { err = ErrStoreMsgNotFound } - fsUnlock() return false, err } + return fs.removeMsgFromBlock(mb, seq, secure, viaLimits) +} + +// Remove a message from the given block, optionally rewriting the mb file. +// fs lock should be held. +func (fs *fileStore) removeMsgFromBlock(mb *msgBlock, seq uint64, secure, viaLimits bool) (removed bool, rerr error) { mb.mu.Lock() // See if we are closed or the sequence number is still relevant or if we know its deleted. if mb.closed || seq < atomic.LoadUint64(&mb.first.seq) || mb.dmap.Exists(seq) { mb.mu.Unlock() - fsUnlock() return false, nil } + // Persist any write errors. + defer func() { + if rerr != nil { + fs.setWriteErr(rerr) + } + }() + fifo := seq == atomic.LoadUint64(&mb.first.seq) isLastBlock := mb == fs.lmb isEmpty := mb.msgs == 1 // ... about to be zero though. @@ -5318,7 +5711,6 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( if mb.cacheNotLoaded() { if err := mb.loadMsgsWithLock(); err != nil { mb.mu.Unlock() - fsUnlock() return false, err } didLoad = true @@ -5341,9 +5733,8 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( if err != nil { finishedWithCache() mb.mu.Unlock() - fsUnlock() // Mimic err behavior from above check to dmap. No error returned if already removed. - if err == errDeletedMsg { + if err == ErrStoreMsgNotFound || err == errDeletedMsg { err = nil } return false, err @@ -5365,13 +5756,15 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( mb.mu.Unlock() // Only safe way to checkLastBlock is to unlock here... lmb, err := fs.checkLastBlock(emptyRecordLen) if err != nil { + mb.mu.Lock() finishedWithCache() - fsUnlock() + mb.mu.Unlock() return false, err } if err := lmb.writeTombstone(seq, ts); err != nil { + mb.mu.Lock() finishedWithCache() - fsUnlock() + mb.mu.Unlock() return false, err } mb.mu.Lock() // We'll need the lock back to carry on safely. @@ -5390,7 +5783,6 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( if err != nil { finishedWithCache() mb.mu.Unlock() - fsUnlock() return false, err } // Need to copy the subject, as eraseMsg will overwrite the cache and we won't @@ -5399,7 +5791,6 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( if err := mb.eraseMsg(seq, int(ri), int(msz), isLastBlock); err != nil { finishedWithCache() mb.mu.Unlock() - fsUnlock() return false, err } } @@ -5431,10 +5822,18 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( fs.dirty++ // If we are tracking subjects here make sure we update that accounting. - mb.ensurePerSubjectInfoLoaded() + if err = mb.ensurePerSubjectInfoLoaded(); err != nil { + finishedWithCache() + mb.mu.Unlock() + return false, err + } // If we are tracking multiple subjects here make sure we update that accounting. - mb.removeSeqPerSubject(subj, seq) + if _, err = mb.removeSeqPerSubject(subj, seq); err != nil { + finishedWithCache() + mb.mu.Unlock() + return false, err + } fs.removePerSubject(subj) if fs.ttls != nil && ttl > 0 { expires := time.Duration(ts) + (time.Second * time.Duration(ttl)) @@ -5462,7 +5861,11 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( // Note that we do not have to store empty records for the deleted, so don't use to calculate. // TODO(dlc) - This should not be inline, should kick the sync routine. if !isLastBlock && mb.shouldCompactInline() { - mb.compact() + if err = mb.compact(); err != nil { + finishedWithCache() + mb.mu.Unlock() + return false, err + } } } @@ -5478,7 +5881,11 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( var firstSeqNeedsUpdate bool if isEmpty { // This writes tombstone iff mb == lmb, so no need to do above. - fs.removeMsgBlock(mb) + if err = fs.removeMsgBlock(mb); err != nil { + finishedWithCache() + mb.mu.Unlock() + return false, err + } firstSeqNeedsUpdate = seq == fs.state.FirstSeq } finishedWithCache() @@ -5488,7 +5895,9 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( // then we need to jump message blocks. We will also write the index so // we don't lose track of the first sequence. if firstSeqNeedsUpdate { - fs.selectNextFirst() + if err = fs.selectNextFirst(); err != nil { + return false, err + } } if cb := fs.scb; cb != nil { @@ -5498,17 +5907,58 @@ func (fs *fileStore) removeMsg(seq uint64, secure, viaLimits, needFSLock bool) ( delta := int64(msz) cb(-1, -delta, seq, subj) - if !needFSLock { - fs.mu.Lock() - } - } else if needFSLock { - // We acquired it so release it. - fs.mu.Unlock() + fs.mu.Lock() } return true, nil } +// Remove all messages in the range [first, last] +// Lock should be held. +func (fs *fileStore) removeMsgsInRange(first, last uint64, viaLimits bool) error { + last = min(last, fs.state.LastSeq) + if first > last { + return nil + } + + firstBlock := sort.Search(len(fs.blks), func(i int) bool { + return atomic.LoadUint64(&fs.blks[i].last.seq) >= first + }) + if firstBlock >= len(fs.blks) { + return nil + } + + for i := firstBlock; i < len(fs.blks); { + mb := fs.blks[i] + mbFirstSeq := atomic.LoadUint64(&mb.first.seq) + mbLastSeq := atomic.LoadUint64(&mb.last.seq) + if mbFirstSeq > last { + break + } + if mbFirstSeq >= first && mbLastSeq <= last && mb.numPriorTombs() == 0 { + // If this block stores no tombstones for previous blocks, + // and its sequences are within the range to be removed, + // we can get rid of the block entirely. To do that we use + // purgeMgsBlock, which also removes the block from fs.blks. + // After purgeMsgBlock, i will be the index of the following + // msgBlock, if any. Therefore, continue without incrementing i. + if err := fs.purgeMsgBlock(mb); err != nil { + return err + } + } else { + from := max(first, mbFirstSeq) + to := min(last, mbLastSeq) + for seq := from; seq <= to; seq++ { + if _, err := fs.removeMsgFromBlock(mb, seq, false, viaLimits); err != nil { + return err + } + } + i++ + } + } + return nil +} + // Tests whether we should try to compact this block while inline removing msgs. // We will want rbytes to be over the minimum and have a 2x potential savings. // If we compacted before but rbytes didn't improve much, guard against constantly compacting. @@ -5527,8 +5977,8 @@ func (mb *msgBlock) shouldCompactSync() bool { // This will compact and rewrite this block. This version will not process any tombstone cleanup. // Write lock needs to be held. -func (mb *msgBlock) compact() { - mb.compactWithFloor(0) +func (mb *msgBlock) compact() error { + return mb.compactWithFloor(0, nil) } // This will compact and rewrite this block. This should only be called when we know we want to rewrite this block. @@ -5536,7 +5986,7 @@ func (mb *msgBlock) compact() { // writing new messages. We will silently bail on any issues with the underlying block and let someone else detect. // if fseq > 0 we will attempt to cleanup stale tombstones. // Write lock needs to be held. -func (mb *msgBlock) compactWithFloor(floor uint64) error { +func (mb *msgBlock) compactWithFloor(floor uint64, fsDmap *avl.SequenceSet) error { wasLoaded := mb.cache != nil && mb.cacheAlreadyLoaded() if !wasLoaded { if err := mb.loadMsgsWithLock(); err != nil { @@ -5590,7 +6040,9 @@ func (mb *msgBlock) compactWithFloor(floor uint64) error { // If this entry is for a lower seq than ours then keep around. // We also check that it is greater than our floor. Floor is zero on normal // calls to compact. - if seq < fseq && seq >= floor { + // If the global delete map is set, check if a tombstone is still + // referencing a message in another block. If not, it can be removed. + if seq < fseq && seq >= floor && (fsDmap == nil || fsDmap.Exists(seq)) { nbuf = append(nbuf, buf[index:index+rl]...) } } else { @@ -5645,11 +6097,11 @@ func (mb *msgBlock) compactWithFloor(floor uint64) error { err := os.WriteFile(mfn, nbuf, defaultFilePerms) dios <- struct{}{} if err != nil { - os.Remove(mfn) + _ = os.Remove(mfn) return err } if err := os.Rename(mfn, mb.mfn); err != nil { - os.Remove(mfn) + _ = os.Remove(mfn) return err } @@ -5827,7 +6279,8 @@ func (mb *msgBlock) flushLoop(fch, qch chan struct{}) { ts *= 2 } - mb.flushPendingMsgs() + // Ignore error here, the error is persisted as mb.werr and will be bubbled up later. + _ = mb.flushPendingMsgs() } // Check if we are no longer the last message block. If we are @@ -5979,11 +6432,17 @@ func (mb *msgBlock) truncate(tseq uint64, ts int64) (nmsgs, nbytes uint64, err e return 0, 0, err } } else if mb.mfd != nil { - mb.mfd.Truncate(eof) - mb.mfd.Sync() + if err = mb.mfd.Truncate(eof); err != nil { + return 0, 0, err + } + if err = mb.mfd.Sync(); err != nil { + return 0, 0, err + } // Update our checksum. var lchk [8]byte - mb.mfd.ReadAt(lchk[:], eof-8) + if _, err = mb.mfd.ReadAt(lchk[:], eof-8); err != nil { + return 0, 0, err + } copy(mb.lchk[0:], lchk[:]) } else { return 0, 0, fmt.Errorf("failed to truncate msg block %d, file not open", mb.index) @@ -5997,7 +6456,9 @@ func (mb *msgBlock) truncate(tseq uint64, ts int64) (nmsgs, nbytes uint64, err e mb.clearCacheAndOffset() // Redo per subject info for this block. - mb.resetPerSubjectInfo() + if err = mb.resetPerSubjectInfo(); err != nil { + return purged, bytes, err + } // Load msgs again. return purged, bytes, mb.loadMsgsWithLock() @@ -6049,7 +6510,7 @@ func (mb *msgBlock) selectNextFirst() { // Select the next FirstSeq // Also cleans up empty blocks at the start only containing tombstones. // Lock should be held. -func (fs *fileStore) selectNextFirst() { +func (fs *fileStore) selectNextFirst() error { if len(fs.blks) > 0 { for len(fs.blks) > 1 { mb := fs.blks[0] @@ -6059,8 +6520,11 @@ func (fs *fileStore) selectNextFirst() { mb.mu.Unlock() break } - fs.forceRemoveMsgBlock(mb) + err := fs.forceRemoveMsgBlock(mb) mb.mu.Unlock() + if err != nil { + return err + } } mb := fs.blks[0] mb.mu.RLock() @@ -6078,6 +6542,7 @@ func (fs *fileStore) selectNextFirst() { } // Mark first as moved. Plays into tombstone cleanup for syncBlocks. fs.firstMoved = true + return nil } // Lock should be held. @@ -6086,7 +6551,7 @@ func (mb *msgBlock) resetCacheExpireTimer(td time.Duration) { td = mb.cexp + 100*time.Millisecond } if mb.ctmr == nil { - mb.ctmr = time.AfterFunc(td, mb.expireCache) + mb.ctmr = time.AfterFunc(td, mb.tryExpireCache) } else { mb.ctmr.Reset(td) } @@ -6134,10 +6599,10 @@ func (mb *msgBlock) clearCache() { } // Called to possibly expire a message block cache. -func (mb *msgBlock) expireCache() { +func (mb *msgBlock) tryExpireCache() { mb.mu.Lock() defer mb.mu.Unlock() - mb.expireCacheLocked() + mb.tryExpireCacheLocked() } func (mb *msgBlock) tryForceExpireCache() { @@ -6148,10 +6613,10 @@ func (mb *msgBlock) tryForceExpireCache() { // We will attempt to force expire this by temporarily clearing the last load time. func (mb *msgBlock) tryForceExpireCacheLocked() { - llts, lwts := mb.llts, mb.lwts - mb.llts, mb.lwts = 0, 0 - mb.expireCacheLocked() - mb.llts, mb.lwts = llts, lwts + llts := mb.llts + mb.llts = 0 + mb.tryExpireCacheLocked() + mb.llts = llts } // This is for expiration of the write cache, which will be partial with fip. @@ -6163,7 +6628,7 @@ func (mb *msgBlock) tryExpireWriteCache() []byte { } lwts, buf, llts, nra := mb.lwts, mb.cache.buf, mb.llts, mb.cache.nra mb.lwts, mb.cache.nra = 0, true - mb.expireCacheLocked() + mb.tryExpireCacheLocked() mb.lwts = lwts if mb.cache != nil { mb.cache.nra = nra @@ -6178,7 +6643,7 @@ func (mb *msgBlock) tryExpireWriteCache() []byte { } // Lock should be held. -func (mb *msgBlock) expireCacheLocked() { +func (mb *msgBlock) tryExpireCacheLocked() { var strengthened bool if mb.cache == nil { mb.cache = mb.ecache.Value() @@ -6516,10 +6981,16 @@ func (fs *fileStore) runMsgScheduling() { } fs.scheduling.running = true - scheduledMsgs := fs.scheduling.getScheduledMessages(func(seq uint64, smv *StoreMsg) *StoreMsg { - sm, _ := fs.msgForSeqLocked(seq, smv, false) - return sm - }) + scheduledMsgs := fs.scheduling.getScheduledMessages( + func(seq uint64, smv *StoreMsg) *StoreMsg { + sm, _ := fs.msgForSeqLocked(seq, smv, false) + return sm + }, + func(subj string, smv *StoreMsg) *StoreMsg { + sm, _ := fs.loadLastLocked(subj, smv) + return sm + }, + ) if len(scheduledMsgs) > 0 { fs.mu.Unlock() for _, msg := range scheduledMsgs { @@ -6533,44 +7004,77 @@ func (fs *fileStore) runMsgScheduling() { } // Lock should be held. -func (fs *fileStore) checkAndFlushLastBlock() { +func (fs *fileStore) checkAndFlushLastBlock() error { lmb := fs.lmb if lmb == nil { - return + return nil } - if lmb.pendingWriteSize() > 0 { - // Since fs lock is held need to pull this apart in case we need to rebuild state. - lmb.mu.Lock() - ld, _ := lmb.flushPendingMsgsLocked() + lmb.mu.Lock() + if err := lmb.werr; err != nil { lmb.mu.Unlock() - if ld != nil { - fs.rebuildStateLocked(ld) - } + return err + } + + if lmb.pendingWriteSizeLocked() == 0 { + lmb.mu.Unlock() + return nil + } + ld, err := lmb.flushPendingMsgsLocked() + lmb.mu.Unlock() + if err != nil { + return err + } + // Since fs lock is held need to unlock the mb in case we need to rebuild state. + if ld != nil { + fs.rebuildStateLocked(ld) } + return nil } // This will check all the checksums on messages and report back any sequence numbers with errors. -func (fs *fileStore) checkMsgs() *LostStreamData { +func (fs *fileStore) checkMsgs() (*LostStreamData, error) { fs.mu.Lock() defer fs.mu.Unlock() - fs.checkAndFlushLastBlock() + var firstErr error + storeErr := func(err error) error { + fs.warn("checkMsgs: %v", err) + if firstErr == nil { + firstErr = err + } + return err + } + + if err := fs.checkAndFlushLastBlock(); err != nil { + return nil, storeErr(fmt.Errorf("flush of last block failed: %w", err)) + } // Clear any global subject state. fs.psim, fs.tsl = fs.psim.Empty(), 0 for _, mb := range fs.blks { // Make sure encryption loaded if needed for the block. - fs.loadEncryptionForMsgBlock(mb) + if err := fs.loadEncryptionForMsgBlock(mb); err != nil { + _ = storeErr(fmt.Errorf("loading encryption for block %d failed: %w", mb.index, err)) + continue + } // FIXME(dlc) - check tombstones here too? - if ld, _, err := mb.rebuildState(); err != nil && ld != nil { + ld, _, err := mb.rebuildState() + if err != nil { + _ = storeErr(fmt.Errorf("rebuildState for block %d failed: %w", mb.index, err)) + continue + } + if ld != nil { // Rebuild fs state too. fs.rebuildStateLocked(ld) } - fs.populateGlobalPerSubjectInfo(mb) + if err = fs.populateGlobalPerSubjectInfo(mb); err != nil { + _ = storeErr(fmt.Errorf("populating per-subject info for block %d failed: %w", mb.index, err)) + continue + } } - return fs.ld + return fs.ld, firstErr } // Lock should be held. @@ -6621,7 +7125,36 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte // Will write the message record to the underlying message block. // filestore lock will be held. // mb lock should be held. -func (mb *msgBlock) writeMsgRecordLocked(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush, kick bool) error { +func (mb *msgBlock) writeMsgRecordLocked(rl, seq uint64, subj string, mhdr, msg []byte, ts int64, flush, kick bool) (rerr error) { + // Return previous write errors immediately. + if mb.werr != nil { + return mb.werr + } + // Persist any returned errors to be used in the future. + defer func() { + if rerr != nil && mb.werr == nil { + // Read/decode errors surfaced from existing on-disk data are not write failures. + // Log and continue instead of disabling writes. + if isReadErr(rerr) { + mb.fs.rateLimitWarn("Ignoring non-write error: %v", rerr) + assert.Unreachable("Filestore msg block encountered read error", map[string]any{ + "name": mb.fs.cfg.Name, + "mb.index": mb.index, + "err": rerr, + "stack": string(debug.Stack()), + }) + return + } + mb.werr = rerr + assert.Unreachable("Filestore msg block encountered write error", map[string]any{ + "name": mb.fs.cfg.Name, + "mb.index": mb.index, + "err": rerr, + "stack": string(debug.Stack()), + }) + } + }() + // Enable for writing if our mfd is not open. if mb.mfd == nil { if err := mb.enableForWriting(flush && kick); err != nil { @@ -6754,9 +7287,12 @@ func (mb *msgBlock) writeMsgRecordLocked(rl, seq uint64, subj string, mhdr, msg } fch, werr := mb.fch, mb.werr + if werr != nil { + return werr + } // If we should be flushing, or had a write error, do so here. - if (flush && mb.fs.fip) || werr != nil { + if flush && mb.fs.fip { ld, err := mb.flushPendingMsgsLocked() if ld != nil { // We have the mb lock here, this needs the mb locks so do in its own go routine. @@ -6811,7 +7347,7 @@ func (mb *msgBlock) closeFDsLocked() error { func (mb *msgBlock) closeFDsLockedNoCheck() { if mb.mfd != nil { - mb.mfd.Close() + _ = mb.mfd.Close() mb.mfd = nil } } @@ -7004,8 +7540,8 @@ func (mb *msgBlock) atomicOverwriteFile(buf []byte, allowCompress bool) error { } errorCleanup := func(err error) error { - tmpFD.Close() - os.Remove(tmpFN) + _ = tmpFD.Close() + _ = os.Remove(tmpFN) return err } @@ -7123,12 +7659,25 @@ func (fs *fileStore) syncBlocks() { return } fs.mu.Lock() + if err := fs.werr; err != nil { + fs.mu.Unlock() + return + } blks := append([]*msgBlock(nil), fs.blks...) lmb, firstMoved, firstSeq := fs.lmb, fs.firstMoved, fs.state.FirstSeq // Clear first moved. fs.firstMoved = false fs.mu.Unlock() + storeFsWerr := func(err error) { + fs.mu.Lock() + defer fs.mu.Unlock() + fs.setWriteErr(err) + } + + var fsDmapLoaded bool + var fsDmap avl.SequenceSet + var markDirty bool for _, mb := range blks { // Do actual sync. Hold lock for consistency. @@ -7137,9 +7686,19 @@ func (fs *fileStore) syncBlocks() { mb.mu.Unlock() continue } + // Bubble up an individual block error into the broader filestore. + if err := mb.werr; err != nil { + mb.mu.Unlock() + storeFsWerr(err) + continue + } // See if we can close FDs due to being idle. if mb.mfd != nil && mb.sinceLastWriteActivity() > closeFDsIdle && mb.pendingWriteSizeLocked() == 0 { - mb.dirtyCloseWithRemove(false) + if err := mb.dirtyCloseWithRemove(false); err != nil { + mb.mu.Unlock() + storeFsWerr(err) + continue + } } // If our first has moved and we are set to noCompact (which is from tombstones), // clear so that we might cleanup tombstones. @@ -7155,31 +7714,60 @@ func (fs *fileStore) syncBlocks() { } // Flush anything that may be pending. - mb.flushPendingMsgsLocked() + if _, err := mb.flushPendingMsgsLocked(); err != nil { + mb.mu.Unlock() + storeFsWerr(err) + continue + } // Check if we need to sync. We will not hold lock during actual sync. needSync := mb.needSync + + // Reset. Because we let go of the lock, we could write new data to this mb which might or + // might not be synced later if we would've reset after letting go of the lock. + mb.needSync = false mb.mu.Unlock() // Check if we should compact here. // Need to hold fs lock in case we reference psim when loading in the mb and we may remove this block if truly empty. if needsCompact { + // Load a delete map containing only interior deletes. + // This is used when compacting to know if tombstones are still relevant, + // and if not they can be compacted. + if !fsDmapLoaded { + fsDmapLoaded = true + fsDmap = fs.deleteMap() + } fs.mu.RLock() mb.mu.Lock() - mb.compactWithFloor(firstSeq) + // If the block has already been removed in the meantime, we can simply skip. + if _, ok := fs.bim[mb.index]; !ok { + mb.mu.Unlock() + fs.mu.RUnlock() + continue + } + err := mb.compactWithFloor(firstSeq, &fsDmap) // If this compact removed all raw bytes due to tombstone cleanup, schedule to remove. shouldRemove := mb.rbytes == 0 mb.mu.Unlock() fs.mu.RUnlock() + if err != nil { + storeFsWerr(err) + continue + } // Check if we should remove. This will not be common, so we will re-take fs write lock here vs changing // it above which we would prefer to be a readlock such that other lookups can occur while compacting this block. if shouldRemove { fs.mu.Lock() mb.mu.Lock() - fs.removeMsgBlock(mb) + err = fs.removeMsgBlock(mb) mb.mu.Unlock() fs.mu.Unlock() needSync = false + if err != nil { + storeFsWerr(err) + continue + } } } @@ -7187,25 +7775,39 @@ func (fs *fileStore) syncBlocks() { if needSync { mb.mu.Lock() var fd *os.File + var err error var didOpen bool if mb.mfd != nil { fd = mb.mfd } else { <-dios - fd, _ = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) + fd, err = os.OpenFile(mb.mfn, os.O_RDWR, defaultFilePerms) dios <- struct{}{} didOpen = true + if err != nil && !os.IsNotExist(err) { + mb.mu.Unlock() + storeFsWerr(err) + continue + } } // If we have an fd. if fd != nil { - canClear := fd.Sync() == nil + if err = fd.Sync(); err != nil { + // Close fd if we opened it, but ignore its error since sync takes precedence. + if didOpen { + _ = fd.Close() + } + mb.mu.Unlock() + storeFsWerr(err) + continue + } // If we opened the file close the fd. if didOpen { - fd.Close() - } - // Only clear sync flag on success. - if canClear { - mb.needSync = false + if err = fd.Close(); err != nil { + mb.mu.Unlock() + storeFsWerr(err) + continue + } } } mb.mu.Unlock() @@ -7216,6 +7818,7 @@ func (fs *fileStore) syncBlocks() { return } fs.mu.Lock() + defer fs.mu.Unlock() fs.setSyncTimer() if markDirty { fs.dirty++ @@ -7224,15 +7827,28 @@ func (fs *fileStore) syncBlocks() { // Sync state file if we are not running with sync always. if !fs.fcfg.SyncAlways { fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) + var fd *os.File + var err error <-dios - fd, _ := os.OpenFile(fn, os.O_RDWR, defaultFilePerms) + fd, err = os.OpenFile(fn, os.O_RDWR, defaultFilePerms) dios <- struct{}{} + if err != nil && !os.IsNotExist(err) { + fs.setWriteErr(err) + return + } if fd != nil { - fd.Sync() - fd.Close() + if err = fd.Sync(); err != nil { + // Close fd, but ignore its error since sync takes precedence. + _ = fd.Close() + fs.setWriteErr(err) + return + } + if err = fd.Close(); err != nil { + fs.setWriteErr(err) + return + } } } - fs.mu.Unlock() } // Select the message block where this message should be found. @@ -7535,16 +8151,13 @@ func (mb *msgBlock) writeAt(buf []byte, woff int64) (int, error) { // flushPendingMsgsLocked writes out any messages for this message block. // Lock should be held. func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { - // Signals us that we need to rebuild filestore state. - var fsLostData *LostStreamData - var weakenCache bool if mb.cache == nil { mb.cache = mb.ecache.Value() weakenCache = mb.cache != nil } - if mb.cache == nil || mb.mfd == nil { - return nil, errNoCache + if mb.cache == nil || mb.mfd == nil || mb.werr != nil { + return nil, mb.werr } buf, err := mb.bytesPending() @@ -7585,9 +8198,16 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { for lbb := lob; lbb > 0; lbb = len(buf) { n, err := mb.writeAt(buf, wp) if err != nil { - mb.dirtyCloseWithRemove(false) + // Ignore the errors here, we'll try reloading just to figure out and return the lost data if we can. + _ = mb.dirtyCloseWithRemove(false) ld, _, _ := mb.rebuildStateLocked() mb.werr = err + assert.Unreachable("Filestore msg block encountered flush error", map[string]any{ + "name": mb.fs.cfg.Name, + "mb.index": mb.index, + "err": err, + "stack": string(debug.Stack()), + }) return ld, err } // Update our write offset. @@ -7595,12 +8215,9 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { buf = buf[n:] } - // Clear any error. - mb.werr = nil - // Cache may be gone. if mb.cache == nil || mb.mfd == nil { - return fsLostData, mb.werr + return nil, mb.werr } // Update write pointer. @@ -7608,7 +8225,16 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { // Check if we are in sync always mode. if mb.syncAlways { - mb.mfd.Sync() + if err = mb.mfd.Sync(); err != nil { + mb.werr = err + assert.Unreachable("Filestore msg block encountered sync error", map[string]any{ + "name": mb.fs.cfg.Name, + "mb.index": mb.index, + "err": err, + "stack": string(debug.Stack()), + }) + return nil, err + } } else { mb.needSync = true } @@ -7618,7 +8244,7 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { // not releasing the lock during I/O operation. Therefore this will always // return zero. if mb.pendingWriteSizeLocked() > 0 { - return fsLostData, mb.werr + return nil, mb.werr } // Check last access time. If we think the block still has read interest @@ -7629,13 +8255,13 @@ func (mb *msgBlock) flushPendingMsgsLocked() (*LostStreamData, error) { mb.ecache.Weaken() } mb.resetCacheExpireTimer(0) - return fsLostData, mb.werr + return nil, mb.werr } // If not, we'll just drop the cache altogether & recycle the buffer. mb.cache.nra = false - mb.expireCacheLocked() - return fsLostData, mb.werr + mb.tryExpireCacheLocked() + return nil, mb.werr } // Lock should be held. @@ -7785,7 +8411,7 @@ checkCache: if err != nil { mb.fs.warn("loadBlock error: %v", err) if err == errNoBlkData { - if ld, _, err := mb.rebuildStateLocked(); err != nil && ld != nil { + if ld, _, _ := mb.rebuildStateLocked(); ld != nil { // Rebuild fs state too. go mb.fs.rebuildState(ld) } @@ -8028,7 +8654,7 @@ func (mb *msgBlock) cacheLookupEx(seq uint64, sm *StoreMsg, doCopy bool) (*Store } if seq != fsm.seq { // See TestFileStoreInvalidIndexesRebuilt. - mb.tryForceExpireCacheLocked() + mb.clearCacheAndOffset() return nil, fmt.Errorf("sequence numbers for cache load did not match, %d vs %d", seq, fsm.seq) } @@ -8262,8 +8888,12 @@ func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err err fs.mu.RLock() defer fs.mu.RUnlock() + return fs.loadLastLocked(subj, sm) +} - if fs.lmb == nil { +// Lock should be held. +func (fs *fileStore) loadLastLocked(subj string, sm *StoreMsg) (lsm *StoreMsg, err error) { + if fs.isClosed() || fs.lmb == nil { return nil, ErrStoreClosed } @@ -8305,7 +8935,7 @@ func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err err continue } mb.mu.Lock() - if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + if err = mb.ensurePerSubjectInfoLoaded(); err != nil { mb.mu.Unlock() return nil, err } @@ -8319,13 +8949,22 @@ func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err err // Check if we need to recalculate. We only care about the last sequence. if ss.lastNeedsUpdate { // mb is already loaded into the cache so should be fast-ish. - mb.recalculateForSubj(subj, ss) + if err = mb.recalculateForSubj(subj, ss); err != nil { + if err != nil { + mb.mu.Unlock() + return nil, err + } + } } l = ss.Last } } if l == 0 { - _, _, l = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq)) + _, _, l, err = mb.filteredPendingLocked(subj, wc, atomic.LoadUint64(&mb.first.seq)) + if err != nil { + mb.mu.Unlock() + return nil, err + } } var didLoad bool if l > 0 { @@ -8368,9 +9007,12 @@ func (fs *fileStore) LoadNextMsgMulti(sl *gsl.SimpleSublist, start uint64, smp * if fs.isClosed() { return nil, 0, ErrStoreClosed } - if sl == nil { + if sl == nil || sl.MatchesFullWildcard() { return fs.LoadNextMsg(_EMPTY_, false, start, smp) } + if filter, ok := sl.MatchesSingleFilter(); ok { + return fs.LoadNextMsg(filter, subjectHasWildcard(filter), start, smp) + } fs.mu.RLock() defer fs.mu.RUnlock() @@ -8465,7 +9107,9 @@ func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *Store // let's check the psim to see if we can skip ahead. if start <= fs.state.FirstSeq { var ss SimpleState - fs.numFilteredPendingNoLast(filter, &ss) + if err := fs.numFilteredPendingNoLast(filter, &ss); err != nil { + return nil, 0, err + } // Nothing available. if ss.Msgs == 0 { return nil, fs.state.LastSeq, ErrStoreEOF @@ -8517,58 +9161,186 @@ func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *Store return nil, fs.state.LastSeq, ErrStoreEOF } -// Will load the next non-deleted msg starting at the start sequence and walking backwards. -func (fs *fileStore) LoadPrevMsg(start uint64, smp *StoreMsg) (sm *StoreMsg, err error) { +// Find the previous matching message. +// fs lock should be held. +func (mb *msgBlock) prevMatching(filter string, wc bool, start uint64, sm *StoreMsg) (*StoreMsg, bool, error) { + mb.mu.Lock() + var updateLLTS bool + defer func() { + if updateLLTS { + mb.llts = ats.AccessTime() + } + mb.finishedWithCache() + mb.mu.Unlock() + }() + + end, isAll := start, filter == _EMPTY_ || filter == fwcs + + var didLoad bool + if mb.fssNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + return nil, false, err + } + didLoad = true + } + mb.lsts = ats.AccessTime() + + if filter == _EMPTY_ { + filter = fwcs + wc = true + } + + if !isAll && mb.fss.Size() == 1 { + if !wc { + _, isAll = mb.fss.Find(stringToBytes(filter)) + } else { + mb.fss.Match(stringToBytes(filter), func(subject []byte, _ *SimpleState) { + isAll = true + }) + } + if !isAll { + return nil, didLoad, ErrStoreMsgNotFound + } + } + + lseq := atomic.LoadUint64(&mb.first.seq) + end = min(end, atomic.LoadUint64(&mb.last.seq)) + + var isMatch func(subj string) bool + if wc { + _tsa, _fsa := [32]string{}, [32]string{} + tsa, fsa := _tsa[:0], tokenizeSubjectIntoSlice(_fsa[:0], filter) + isMatch = func(subj string) bool { + tsa = tokenizeSubjectIntoSlice(tsa[:0], subj) + return isSubsetMatchTokenized(tsa, fsa) + } + } + + subjs := mb.fs.cfg.Subjects + doLinearScan := isAll || (wc && len(subjs) == 1 && subjs[0] == filter) + if !doLinearScan && wc && mb.cacheAlreadyLoaded() { + doLinearScan = mb.fss.Size()*4 > int(end-lseq) + } + + if !doLinearScan { + var found bool + var first, last uint64 + if bfilter := stringToBytes(filter); wc { + var ierr error + mb.fss.Match(bfilter, func(bsubj []byte, ss *SimpleState) { + if ierr != nil { + return + } + if ss.firstNeedsUpdate || ss.lastNeedsUpdate { + if ierr = mb.recalculateForSubj(bytesToString(bsubj), ss); ierr != nil { + return + } + } + if end < ss.First { + return + } + if !found { + found = true + first, last = ss.First, min(end, ss.Last) + return + } + first = min(first, ss.First) + last = max(last, min(end, ss.Last)) + }) + if ierr != nil { + return nil, false, ierr + } + } else if ss, _ := mb.fss.Find(bfilter); ss != nil { + if ss.firstNeedsUpdate || ss.lastNeedsUpdate { + if err := mb.recalculateForSubj(filter, ss); err != nil { + return nil, false, err + } + } + if end >= ss.First { + found = true + first, last = ss.First, min(end, ss.Last) + } + } + if !found || first > last { + return nil, didLoad, ErrStoreMsgNotFound + } + lseq, end = max(lseq, first), last + } + + if mb.cacheNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + return nil, false, err + } + didLoad = true + } + + if sm == nil { + sm = new(StoreMsg) + } + + for seq := end; seq >= lseq; seq-- { + if mb.dmap.Exists(seq) { + updateLLTS = true + continue + } + llseq := mb.llseq + fsm, err := mb.cacheLookup(seq, sm) + if err != nil { + if err == errPartialCache || err == errNoCache { + return nil, false, err + } + continue + } + updateLLTS = false + expireOk := seq == lseq && mb.llseq != llseq && mb.llseq == seq + if isAll { + return fsm, expireOk, nil + } + if wc && isMatch(sm.subj) { + return fsm, expireOk, nil + } else if !wc && fsm.subj == filter { + return fsm, expireOk, nil + } + mb.llseq = llseq + } + + return nil, didLoad, ErrStoreMsgNotFound +} + +// Will load the previous message matching the filter subject, starting at the start sequence and walking backwards. +func (fs *fileStore) LoadPrevMsg(filter string, wc bool, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) { if fs.isClosed() { - return nil, ErrStoreClosed + return nil, 0, ErrStoreClosed } fs.mu.RLock() defer fs.mu.RUnlock() if fs.state.Msgs == 0 || start < fs.state.FirstSeq { - return nil, ErrStoreEOF + return nil, fs.state.FirstSeq, ErrStoreEOF } if start > fs.state.LastSeq { start = fs.state.LastSeq } - if smp == nil { - smp = new(StoreMsg) - } if bi, _ := fs.selectMsgBlockWithIndex(start); bi >= 0 { for i := bi; i >= 0; i-- { mb := fs.blks[i] - mb.mu.Lock() - // Need messages loaded from here on out. - if mb.cacheNotLoaded() { - if err := mb.loadMsgsWithLock(); err != nil { - mb.mu.Unlock() - return nil, err - } - } - - lseq, fseq := atomic.LoadUint64(&mb.last.seq), atomic.LoadUint64(&mb.first.seq) - if start > lseq { - start = lseq - } - for seq := start; seq >= fseq; seq-- { - if mb.dmap.Exists(seq) { - continue - } - if sm, err := mb.cacheLookup(seq, smp); err == nil { - mb.finishedWithCache() - mb.mu.Unlock() - return sm, nil + if sm, expireOk, err := mb.prevMatching(filter, wc, start, smp); err == nil { + if expireOk { + mb.tryForceExpireCache() } + return sm, sm.seq, nil + } else if err != ErrStoreMsgNotFound { + return nil, 0, err + } else if expireOk { + mb.tryForceExpireCache() } - mb.finishedWithCache() - mb.mu.Unlock() } } - return nil, ErrStoreEOF + return nil, fs.state.FirstSeq, ErrStoreEOF } // LoadPrevMsgMulti will find the previous message matching any entry in the sublist. @@ -8577,9 +9349,11 @@ func (fs *fileStore) LoadPrevMsgMulti(sl *gsl.SimpleSublist, start uint64, smp * return nil, 0, ErrStoreClosed } - if sl == nil { - sm, err = fs.LoadPrevMsg(start, smp) - return + if sl == nil || sl.MatchesFullWildcard() { + return fs.LoadPrevMsg(_EMPTY_, false, start, smp) + } + if filter, ok := sl.MatchesSingleFilter(); ok { + return fs.LoadPrevMsg(filter, subjectHasWildcard(filter), start, smp) } fs.mu.RLock() defer fs.mu.RUnlock() @@ -8953,6 +9727,15 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint } } + // Persist any write errors. + defer func() { + if err != nil { + fs.mu.Lock() + fs.setWriteErr(err) + fs.mu.Unlock() + } + }() + // Make sure to not leave subject if empty and we reach this spot. if subject == _EMPTY_ { subject = fwcs @@ -8965,9 +9748,9 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint // If we have a "keep" designation need to get full filtered state so we know how many to purge. var maxp uint64 if keep > 0 { - ss := fs.FilteredState(1, subject) - if keep >= ss.Msgs { - return 0, nil + ss, err := fs.FilteredState(1, subject) + if err != nil || keep >= ss.Msgs { + return 0, err } maxp = ss.Msgs - keep } @@ -8976,17 +9759,65 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint var tombs []msgId var lowSeq uint64 + if fs.isClosed() { + return purged, ErrStoreClosed + } fs.mu.Lock() + // Always return previous write errors. + if err := fs.werr; err != nil { + fs.mu.Unlock() + return purged, err + } + if len(fs.blks) == 0 || fs.lmb == nil { + fs.mu.Unlock() + return purged, nil + } + + var start, stop uint32 + + // If literal subject check for presence. + if wc { + start = fs.lmb.index + fs.psim.Match(stringToBytes(subject), func(_ []byte, psi *psi) { + // Keep track of start and stop indexes for this subject. + if psi.fblk < start { + start = psi.fblk + } + if psi.lblk > stop { + stop = psi.lblk + } + }) + // None matched. + if stop == 0 { + fs.mu.Unlock() + return purged, nil + } + } else if info, ok := fs.psim.Find(stringToBytes(subject)); ok { + start, stop = info.fblk, info.lblk + } else { + fs.mu.Unlock() + return purged, nil + } + // We may remove blocks as we purge, so don't range directly on fs.blks // otherwise we may jump over some (see https://github.com/nats-io/nats-server/issues/3528) for i := 0; i < len(fs.blks); i++ { mb := fs.blks[i] + // Skip if not within our range for the purge subject. + if mb.index < start || mb.index > stop { + continue + } mb.mu.Lock() // If we do not have our fss, try to expire the cache if we have no items in this block. shouldExpire := mb.fssNotLoaded() - t, f, l := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq)) + t, f, l, err := mb.filteredPendingLocked(subject, wc, atomic.LoadUint64(&mb.first.seq)) + if err != nil { + mb.mu.Unlock() + fs.mu.Unlock() + return purged, err + } if t == 0 { // Expire if we were responsible for loading. if shouldExpire { @@ -9035,7 +9866,12 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint bytes += rl } // PSIM and FSS updates. - nr := mb.removeSeqPerSubject(sm.subj, seq) + nr, err := mb.removeSeqPerSubject(sm.subj, seq) + if err != nil { + mb.mu.Unlock() + fs.mu.Unlock() + return purged, err + } nrg = fs.removePerSubject(sm.subj) // Track tombstones we need to write. @@ -9050,7 +9886,11 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint if mb.isEmpty() { // Since we are removing this block don't need to write tombstones. tombs = tombs[:te] - fs.removeMsgBlock(mb) + if err = fs.removeMsgBlock(mb); err != nil { + mb.mu.Unlock() + fs.mu.Unlock() + return 0, err + } i-- // keep flag set, if set previously firstSeqNeedsUpdate = firstSeqNeedsUpdate || seq == fs.state.FirstSeq @@ -9097,7 +9937,10 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint } } if firstSeqNeedsUpdate { - fs.selectNextFirst() + if err = fs.selectNextFirst(); err != nil { + fs.mu.Unlock() + return purged, err + } } // Update the last purgeEx call time. @@ -9107,14 +9950,17 @@ func (fs *fileStore) PurgeEx(subject string, sequence, keep uint64) (purged uint // When writing multiple tombstones we will flush at the end. if len(tombs) > 0 { for _, tomb := range tombs { - if err := fs.writeTombstoneNoFlush(tomb.seq, tomb.ts); err != nil { + if err = fs.writeTombstoneNoFlush(tomb.seq, tomb.ts); err != nil { fs.mu.Unlock() return purged, err } } // Flush any pending. If we change blocks the newMsgBlockForWrite() will flush any pending for us. if lmb := fs.lmb; lmb != nil { - lmb.flushPendingMsgs() + if err = lmb.flushPendingMsgs(); err != nil { + fs.mu.Unlock() + return purged, err + } } } @@ -9140,14 +9986,29 @@ func (fs *fileStore) Purge() (uint64, error) { return fs.purge(0) } -func (fs *fileStore) purge(fseq uint64) (uint64, error) { +func (fs *fileStore) purge(fseq uint64) (purged uint64, rerr error) { if fs.isClosed() { return 0, ErrStoreClosed } + // Persist any write errors. + defer func() { + if rerr != nil { + fs.mu.Lock() + fs.setWriteErr(rerr) + fs.mu.Unlock() + } + }() + fs.mu.Lock() - purged := fs.state.Msgs + // Always return previous write errors. + if err := fs.werr; err != nil { + fs.mu.Unlock() + return 0, err + } + + purged = fs.state.Msgs rbytes := int64(fs.state.Bytes) fs.state.FirstSeq = fs.state.LastSeq + 1 @@ -9160,54 +10021,18 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { mb.dirtyClose() } - fs.blks = nil - fs.lmb = nil - fs.bim = make(map[uint32]*msgBlock) - // Clear any per subject tracking. - fs.psim, fs.tsl = fs.psim.Empty(), 0 - fs.sdm.empty() - // Mark dirty. - fs.dirty++ - - // Move the msgs directory out of the way, will delete out of band. - // FIXME(dlc) - These can error and we need to change api above to propagate? - mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) - pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) - // If purge directory still exists then we need to wait - // in place and remove since rename would fail. - if _, err := os.Stat(pdir); err == nil { - <-dios - os.RemoveAll(pdir) - dios <- struct{}{} + // Check if we need to set the first seq to a new number. + if fseq > fs.state.FirstSeq { + fs.state.FirstSeq = fseq + fs.state.LastSeq = fseq - 1 } - <-dios - os.Rename(mdir, pdir) - dios <- struct{}{} - - go func() { - <-dios - os.RemoveAll(pdir) - dios <- struct{}{} - }() - - // Create new one. - <-dios - os.MkdirAll(mdir, defaultDirPerms) - dios <- struct{}{} - // Make sure we have a lmb to write to. if _, err := fs.newMsgBlockForWrite(); err != nil { fs.mu.Unlock() return purged, err } - // Check if we need to set the first seq to a new number. - if fseq > fs.state.FirstSeq { - fs.state.FirstSeq = fseq - fs.state.LastSeq = fseq - 1 - } - lmb := fs.lmb atomic.StoreUint64(&lmb.first.seq, fs.state.FirstSeq) atomic.StoreUint64(&lmb.last.seq, fs.state.LastSeq) @@ -9216,7 +10041,107 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { if lseq := atomic.LoadUint64(&lmb.last.seq); lseq > 0 { // Leave a tombstone so we can remember our starting sequence in case // full state becomes corrupted. - fs.writeTombstone(lseq, lmb.last.ts) + if err := fs.writeTombstone(lseq, lmb.last.ts); err != nil { + fs.mu.Unlock() + return purged, err + } + } + // Close FDs since we'll move the file. We re-enable the FD after the purge is complete. + if err := lmb.flushPendingMsgs(); err != nil { + fs.mu.Unlock() + return purged, err + } + if err := lmb.closeFDs(); err != nil { + fs.mu.Unlock() + return purged, err + } + + fs.blks = nil + fs.lmb = nil + fs.bim = make(map[uint32]*msgBlock) + // Clear any per subject tracking. + fs.psim, fs.tsl = fs.psim.Empty(), 0 + fs.sdm.empty() + // Mark dirty. + fs.dirty++ + fs.addMsgBlock(lmb) + + // Move the msgs directory out of the way, will delete out of band. + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + ndir := filepath.Join(fs.fcfg.StoreDir, newMsgDir) + pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) + <-dios + // If purge directory still exists then we need to wait + // in place and remove since rename would fail. + if _, err := os.Stat(ndir); err == nil { + if err = os.RemoveAll(ndir); err != nil { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + } else if !os.IsNotExist(err) { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + if _, err := os.Stat(pdir); err == nil { + if err = os.RemoveAll(pdir); err != nil { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + } else if !os.IsNotExist(err) { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + + // Create directory to move the new tombstone to. + if err := os.MkdirAll(ndir, defaultDirPerms); err != nil { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + // Move out the block containing the tombstone. Also move the key file if encrypted. + // The block file itself MUST be moved last to ensure we can assume the prior renames + // were successful during recovery. + for _, mbf := range []string{fmt.Sprintf(keyScan, lmb.index), fmt.Sprintf(blkScan, lmb.index)} { + b := filepath.Join(mdir, mbf) + a := filepath.Join(ndir, mbf) + if err := os.Rename(b, a); err != nil && !os.IsNotExist(err) { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + } + // Purge all remaining messages. + if err := os.Rename(mdir, pdir); err != nil { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + // Rename the directory back to be left only with the tombstone. + if err := os.Rename(ndir, mdir); err != nil { + dios <- struct{}{} + fs.mu.Unlock() + return purged, err + } + dios <- struct{}{} + + // Remove the purged messages directory asynchronously. + go func() { + <-dios + _ = os.RemoveAll(pdir) + dios <- struct{}{} + }() + + // Re-enable writing for the lmb. + lmb.mu.Lock() + err := lmb.enableForWriting(fs.fip) + lmb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return purged, err } cb := fs.scb @@ -9234,6 +10159,51 @@ func (fs *fileStore) purge(fseq uint64) (uint64, error) { return purged, nil } +// Lock and dios should be held. +func (fs *fileStore) recoverPartialPurge() error { + mdir := filepath.Join(fs.fcfg.StoreDir, msgDir) + ndir := filepath.Join(fs.fcfg.StoreDir, newMsgDir) + if entries, err := os.ReadDir(ndir); err != nil && !os.IsNotExist(err) { + return err + } else if err == nil { + hasBlk := slices.ContainsFunc(entries, func(e os.DirEntry) bool { + return strings.HasSuffix(e.Name(), blkSuffix) + }) + if hasBlk { + // We have a tombstone, we can purge the old messages. + if err = os.RemoveAll(mdir); err != nil { + return err + } + if err = os.Rename(ndir, mdir); err != nil { + return err + } + } else { + // No .blk means the purge did not complete, so clear + // any progress made by the partial purge. + for _, entry := range entries { + var index uint32 + if n, err := fmt.Sscanf(entry.Name(), keyScan, &index); err != nil || n != 1 { + continue + } + // Found a key file, remove the corresponding .blk, if any. + // Recovery may otherwise wrongly conclude that the .blk is + // is plaintext, and consider it corrupt when trying to open it. + err := os.Remove(filepath.Join(mdir, fmt.Sprintf(blkScan, index))) + if err != nil && !os.IsNotExist(err) { + return err + } + } + _ = os.RemoveAll(ndir) + return nil + } + } + pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) + if _, err := os.Stat(pdir); err == nil { + _ = os.RemoveAll(pdir) + } + return nil +} + // Compact will remove all messages from this store up to // but not including the seq parameter. // Will return the number of purged messages. @@ -9241,14 +10211,20 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) { return fs.compact(seq) } -func (fs *fileStore) compact(seq uint64) (uint64, error) { +func (fs *fileStore) compact(seq uint64) (purged uint64, rerr error) { + if fs.isClosed() { + return 0, ErrStoreClosed + } if seq == 0 { return fs.purge(seq) } - var purged, bytes uint64 - fs.mu.Lock() + // Always return previous write errors. + if err := fs.werr; err != nil { + fs.mu.Unlock() + return 0, err + } // Same as purge all. if lseq := fs.state.LastSeq; seq > lseq { fs.mu.Unlock() @@ -9266,6 +10242,17 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { return 0, nil } + // Persist any write errors. + defer func() { + if rerr != nil { + fs.mu.Lock() + fs.setWriteErr(rerr) + fs.mu.Unlock() + } + }() + + var bytes uint64 + // All msgblocks up to this one can be thrown away. var deleted int for _, mb := range fs.blks { @@ -9276,7 +10263,11 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { purged += mb.msgs bytes += mb.bytes // Make sure we do subject cleanup as well. - mb.ensurePerSubjectInfoLoaded() + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + mb.mu.Unlock() + fs.mu.Unlock() + return 0, err + } mb.fss.IterOrdered(func(bsubj []byte, ss *SimpleState) bool { subj := bytesToString(bsubj) for i := uint64(0); i < ss.Msgs; i++ { @@ -9285,8 +10276,12 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { return true }) // Now close. - mb.dirtyCloseWithRemove(true) + err := mb.dirtyCloseWithRemove(true) mb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return purged, err + } deleted++ } @@ -9304,7 +10299,9 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { // Make sure we have the messages loaded. if smb.cacheNotLoaded() { if err = smb.loadMsgsWithLock(); err != nil { - goto SKIP + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err } defer func() { // The lock is released once we get here, so need to re-acquire. @@ -9332,7 +10329,11 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { purged++ } // Update fss - smb.removeSeqPerSubject(sm.subj, mseq) + if _, err := smb.removeSeqPerSubject(sm.subj, mseq); err != nil { + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err + } fs.removePerSubject(sm.subj) tombs = append(tombs, msgId{sm.seq, sm.ts}) } @@ -9342,7 +10343,11 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { if isEmpty := smb.msgs == 0; isEmpty { // Only remove if not the last block. if smb != fs.lmb { - smb.dirtyCloseWithRemove(true) + if err = smb.dirtyCloseWithRemove(true); err != nil { + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err + } deleted++ } else { // Make sure to sync changes. @@ -9372,7 +10377,11 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { if smb.rbytes > compactMinimum && smb.bytes*2 < smb.rbytes { var moff uint32 moff, _, _, err = smb.slotInfo(int(atomic.LoadUint64(&smb.first.seq) - smb.cache.fseq)) - if err != nil || moff >= uint32(len(smb.cache.buf)) { + if err != nil { + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err + } else if moff >= uint32(len(smb.cache.buf)) { goto SKIP } buf := smb.cache.buf[moff:] @@ -9385,7 +10394,9 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { // Recreate to reset counter. bek, err := genBlockEncryptionKey(smb.fs.fcfg.Cipher, smb.seed, smb.nonce) if err != nil { - goto SKIP + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err } // For future writes make sure to set smb.bek to keep counter correct. smb.bek = bek @@ -9394,21 +10405,27 @@ func (fs *fileStore) compact(seq uint64) (uint64, error) { // Recompress if necessary (smb.cmp contains the algorithm used when // the block was loaded from disk, or defaults to NoCompression if not) if nbuf, err = smb.cmp.Compress(nbuf); err != nil { - goto SKIP + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err } // We will write to a new file and mv/rename it in case of failure. mfn := filepath.Join(smb.fs.fcfg.StoreDir, msgDir, fmt.Sprintf(newScan, smb.index)) <-dios - err := os.WriteFile(mfn, nbuf, defaultFilePerms) + err = os.WriteFile(mfn, nbuf, defaultFilePerms) dios <- struct{}{} if err != nil { - os.Remove(mfn) - goto SKIP + _ = os.Remove(mfn) + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err } - if err := os.Rename(mfn, smb.mfn); err != nil { - os.Remove(mfn) - goto SKIP + if err = os.Rename(mfn, smb.mfn); err != nil { + _ = os.Remove(mfn) + smb.mu.Unlock() + fs.mu.Unlock() + return purged, err } // Make sure to remove fss state. @@ -9427,14 +10444,17 @@ SKIP: // When writing multiple tombstones we will flush at the end. if len(tombs) > 0 { for _, tomb := range tombs { - if err := fs.writeTombstoneNoFlush(tomb.seq, tomb.ts); err != nil { + if err = fs.writeTombstoneNoFlush(tomb.seq, tomb.ts); err != nil { fs.mu.Unlock() return purged, err } } // Flush any pending. If we change blocks the newMsgBlockForWrite() will flush any pending for us. if lmb := fs.lmb; lmb != nil { - lmb.flushPendingMsgs() + if err = lmb.flushPendingMsgs(); err != nil { + fs.mu.Unlock() + return purged, err + } } } @@ -9503,8 +10523,12 @@ func (fs *fileStore) reset() error { mb.mu.Lock() purged += mb.msgs bytes += mb.bytes - mb.dirtyCloseWithRemove(true) + err := mb.dirtyCloseWithRemove(true) mb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return err + } } // Reset @@ -9579,8 +10603,64 @@ func (mb *msgBlock) tombsLocked() []msgId { return tombs } +// fs lock should be held. +func (mb *msgBlock) numPriorTombs() int { + mb.mu.Lock() + defer mb.mu.Unlock() + return mb.numPriorTombsLocked() +} + +// Return number of tombstones for messages prior to this msgBlock. +// Both locks should be held. +// Write lock should be held for block. +func (mb *msgBlock) numPriorTombsLocked() int { + if mb.cacheNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + return 0 + } + } + defer mb.finishedWithCache() + + var fseq uint64 + var tombs int + var le = binary.LittleEndian + buf := mb.cache.buf + + for index, lbuf := uint32(0), uint32(len(buf)); index < lbuf; { + if index+msgHdrSize > lbuf { + return tombs + } + hdr := buf[index : index+msgHdrSize] + rl, seq := le.Uint32(hdr[0:]), le.Uint64(hdr[4:]) + // Clear any headers bit that could be set. + rl &^= hbit + // Check for tombstones. + if seq&tbit != 0 { + seq = seq &^ tbit + // Tombstones below the global first seq are irrelevant. + // And we only count tombstones below this block's first seq. + if seq >= mb.fs.state.FirstSeq && (fseq == 0 || seq < fseq) { + tombs++ + } + index += rl + continue + } + if seq == 0 || seq&ebit != 0 { + index += rl + continue + } + // Advance to next record. + index += rl + if fseq == 0 { + fseq = seq + } + } + + return tombs +} + // Truncate will truncate a stream store up to seq. Sequence needs to be valid. -func (fs *fileStore) Truncate(seq uint64) error { +func (fs *fileStore) Truncate(seq uint64) (rerr error) { if fs.isClosed() { return ErrStoreClosed } @@ -9591,15 +10671,34 @@ func (fs *fileStore) Truncate(seq uint64) error { } fs.mu.Lock() + // Always return previous write errors. + if err := fs.werr; err != nil { + fs.mu.Unlock() + return err + } + + // Persist any write errors. + defer func() { + if rerr != nil { + fs.mu.Lock() + fs.setWriteErr(rerr) + fs.mu.Unlock() + } + }() // Any existing state file will no longer be applicable. We will force write a new one // at the end, after we release the lock. os.Remove(filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile)) + var err error var lsm *StoreMsg smb := fs.selectMsgBlock(seq) if smb != nil { - lsm, _, _ = smb.fetchMsgNoCopy(seq, nil) + lsm, _, err = smb.fetchMsgNoCopy(seq, nil) + if err != nil && err != ErrStoreMsgNotFound && err != errDeletedMsg { + fs.mu.Unlock() + return err + } } // Reset last so new block doesn't contain truncated sequences/timestamps. @@ -9632,7 +10731,10 @@ func (fs *fileStore) Truncate(seq uint64) error { // If the selected block is not found or the message was deleted, we'll need to write a tombstone // at the truncated sequence so we don't roll backward on our last sequence and timestamp. if lsm == nil || removeSmb { - fs.writeTombstone(seq, lastTime) + if err = fs.writeTombstone(seq, lastTime); err != nil { + fs.mu.Unlock() + return err + } } var purged, bytes uint64 @@ -9660,13 +10762,20 @@ func (fs *fileStore) Truncate(seq uint64) error { mb.mu.Unlock() for _, tomb := range tombs { if tomb.seq < seq { - fs.writeTombstone(tomb.seq, tomb.ts) + if err = fs.writeTombstone(tomb.seq, tomb.ts); err != nil { + fs.mu.Unlock() + return err + } } } mb.mu.Lock() } - fs.forceRemoveMsgBlock(mb) + err = fs.forceRemoveMsgBlock(mb) mb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return err + } } hasWrittenTombstones := len(tmb.tombs()) > 0 @@ -9682,13 +10791,20 @@ func (fs *fileStore) Truncate(seq uint64) error { smb.mu.Unlock() for _, tomb := range tombs { if tomb.seq < seq { - fs.writeTombstone(tomb.seq, tomb.ts) + if err = fs.writeTombstone(tomb.seq, tomb.ts); err != nil { + fs.mu.Unlock() + return err + } } } smb.mu.Lock() } - fs.forceRemoveMsgBlock(smb) + err = fs.forceRemoveMsgBlock(smb) smb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return err + } goto SKIP } @@ -9729,8 +10845,12 @@ SKIP: if !hasWrittenTombstones { fs.lmb = smb tmb.mu.Lock() - fs.forceRemoveMsgBlock(tmb) + err = fs.forceRemoveMsgBlock(tmb) tmb.mu.Unlock() + if err != nil { + fs.mu.Unlock() + return err + } } // Reset last. @@ -9747,7 +10867,10 @@ SKIP: fs.state.Bytes -= bytes // Reset our subject lookup info. - fs.resetGlobalPerSubjectInfo() + if err = fs.resetGlobalPerSubjectInfo(); err != nil { + fs.mu.Unlock() + return err + } fs.dirty++ @@ -9807,38 +10930,59 @@ func (fs *fileStore) removeMsgBlockFromList(mb *msgBlock) { // Removes the msgBlock // Both locks should be held. -func (fs *fileStore) removeMsgBlock(mb *msgBlock) { +func (fs *fileStore) removeMsgBlock(mb *msgBlock) error { // Check for us being last message block lseq, lts := atomic.LoadUint64(&mb.last.seq), mb.last.ts if mb == fs.lmb { // Creating a new message write block requires that the lmb lock is not held. mb.mu.Unlock() // Write the tombstone to remember since this was last block. - if lmb, _ := fs.newMsgBlockForWrite(); lmb != nil { - fs.writeTombstone(lseq, lts) + if lmb, err := fs.newMsgBlockForWrite(); err != nil || lmb == nil { + if err != nil { + err = errors.New("lmb missing") + } + // Re-acquire mb lock + mb.mu.Lock() + return err + } else if err = fs.writeTombstone(lseq, lts); err != nil { + // Re-acquire mb lock + mb.mu.Lock() + return err } mb.mu.Lock() } else if lseq == fs.state.LastSeq { // Need to write a tombstone for the last sequence if we're removing the block containing it. - fs.writeTombstone(lseq, lts) + if err := fs.writeTombstone(lseq, lts); err != nil { + return err + } } // Only delete message block after (potentially) writing a tombstone. - fs.forceRemoveMsgBlock(mb) + // But only if it doesn't contain any tombstones for prior blocks. + if mb.numPriorTombsLocked() > 0 { + return nil + } + return fs.forceRemoveMsgBlock(mb) } // Removes the msgBlock, without writing tombstones to ensure the last sequence is preserved. // Both locks should be held. -func (fs *fileStore) forceRemoveMsgBlock(mb *msgBlock) { - mb.dirtyCloseWithRemove(true) +func (fs *fileStore) forceRemoveMsgBlock(mb *msgBlock) error { + if err := mb.dirtyCloseWithRemove(true); err != nil { + return err + } fs.removeMsgBlockFromList(mb) + return nil } // Purges and removes the msgBlock from the store. // Lock should be held. -func (fs *fileStore) purgeMsgBlock(mb *msgBlock) { +func (fs *fileStore) purgeMsgBlock(mb *msgBlock) error { mb.mu.Lock() // Adjust per-subject tracking if present. - if err := mb.ensurePerSubjectInfoLoaded(); err == nil && mb.fss != nil { + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + mb.mu.Unlock() + return err + } else if mb.fss != nil { mb.fss.IterFast(func(bsubj []byte, ss *SimpleState) bool { subj := bytesToString(bsubj) for range ss.Msgs { @@ -9849,21 +10993,25 @@ func (fs *fileStore) purgeMsgBlock(mb *msgBlock) { } // Clean up scheduled message metadata if we know this block contained any. if fs.scheduling != nil && mb.schedules > 0 { - cacheLoaded := !mb.cacheNotLoaded() - if !cacheLoaded { - cacheLoaded = mb.loadMsgsWithLock() == nil + if mb.cacheNotLoaded() { + if err := mb.loadMsgsWithLock(); err != nil { + mb.mu.Unlock() + return err + } } - if cacheLoaded { - var smv StoreMsg - fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) - for seq := fseq; seq <= lseq; seq++ { - sm, err := mb.cacheLookupNoCopy(seq, &smv) - if err != nil || sm == nil { - continue - } - if schedule, ok := getMessageSchedule(sm.hdr); ok && !schedule.IsZero() { - fs.scheduling.remove(seq) - } + var smv StoreMsg + fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq) + for seq := fseq; seq <= lseq; seq++ { + sm, err := mb.cacheLookupNoCopy(seq, &smv) + if err != nil && err != errDeletedMsg { + mb.mu.Unlock() + return err + } + if sm == nil { + continue + } + if schedule, apiErr := getMessageSchedule(sm.hdr); apiErr == nil && !schedule.IsZero() { + fs.scheduling.remove(seq) } } } @@ -9877,11 +11025,16 @@ func (fs *fileStore) purgeMsgBlock(mb *msgBlock) { } fs.state.Msgs -= msgs fs.state.Bytes -= bytes - fs.removeMsgBlock(mb) + if err := fs.removeMsgBlock(mb); err != nil { + mb.mu.Unlock() + return err + } mb.tryForceExpireCacheLocked() mb.finishedWithCache() mb.mu.Unlock() - fs.selectNextFirst() + if err := fs.selectNextFirst(); err != nil { + return err + } if cb := fs.scb; cb != nil { // If we have a callback registered, we need to release lock regardless since consumers will recalculate pending. @@ -9890,6 +11043,7 @@ func (fs *fileStore) purgeMsgBlock(mb *msgBlock) { cb(-int64(msgs), -int64(bytes), 0, _EMPTY_) fs.mu.Lock() } + return nil } // Called by purge to simply get rid of the cache and close our fds. @@ -9917,23 +11071,23 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) error { close(mb.qch) mb.qch = nil } - if mb.mfd != nil { - mb.mfd.Close() + if fd := mb.mfd; fd != nil { mb.mfd = nil + if err := fd.Close(); err != nil && !os.IsNotExist(err) { + return err + } } if remove { // Clear any tracking by subject if we are removing. mb.fss = nil if mb.mfn != _EMPTY_ { - err := os.Remove(mb.mfn) - if isPermissionError(err) { + if err := os.Remove(mb.mfn); err != nil && !os.IsNotExist(err) { return err } mb.mfn = _EMPTY_ } if mb.kfn != _EMPTY_ { - err := os.Remove(mb.kfn) - if isPermissionError(err) { + if err := os.Remove(mb.kfn); err != nil && !os.IsNotExist(err) { return err } } @@ -9943,21 +11097,23 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) error { // Remove a seq from the fss and select new first. // Lock should be held. -func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) uint64 { - mb.ensurePerSubjectInfoLoaded() +func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) (uint64, error) { + if err := mb.ensurePerSubjectInfoLoaded(); err != nil { + return 0, err + } if mb.fss == nil { - return 0 + return 0, nil } bsubj := stringToBytes(subj) ss, ok := mb.fss.Find(bsubj) if !ok || ss == nil { - return 0 + return 0, nil } mb.fs.sdm.removeSeqAndSubject(seq, subj) if ss.Msgs == 1 { mb.fss.Delete(bsubj) - return 0 + return 0, nil } ss.Msgs-- @@ -9967,12 +11123,12 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) uint64 { if !ss.lastNeedsUpdate && seq != ss.Last { ss.First = ss.Last ss.firstNeedsUpdate = false - return 1 + return 1, nil } if !ss.firstNeedsUpdate && seq != ss.First { ss.Last = ss.First ss.lastNeedsUpdate = false - return 1 + return 1, nil } } @@ -9980,16 +11136,16 @@ func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) uint64 { ss.firstNeedsUpdate = seq == ss.First || ss.firstNeedsUpdate ss.lastNeedsUpdate = seq == ss.Last || ss.lastNeedsUpdate - return ss.Msgs + return ss.Msgs, nil } // Will recalculate the first and/or last sequence for this subject in this block. // Will avoid slower path message lookups and scan the cache directly instead. -func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { +func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) error { // Need to make sure messages are loaded. if mb.cacheNotLoaded() { if err := mb.loadMsgsWithLock(); err != nil { - return + return err } defer mb.finishedWithCache() } @@ -10002,7 +11158,7 @@ func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { ss.First = ss.Last ss.firstNeedsUpdate = false ss.lastNeedsUpdate = false - return + return nil } endSlot := int(ss.Last - mb.cache.fseq) @@ -10010,7 +11166,7 @@ func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { endSlot = 0 } if endSlot >= len(mb.cache.idx) || startSlot > endSlot { - return + return nil } var le = binary.LittleEndian @@ -10033,7 +11189,7 @@ func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { ss.First = ss.Last // Only need to reset ss.lastNeedsUpdate, ss.firstNeedsUpdate is already reset above. ss.lastNeedsUpdate = false - return + return nil } buf := mb.cache.buf[li:] hdr := buf[:msgHdrSize] @@ -10047,7 +11203,7 @@ func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { if ss.Msgs == 1 { ss.Last = seq ss.lastNeedsUpdate = false - return + return nil } // Skip the start slot ahead, if we need to recalculate last we can stop early. startSlot = slot @@ -10072,7 +11228,7 @@ func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { li := int(bi) if li >= len(mb.cache.buf) { // Can't overwrite ss.Last, just skip. - return + return nil } buf := mb.cache.buf[li:] hdr := buf[:msgHdrSize] @@ -10091,22 +11247,26 @@ func (mb *msgBlock) recalculateForSubj(subj string, ss *SimpleState) { ss.First = seq ss.firstNeedsUpdate = false } - return + return nil } } } + return nil } // Lock should be held. -func (fs *fileStore) resetGlobalPerSubjectInfo() { +func (fs *fileStore) resetGlobalPerSubjectInfo() error { // Clear any global subject state. fs.psim, fs.tsl = fs.psim.Empty(), 0 if fs.noTrackSubjects() { - return + return nil } for _, mb := range fs.blks { - fs.populateGlobalPerSubjectInfo(mb) + if err := fs.populateGlobalPerSubjectInfo(mb); err != nil { + return err + } } + return nil } // Lock should be held. @@ -10154,6 +11314,9 @@ func (mb *msgBlock) generatePerSubjectInfo() error { if err == errNoCache { return nil } + // Clear partially built fss so callers don't operate on incomplete state. + mb.fss = nil + mb.clearCacheAndOffset() return err } if sm != nil && len(sm.subj) > 0 { @@ -10196,12 +11359,12 @@ func (mb *msgBlock) ensurePerSubjectInfoLoaded() error { // Called on recovery to populate the global psim state. // Lock should be held. -func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { +func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) error { mb.mu.Lock() defer mb.mu.Unlock() if err := mb.ensurePerSubjectInfoLoaded(); err != nil { - return + return err } // Now populate psim. @@ -10219,6 +11382,7 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) { } return true }) + return nil } // Calls os.RemoveAll on the given `dir` directory, but if an error occurs, @@ -10309,10 +11473,8 @@ func (fs *fileStore) Delete(inline bool) error { } pdir := filepath.Join(fs.fcfg.StoreDir, purgeDir) - // If purge directory still exists then we need to wait - // in place and remove since rename would fail. if _, err := os.Stat(pdir); err == nil { - os.RemoveAll(pdir) + _ = os.RemoveAll(pdir) } // Quickly close all blocks and simulate a purge w/o overhead an new write block. @@ -10383,7 +11545,7 @@ func (fs *fileStore) cancelSyncTimer() { const ( fullStateMagic = uint8(11) fullStateMinVersion = uint8(1) // What is the minimum version we know how to parse? - fullStateVersion = uint8(3) // What is the current version written out to index.db? + fullStateVersion = uint8(4) // What is the current version written out to index.db? ) // This go routine periodically writes out our full stream state index. @@ -10427,17 +11589,12 @@ func timestampNormalized(t time.Time) int64 { // writeFullState will proceed to write the full meta state iff not complex and time consuming. // Since this is for quick recovery it is optional and should not block/stall normal operations. func (fs *fileStore) writeFullState() error { - return fs._writeFullState(false, true) + return fs._writeFullState(false) } // forceWriteFullState will proceed to write the full meta state. func (fs *fileStore) forceWriteFullState() error { - return fs._writeFullState(true, true) -} - -// forceWriteFullStateLocked will proceed to write the full meta state. This should only be called by stop() -func (fs *fileStore) forceWriteFullStateLocked() error { - return fs._writeFullState(true, false) + return fs._writeFullState(true) } // This will write the full binary state for the stream. @@ -10447,29 +11604,46 @@ func (fs *fileStore) forceWriteFullStateLocked() error { // 2. PSIM - Per Subject Index Map - Tracks first and last blocks with subjects present. // 3. MBs - Index, Bytes, First and Last Sequence and Timestamps, and the deleted map (avl.seqset). // 4. Last block index and hash of record inclusive to this stream state. -func (fs *fileStore) _writeFullState(force, needLock bool) error { +func (fs *fileStore) _writeFullState(force bool) error { if fs.isClosed() { return nil } - fsLock := func() { - if needLock { - fs.mu.Lock() - } - } - fsUnlock := func() { - if needLock { - fs.mu.Unlock() - } + // If we aren't forcing an update then only queue this up if we aren't already + // running. This means we can keep waiting on shutdown if needed but not build up + // lots of waiting goroutines in a bad timer case. + if fs.wfsrun.Add(1) > 1 && !force { + fs.wfsrun.Add(-1) + return nil } + defer fs.wfsrun.Add(-1) + + // Only allow one _writeFullState to take place at a time, otherwise we can + // have multiple goroutines trying to write the same file after we've released + // the store lock. + fs.wfsmu.Lock() + defer fs.wfsmu.Unlock() start := time.Now() - fsLock() + fs.mu.RLock() if fs.dirty == 0 { - fsUnlock() + fs.mu.RUnlock() return nil } + // Configure encryption if needed. + if fs.prf != nil { + // Re-acquire temporarily as write lock to set up AEK. + fs.mu.RUnlock() + fs.mu.Lock() + err := fs.setupAEK() + fs.mu.Unlock() + if err != nil { + return err + } + fs.mu.RLock() + } + // For calculating size and checking time costs for non forced calls. numSubjects := fs.numSubjects() @@ -10485,13 +11659,13 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { numDeleted = int((fs.state.LastSeq - fs.state.FirstSeq + 1) - fs.state.Msgs) } if numSubjects > numThreshold || numDeleted > numThreshold { - fsUnlock() + fs.mu.RUnlock() return errStateTooBig } } // We track this through subsequent runs to get an avg per blk used for subsequent runs. - avgDmapLen := fs.adml + avgDmapLen := fs.wfsadml // If first time through could be 0 if avgDmapLen == 0 && ((fs.state.LastSeq-fs.state.FirstSeq+1)-fs.state.Msgs) > 0 { avgDmapLen = 1024 @@ -10533,9 +11707,7 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { buf = append(buf, subj...) buf = binary.AppendUvarint(buf, psi.total) buf = binary.AppendUvarint(buf, uint64(psi.fblk)) - if psi.total > 1 { - buf = binary.AppendUvarint(buf, uint64(psi.lblk)) - } + buf = binary.AppendUvarint(buf, uint64(psi.lblk)) }) } @@ -10568,7 +11740,7 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { buf = binary.AppendUvarint(buf, mb.ttls) // Field is new in version 2 buf = binary.AppendUvarint(buf, mb.schedules) // Field is new in version 3 if numDeleted > 0 { - dmap, _ := mb.dmap.Encode(scratch[:0]) + dmap := mb.dmap.Encode(scratch[:0]) dmapTotalLen += len(dmap) buf = append(buf, dmap...) } @@ -10583,7 +11755,7 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { mb.mu.RUnlock() } if dmapTotalLen > 0 { - fs.adml = dmapTotalLen / len(fs.blks) + fs.wfsadml = dmapTotalLen / len(fs.blks) } // Place block index and hash onto the end. @@ -10592,16 +11764,12 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { // Encrypt if needed. if fs.prf != nil { - if err := fs.setupAEK(); err != nil { - fsUnlock() - return err - } nonce := make([]byte, fs.aek.NonceSize(), fs.aek.NonceSize()+len(buf)+fs.aek.Overhead()) if n, err := rand.Read(nonce); err != nil { - fsUnlock() + fs.mu.RUnlock() return err } else if n != len(nonce) { - fsUnlock() + fs.mu.RUnlock() return fmt.Errorf("not enough nonce bytes read (%d != %d)", n, len(nonce)) } buf = fs.aek.Seal(nonce, nonce, buf, nil) @@ -10609,26 +11777,25 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { fn := filepath.Join(fs.fcfg.StoreDir, msgDir, streamStreamStateFile) - fs.hh.Reset() - fs.hh.Write(buf) - buf = fs.hh.Sum(buf) + // Need to have our own hasher here, as under a read lock we can't mutate the + // fs.hh safely. + key := sha256.Sum256([]byte(fs.cfg.Name)) + hh, _ := highwayhash.NewDigest64(key[:]) + hh.Write(buf) + buf = hh.Sum(buf) // Snapshot prior dirty count. priorDirty := fs.dirty statesEqual := trackingStatesEqual(&fs.state, &mstate) // Release lock. - fsUnlock() + fs.mu.RUnlock() // Check consistency here. if !statesEqual { fs.warn("Stream state encountered internal inconsistency on write") // Rebuild our fs state from the mb state. - if needLock { - fs.rebuildState(nil) - } else { - fs.rebuildStateLocked(nil) - } + fs.rebuildState(nil) return errCorruptState } @@ -10647,20 +11814,18 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { err := os.WriteFile(fn, buf, defaultFilePerms) // if file system is not writable isPermissionError is set to true dios <- struct{}{} - if isPermissionError(err) { + if err != nil { return err } // Update dirty if successful. - if err == nil { - fsLock() - fs.dirty -= priorDirty - fsUnlock() - } + fs.mu.Lock() + fs.dirty -= priorDirty + fs.mu.Unlock() // Attempt to write both files, an error in one should not prevent the other from being written. - ttlErr := fs.writeTTLState(needLock) - schedErr := fs.writeMsgSchedulingState(needLock) + ttlErr := fs.writeTTLState() + schedErr := fs.writeMsgSchedulingState() if ttlErr != nil { return ttlErr } else if schedErr != nil { @@ -10669,42 +11834,30 @@ func (fs *fileStore) _writeFullState(force, needLock bool) error { return nil } -func (fs *fileStore) writeTTLState(needLock bool) error { - if needLock { - fs.mu.RLock() - } +func (fs *fileStore) writeTTLState() error { + fs.mu.RLock() if fs.ttls == nil { - if needLock { - fs.mu.RUnlock() - } + fs.mu.RUnlock() return nil } fn := filepath.Join(fs.fcfg.StoreDir, msgDir, ttlStreamStateFile) // Must be lseq+1 to identify up to which sequence the TTLs are valid. buf := fs.ttls.Encode(fs.state.LastSeq + 1) - if needLock { - fs.mu.RUnlock() - } + fs.mu.RUnlock() return fs.writeFileWithOptionalSync(fn, buf, defaultFilePerms) } -func (fs *fileStore) writeMsgSchedulingState(needLock bool) error { - if needLock { - fs.mu.RLock() - } +func (fs *fileStore) writeMsgSchedulingState() error { + fs.mu.RLock() if fs.scheduling == nil { - if needLock { - fs.mu.RUnlock() - } + fs.mu.RUnlock() return nil } fn := filepath.Join(fs.fcfg.StoreDir, msgDir, msgSchedulingStreamStateFile) // Must be lseq+1 to identify up to which sequence the schedules are valid. buf := fs.scheduling.encode(fs.state.LastSeq + 1) - if needLock { - fs.mu.RUnlock() - } + fs.mu.RUnlock() return fs.writeFileWithOptionalSync(fn, buf, defaultFilePerms) } @@ -10752,7 +11905,9 @@ func (fs *fileStore) stop(delete, writeState bool) error { if writeState { // Write full state if needed. If not dirty this is a no-op. - fs.forceWriteFullStateLocked() + fs.mu.Unlock() + fs.forceWriteFullState() + fs.mu.Lock() } // Mark as closed. Last message block needs to be cleared after @@ -10992,8 +12147,18 @@ func (fs *fileStore) Snapshot(deadline time.Duration, checkMsgs, includeConsumer fs.mu.Unlock() if checkMsgs { - ld := fs.checkMsgs() + ld, err := fs.checkMsgs() + clearSips := func() { + fs.mu.Lock() + fs.sips-- + fs.mu.Unlock() + } + if err != nil { + clearSips() + return nil, fmt.Errorf("snapshot check failed: %w", err) + } if ld != nil && len(ld.Msgs) > 0 { + clearSips() return nil, fmt.Errorf("snapshot check detected %d bad messages", len(ld.Msgs)) } } @@ -11082,10 +12247,7 @@ func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) { i += binary.PutUvarint(scratch[i:], num) b = append(b, scratch[0:i]...) case *avl.SequenceSet: - buf, err := db.Encode(scratch[:0]) - if err != nil { - return nil, err - } + buf := db.Encode(scratch[:0]) b = append(b, buf...) default: return nil, errors.New("no impl") @@ -11096,64 +12258,148 @@ func (fs *fileStore) EncodedStreamState(failed uint64) ([]byte, error) { return b, nil } -// We used to be more sophisticated to save memory, but speed is more important. +// deleteBlocks returns DeleteBlocks representing interior deletes +// and gaps between blocks. // All blocks should be at least read locked. func (fs *fileStore) deleteBlocks() DeleteBlocks { var dbs DeleteBlocks var prevLast uint64 + var prevRange *DeleteRange + var msgsSinceGap bool for _, mb := range fs.blks { // Detect if we have a gap between these blocks. fseq := atomic.LoadUint64(&mb.first.seq) if prevLast > 0 && prevLast+1 != fseq { - dbs = append(dbs, &DeleteRange{First: prevLast + 1, Num: fseq - prevLast - 1}) + gapSize := fseq - prevLast - 1 + // The previous DeleteRange can be extended + // to include this gap, if there are no + // blocks containing messages between the + // two gaps. + if prevRange != nil && !msgsSinceGap { + prevRange.Num += gapSize + } else { + prevRange = &DeleteRange{ + First: prevLast + 1, + Num: gapSize, + } + msgsSinceGap = false + dbs = append(dbs, prevRange) + } } if mb.dmap.Size() > 0 { dbs = append(dbs, &mb.dmap) + prevRange = nil } prevLast = atomic.LoadUint64(&mb.last.seq) + msgsSinceGap = msgsSinceGap || mb.msgs > 0 } return dbs } +// deleteMap returns all interior deletes for each block based on the mb.dmap. +// Specifically, this will not contain any deletes for blocks that have been removed. +// This is useful to know whether a tombstone is still relevant and marked as deleted by an active block. +// No locks should be held. +func (fs *fileStore) deleteMap() (dmap avl.SequenceSet) { + fs.mu.RLock() + defer fs.mu.RUnlock() + + fs.readLockAllMsgBlocks() + defer fs.readUnlockAllMsgBlocks() + + for _, mb := range fs.blks { + if mb.dmap.Size() > 0 { + mb.dmap.Range(func(seq uint64) bool { + dmap.Insert(seq) + return true + }) + } + } + return dmap +} + // SyncDeleted will make sure this stream has same deleted state as dbs. // This will only process deleted state within our current state. -func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) { +func (fs *fileStore) SyncDeleted(dbs DeleteBlocks) error { + if fs.isClosed() { + return ErrStoreClosed + } + if len(dbs) == 0 { - return + return nil } fs.mu.Lock() defer fs.mu.Unlock() - lseq := fs.state.LastSeq - var needsCheck DeleteBlocks + // Always return previous write errors. + if err := fs.werr; err != nil { + return err + } + lseq := fs.state.LastSeq fs.readLockAllMsgBlocks() mdbs := fs.deleteBlocks() - for i, db := range dbs { - first, last, num := db.State() - // If the block is same as what we have we can skip. - if i < len(mdbs) { - eFirst, eLast, eNum := mdbs[i].State() - if first == eFirst && last == eLast && num == eNum { - continue - } - } else if first > lseq { - // Skip blocks not applicable to our current state. + fs.readUnlockAllMsgBlocks() + + for _, db := range dbs { + first, last, _ := db.State() + if first > lseq { + break + } + + var prune bool + if prune, mdbs = pruneDeleteBlock(db, mdbs); prune { continue } - // Need to insert these. - needsCheck = append(needsCheck, db) + + var err error + if _, ok := db.(*DeleteRange); ok { + err = fs.removeMsgsInRange(first, last, true) + } else { + db.Range(func(dseq uint64) bool { + _, err = fs.removeMsg(dseq, false, true, false) + // Can continue safely if the message doesn't exist. + if err == ErrStoreEOF || err == ErrStoreMsgNotFound { + err = nil + } + return err == nil + }) + } + if err != nil { + return err + } } - fs.readUnlockAllMsgBlocks() + return nil +} - for _, db := range needsCheck { - db.Range(func(dseq uint64) bool { - fs.removeMsg(dseq, false, true, false) - return true - }) +// pruneDeleteBlock tries to find a delete block in the ordered blocks slice +// that matches db. It skips blocks that are already behind db and returns +// whether the next candidate matches exactly, along with the remaining +// suffix to use for the next comparison. +func pruneDeleteBlock(db DeleteBlock, blocks DeleteBlocks) (bool, DeleteBlocks) { + if len(blocks) == 0 { + return false, blocks + } + + aFirst, aLast, aNum := db.State() + bFirst, bLast, bNum := blocks[0].State() + + // Drop blocks that end before db starts. + for bLast < aFirst { + blocks = blocks[1:] + if len(blocks) == 0 { + return false, blocks + } + bFirst, bLast, bNum = blocks[0].State() + } + + if aFirst == bFirst && aLast == bLast && aNum == bNum { + return true, blocks[1:] } + + return false, blocks } //////////////////////////////////////////////////////////////////////////////// @@ -11194,7 +12440,9 @@ func (fs *fileStore) ConsumerStore(name string, created time.Time, cfg *Consumer if cfg.MemoryStorage { // Create directly here. o := &consumerMemStore{ms: fs, cfg: *cfg} - fs.AddConsumer(o) + if err := fs.AddConsumer(o); err != nil { + return nil, err + } return o, nil } @@ -11260,7 +12508,7 @@ func (fs *fileStore) ConsumerStore(name string, created time.Time, cfg *Consumer if _, err := os.Stat(meta); err != nil && os.IsNotExist(err) { didCreate = true if err := o.writeConsumerMeta(); err != nil { - os.RemoveAll(odir) + _ = os.RemoveAll(odir) return nil, err } } @@ -11272,7 +12520,7 @@ func (fs *fileStore) ConsumerStore(name string, created time.Time, cfg *Consumer if _, err := os.Stat(keyFile); err != nil && os.IsNotExist(err) { if err := o.writeConsumerMeta(); err != nil { if didCreate { - os.RemoveAll(odir) + _ = os.RemoveAll(odir) } return nil, err } @@ -11286,7 +12534,7 @@ func (fs *fileStore) ConsumerStore(name string, created time.Time, cfg *Consumer err = fs.writeFileWithOptionalSync(o.ifn, state, defaultFilePerms) if err != nil { if didCreate { - os.RemoveAll(odir) + _ = os.RemoveAll(odir) } return nil, err } @@ -11298,7 +12546,6 @@ func (fs *fileStore) ConsumerStore(name string, created time.Time, cfg *Consumer // Create channels to control our flush go routine. o.fch = make(chan struct{}, 1) o.qch = make(chan struct{}) - go o.flushLoop(o.fch, o.qch) // Make sure to load in our state from disk if needed. if err = o.loadState(); err != nil { @@ -11306,8 +12553,11 @@ func (fs *fileStore) ConsumerStore(name string, created time.Time, cfg *Consumer } // Assign to filestore. - fs.AddConsumer(o) + if err = fs.AddConsumer(o); err != nil { + return nil, err + } + go o.flushLoop(o.fch, o.qch) return o, nil } @@ -11461,11 +12711,9 @@ func (o *consumerFileStore) flushLoop(fch, qch chan struct{}) { func (o *consumerFileStore) SetStarting(sseq uint64) error { o.mu.Lock() o.state.Delivered.Stream = sseq - buf, err := o.encodeState() + o.state.AckFloor.Stream = sseq + buf := encodeConsumerState(&o.state) o.mu.Unlock() - if err != nil { - return err - } return o.writeState(buf) } @@ -11485,6 +12733,14 @@ func (o *consumerFileStore) UpdateStarting(sseq uint64) { o.kickFlusher() } +// Reset all values in the store, and reset the starting sequence. +func (o *consumerFileStore) Reset(sseq uint64) error { + o.mu.Lock() + o.state = ConsumerState{} + o.mu.Unlock() + return o.SetStarting(sseq) +} + // HasState returns if this store has a recorded state. func (o *consumerFileStore) HasState() bool { o.mu.Lock() @@ -11586,8 +12842,8 @@ func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error { return ErrStoreMsgNotFound } - // Check for AckAll here. - if o.cfg.AckPolicy == AckAll { + // Check for AckAll here (or AckFlowControl which functions like AckAll). + if o.cfg.AckPolicy == AckAll || o.cfg.AckPolicy == AckFlowControl { sgap := sseq - o.state.AckFloor.Stream o.state.AckFloor.Consumer = dseq o.state.AckFloor.Stream = sseq @@ -11721,6 +12977,50 @@ func (o *consumerFileStore) Update(state *ConsumerState) error { return nil } +// ForceUpdate updates the consumer state without the backwards check. +// This is used during recovery when we need to reset the consumer to an earlier sequence. +func (o *consumerFileStore) ForceUpdate(state *ConsumerState) error { + // Sanity checks. + if state.AckFloor.Consumer > state.Delivered.Consumer { + return fmt.Errorf("bad ack floor for consumer") + } + if state.AckFloor.Stream > state.Delivered.Stream { + return fmt.Errorf("bad ack floor for stream") + } + + // Copy to our state. + var pending map[uint64]*Pending + var redelivered map[uint64]uint64 + if len(state.Pending) > 0 { + pending = make(map[uint64]*Pending, len(state.Pending)) + for seq, p := range state.Pending { + pending[seq] = &Pending{p.Sequence, p.Timestamp} + if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream { + return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq) + } + } + } + if len(state.Redelivered) > 0 { + redelivered = make(map[uint64]uint64, len(state.Redelivered)) + for seq, dc := range state.Redelivered { + redelivered[seq] = dc + } + } + + // Replace our state. + o.mu.Lock() + o.state.Delivered = state.Delivered + o.state.AckFloor = state.AckFloor + o.state.Pending = pending + o.state.Redelivered = redelivered + buf, err := o.encodeState() + o.mu.Unlock() + if err != nil { + return err + } + return o.writeState(buf) +} + // Will encrypt the state with our asset key. Will be a no-op if encryption not enabled. // Lock should be held. func (o *consumerFileStore) encryptState(buf []byte) ([]byte, error) { @@ -12150,7 +13450,9 @@ func (o *consumerFileStore) Stop() error { ifn, fs := o.ifn, o.fs o.mu.Unlock() - fs.RemoveConsumer(o) + if err = fs.RemoveConsumer(o); err != nil { + return err + } if len(buf) > 0 { o.waitOnFlusher() @@ -12234,63 +13536,6 @@ func (fs *fileStore) RemoveConsumer(o ConsumerStore) error { return nil } -//////////////////////////////////////////////////////////////////////////////// -// Templates -//////////////////////////////////////////////////////////////////////////////// - -// Deprecated: stream templates are deprecated and will be removed in a future version. -type templateFileStore struct { - dir string - hh *highwayhash.Digest64 -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func newTemplateFileStore(storeDir string) *templateFileStore { - tdir := filepath.Join(storeDir, tmplsDir) - key := sha256.Sum256([]byte("templates")) - hh, err := highwayhash.NewDigest64(key[:]) - if err != nil { - return nil - } - return &templateFileStore{dir: tdir, hh: hh} -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (ts *templateFileStore) Store(t *streamTemplate) error { - dir := filepath.Join(ts.dir, t.Name) - if err := os.MkdirAll(dir, defaultDirPerms); err != nil { - return fmt.Errorf("could not create templates storage directory for %q- %v", t.Name, err) - } - meta := filepath.Join(dir, JetStreamMetaFile) - if _, err := os.Stat(meta); (err != nil && !os.IsNotExist(err)) || err == nil { - return err - } - t.mu.Lock() - b, err := json.Marshal(t) - t.mu.Unlock() - if err != nil { - return err - } - if err := os.WriteFile(meta, b, defaultFilePerms); err != nil { - return err - } - // FIXME(dlc) - Do checksum - ts.hh.Reset() - ts.hh.Write(b) - var hb [highwayhash.Size64]byte - checksum := hex.EncodeToString(ts.hh.Sum(hb[:0])) - sum := filepath.Join(dir, JetStreamMetaFileSum) - if err := os.WriteFile(sum, []byte(checksum), defaultFilePerms); err != nil { - return err - } - return nil -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (ts *templateFileStore) Delete(t *streamTemplate) error { - return os.RemoveAll(filepath.Join(ts.dir, t.Name)) -} - //////////////////////////////////////////////////////////////////////////////// // Compression //////////////////////////////////////////////////////////////////////////////// @@ -12406,6 +13651,10 @@ func writeFileWithSync(name string, data []byte, perm fs.FileMode) error { return writeAtomically(name, data, perm, true) } +// Windows does not support fsyncing directory metadata, it results in a panic, so +// we need to skip doing this there. +const canFsyncDirectories = runtime.GOOS != "windows" + func writeAtomically(name string, data []byte, perm fs.FileMode, sync bool) error { tmp := name + ".tmp" flags := os.O_CREATE | os.O_WRONLY | os.O_TRUNC @@ -12421,6 +13670,7 @@ func writeAtomically(name string, data []byte, perm fs.FileMode, sync bool) erro return err } if _, err := f.Write(data); err != nil { + // Close fd, but ignore its error since write takes precedence. _ = f.Close() _ = os.Remove(tmp) return err @@ -12433,12 +13683,20 @@ func writeAtomically(name string, data []byte, perm fs.FileMode, sync bool) erro _ = os.Remove(tmp) return err } - if sync { + if sync && canFsyncDirectories { // To ensure that the file rename was persisted on all filesystems, // also try to flush the directory metadata. - if d, err := os.Open(filepath.Dir(name)); err == nil { - _ = d.Sync() + var d *os.File + if d, err = os.Open(filepath.Dir(name)); err != nil { + return err + } + if err = d.Sync(); err != nil { + // Close fd, but ignore its error since sync takes precedence. _ = d.Close() + return err + } + if err = d.Close(); err != nil { + return err } } return nil diff --git a/vendor/github.com/nats-io/nats-server/v2/server/gateway.go b/vendor/github.com/nats-io/nats-server/v2/server/gateway.go index f4982cc2ce..52cb65c19d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/gateway.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/gateway.go @@ -1156,9 +1156,7 @@ func (c *client) processGatewayInfo(info *Info) { // defensive code above that if we did not register this connection // because we already have an outbound for this name, then // close this connection (and make sure it does not try to reconnect) - c.mu.Lock() - c.flags.set(noReconnect) - c.mu.Unlock() + c.setNoReconnect() c.closeConnection(WrongGateway) return } @@ -1981,7 +1979,7 @@ func (c *client) processGatewayRUnsub(arg []byte) error { return nil } else { // Plain sub, assume optimistic sends, create entry. - e = &outsie{ni: make(map[string]struct{}), sl: NewSublistWithCache()} + e = &outsie{ni: make(map[string]struct{}), sl: NewSublistForServer(c.srv)} newe = true } // This is when a sub or queue sub is supposed to be in @@ -2090,7 +2088,7 @@ func (c *client) processGatewayRSub(arg []byte) error { } else if queue == nil { return nil } else { - e = &outsie{ni: make(map[string]struct{}), sl: NewSublistWithCache()} + e = &outsie{ni: make(map[string]struct{}), sl: NewSublistForServer(c.srv)} newe = true useSl = true } @@ -2952,6 +2950,16 @@ func getSubjectFromGWRoutedReply(reply []byte, isOldPrefix bool) []byte { return reply[gwSubjectOffset:] } +// Returns the subject embedded in the given routed +// reply subject and whether the prefix was stripped. +// If the subject is not routed, returns it unchanged. +func getGWRoutedSubjectOrSelf(subject []byte) ([]byte, bool) { + if isGWPrefix, oldPrefix := isGWRoutedSubjectAndIsOldPrefix(subject); isGWPrefix { + return getSubjectFromGWRoutedReply(subject, oldPrefix), true + } + return subject, false +} + // This should be invoked only from processInboundGatewayMsg() or // processInboundRoutedMsg() and is checking if the subject // (c.pa.subject) has the _GR_ prefix. If so, this is processed @@ -3201,7 +3209,7 @@ func (c *client) gatewayAllSubsReceiveStart(info *Info) { e.mode = Transitioning e.Unlock() } else { - e := &outsie{sl: NewSublistWithCache()} + e := &outsie{sl: NewSublistForServer(c.srv)} e.mode = Transitioning c.mu.Lock() c.gw.outsim.Store(account, e) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/gsl/gsl.go b/vendor/github.com/nats-io/nats-server/v2/server/gsl/gsl.go index c16f6bac62..235f144c52 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/gsl/gsl.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/gsl/gsl.go @@ -170,6 +170,66 @@ func (s *GenericSublist[T]) NumInterest(subject string) (np int) { return } +// MatchesFullWildcard returns true if there is top-level ">" interest. +func (s *GenericSublist[T]) MatchesFullWildcard() bool { + if s == nil { + return false + } + s.RLock() + defer s.RUnlock() + return s.root.fwc != nil +} + +// MatchesSingleFilter returns the filter when the sublist contains exactly one unique subject. +func (s *GenericSublist[T]) MatchesSingleFilter() (string, bool) { + if s == nil { + return _EMPTY_, false + } + s.RLock() + defer s.RUnlock() + return singleFilter(s.root, _EMPTY_) +} + +func singleFilter[T comparable](l *level[T], filter string) (string, bool) { + if l == nil { + return filter, filter != _EMPTY_ + } + if len(l.nodes) > 1 { + return _EMPTY_, false + } + var next *node[T] + branches := 0 + if l.pwc != nil { + next = l.pwc + branches++ + } + if l.fwc != nil { + next = l.fwc + branches++ + } + for _, n := range l.nodes { + next = n + branches++ + } + if branches != 1 { + return _EMPTY_, false + } + for _, subj := range next.subs { + filter = subj + break + } + if next.next == nil { + return filter, filter != _EMPTY_ + } + if filter != _EMPTY_ { + if next.next.numNodes() > 0 { + return _EMPTY_, false + } + return filter, true + } + return singleFilter(next.next, filter) +} + func (s *GenericSublist[T]) match(subject string, cb func(T), doLock bool) { tsa := [32]string{} tokens := tsa[:0] diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go index 3ec6942b90..010b170b8d 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream.go @@ -25,13 +25,13 @@ import ( "os" "path/filepath" "runtime/debug" - "strconv" "strings" "sync" "sync/atomic" "time" "github.com/minio/highwayhash" + "github.com/nats-io/nats-server/v2/server/gsl" "github.com/nats-io/nats-server/v2/server/sysmem" "github.com/nats-io/nats-server/v2/server/tpm" "github.com/nats-io/nkeys" @@ -103,22 +103,26 @@ type JetStreamAPIStats struct { // This is for internal accounting for JetStream for this server. type jetStream struct { // These are here first because of atomics on 32bit systems. - apiInflight int64 - apiTotal int64 - apiErrors int64 - memReserved int64 - storeReserved int64 - memUsed int64 - storeUsed int64 - queueLimit int64 - clustered int32 - mu sync.RWMutex - srv *Server - config JetStreamConfig - cluster *jetStreamCluster - accounts map[string]*jsAccount - apiSubs *Sublist - started time.Time + apiInflight int64 + apiTotal int64 + apiErrors int64 + memMax int64 + memReserved int64 // Requires JS lock to be held. + memUsed int64 + storeMax int64 + storeReserved int64 // Requires JS lock to be held. + storeUsed int64 + queueLimit int64 + infoQueueLimit int64 + clustered int32 + mu sync.RWMutex + srv *Server + config JetStreamConfig + cluster *jetStreamCluster + accounts map[string]*jsAccount + apiSubs *Sublist + infoSubs *gsl.SimpleSublist // Subjects for info-specific queue. + started time.Time // System level request to purge a stream move accountPurge *subscription @@ -150,14 +154,12 @@ type jsaStorage struct { // an internal sub for a stream, so we will direct link to the stream // and walk backwards as needed vs multiple hash lookups and locks, etc. type jsAccount struct { - mu sync.RWMutex - js *jetStream - account *Account - storeDir string - inflight sync.Map - streams map[string]*stream - templates map[string]*streamTemplate // Deprecated: stream templates are deprecated and will be removed in a future version. - store TemplateStore // Deprecated: stream templates are deprecated and will be removed in a future version. + mu sync.RWMutex + js *jetStream + account *Account + storeDir string + inflight sync.Map + streams map[string]*stream // From server sendq *ipQueue[*pubMsg] @@ -415,15 +417,19 @@ func (s *Server) initJetStreamEncryption() (err error) { // enableJetStream will start up the JetStream subsystem. func (s *Server) enableJetStream(cfg JetStreamConfig) error { - js := &jetStream{srv: s, config: cfg, accounts: make(map[string]*jsAccount), apiSubs: NewSublistNoCache()} + js := &jetStream{srv: s, config: cfg, accounts: make(map[string]*jsAccount), apiSubs: NewSublistNoCache(), infoSubs: gsl.NewSimpleSublist()} s.gcbMu.Lock() if s.gcbOutMax = s.getOpts().JetStreamMaxCatchup; s.gcbOutMax == 0 { s.gcbOutMax = defaultMaxTotalCatchupOutBytes } s.gcbMu.Unlock() + atomic.StoreInt64(&js.memMax, cfg.MaxMemory) + atomic.StoreInt64(&js.storeMax, cfg.MaxStore) + // TODO: Not currently reloadable. atomic.StoreInt64(&js.queueLimit, s.getOpts().JetStreamRequestQueueLimit) + atomic.StoreInt64(&js.infoQueueLimit, s.getOpts().JetStreamInfoQueueLimit) s.js.Store(js) @@ -1058,8 +1064,10 @@ func (s *Server) shutdownJetStream() { func (s *Server) JetStreamConfig() *JetStreamConfig { var c *JetStreamConfig if js := s.getJetStream(); js != nil { + js.mu.RLock() copy := js.config c = &(copy) + js.mu.RUnlock() } return c } @@ -1219,54 +1227,6 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits, tq c s.Debugf("Recovering JetStream state for account %q", a.Name) } - // Check templates first since messsage sets will need proper ownership. - // FIXME(dlc) - Make this consistent. - tdir := filepath.Join(jsa.storeDir, tmplsDir) - if stat, err := os.Stat(tdir); err == nil && stat.IsDir() { - key := sha256.Sum256([]byte("templates")) - hh, err := highwayhash.NewDigest64(key[:]) - if err != nil { - return err - } - fis, _ := os.ReadDir(tdir) - for _, fi := range fis { - metafile := filepath.Join(tdir, fi.Name(), JetStreamMetaFile) - metasum := filepath.Join(tdir, fi.Name(), JetStreamMetaFileSum) - buf, err := os.ReadFile(metafile) - if err != nil { - s.Warnf(" Error reading StreamTemplate metafile %q: %v", metasum, err) - continue - } - if _, err := os.Stat(metasum); os.IsNotExist(err) { - s.Warnf(" Missing StreamTemplate checksum for %q", metasum) - continue - } - sum, err := os.ReadFile(metasum) - if err != nil { - s.Warnf(" Error reading StreamTemplate checksum %q: %v", metasum, err) - continue - } - hh.Reset() - hh.Write(buf) - var hb [highwayhash.Size64]byte - checksum := hex.EncodeToString(hh.Sum(hb[:0])) - if checksum != string(sum) { - s.Warnf(" StreamTemplate checksums do not match %q vs %q", sum, checksum) - continue - } - var cfg StreamTemplateConfig - if err := json.Unmarshal(buf, &cfg); err != nil { - s.Warnf(" Error unmarshalling StreamTemplate metafile: %v", err) - continue - } - cfg.Config.Name = _EMPTY_ - if _, err := a.addStreamTemplate(&cfg); err != nil { - s.Warnf(" Error recreating StreamTemplate %q: %v", cfg.Name, err) - continue - } - } - } - // Remember if we should be encrypted and what cipher we think we should use. encrypted := s.getOpts().JetStreamKey != _EMPTY_ sc := s.getOpts().JetStreamCipher @@ -1510,15 +1470,6 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits, tq c return nil } - if cfg.Template != _EMPTY_ { - jsa.mu.Lock() - err := jsa.addStreamNameToTemplate(cfg.Template, cfg.Name) - jsa.mu.Unlock() - if err != nil { - s.Warnf(" Error adding stream %q to template %q: %v", cfg.Name, cfg.Template, err) - } - } - // We had a bug that set a default de dupe window on mirror, despite that being not a valid config fixCfgMirrorWithDedupWindow(&cfg.StreamConfig) @@ -1587,6 +1538,7 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits, tq c batchId string batchSeq uint64 commit bool + commitEob bool batchStoreDir string store StreamStore state StreamState @@ -1604,19 +1556,30 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits, tq c } // We've observed a partial batch write. Write the remainder of the batch. batchSeq++ - _, batchStoreDir = getBatchStoreDir(mset, batchId) + _, batchStoreDir = getBatchStoreDir(jsa.storeDir, cfg.Name, batchId) if _, err = os.Stat(batchStoreDir); err != nil { s.Errorf(" Failed restoring partial batch write for stream '%s > %s' at sequence %d: %v", mset.accName(), mset.name(), batchSeq, err) goto SKIP } - store, err = newBatchStore(mset, batchId) + store, err = newBatchStore(mset, batchId, cfg.Replicas, cfg.Storage, jsa.storeDir, cfg.Name) if err != nil { s.Errorf(" Failed restoring partial batch write for stream '%s > %s' at sequence %d: %v", mset.accName(), mset.name(), batchSeq, err) goto SKIP } store.FastState(&state) + sm, err = store.LoadMsg(state.LastSeq, &smv) + if err != nil || sm == nil { + s.Errorf(" Failed restoring partial batch write for stream '%s > %s' at sequence %d: last msg not found %d", + mset.accName(), mset.name(), batchSeq, state.LastSeq) + goto SKIP + } + commitEob = bytes.Equal(sliceHeader(JSBatchCommit, sm.hdr), []byte("eob")) + // If the commit ends with an "End Of Batch" message, we don't store this. + if commitEob { + state.LastSeq-- + } s.Noticef(" Restoring partial batch write for stream '%s > %s' (seq %d to %d)", mset.accName(), mset.name(), batchSeq, state.LastSeq) // Loop through items that weren't persisted yet. @@ -1627,7 +1590,12 @@ func (a *Account) EnableJetStream(limits map[string]JetStreamAccountLimits, tq c mset.accName(), mset.name(), seq, err) break } - mset.processJetStreamMsg(sm.subj, _EMPTY_, sm.hdr, sm.msg, 0, 0, nil, false, true) + hdr := sm.hdr + // If committed by EOB, the last message must get the normal commit header. + if commitEob && seq == state.LastSeq { + hdr = genHeader(hdr, JSBatchCommit, "1") + } + mset.processJetStreamMsg(sm.subj, _EMPTY_, hdr, sm.msg, 0, 0, nil, false, true) } store.Delete(true) SKIP: @@ -2342,14 +2310,14 @@ func (jsa *jsAccount) sendClusterUsageUpdate() { func (js *jetStream) wouldExceedLimits(storeType StorageType, sz int) bool { var ( total *int64 - max int64 + max *int64 ) if storeType == MemoryStorage { - total, max = &js.memUsed, js.config.MaxMemory + total, max = &js.memUsed, &js.memMax } else { - total, max = &js.storeUsed, js.config.MaxStore + total, max = &js.storeUsed, &js.storeMax } - return (atomic.LoadInt64(total) + int64(sz)) > max + return (atomic.LoadInt64(total) + int64(sz)) > atomic.LoadInt64(max) } func (js *jetStream) limitsExceeded(storeType StorageType) bool { @@ -2519,7 +2487,6 @@ func (jsa *jsAccount) acc() *Account { // Delete the JetStream resources. func (jsa *jsAccount) delete() { var streams []*stream - var ts []string jsa.mu.Lock() // The update timer and subs need to be protected by usageMu lock @@ -2538,20 +2505,11 @@ func (jsa *jsAccount) delete() { for _, ms := range jsa.streams { streams = append(streams, ms) } - acc := jsa.account - for _, t := range jsa.templates { - ts = append(ts, t.Name) - } - jsa.templates = nil jsa.mu.Unlock() for _, mset := range streams { mset.stop(false, false) } - - for _, t := range ts { - acc.deleteStreamTemplate(t) - } } // Lookup the jetstream account for a given account. @@ -2763,325 +2721,6 @@ func (a *Account) checkForJetStream() (*Server, *jsAccount, error) { return s, jsa, nil } -// StreamTemplateConfig allows a configuration to auto-create streams based on this template when a message -// is received that matches. Each new stream will use the config as the template config to create them. -// Deprecated: stream templates are deprecated and will be removed in a future version. -type StreamTemplateConfig struct { - Name string `json:"name"` - Config *StreamConfig `json:"config"` - MaxStreams uint32 `json:"max_streams"` -} - -// StreamTemplateInfo -// Deprecated: stream templates are deprecated and will be removed in a future version. -type StreamTemplateInfo struct { - Config *StreamTemplateConfig `json:"config"` - Streams []string `json:"streams"` -} - -// streamTemplate -// Deprecated: stream templates are deprecated and will be removed in a future version. -type streamTemplate struct { - mu sync.Mutex - tc *client - jsa *jsAccount - *StreamTemplateConfig - streams []string -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (t *StreamTemplateConfig) deepCopy() *StreamTemplateConfig { - copy := *t - cfg := *t.Config - copy.Config = &cfg - return © -} - -// addStreamTemplate will add a stream template to this account that allows auto-creation of streams. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (a *Account) addStreamTemplate(tc *StreamTemplateConfig) (*streamTemplate, error) { - s, jsa, err := a.checkForJetStream() - if err != nil { - return nil, err - } - if tc.Config.Name != "" { - return nil, fmt.Errorf("template config name should be empty") - } - if len(tc.Name) > JSMaxNameLen { - return nil, fmt.Errorf("template name is too long, maximum allowed is %d", JSMaxNameLen) - } - - // FIXME(dlc) - Hacky - tcopy := tc.deepCopy() - tcopy.Config.Name = "_" - cfg, apiErr := s.checkStreamCfg(tcopy.Config, a, false) - if apiErr != nil { - return nil, apiErr - } - tcopy.Config = &cfg - t := &streamTemplate{ - StreamTemplateConfig: tcopy, - tc: s.createInternalJetStreamClient(), - jsa: jsa, - } - t.tc.registerWithAccount(a) - - jsa.mu.Lock() - if jsa.templates == nil { - jsa.templates = make(map[string]*streamTemplate) - // Create the appropriate store - if cfg.Storage == FileStorage { - jsa.store = newTemplateFileStore(jsa.storeDir) - } else { - jsa.store = newTemplateMemStore() - } - } else if _, ok := jsa.templates[tcopy.Name]; ok { - jsa.mu.Unlock() - return nil, fmt.Errorf("template with name %q already exists", tcopy.Name) - } - jsa.templates[tcopy.Name] = t - jsa.mu.Unlock() - - // FIXME(dlc) - we can not overlap subjects between templates. Need to have test. - - // Setup the internal subscriptions to trap the messages. - if err := t.createTemplateSubscriptions(); err != nil { - return nil, err - } - if err := jsa.store.Store(t); err != nil { - t.delete() - return nil, err - } - return t, nil -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (t *streamTemplate) createTemplateSubscriptions() error { - if t == nil { - return fmt.Errorf("no template") - } - if t.tc == nil { - return fmt.Errorf("template not enabled") - } - c := t.tc - if !c.srv.EventsEnabled() { - return ErrNoSysAccount - } - sid := 1 - for _, subject := range t.Config.Subjects { - // Now create the subscription - if _, err := c.processSub([]byte(subject), nil, []byte(strconv.Itoa(sid)), t.processInboundTemplateMsg, false); err != nil { - c.acc.deleteStreamTemplate(t.Name) - return err - } - sid++ - } - return nil -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (t *streamTemplate) processInboundTemplateMsg(_ *subscription, pc *client, acc *Account, subject, reply string, msg []byte) { - if t == nil || t.jsa == nil { - return - } - jsa := t.jsa - cn := canonicalName(subject) - - jsa.mu.Lock() - // If we already are registered then we can just return here. - if _, ok := jsa.streams[cn]; ok { - jsa.mu.Unlock() - return - } - jsa.mu.Unlock() - - // Check if we are at the maximum and grab some variables. - t.mu.Lock() - c := t.tc - cfg := *t.Config - cfg.Template = t.Name - atLimit := len(t.streams) >= int(t.MaxStreams) - if !atLimit { - t.streams = append(t.streams, cn) - } - t.mu.Unlock() - - if atLimit { - c.RateLimitWarnf("JetStream could not create stream for account %q on subject %q, at limit", acc.Name, subject) - return - } - - // We need to create the stream here. - // Change the config from the template and only use literal subject. - cfg.Name = cn - cfg.Subjects = []string{subject} - mset, err := acc.addStream(&cfg) - if err != nil { - acc.validateStreams(t) - c.RateLimitWarnf("JetStream could not create stream for account %q on subject %q: %v", acc.Name, subject, err) - return - } - - // Process this message directly by invoking mset. - mset.processInboundJetStreamMsg(nil, pc, acc, subject, reply, msg) -} - -// lookupStreamTemplate looks up the names stream template. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (a *Account) lookupStreamTemplate(name string) (*streamTemplate, error) { - _, jsa, err := a.checkForJetStream() - if err != nil { - return nil, err - } - jsa.mu.Lock() - defer jsa.mu.Unlock() - if jsa.templates == nil { - return nil, fmt.Errorf("template not found") - } - t, ok := jsa.templates[name] - if !ok { - return nil, fmt.Errorf("template not found") - } - return t, nil -} - -// This function will check all named streams and make sure they are valid. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (a *Account) validateStreams(t *streamTemplate) { - t.mu.Lock() - var vstreams []string - for _, sname := range t.streams { - if _, err := a.lookupStream(sname); err == nil { - vstreams = append(vstreams, sname) - } - } - t.streams = vstreams - t.mu.Unlock() -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (t *streamTemplate) delete() error { - if t == nil { - return fmt.Errorf("nil stream template") - } - - t.mu.Lock() - jsa := t.jsa - c := t.tc - t.tc = nil - defer func() { - if c != nil { - c.closeConnection(ClientClosed) - } - }() - t.mu.Unlock() - - if jsa == nil { - return NewJSNotEnabledForAccountError() - } - - jsa.mu.Lock() - if jsa.templates == nil { - jsa.mu.Unlock() - return fmt.Errorf("template not found") - } - if _, ok := jsa.templates[t.Name]; !ok { - jsa.mu.Unlock() - return fmt.Errorf("template not found") - } - delete(jsa.templates, t.Name) - acc := jsa.account - jsa.mu.Unlock() - - // Remove streams associated with this template. - var streams []*stream - t.mu.Lock() - for _, name := range t.streams { - if mset, err := acc.lookupStream(name); err == nil { - streams = append(streams, mset) - } - } - t.mu.Unlock() - - if jsa.store != nil { - if err := jsa.store.Delete(t); err != nil { - return fmt.Errorf("error deleting template from store: %v", err) - } - } - - var lastErr error - for _, mset := range streams { - if err := mset.delete(); err != nil { - lastErr = err - } - } - return lastErr -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (a *Account) deleteStreamTemplate(name string) error { - t, err := a.lookupStreamTemplate(name) - if err != nil { - return NewJSStreamTemplateNotFoundError() - } - return t.delete() -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (a *Account) templates() []*streamTemplate { - var ts []*streamTemplate - _, jsa, err := a.checkForJetStream() - if err != nil { - return nil - } - - jsa.mu.Lock() - for _, t := range jsa.templates { - // FIXME(dlc) - Copy? - ts = append(ts, t) - } - jsa.mu.Unlock() - - return ts -} - -// Will add a stream to a template, this is for recovery. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (jsa *jsAccount) addStreamNameToTemplate(tname, mname string) error { - if jsa.templates == nil { - return fmt.Errorf("template not found") - } - t, ok := jsa.templates[tname] - if !ok { - return fmt.Errorf("template not found") - } - // We found template. - t.mu.Lock() - t.streams = append(t.streams, mname) - t.mu.Unlock() - return nil -} - -// This will check if a template owns this stream. -// jsAccount lock should be held -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (jsa *jsAccount) checkTemplateOwnership(tname, sname string) bool { - if jsa.templates == nil { - return false - } - t, ok := jsa.templates[tname] - if !ok { - return false - } - // We found template, make sure we are in streams. - for _, streamName := range t.streams { - if sname == streamName { - return true - } - } - return false -} - type Number interface { int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64 | float32 | float64 } @@ -3107,10 +2746,11 @@ func isValidName(name string) bool { return !strings.ContainsAny(name, " \t\r\n\f.*>") } -// CanonicalName will replace all token separators '.' with '_'. -// This can be used when naming streams or consumers with multi-token subjects. -func canonicalName(name string) string { - return strings.ReplaceAll(name, ".", "_") +func isValidAssetName(name string) bool { + if name == _EMPTY_ { + return false + } + return !strings.ContainsAny(name, " \t\r\n\f.*>\\/") } // To throttle the out of resources errors. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go index e8600e14f8..53525a8bca 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_api.go @@ -20,6 +20,7 @@ import ( "errors" "fmt" "io" + "maps" "os" "path/filepath" "runtime" @@ -46,29 +47,6 @@ const ( // Will return JSON response. JSApiAccountInfo = "$JS.API.INFO" - // JSApiTemplateCreate is the endpoint to create new stream templates. - // Will return JSON response. - // Deprecated: stream templates are deprecated and will be removed in a future version. - JSApiTemplateCreate = "$JS.API.STREAM.TEMPLATE.CREATE.*" - JSApiTemplateCreateT = "$JS.API.STREAM.TEMPLATE.CREATE.%s" - - // JSApiTemplates is the endpoint to list all stream template names for this account. - // Will return JSON response. - // Deprecated: stream templates are deprecated and will be removed in a future version. - JSApiTemplates = "$JS.API.STREAM.TEMPLATE.NAMES" - - // JSApiTemplateInfo is for obtaining general information about a named stream template. - // Will return JSON response. - // Deprecated: stream templates are deprecated and will be removed in a future version. - JSApiTemplateInfo = "$JS.API.STREAM.TEMPLATE.INFO.*" - JSApiTemplateInfoT = "$JS.API.STREAM.TEMPLATE.INFO.%s" - - // JSApiTemplateDelete is the endpoint to delete stream templates. - // Will return JSON response. - // Deprecated: stream templates are deprecated and will be removed in a future version. - JSApiTemplateDelete = "$JS.API.STREAM.TEMPLATE.DELETE.*" - JSApiTemplateDeleteT = "$JS.API.STREAM.TEMPLATE.DELETE.%s" - // JSApiStreamCreate is the endpoint to create new streams. // Will return JSON response. JSApiStreamCreate = "$JS.API.STREAM.CREATE.*" @@ -177,6 +155,9 @@ const ( // JSApiRequestNextT is the prefix for the request next message(s) for a consumer in worker/pull mode. JSApiRequestNextT = "$JS.API.CONSUMER.MSG.NEXT.%s.%s" + // JSApiConsumerResetT is the prefix for resetting a given consumer to a new starting sequence. + JSApiConsumerResetT = "$JS.API.CONSUMER.RESET.%s.%s" + // JSApiConsumerUnpinT is the prefix for unpinning subscription for a given consumer. JSApiConsumerUnpin = "$JS.API.CONSUMER.UNPIN.*.*" JSApiConsumerUnpinT = "$JS.API.CONSUMER.UNPIN.%s.%s" @@ -237,13 +218,15 @@ const ( // jsAckT is the template for the ack message stream coming back from a consumer // when they ACK/NAK, etc a message. jsAckT = "$JS.ACK.%s.%s" + jsAckTv2 = "$JS.ACK.%s.%s.%s.%s" jsAckPre = "$JS.ACK." jsAckPreLen = len(jsAckPre) // jsFlowControl is for flow control subjects. jsFlowControlPre = "$JS.FC." // jsFlowControl is for FC responses. - jsFlowControl = "$JS.FC.%s.%s.*" + jsFlowControl = "$JS.FC.%s.%s.*" + jsFlowControlV2 = "$JS.FC.%s.%s.%s.%s.*" // JSAdvisoryPrefix is a prefix for all JetStream advisories. JSAdvisoryPrefix = "$JS.EVENT.ADVISORY" @@ -787,50 +770,19 @@ type JSApiConsumerGetNextRequest struct { PriorityGroup } -// JSApiStreamTemplateCreateResponse for creating templates. -// Deprecated: stream templates are deprecated and will be removed in a future version. -type JSApiStreamTemplateCreateResponse struct { - ApiResponse - *StreamTemplateInfo -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -const JSApiStreamTemplateCreateResponseType = "io.nats.jetstream.api.v1.stream_template_create_response" - -// Deprecated: stream templates are deprecated and will be removed in a future version. -type JSApiStreamTemplateDeleteResponse struct { - ApiResponse - Success bool `json:"success,omitempty"` -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -const JSApiStreamTemplateDeleteResponseType = "io.nats.jetstream.api.v1.stream_template_delete_response" - -// JSApiStreamTemplateInfoResponse for information about stream templates. -// Deprecated: stream templates are deprecated and will be removed in a future version. -type JSApiStreamTemplateInfoResponse struct { - ApiResponse - *StreamTemplateInfo -} - -// Deprecated: stream templates are deprecated and will be removed in a future version. -const JSApiStreamTemplateInfoResponseType = "io.nats.jetstream.api.v1.stream_template_info_response" - -// Deprecated: stream templates are deprecated and will be removed in a future version. -type JSApiStreamTemplatesRequest struct { - ApiPagedRequest +// JSApiConsumerResetRequest is for resetting a consumer to a specific sequence. +type JSApiConsumerResetRequest struct { + Seq uint64 `json:"seq,omitempty"` } -// JSApiStreamTemplateNamesResponse list of templates -// Deprecated: stream templates are deprecated and will be removed in a future version. -type JSApiStreamTemplateNamesResponse struct { +// JSApiConsumerResetResponse is a superset of JSApiConsumerCreateResponse, but including an explicit ResetSeq. +type JSApiConsumerResetResponse struct { ApiResponse - ApiPaged - Templates []string `json:"streams"` + *ConsumerInfo + ResetSeq uint64 `json:"reset_seq"` } -// Deprecated: stream templates are deprecated and will be removed in a future version. -const JSApiStreamTemplateNamesResponseType = "io.nats.jetstream.api.v1.stream_template_names_response" +const JSApiConsumerResetResponseType = "io.nats.jetstream.api.v1.consumer_reset_response" // Structure that holds state for a JetStream API request that is processed // in a separate long-lived go routine. This is to avoid blocking connections. @@ -911,11 +863,19 @@ func (js *jetStream) apiDispatch(sub *subscription, c *client, acc *Account, sub // Copy the state. Note the JSAPI only uses the hdr index to piece apart the // header from the msg body. No other references are needed. // Check pending and warn if getting backed up. - pending, _ := s.jsAPIRoutedReqs.push(&jsAPIRoutedReq{jsub, sub, acc, subject, reply, copyBytes(rmsg), c.pa}) - limit := atomic.LoadInt64(&js.queueLimit) + var queue *ipQueue[*jsAPIRoutedReq] + var limit int64 + if js.infoSubs.HasInterest(subject) { + queue = s.jsAPIRoutedInfoReqs + limit = atomic.LoadInt64(&js.infoQueueLimit) + } else { + queue = s.jsAPIRoutedReqs + limit = atomic.LoadInt64(&js.queueLimit) + } + pending, _ := queue.push(&jsAPIRoutedReq{jsub, sub, acc, subject, reply, copyBytes(rmsg), c.pa}) if pending >= int(limit) { - s.rateLimitFormatWarnf("JetStream API queue limit reached, dropping %d requests", pending) - drained := int64(s.jsAPIRoutedReqs.drain()) + s.rateLimitFormatWarnf("%s limit reached, dropping %d requests", queue.name, pending) + drained := int64(queue.drain()) atomic.AddInt64(&js.apiInflight, -drained) s.publishAdvisory(nil, JSAdvisoryAPILimitReached, JSAPILimitReachedAdvisory{ @@ -935,29 +895,45 @@ func (s *Server) processJSAPIRoutedRequests() { defer s.grWG.Done() s.mu.RLock() - queue := s.jsAPIRoutedReqs + queue, infoqueue := s.jsAPIRoutedReqs, s.jsAPIRoutedInfoReqs client := &client{srv: s, kind: JETSTREAM} s.mu.RUnlock() js := s.getJetStream() + processFromQueue := func(ipq *ipQueue[*jsAPIRoutedReq]) { + // Only pop one item at a time here, otherwise if the system is recovering + // from queue buildup, then one worker will pull off all the tasks and the + // others will be starved of work. + if r, ok := ipq.popOne(); ok && r != nil { + client.pa = r.pa + start := time.Now() + r.jsub.icb(r.sub, client, r.acc, r.subject, r.reply, r.msg) + if dur := time.Since(start); dur >= readLoopReportThreshold { + s.Warnf("Internal subscription on %q took too long: %v", r.subject, dur) + } + atomic.AddInt64(&js.apiInflight, -1) + } + } + for { + // First select case is prioritizing queue, we will only fall through + // to the second select case that considers infoqueue if queue is empty. + // This effectively means infos are deprioritized. select { case <-queue.ch: - // Only pop one item at a time here, otherwise if the system is recovering - // from queue buildup, then one worker will pull off all the tasks and the - // others will be starved of work. - for r, ok := queue.popOne(); ok && r != nil; r, ok = queue.popOne() { - client.pa = r.pa - start := time.Now() - r.jsub.icb(r.sub, client, r.acc, r.subject, r.reply, r.msg) - if dur := time.Since(start); dur >= readLoopReportThreshold { - s.Warnf("Internal subscription on %q took too long: %v", r.subject, dur) - } - atomic.AddInt64(&js.apiInflight, -1) - } + processFromQueue(queue) case <-s.quitCh: return + default: + select { + case <-infoqueue.ch: + processFromQueue(infoqueue) + case <-queue.ch: + processFromQueue(queue) + case <-s.quitCh: + return + } } } } @@ -976,7 +952,8 @@ func (s *Server) setJetStreamExportSubs() error { if mp > maxProcs { mp = maxProcs } - s.jsAPIRoutedReqs = newIPQueue[*jsAPIRoutedReq](s, "Routed JS API Requests") + s.jsAPIRoutedReqs = newIPQueue[*jsAPIRoutedReq](s, "JetStream API queue") + s.jsAPIRoutedInfoReqs = newIPQueue[*jsAPIRoutedReq](s, "JetStream API info queue") for i := 0; i < mp; i++ { s.startGoRoutine(s.processJSAPIRoutedRequests) } @@ -992,20 +969,13 @@ func (s *Server) setJetStreamExportSubs() error { } // API handles themselves. + // infopairs are deprioritized compared to pairs in processJSAPIRoutedRequests. pairs := []struct { subject string handler msgHandler }{ - {JSApiAccountInfo, s.jsAccountInfoRequest}, - {JSApiTemplateCreate, s.jsTemplateCreateRequest}, - {JSApiTemplates, s.jsTemplateNamesRequest}, - {JSApiTemplateInfo, s.jsTemplateInfoRequest}, - {JSApiTemplateDelete, s.jsTemplateDeleteRequest}, {JSApiStreamCreate, s.jsStreamCreateRequest}, {JSApiStreamUpdate, s.jsStreamUpdateRequest}, - {JSApiStreams, s.jsStreamNamesRequest}, - {JSApiStreamList, s.jsStreamListRequest}, - {JSApiStreamInfo, s.jsStreamInfoRequest}, {JSApiStreamDelete, s.jsStreamDeleteRequest}, {JSApiStreamPurge, s.jsStreamPurgeRequest}, {JSApiStreamSnapshot, s.jsStreamSnapshotRequest}, @@ -1018,23 +988,40 @@ func (s *Server) setJetStreamExportSubs() error { {JSApiConsumerCreateEx, s.jsConsumerCreateRequest}, {JSApiConsumerCreate, s.jsConsumerCreateRequest}, {JSApiDurableCreate, s.jsConsumerCreateRequest}, - {JSApiConsumers, s.jsConsumerNamesRequest}, - {JSApiConsumerList, s.jsConsumerListRequest}, - {JSApiConsumerInfo, s.jsConsumerInfoRequest}, {JSApiConsumerDelete, s.jsConsumerDeleteRequest}, {JSApiConsumerPause, s.jsConsumerPauseRequest}, {JSApiConsumerUnpin, s.jsConsumerUnpinRequest}, } + infopairs := []struct { + subject string + handler msgHandler + }{ + {JSApiAccountInfo, s.jsAccountInfoRequest}, + {JSApiStreams, s.jsStreamNamesRequest}, + {JSApiStreamList, s.jsStreamListRequest}, + {JSApiStreamInfo, s.jsStreamInfoRequest}, + {JSApiConsumers, s.jsConsumerNamesRequest}, + {JSApiConsumerList, s.jsConsumerListRequest}, + {JSApiConsumerInfo, s.jsConsumerInfoRequest}, + } js.mu.Lock() defer js.mu.Unlock() - for _, p := range pairs { + // As well as populating js.apiSubs for the dispatch function to use, we + // will also populate js.infoSubs, so that the dispatch function can + // decide quickly whether or not the request is an info request or not. + for _, p := range append(infopairs, pairs...) { sub := &subscription{subject: []byte(p.subject), icb: p.handler} if err := js.apiSubs.Insert(sub); err != nil { return err } } + for _, p := range infopairs { + if err := js.infoSubs.Insert(p.subject, struct{}{}); err != nil { + return err + } + } return nil } @@ -1239,7 +1226,7 @@ func (s *Server) unmarshalRequest(c *client, acc *Account, subject string, msg [ c.RateLimitWarnf("Invalid JetStream request '%s > %s': %s", acc, subject, err) - if s.JetStreamConfig().Strict { + if js := s.getJetStream(); js != nil && js.config.Strict { return err } @@ -1345,10 +1332,6 @@ func (s *Server) jsAccountInfoRequest(sub *subscription, c *client, _ *Account, } // Helpers for token extraction. -func templateNameFromSubject(subject string) string { - return tokenAt(subject, 6) -} - func streamNameFromSubject(subject string) string { return tokenAt(subject, 5) } @@ -1357,223 +1340,6 @@ func consumerNameFromSubject(subject string) string { return tokenAt(subject, 6) } -// Request to create a new template. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (s *Server) jsTemplateCreateRequest(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - if c == nil { - return - } - ci, acc, hdr, msg, err := s.getRequestInfo(c, rmsg) - if err != nil { - s.Warnf(badAPIRequestT, msg) - return - } - - var resp = JSApiStreamTemplateCreateResponse{ApiResponse: ApiResponse{Type: JSApiStreamTemplateCreateResponseType}} - if errorOnRequiredApiLevel(hdr) { - resp.Error = NewJSRequiredApiLevelError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if !acc.JetStreamEnabled() { - resp.Error = NewJSNotEnabledForAccountError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - - // Not supported for now. - if s.JetStreamIsClustered() { - resp.Error = NewJSClusterUnSupportFeatureError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - - var cfg StreamTemplateConfig - if err := s.unmarshalRequest(c, acc, subject, msg, &cfg); err != nil { - resp.Error = NewJSInvalidJSONError(err) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - templateName := templateNameFromSubject(subject) - if templateName != cfg.Name { - resp.Error = NewJSTemplateNameNotMatchSubjectError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - - t, err := acc.addStreamTemplate(&cfg) - if err != nil { - resp.Error = NewJSStreamTemplateCreateError(err, Unless(err)) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - t.mu.Lock() - tcfg := t.StreamTemplateConfig.deepCopy() - streams := t.streams - if streams == nil { - streams = []string{} - } - t.mu.Unlock() - resp.StreamTemplateInfo = &StreamTemplateInfo{Config: tcfg, Streams: streams} - s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) -} - -// Request for the list of all template names. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (s *Server) jsTemplateNamesRequest(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - if c == nil { - return - } - ci, acc, hdr, msg, err := s.getRequestInfo(c, rmsg) - if err != nil { - s.Warnf(badAPIRequestT, msg) - return - } - - var resp = JSApiStreamTemplateNamesResponse{ApiResponse: ApiResponse{Type: JSApiStreamTemplateNamesResponseType}} - if errorOnRequiredApiLevel(hdr) { - resp.Error = NewJSRequiredApiLevelError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if !acc.JetStreamEnabled() { - resp.Error = NewJSNotEnabledForAccountError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - - // Not supported for now. - if s.JetStreamIsClustered() { - resp.Error = NewJSClusterUnSupportFeatureError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - - var offset int - if isJSONObjectOrArray(msg) { - var req JSApiStreamTemplatesRequest - if err := s.unmarshalRequest(c, acc, subject, msg, &req); err != nil { - resp.Error = NewJSInvalidJSONError(err) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - offset = req.Offset - } - - ts := acc.templates() - slices.SortFunc(ts, func(i, j *streamTemplate) int { - return cmp.Compare(i.StreamTemplateConfig.Name, j.StreamTemplateConfig.Name) - }) - - tcnt := len(ts) - if offset > tcnt { - offset = tcnt - } - - for _, t := range ts[offset:] { - t.mu.Lock() - name := t.Name - t.mu.Unlock() - resp.Templates = append(resp.Templates, name) - if len(resp.Templates) >= JSApiNamesLimit { - break - } - } - resp.Total = tcnt - resp.Limit = JSApiNamesLimit - resp.Offset = offset - if resp.Templates == nil { - resp.Templates = []string{} - } - s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) -} - -// Request for information about a stream template. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (s *Server) jsTemplateInfoRequest(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - if c == nil { - return - } - ci, acc, hdr, msg, err := s.getRequestInfo(c, rmsg) - if err != nil { - s.Warnf(badAPIRequestT, msg) - return - } - - var resp = JSApiStreamTemplateInfoResponse{ApiResponse: ApiResponse{Type: JSApiStreamTemplateInfoResponseType}} - if errorOnRequiredApiLevel(hdr) { - resp.Error = NewJSRequiredApiLevelError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if !acc.JetStreamEnabled() { - resp.Error = NewJSNotEnabledForAccountError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if !isEmptyRequest(msg) { - resp.Error = NewJSNotEmptyRequestError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - name := templateNameFromSubject(subject) - t, err := acc.lookupStreamTemplate(name) - if err != nil { - resp.Error = NewJSStreamTemplateNotFoundError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - t.mu.Lock() - cfg := t.StreamTemplateConfig.deepCopy() - streams := t.streams - if streams == nil { - streams = []string{} - } - t.mu.Unlock() - - resp.StreamTemplateInfo = &StreamTemplateInfo{Config: cfg, Streams: streams} - s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) -} - -// Request to delete a stream template. -// Deprecated: stream templates are deprecated and will be removed in a future version. -func (s *Server) jsTemplateDeleteRequest(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - if c == nil { - return - } - ci, acc, hdr, msg, err := s.getRequestInfo(c, rmsg) - if err != nil { - s.Warnf(badAPIRequestT, msg) - return - } - - var resp = JSApiStreamTemplateDeleteResponse{ApiResponse: ApiResponse{Type: JSApiStreamTemplateDeleteResponseType}} - if errorOnRequiredApiLevel(hdr) { - resp.Error = NewJSRequiredApiLevelError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if !acc.JetStreamEnabled() { - resp.Error = NewJSNotEnabledForAccountError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - if !isEmptyRequest(msg) { - resp.Error = NewJSNotEmptyRequestError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - name := templateNameFromSubject(subject) - err = acc.deleteStreamTemplate(name) - if err != nil { - resp.Error = NewJSStreamTemplateDeleteError(err, Unless(err)) - s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) - return - } - resp.Success = true - s.sendAPIResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(resp)) -} - func (s *Server) jsonResponse(v any) string { b, err := json.Marshal(v) if err != nil { @@ -2109,7 +1875,7 @@ func (s *Server) jsStreamInfoRequest(sub *subscription, c *client, a *Account, s if cc != nil { // Check to make sure the stream is assigned. js.mu.RLock() - isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, streamName) + isLeader, sa := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, streamName) var offline bool if sa != nil { clusterWideConsCount = len(sa.consumers) @@ -2338,7 +2104,7 @@ func (s *Server) jsStreamLeaderStepDownRequest(sub *subscription, c *client, _ * } js.mu.RLock() - isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, name) + isLeader, sa := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, name) js.mu.RUnlock() if isLeader && sa == nil { @@ -2455,7 +2221,7 @@ func (s *Server) jsConsumerLeaderStepDownRequest(sub *subscription, c *client, _ consumer := tokenAt(subject, 7) js.mu.RLock() - isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, stream) + isLeader, sa := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, stream) js.mu.RUnlock() if isLeader && sa == nil { @@ -3456,7 +3222,7 @@ func (s *Server) jsMsgDeleteRequest(sub *subscription, c *client, _ *Account, su } js.mu.RLock() - isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, stream) + isLeader, sa := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, stream) js.mu.RUnlock() if isLeader && sa == nil { @@ -3581,7 +3347,7 @@ func (s *Server) jsMsgGetRequest(sub *subscription, c *client, _ *Account, subje } js.mu.RLock() - isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, stream) + isLeader, sa := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, stream) js.mu.RUnlock() if isLeader && sa == nil { @@ -3876,7 +3642,7 @@ func (s *Server) jsStreamPurgeRequest(sub *subscription, c *client, _ *Account, } js.mu.RLock() - isLeader, sa := cc.isLeader(), js.streamAssignment(acc.Name, stream) + isLeader, sa := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, stream) js.mu.RUnlock() if isLeader && sa == nil { @@ -4048,6 +3814,13 @@ func (s *Server) jsStreamRestoreRequest(sub *subscription, c *client, _ *Account return } + // Check for path like separators in the name. + if strings.ContainsAny(stream, `\/`) { + resp.Error = NewJSStreamNameContainsPathSeparatorsError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + return + } + if s.JetStreamIsClustered() { s.jsClusteredStreamRestoreRequest(ci, acc, &req, subject, reply, rmsg) return @@ -4597,11 +4370,49 @@ func (s *Server) jsConsumerCreateRequest(sub *subscription, c *client, a *Accoun isClustered := s.JetStreamIsClustered() // Determine if we should proceed here when we are in clustered mode. + direct := req.Config.Direct if isClustered { - if req.Config.Direct { - // Check to see if we have this stream and are the stream leader. - if !acc.JetStreamIsStreamLeader(streamNameFromSubject(subject)) { - return + if direct { + // If it's just a direct consumer, check for stream leader. + if !req.Config.Sourcing { + // Check to see if we have this stream and are the stream leader. + if !acc.JetStreamIsStreamLeader(streamNameFromSubject(subject)) { + return + } + } else { + // Otherwise, we either need this to be answered by the stream or meta leader. + var cc *jetStreamCluster + js, cc = s.getJetStreamCluster() + if js == nil || cc == nil { + return + } + js.mu.RLock() + sa := js.streamAssignmentOrInflight(acc.Name, streamNameFromSubject(subject)) + if sa == nil { + js.mu.RUnlock() + return + } + // If the stream is WQ or Interest, we need the meta leader to answer. + if sa.Config.Retention != LimitsPolicy { + direct = false + } + js.mu.RUnlock() + if direct { + // Check to see if we have this stream and are the stream leader. + if !acc.JetStreamIsStreamLeader(streamNameFromSubject(subject)) { + return + } + } else { + if js.isLeaderless() { + resp.Error = NewJSClusterNotAvailError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(msg), s.jsonResponse(&resp)) + return + } + // Make sure we are meta leader. + if !s.JetStreamIsLeader() { + return + } + } } } else { var cc *jetStreamCluster @@ -4645,6 +4456,7 @@ func (s *Server) jsConsumerCreateRequest(sub *subscription, c *client, a *Accoun // Legacy ephemeral. rt = ccLegacyEphemeral streamName = streamNameFromSubject(subject) + consumerName = req.Config.Name } else { // New style and durable legacy. if tokenAt(subject, 4) == "DURABLE" { @@ -4736,7 +4548,7 @@ func (s *Server) jsConsumerCreateRequest(sub *subscription, c *client, a *Accoun return } - if isClustered && !req.Config.Direct { + if isClustered && !direct { s.jsClusteredConsumerRequest(ci, acc, subject, reply, rmsg, req.Stream, &req.Config, req.Action, req.Pedantic) return } @@ -4760,6 +4572,23 @@ func (s *Server) jsConsumerCreateRequest(sub *subscription, c *client, a *Accoun return } + // If the consumer is a direct sourcing consumer, we need to "upgrade" + // it to be durable without AckNone if not a Limits-based stream. + if req.Config.Direct && req.Config.Sourcing && req.Config.Name != _EMPTY_ { + if !isClustered && stream.isInterestRetention() { + req.Config.Direct = false + req.Config.Durable = req.Config.Name + req.Config.AckPolicy = AckFlowControl + req.Config.AckWait = 0 + req.Config.MaxDeliver = 0 + req.Config.InactiveThreshold = 0 + } else { + // Otherwise, need to append a randomized suffix since the source uses a stable name. + req.Config.Name = fmt.Sprintf("%s-%s", req.Config.Name, createConsumerName()) + consumerName = req.Config.Name + } + } + if o := stream.lookupConsumer(consumerName); o != nil { if o.offlineReason != _EMPTY_ { resp.Error = NewJSConsumerOfflineReasonError(errors.New(o.offlineReason)) @@ -4770,6 +4599,12 @@ func (s *Server) jsConsumerCreateRequest(sub *subscription, c *client, a *Accoun // it back to whatever the current configured value is. o.mu.RLock() req.Config.PauseUntil = o.cfg.PauseUntil + // If a durable sourcing consumer is used, we need to reset the deliver policy. + if req.Config.Sourcing && req.Config.Durable != _EMPTY_ { + req.Config.DeliverPolicy = o.cfg.DeliverPolicy + req.Config.OptStartSeq = o.cfg.OptStartSeq + req.Config.OptStartTime = o.cfg.OptStartTime + } o.mu.RUnlock() } @@ -5079,7 +4914,7 @@ func (s *Server) jsConsumerInfoRequest(sub *subscription, c *client, _ *Account, groupCreated := meta.Created() js.mu.RLock() - isLeader, sa, ca := cc.isLeader(), js.streamAssignment(acc.Name, streamName), js.consumerAssignment(acc.Name, streamName, consumerName) + isLeader, sa, ca := cc.isLeader(), js.streamAssignmentOrInflight(acc.Name, streamName), js.consumerAssignmentOrInflight(acc.Name, streamName, consumerName) var rg *raftGroup var offline, isMember bool if ca != nil { @@ -5404,8 +5239,11 @@ func (s *Server) jsConsumerPauseRequest(sub *subscription, c *client, _ *Account return } - nca := *ca + nca := ca.clone() + // We're only holding the read lock and release below, + // we need a copy to prevent concurrent reads/writes. ncfg := *ca.Config + ncfg.Metadata = maps.Clone(ncfg.Metadata) nca.Config = &ncfg meta := cc.meta js.mu.RUnlock() @@ -5420,7 +5258,7 @@ func (s *Server) jsConsumerPauseRequest(sub *subscription, c *client, _ *Account // Only PauseUntil is updated above, so reuse config for both. setStaticConsumerMetadata(nca.Config) - eca := encodeAddConsumerAssignment(&nca) + eca := encodeAddConsumerAssignment(nca) meta.Propose(eca) resp.PauseUntil = pauseUTC @@ -5453,7 +5291,13 @@ func (s *Server) jsConsumerPauseRequest(sub *subscription, c *client, _ *Account return } + // We're only holding the read lock and release below, + // we need a copy to prevent concurrent reads/writes. + obs.mu.RLock() ncfg := obs.cfg + ncfg.Metadata = maps.Clone(ncfg.Metadata) + obs.mu.RUnlock() + pauseUTC := req.PauseUntil.UTC() if !pauseUTC.IsZero() { ncfg.PauseUntil = &pauseUTC diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_batching.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_batching.go index a8793e5462..4b04241fad 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_batching.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_batching.go @@ -21,6 +21,7 @@ import ( "math/big" "path/filepath" "slices" + "strconv" "strings" "sync" "sync/atomic" @@ -29,60 +30,105 @@ import ( var ( // Tracks the total inflight batches, across all streams and accounts that enable batching. - globalInflightBatches atomic.Int32 + globalInflightAtomicBatches atomic.Int64 + globalInflightFastBatches atomic.Int64 ) type batching struct { - mu sync.Mutex - group map[string]*batchGroup + mu sync.Mutex + atomic map[string]*atomicBatch + fast map[string]*fastBatch } -type batchGroup struct { - lseq uint64 - store StreamStore - timer *time.Timer +type atomicBatch struct { + timer *time.Timer // Inactivity timer for the batch. + lseq uint64 // The highest sequence for this batch. + store StreamStore // Where the batch is staged before committing. } +type fastBatch struct { + timer *time.Timer // Inactivity timer for the batch. + lseq uint64 // The highest sequence for this batch. + sseq uint64 // Last persisted stream sequence. + pseq uint64 // Last persisted batch sequence (is always lower or equal to lseq). + fseq uint64 // Sequence of when we last sent a flow message (is always lower or equal to pseq). + pending uint32 // Number of pending messages in the batch waiting to be persisted. + ackMessages uint16 // Ack will be sent every N messages. + maxAckMessages uint16 // Maximum ackMessages value the client allows. + reply string // The last reply subject seen when persisting a message. + gapOk bool // Whether a gap is okay, if not, the batch would be rejected. + commit bool // If the batch is committed. +} + +// newAtomicBatch creates an atomic batch publish object. // Lock should be held. -func (batches *batching) newBatchGroup(mset *stream, batchId string) (*batchGroup, error) { - store, err := newBatchStore(mset, batchId) +func (batches *batching) newAtomicBatch(mset *stream, batchId string, replicas int, storage StorageType, storeDir, streamName string) (*atomicBatch, error) { + store, err := newBatchStore(mset, batchId, replicas, storage, storeDir, streamName) if err != nil { return nil, err } - b := &batchGroup{store: store} + b := &atomicBatch{store: store} + b.setupCleanupTimer(mset, batchId, batches) + return b, nil +} +// setupCleanupTimer sets up a timer to clean up the batch after a timeout. +func (b *atomicBatch) setupCleanupTimer(mset *stream, batchId string, batches *batching) { // Create a timer to clean up after timeout. - timeout := streamMaxBatchTimeout - if maxBatchTimeout := mset.srv.getOpts().JetStreamLimits.MaxBatchTimeout; maxBatchTimeout > 0 { - timeout = maxBatchTimeout - } + timeout := getCleanupTimeout(mset) b.timer = time.AfterFunc(timeout, func() { b.cleanup(batchId, batches) mset.sendStreamBatchAbandonedAdvisory(batchId, BatchTimeout) }) - return b, nil } -func getBatchStoreDir(mset *stream, batchId string) (string, string) { - mset.mu.RLock() - jsa, name := mset.jsa, mset.cfg.Name - mset.mu.RUnlock() +// resetCleanupTimer resets the cleanup timer, allowing to extend the lifetime of the batch. +// Returns whether the timer was reset without it having expired before. +func (b *atomicBatch) resetCleanupTimer(mset *stream) bool { + timeout := getCleanupTimeout(mset) + return b.timer.Reset(timeout) +} - jsa.mu.RLock() - sd := jsa.storeDir - jsa.mu.RUnlock() +// cleanup deletes underlying resources associated with the batch and unregisters it from the stream's batches. +func (b *atomicBatch) cleanup(batchId string, batches *batching) { + batches.mu.Lock() + defer batches.mu.Unlock() + b.cleanupLocked(batchId, batches) +} - bname := getHash(batchId) - return bname, filepath.Join(sd, streamsDir, name, batchesDir, bname) +// Lock should be held. +func (b *atomicBatch) cleanupLocked(batchId string, batches *batching) { + if b.timer == nil { + return + } + globalInflightAtomicBatches.Add(-1) + b.timer.Stop() + b.store.Delete(true) + delete(batches.atomic, batchId) + // Reset so that another invocation doesn't double-account. + b.timer = nil } -func newBatchStore(mset *stream, batchId string) (StreamStore, error) { - mset.mu.RLock() - replicas, storage := mset.cfg.Replicas, mset.cfg.Storage - mset.mu.RUnlock() +// Lock should be held. +func (b *atomicBatch) stopLocked() { + if b.timer == nil { + return + } + globalInflightAtomicBatches.Add(-1) + b.timer.Stop() + b.store.Stop() + // Reset so that another invocation doesn't double-account. + b.timer = nil +} +func getBatchStoreDir(storeDir, streamName, batchId string) (string, string) { + bname := getHash(batchId) + return bname, filepath.Join(storeDir, streamsDir, streamName, batchesDir, bname) +} + +func newBatchStore(mset *stream, batchId string, replicas int, storage StorageType, storeDir, streamName string) (StreamStore, error) { if replicas == 1 && storage == FileStorage { - bname, storeDir := getBatchStoreDir(mset, batchId) + bname, storeDir := getBatchStoreDir(storeDir, streamName, batchId) fcfg := FileStoreConfig{AsyncFlush: true, BlockSize: defaultLargeBlockSize, StoreDir: storeDir} s := mset.srv prf := s.jsKeyGen(s.getOpts().JetStreamKey, mset.acc.Name) @@ -101,34 +147,264 @@ func newBatchStore(mset *stream, batchId string) (StreamStore, error) { // If the timer has already cleaned up the batch, we can't commit. // Otherwise, we ensure the timer does not clean up the batch in the meantime. // Lock should be held. -func (b *batchGroup) readyForCommit() bool { +func (b *atomicBatch) readyForCommit() *BatchAbandonReason { if !b.timer.Stop() { + return &BatchTimeout + } + if b.store.FlushAllPending() != nil { + return &BatchIncomplete + } + return nil +} + +// newFastBatch creates a fast batch publish object and registers it in batches.fast. +// Lock should be held. +func (batches *batching) newFastBatch(mset *stream, batchId string, gapOk bool, maxAckMessages uint16) *fastBatch { + b := &fastBatch{gapOk: gapOk, maxAckMessages: maxAckMessages} + if batches.fast == nil { + batches.fast = make(map[string]*fastBatch, 1) + } + batches.fast[batchId] = b + batches.fastBatchInit(b) + b.setupCleanupTimer(mset, batchId, batches) + return b +} + +// fastBatchInit (re)initializes the ackMessages field for a fast batch. +// The batch must already be registered in batches.fast. +// Lock should be held. +func (batches *batching) fastBatchInit(b *fastBatch) { + // If it's the only batch, just allow what the client wants, otherwise we'll + // need to coordinate and slowly ramp up this publisher. + // TODO(mvv): fast ingest's initial flow value improvements? + ackMessages := min(500, b.maxAckMessages) + if len(batches.fast) > 1 { + ackMessages = 1 + } + b.ackMessages = ackMessages +} + +// fastBatchReset resets the fast batch to an empty state and sends a flow control message. +// Lock should be held. +func (batches *batching) fastBatchReset(mset *stream, batchId string, b *fastBatch) { + // If the timer already stopped before we could commit, we clean it up. + if b.timer == nil || (!b.commit && !b.timer.Stop()) { + b.cleanupLocked(batchId, batches) + return + } + // Otherwise, reset the state. + batches.fastBatchInit(b) + b.timer.Reset(getCleanupTimeout(mset)) + b.commit = false + b.pending = 0 + b.fseq, b.lseq = b.pseq, b.pseq + b.sendFlowControl(b.fseq, mset, b.reply) +} + +// fastBatchRegisterSequences registers the highest stored batch and stream sequence and returns +// whether a PubAck should be sent if the batch has been committed. +// If this is called on a follower, it only registers the highest stream and persisted batch sequences. +// Lock should be held. +func (batches *batching) fastBatchRegisterSequences(mset *stream, reply string, streamSeq uint64, isLeader bool, batch *FastBatch) bool { + b, ok := batches.fast[batch.id] + if !ok || !isLeader { + // If this batch has committed, we can clean it up. + if batch.commit { + if b != nil { + b.cleanupLocked(batch.id, batches) + } + return false + } + // Otherwise, even as a follower, we record the latest state of this batch. + if b == nil || !b.resetCleanupTimer(mset) { + if b != nil { + // The timer couldn't be reset, this means the timer already runs and is likely + // waiting to acquire the lock. We reset the timer here so it doesn't clean up + // this batch that we're about to overwrite. + b.timer = nil + } else { + // If this is a new batch for us, even though we're a follower, we still need + // to account toward the global inflight limit. + globalInflightFastBatches.Add(1) + } + // We'll need a copy as we'll use it as a key and later for cleanup. + batchId := copyString(batch.id) + b = batches.newFastBatch(mset, batchId, batch.gapOk, batch.flow) + } + b.sseq = streamSeq + b.pseq, b.lseq = batch.seq, batch.seq + b.reply = reply return false } - b.store.FlushAllPending() + b.reply = reply + if b.pending > 0 { + b.pending-- + } + b.sseq = streamSeq + // Store last persisted batch sequence. + // If we have no remaining pending writes, we might have had duplicate messages + // and need to send additional flow control messages. + var skipped bool + if b.pending == 0 { + skipped = true + b.pseq = b.lseq + } else { + b.pseq = batch.seq + } + // If the PubAck needs to be sent now as a result of a commit. + if b.lseq == b.pseq && b.commit { + b.cleanupLocked(batch.id, batches) + // If we skipped ahead due to duplicate messages, send the PubAck with the highest sequence. + if skipped { + var buf [256]byte + pubAck := append(buf[:0], mset.pubAck...) + response := append(pubAck, strconv.FormatUint(b.sseq, 10)...) + response = append(response, fmt.Sprintf(",\"batch\":%q,\"count\":%d}", batch.id, b.lseq)...) + if len(reply) > 0 { + mset.outq.sendMsg(reply, response) + } + return false + } + return true + } + b.checkFlowControl(mset, reply, batches) + return false +} + +// checkFlowControl checks whether a flow control message should be sent. +// If so, it updates the flow values to speed up or slow down the publisher if needed. +// Returns whether a flow control message was sent. +// Lock should be held. +func (b *fastBatch) checkFlowControl(mset *stream, reply string, batches *batching) bool { + am := uint64(b.ackMessages) + if b.pseq < b.fseq+am { + return false + } + // Instead of sending multiple flow control messages, skip ahead to only send the last. + steps := (b.pseq - b.fseq) / am + b.fseq += steps * am + + // TODO(mvv): fast ingest's dynamic flow value improvements? + // This is currently just a simple value to have a working version. Should take average + // message sizes into account and compare how much this client is contributing to the + // ingest IPQ total size and messages and have publishers share based on that. + maxAckMessages := uint16(500 / len(batches.fast)) + if maxAckMessages < 1 { + maxAckMessages = 1 + } + // Limit to the client's allowed maximum. + if maxAckMessages > b.maxAckMessages { + maxAckMessages = b.maxAckMessages + } + + if b.ackMessages < maxAckMessages { + // Ramp up. + b.ackMessages *= 2 + if b.ackMessages > maxAckMessages { + b.ackMessages = maxAckMessages + } + } else if b.ackMessages > maxAckMessages { + // Slow down. + b.ackMessages /= 2 + if b.ackMessages <= maxAckMessages { + b.ackMessages = maxAckMessages + } + } + + // Finally, send the flow control message. + b.sendFlowControl(b.fseq, mset, reply) return true } +// sendFlowControl sends a fast batch flow control message for the current highest sequence. +// Lock should be held. +func (b *fastBatch) sendFlowControl(batchSeq uint64, mset *stream, reply string) { + if len(reply) == 0 { + return + } + response, _ := BatchFlowAck{Sequence: batchSeq, Messages: b.ackMessages}.MarshalJSON() + mset.outq.sendMsg(reply, response) +} + +// fastBatchCommit ends the batch and commits the data up to that point. If all messages +// have already been persisted, a PubAck is sent immediately. Otherwise, it will be sent +// after the last message has been persisted. +// Lock should be held. +func (batches *batching) fastBatchCommit(b *fastBatch, batchId string, mset *stream, reply string) bool { + // Either we commit now, or we clean up later, so stop the timer. + if b.timer == nil || (!b.commit && !b.timer.Stop()) { + // Shouldn't be possible for the timer to already be stopped if we haven't committed yet, + // since we pre-check being able to reset the timer. But guard against it anyhow. + return true + } + // Mark that this batch commits. + b.commit = true + // If the whole batch has been persisted, we can respond with the PubAck now. + if b.lseq == b.pseq { + b.cleanupLocked(batchId, batches) + var buf [256]byte + pubAck := append(buf[:0], mset.pubAck...) + response := append(pubAck, strconv.FormatUint(b.sseq, 10)...) + response = append(response, fmt.Sprintf(",\"batch\":%q,\"count\":%d}", batchId, b.lseq)...) + if len(reply) > 0 { + mset.outq.sendMsg(reply, response) + } + return true + } + // Otherwise, we need to wait and the PubAck will be sent when the last message is persisted. + return false +} + +// setupCleanupTimer sets up a timer to clean up the batch after a timeout. +func (b *fastBatch) setupCleanupTimer(mset *stream, batchId string, batches *batching) { + // Create a timer to clean up after timeout. + timeout := getCleanupTimeout(mset) + b.timer = time.AfterFunc(timeout, func() { + b.cleanup(batchId, batches) + }) +} + +// resetCleanupTimer resets the cleanup timer, allowing to extend the lifetime of the batch. +// Returns whether the timer was reset without it having expired before. +func (b *fastBatch) resetCleanupTimer(mset *stream) bool { + if b.commit { + return true + } + if b.timer == nil { + return false + } + timeout := getCleanupTimeout(mset) + return b.timer.Reset(timeout) +} + // cleanup deletes underlying resources associated with the batch and unregisters it from the stream's batches. -func (b *batchGroup) cleanup(batchId string, batches *batching) { +func (b *fastBatch) cleanup(batchId string, batches *batching) { batches.mu.Lock() defer batches.mu.Unlock() b.cleanupLocked(batchId, batches) } // Lock should be held. -func (b *batchGroup) cleanupLocked(batchId string, batches *batching) { - globalInflightBatches.Add(-1) +func (b *fastBatch) cleanupLocked(batchId string, batches *batching) { + // If the timer is nil, it means this batch has been replaced with a new one. + // This can happen on a follower depending on timing. + if b.timer == nil { + return + } + globalInflightFastBatches.Add(-1) b.timer.Stop() - b.store.Delete(true) - delete(batches.group, batchId) + delete(batches.fast, batchId) + // Reset so that another invocation doesn't double-account. + b.timer = nil } -// Lock should be held. -func (b *batchGroup) stopLocked() { - globalInflightBatches.Add(-1) - b.timer.Stop() - b.store.Stop() +// getCleanupTimeout returns the timeout for the batch, taking into account the server's limits. +func getCleanupTimeout(mset *stream) time.Duration { + timeout := streamMaxBatchTimeout + if maxBatchTimeout := mset.srv.getOpts().JetStreamLimits.MaxBatchTimeout; maxBatchTimeout > 0 { + timeout = maxBatchTimeout + } + return timeout } // batchStagedDiff stages all changes for consistency checks until commit. @@ -136,6 +412,7 @@ type batchStagedDiff struct { msgIds map[string]struct{} counter map[string]*msgCounterRunningTotal inflight map[string]*inflightSubjectRunningTotal + inflightTransform map[uint64]string expectedPerSubject map[string]*batchExpectedPerSubject } @@ -180,6 +457,16 @@ func (diff *batchStagedDiff) commit(mset *stream) { } } + // Track inflight subject transforms. + if len(diff.inflightTransform) > 0 { + if mset.inflightTransform == nil { + mset.inflightTransform = make(map[uint64]string, len(diff.inflightTransform)) + } + for clseq, subj := range diff.inflightTransform { + mset.inflightTransform[clseq] = subj + } + } + // Track sequence and subject. if len(diff.expectedPerSubject) > 0 { if mset.expectedPerSubjectSequence == nil { @@ -238,7 +525,7 @@ func (batch *batchApply) rejectBatchState(mset *stream) { // mset.mu lock must NOT be held or used. // mset.clMu lock must be held. func checkMsgHeadersPreClusteredProposal( - diff *batchStagedDiff, mset *stream, subject string, hdr []byte, msg []byte, sourced bool, name string, + diff *batchStagedDiff, mset *stream, subject, rsubject string, hdr []byte, msg []byte, sourced bool, name string, jsa *jsAccount, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules bool, discard DiscardPolicy, discardNewPer bool, maxMsgSize int, maxMsgs int64, maxMsgsPer int64, maxBytes int64, ) ([]byte, []byte, uint64, *ApiError, error) { @@ -515,8 +802,9 @@ func checkMsgHeadersPreClusteredProposal( } // Message scheduling. - if schedule, ok := getMessageSchedule(hdr); !ok { - apiErr := NewJSMessageSchedulesPatternInvalidError() + if sourced { + // noop, sourced messages were already validated by the origin stream. + } else if schedule, apiErr := getMessageSchedule(hdr); apiErr != nil { if !allowMsgSchedules { apiErr = NewJSMessageSchedulesDisabledError() } @@ -528,22 +816,40 @@ func checkMsgHeadersPreClusteredProposal( } else if scheduleTtl, ok := getMessageScheduleTTL(hdr); !ok { apiErr := NewJSMessageSchedulesTTLInvalidError() return hdr, msg, 0, apiErr, apiErr + } else if scheduleRollup := getMessageScheduleRollup(hdr); scheduleRollup != _EMPTY_ && scheduleRollup != JSMsgRollupSubject { + apiErr := NewJSMessageSchedulesRollupInvalidError() + return hdr, msg, 0, apiErr, apiErr } else if scheduleTtl != _EMPTY_ && !allowTTL { return hdr, msg, 0, NewJSMessageTTLDisabledError(), errMsgTTLDisabled } else if scheduleTarget := getMessageScheduleTarget(hdr); scheduleTarget == _EMPTY_ || - !IsValidPublishSubject(scheduleTarget) || SubjectsCollide(scheduleTarget, subject) { + !IsValidPublishSubject(scheduleTarget) || scheduleTarget == subject { apiErr := NewJSMessageSchedulesTargetInvalidError() return hdr, msg, 0, apiErr, apiErr + } else if scheduleSource := getMessageScheduleSource(hdr); scheduleSource != _EMPTY_ && + (scheduleSource == scheduleTarget || scheduleSource == subject || !IsValidPublishSubject(scheduleSource)) { + apiErr := NewJSMessageSchedulesSourceInvalidError() + return hdr, msg, 0, apiErr, apiErr } else { mset.cfgMu.RLock() match := slices.ContainsFunc(mset.cfg.Subjects, func(subj string) bool { return SubjectsCollide(subj, scheduleTarget) }) - mset.cfgMu.RUnlock() if !match { + mset.cfgMu.RUnlock() apiErr := NewJSMessageSchedulesTargetInvalidError() return hdr, msg, 0, apiErr, apiErr } + if scheduleSource != _EMPTY_ { + match = slices.ContainsFunc(mset.cfg.Subjects, func(subj string) bool { + return SubjectsCollide(subj, scheduleSource) + }) + if !match { + mset.cfgMu.RUnlock() + apiErr := NewJSMessageSchedulesSourceInvalidError() + return hdr, msg, 0, apiErr, apiErr + } + } + mset.cfgMu.RUnlock() // Add a rollup sub header if it doesn't already exist. // Otherwise, it must exist already as a rollup on the subject. @@ -555,10 +861,32 @@ func checkMsgHeadersPreClusteredProposal( } } } + if scheduleNext := sliceHeader(JSScheduleNext, hdr); len(scheduleNext) > 0 && !sourced { + // Clients may only use Nats-Schedule-Next to purge a schedule. + if bytesToString(scheduleNext) != JSScheduleNextPurge { + apiErr := NewJSMessageSchedulesSchedulerInvalidError() + return hdr, msg, 0, apiErr, apiErr + } + // Nats-Scheduler must accompany the purge and: + // - it must NOT be empty. + // - it must NOT match the publish subject. + if scheduler := sliceHeader(JSScheduler, hdr); len(scheduler) == 0 || + bytesToString(scheduler) == subject || !IsValidPublishSubject(bytesToString(scheduler)) { + apiErr := NewJSMessageSchedulesSchedulerInvalidError() + return hdr, msg, 0, apiErr, apiErr + } else if !allowMsgSchedules { + apiErr := NewJSMessageSchedulesDisabledError() + return hdr, msg, 0, apiErr, apiErr + } + } else if !sourced && len(sliceHeader(JSScheduler, hdr)) > 0 { + // Clients may only use Nats-Scheduler alongside Nats-Schedule-Next. + apiErr := NewJSMessageSchedulesSchedulerInvalidError() + return hdr, msg, 0, apiErr, apiErr + } // Check for any rollups. if rollup := getRollup(hdr); rollup != _EMPTY_ { - if !allowRollup || denyPurge { + if (!allowRollup || denyPurge) && !sourced { err := errors.New("rollup not permitted") return hdr, msg, 0, NewJSStreamRollupFailedError(err), err } @@ -607,6 +935,19 @@ func checkMsgHeadersPreClusteredProposal( diff.inflight[subject] = i } + // Subject transform. + if subject != rsubject { + // The 'subject' is a transformed subject used for consistency checks. + // But since we propose the original (raw) subject to our peers, we need + // to store the transformed subject separately for when we apply. + // TODO(mvv): since subject transforms are handled by each replica individually, this has a + // potential for desync given out-of-order stream subject transform updates. + if diff.inflightTransform == nil { + diff.inflightTransform = make(map[uint64]string, 1) + } + diff.inflightTransform[mset.clseq] = subject + } + // Check if we have discard new with max msgs or bytes. // We need to deny here otherwise we'd need to bump CLFS, and it could succeed on some // peers and not others depending on consumer ack state (if interest policy). @@ -639,7 +980,8 @@ func checkMsgHeadersPreClusteredProposal( } // Similarly, check DiscardNew per-subject threshold to not need to bump CLFS. - if discardNewPer && maxMsgsPer > 0 { + // Allow rollup messages through since they will purge after storing. + if discardNewPer && maxMsgsPer > 0 && len(sliceHeader(JSMsgRollup, hdr)) == 0 { // Get the current total for this subject. totalMsgsForSubject := mset.store.SubjectsTotals(subject)[subject] // Add inflight count in this batch and for this stream. @@ -656,3 +998,68 @@ func checkMsgHeadersPreClusteredProposal( return hdr, msg, 0, nil, nil } + +// recalculateClusteredSeq initializes or updates mset.clseq, for example after a leader change. +// This is reused for normal clustered publishing into a stream, and for atomic and fast batch publishing. +// mset.clMu lock must be held. +func recalculateClusteredSeq(mset *stream, needStreamLock bool) (lseq uint64) { + // Need to unlock and re-acquire the locks in the proper order. + mset.clMu.Unlock() + // Locking order is stream -> batchMu -> clMu + if needStreamLock { + mset.mu.RLock() + } + batch := mset.batchApply + var batchCount uint64 + if batch != nil { + batch.mu.Lock() + batchCount = batch.count + } + mset.clMu.Lock() + // Re-capture + lseq = mset.lseq + mset.clseq = lseq + mset.clfs + batchCount + // Keep hold of the mset.clMu, but unlock the others. + if batch != nil { + batch.mu.Unlock() + } + if needStreamLock { + mset.mu.RUnlock() + } + return lseq +} + +// commitSingleMsg commits and proposes a single message to the node. +// This is reused both for normal publishing into a stream, and for fast batch publishing. +// mset.clMu lock must be held. +func commitSingleMsg( + diff *batchStagedDiff, mset *stream, subject string, reply string, hdr []byte, msg []byte, name string, + jsa *jsAccount, mt *msgTrace, node RaftNode, replicas int, lseq uint64, +) error { + // Do proposal. + esm := encodeStreamMsgAllowCompress(subject, reply, hdr, msg, mset.clseq, time.Now().UnixNano(), false) + if err := node.Propose(esm); err != nil { + return err + } + + var mtKey uint64 + if mt != nil { + mtKey = mset.clseq + if mset.mt == nil { + mset.mt = make(map[uint64]*msgTrace) + } + mset.mt[mtKey] = mt + } + + diff.commit(mset) + mset.clseq++ + mset.trackReplicationTraffic(node, len(esm), replicas) + + // Check to see if we are being overrun. + // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. + if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold { + lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) + mset.srv.RateLimitWarnf("%s", lerr.Error()) + } + return nil +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go index ea524abe7e..d406c7f1a2 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_cluster.go @@ -36,7 +36,6 @@ import ( "github.com/antithesishq/antithesis-sdk-go/assert" "github.com/klauspost/compress/s2" - "github.com/minio/highwayhash" "github.com/nats-io/nuid" ) @@ -54,6 +53,10 @@ type jetStreamCluster struct { // a response but they need to be same group, peers etc. and sync subjects. inflightStreams map[string]map[string]*inflightStreamInfo inflightConsumers map[string]map[string]map[string]*inflightConsumerInfo + // Tracks raft groups currently being started by createRaftGroup, so that + // concurrent callers for the same group can wait without holding js.mu + // across the disk I/O performed during startup. + creatingRaftGroups map[string]chan struct{} // Holds a map of a peer ID to the reply subject, to only respond after gaining // quorum on the peer-remove action. peerRemoveReply map[string]peerRemoveInfo @@ -145,6 +148,8 @@ const ( // Batch stream ops. batchMsgOp batchCommitMsgOp + // Consumer rest to specific starting sequence. + resetSeqOp ) // raftGroups are controlled by the metagroup controller. @@ -173,7 +178,7 @@ type streamAssignment struct { Restore *StreamState `json:"restore_state,omitempty"` // Internal consumers map[string]*consumerAssignment - responded bool + responded atomic.Bool // copied via clone() to satisfy go vet's noCopy check recovering bool reassigning bool // i.e. due to placement issues, lack of resources, etc. resetting bool // i.e. there was an error, and we're stopping and starting the stream @@ -181,6 +186,45 @@ type streamAssignment struct { unsupported *unsupportedStreamAssignment } +func (sa *streamAssignment) hasResponded() bool { + return sa.responded.Load() +} + +// markResponded sets the responded flag and returns the prior value. +func (sa *streamAssignment) markResponded() bool { + return sa.responded.Swap(true) +} + +func (sa *streamAssignment) clearResponded() { + sa.responded.Store(false) +} + +// clone returns a copy of sa. Field-explicit (rather than `*sa`) and +// pointer-returning so the embedded atomic.Bool isn't value-copied; +// responded is transferred via Load/Store. Concurrent callers may write +// responded via markResponded/clearResponded without holding js.mu. +func (sa *streamAssignment) clone() *streamAssignment { + csa := &streamAssignment{ + Client: sa.Client, + Created: sa.Created, + ConfigJSON: sa.ConfigJSON, + Config: sa.Config, + Group: sa.Group, + Sync: sa.Sync, + Subject: sa.Subject, + Reply: sa.Reply, + Restore: sa.Restore, + consumers: sa.consumers, + recovering: sa.recovering, + reassigning: sa.reassigning, + resetting: sa.resetting, + err: sa.err, + unsupported: sa.unsupported, + } + csa.responded.Store(sa.responded.Load()) + return csa +} + type unsupportedStreamAssignment struct { reason string info StreamInfo @@ -257,12 +301,49 @@ type consumerAssignment struct { Reply string `json:"reply,omitempty"` State *ConsumerState `json:"state,omitempty"` // Internal - responded bool + responded atomic.Bool // copied via clone() to satisfy go vet's noCopy check recovering bool err error unsupported *unsupportedConsumerAssignment } +func (ca *consumerAssignment) hasResponded() bool { + return ca.responded.Load() +} + +// markResponded sets the responded flag and returns the prior value. +func (ca *consumerAssignment) markResponded() bool { + return ca.responded.Swap(true) +} + +func (ca *consumerAssignment) clearResponded() { + ca.responded.Store(false) +} + +// clone returns a copy of ca. Field-explicit (rather than `*ca`) and +// pointer-returning so the embedded atomic.Bool isn't value-copied; +// responded is transferred via Load/Store. Concurrent callers may write +// responded via markResponded/clearResponded without holding js.mu. +func (ca *consumerAssignment) clone() *consumerAssignment { + cca := &consumerAssignment{ + Client: ca.Client, + Created: ca.Created, + Name: ca.Name, + Stream: ca.Stream, + ConfigJSON: ca.ConfigJSON, + Config: ca.Config, + Group: ca.Group, + Subject: ca.Subject, + Reply: ca.Reply, + State: ca.State, + recovering: ca.recovering, + err: ca.err, + unsupported: ca.unsupported, + } + cca.responded.Store(ca.responded.Load()) + return cca +} + type unsupportedConsumerAssignment struct { reason string info ConsumerInfo @@ -646,6 +727,11 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) error { mset.cfgMu.RLock() replicas := mset.cfg.Replicas mset.cfgMu.RUnlock() + var nrgWerr error + if node != nil { + nrgWerr = node.GetWriteErr() + } + streamWerr := mset.getWriteErr() switch { case replicas <= 1: return nil // No further checks for R=1 streams @@ -661,6 +747,12 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) error { s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName) return errors.New("cluster node skew detected") + case nrgWerr != nil: + return fmt.Errorf("node write error: %v", nrgWerr) + + case streamWerr != nil: + return fmt.Errorf("stream write error: %v", streamWerr) + case !mset.isMonitorRunning(): return errors.New("monitor goroutine not running") @@ -2241,7 +2333,7 @@ func (js *jetStream) collectStreamAndConsumerChanges(c RaftNodeCheckpoint, strea func (js *jetStream) setStreamAssignmentRecovering(sa *streamAssignment) { js.mu.Lock() defer js.mu.Unlock() - sa.responded = true + sa.markResponded() sa.recovering = true sa.Restore = nil if sa.Group != nil { @@ -2254,7 +2346,7 @@ func (js *jetStream) setStreamAssignmentRecovering(sa *streamAssignment) { func (js *jetStream) setConsumerAssignmentRecovering(ca *consumerAssignment) { js.mu.Lock() defer js.mu.Unlock() - ca.responded = true + ca.markResponded() ca.recovering = true if ca.Group != nil { ca.Group.Preferred = _EMPTY_ @@ -2265,19 +2357,19 @@ func (js *jetStream) setConsumerAssignmentRecovering(ca *consumerAssignment) { // Just copies over and changes out the group so it can be encoded. // Lock should be held. func (sa *streamAssignment) copyGroup() *streamAssignment { - csa, cg := *sa, *sa.Group + csa, cg := sa.clone(), *sa.Group csa.Group = &cg csa.Group.Peers = copyStrings(sa.Group.Peers) - return &csa + return csa } // Just copies over and changes out the group so it can be encoded. // Lock should be held. func (ca *consumerAssignment) copyGroup() *consumerAssignment { - cca, cg := *ca, *ca.Group + cca, cg := ca.clone(), *ca.Group cca.Group = &cg cca.Group.Peers = copyStrings(ca.Group.Peers) - return &cca + return cca } // Lock should be held. @@ -2659,9 +2751,11 @@ func (rg *raftGroup) setPreferred(s *Server) { // createRaftGroup is called to spin up this raft group if needed. func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, recovering bool, storage StorageType, labels pprofLabels) (RaftNode, error) { - // Must hold JS lock throughout, otherwise two parallel calls for the same raft group could result - // in duplicate instances for the same identifier, if the current Raft node is shutting down. - // We can release the lock temporarily while waiting for the Raft node to shut down. + // js.mu protects the lookup/registration of raft groups so that two parallel + // calls for the same identifier can't end up creating duplicate instances. + // It is released around blocking work (waiting for a previous instance to + // shut down, and the disk I/O inside startRaftNode); concurrent callers for + // the same rg.Name are gated through cc.creatingRaftGroups instead. js.mu.Lock() defer js.mu.Unlock() @@ -2676,8 +2770,21 @@ func (js *jetStream) createRaftGroup(accName string, rg *raftGroup, recovering b return nil, nil } - // Check if we already have this assigned. retry: + // If another goroutine is mid-creation for this raft group, wait for it + // to finish (without holding js.mu) and then retry the lookup. + if ch, ok := cc.creatingRaftGroups[rg.Name]; ok { + js.mu.Unlock() + <-ch + js.mu.Lock() + // js.cluster could have been swapped out (shutdown). + if js.cluster == nil || js.cluster.meta == nil { + return nil, NewJSClusterNotActiveError() + } + cc = js.cluster + goto retry + } + // Check if we already have this assigned. if node := s.lookupRaftNode(rg.Name); node != nil { if node.State() == Closed { // We're waiting for this node to finish shutting down before we replace it. @@ -2734,7 +2841,7 @@ retry: // Check here to see if we have a max HA Assets limit set. if maxHaAssets := s.getOpts().JetStreamLimits.MaxHAAssets; maxHaAssets > 0 { - if s.numRaftNodes() > maxHaAssets { + if s.numRaftNodes()+len(cc.creatingRaftGroups) > maxHaAssets { s.Warnf("Maximum HA Assets limit reached: %d", maxHaAssets) // Since the meta leader assigned this, send a statsz update to them to get them up to date. go s.sendStatszUpdate() @@ -2742,46 +2849,71 @@ retry: } } + // Register an in-flight sentinel so concurrent callers for the same group + // will wait for us. Then drop js.mu around all the blocking work below + // (file store creation, peer state read, snapshot replay, fsyncs) so we + // don't serialize every stream/consumer assignment behind one disk fsync. + if cc.creatingRaftGroups == nil { + cc.creatingRaftGroups = make(map[string]chan struct{}) + } + doneCh := make(chan struct{}) + cc.creatingRaftGroups[rg.Name] = doneCh + + // Snapshot rg fields; we drop js.mu below and rg is shared. + rgName, rgScaleUp := rg.Name, rg.ScaleUp + rgPeers := copyStrings(rg.Peers) storeDir := filepath.Join(js.config.StoreDir, sysAcc.Name, defaultStoreDirName, rg.Name) - var store StreamStore - if storage == FileStorage { - // If the server is set to sync always, do the same for the Raft log. - js.srv.optsMu.RLock() - syncAlways := js.srv.opts.SyncAlways - syncInterval := js.srv.opts.SyncInterval - js.srv.optsMu.RUnlock() - fs, err := newFileStoreWithCreated( - FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncAlways: syncAlways, SyncInterval: syncInterval, srv: s}, - StreamConfig{Name: rg.Name, Storage: FileStorage, Metadata: labels}, - time.Now().UTC(), - s.jsKeyGen(s.getOpts().JetStreamKey, rg.Name), - s.jsKeyGen(s.getOpts().JetStreamOldKey, rg.Name), - ) - if err != nil { - s.Errorf("Error creating filestore WAL: %v", err) - return nil, err - } - store = fs - } else { - ms, err := newMemStore(&StreamConfig{Name: rg.Name, Storage: MemoryStorage}) - if err != nil { - s.Errorf("Error creating memstore WAL: %v", err) - return nil, err + js.mu.Unlock() + + n, err := func() (RaftNode, error) { + var store StreamStore + if storage == FileStorage { + opts := s.getOpts() + fs, err := newFileStoreWithCreated( + FileStoreConfig{StoreDir: storeDir, BlockSize: defaultMediumBlockSize, AsyncFlush: false, SyncAlways: opts.SyncAlways, SyncInterval: opts.SyncInterval, srv: s}, + StreamConfig{Name: rgName, Storage: FileStorage, Metadata: labels}, + time.Now().UTC(), + s.jsKeyGen(opts.JetStreamKey, rgName), + s.jsKeyGen(opts.JetStreamOldKey, rgName), + ) + if err != nil { + s.Errorf("Error creating filestore WAL: %v", err) + return nil, err + } + store = fs + } else { + ms, err := newMemStore(&StreamConfig{Name: rgName, Storage: MemoryStorage}) + if err != nil { + s.Errorf("Error creating memstore WAL: %v", err) + return nil, err + } + store = ms } - store = ms - } - cfg := &RaftConfig{Name: rg.Name, Store: storeDir, Log: store, Track: true, Recovering: recovering, ScaleUp: rg.ScaleUp} + cfg := &RaftConfig{Name: rgName, Store: storeDir, Log: store, Track: true, Recovering: recovering, ScaleUp: rgScaleUp} - if _, err := readPeerState(storeDir); err != nil { - s.bootstrapRaftNode(cfg, rg.Peers, true) - } + if _, err := readPeerState(storeDir); err != nil { + s.bootstrapRaftNode(cfg, rgPeers, true) + } + + n, err := s.startRaftNode(accName, cfg, labels) + if err != nil || n == nil { + s.Debugf("Error creating raft group: %v", err) + return nil, err + } + return n, nil + }() - n, err := s.startRaftNode(accName, cfg, labels) + js.mu.Lock() + delete(cc.creatingRaftGroups, rg.Name) + close(doneCh) if err != nil || n == nil { - s.Debugf("Error creating raft group: %v", err) return nil, err } + if js.cluster == nil || js.cluster.meta == nil { + // Cluster was torn down while we were creating; let n be reaped at shutdown. + return nil, NewJSClusterNotActiveError() + } // Need JS lock to be held for the assignment to avoid data-race reports rg.node = n // See if we are preferred and should start campaign immediately. @@ -2967,61 +3099,117 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps } accName := acc.GetName() - // Used to represent how we can detect a changed state quickly and without representing - // a complete and detailed state which could be costly in terms of memory, cpu and GC. - // This only entails how many messages, and the first and last sequence of the stream. - // This is all that is needed to detect a change, and we can get this from FilteredState() - // with an empty filter. - var lastState SimpleState - // Don't allow the upper layer to install snapshots until we have // fully recovered from disk. isRecovering := true - var failedSnapshots int + var ( + snapMu sync.Mutex + snapshotting bool + fallbackSnapshot bool + failedSnapshots int + ) + doSnapshot := func(force bool) { // Suppress during recovery. + if mset == nil || isRecovering || isRestore { + return + } + snapMu.Lock() + defer snapMu.Unlock() // If snapshots have failed, and we're not forced to, we'll wait for the timer since it'll now be forced. - if mset == nil || isRecovering || isRestore || (!force && failedSnapshots > 0) { + if !force && failedSnapshots > 0 { return } - - // Before we actually calculate the detailed state and encode it, let's check the - // simple state to detect any changes. - curState := mset.store.FilteredState(0, _EMPTY_) - - // If the state hasn't changed but the log has gone way over - // the compaction size then we will want to compact anyway. - // This shouldn't happen for streams like it can for pull - // consumers on idle streams but better to be safe than sorry! - ne, nb := n.Size() - if curState == lastState && ne < compactNumMin && nb < compactSizeMin { + // Suppress if an async snapshot is already in progress. + if snapshotting { return } - // Make sure all pending data is flushed before allowing snapshots. - mset.flushAllPending() - // If we had a significant number of failed snapshots, start relaxing Raft-layer checks // to force it through. We might have been catching up a peer for a long period, and this // protects our log size from growing indefinitely. forceSnapshot := failedSnapshots > 4 - if err := n.InstallSnapshot(mset.stateSnapshot(), forceSnapshot); err == nil { - lastState = curState - // If there was a failed snapshot before, we reduced the timer's interval. - // Reset it back to the original interval now. - if failedSnapshots > 0 { - t.Reset(compactInterval + rci) + c, err := n.CreateSnapshotCheckpoint(forceSnapshot) + if err != nil { + if err != errNoSnapAvailable && err != errNodeClosed { + s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v", + mset.acc.Name, mset.name(), n.Group(), err) + // If this is the first failure, reduce the interval of the snapshot timer. + // This ensures we're not waiting too long for snapshotting to eventually become forced. + if failedSnapshots == 0 { + t.Reset(compactMinInterval) + } + failedSnapshots++ + } + return + } + + // Make sure all pending data is flushed before allowing snapshots. + if err := mset.flushAllPending(); err != nil { + // If the pending data couldn't be flushed, we have no safe way to continue. + s.Errorf("Failed to flush pending data for '%s > %s' [%s]: %v", accName, mset.name(), n.Group(), err) + assert.Unreachable("Stream snapshot flush failed", map[string]any{ + "account": accName, + "stream": mset.name(), + "group": n.Group(), + "err": err, + }) + c.Abort() + mset.setWriteErr(err) + n.Stop() + return + } + + snap := mset.stateSnapshot() + + handleInstallResult := func(err error) { + snapshotting = false + if err == nil { + // If there was a failed snapshot before, we reduced the timer's interval. + // Reset it back to the original interval now. + if failedSnapshots > 0 { + t.Reset(compactInterval + rci) + } + failedSnapshots = 0 + fallbackSnapshot = false + } else { + c.Abort() + + if err == errNoSnapAvailable || err == errNodeClosed || err == errCatchupsRunning || err == errSnapAborted { + return + } + + s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v, will fall back to blocking snapshot", + mset.acc.Name, mset.name(), n.Group(), err) + fallbackSnapshot = true + // If this is the first failure, reduce the interval of the snapshot timer. + // This ensures we're not waiting too long for snapshotting to eventually become forced. + if failedSnapshots == 0 { + t.Reset(compactMinInterval) + } + failedSnapshots++ } - failedSnapshots = 0 - } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { - s.RateLimitWarnf("Failed to install snapshot for '%s > %s' [%s]: %v", mset.acc.Name, mset.name(), n.Group(), err) - // If this is the first failure, reduce the interval of the snapshot timer. - // This ensures we're not waiting too long for snapshotting to eventually become forced. - if failedSnapshots == 0 { - t.Reset(compactMinInterval) + } + + snapshotting = true + if fallbackSnapshot { + _, err = c.InstallSnapshot(snap) + handleInstallResult(err) + } else { + started := s.startGoRoutine(func() { + defer s.grWG.Done() + + _, err := c.InstallSnapshot(snap) + + snapMu.Lock() + defer snapMu.Unlock() + handleInstallResult(err) + }) + if !started { + snapshotting = false + c.Abort() } - failedSnapshots++ } } @@ -3098,6 +3286,9 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps select { case <-s.quitCh: // Server shutting down, but we might receive this before qch, so try to snapshot. + snapMu.Lock() + fallbackSnapshot = true + snapMu.Unlock() doSnapshot(false) return case <-mqch: @@ -3105,6 +3296,9 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps // Don't snapshot if not shutting down, monitor goroutine could be going away // on a scale down or a remove for example. if s.isShuttingDown() { + snapMu.Lock() + fallbackSnapshot = true + snapMu.Unlock() doSnapshot(false) } return @@ -3173,7 +3367,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps aq.recycle(&ces) return } - s.Warnf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err) + s.Errorf("Error applying entries to '%s > %s': %v", accName, sa.Config.Name, err) if isClusterResetErr(err) { if mset.isMirror() && mset.IsLeader() { mset.retryMirrorConsumer() @@ -3182,7 +3376,7 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps // If the error signals we timed out of a snapshot, we should try to replay the snapshot // instead of fully resetting the state. Resetting the clustered state may result in // race conditions and should only be used as a last effort attempt. - if errors.Is(err, errCatchupAbortedNoLeader) || err == errCatchupTooManyRetries { + if errors.Is(err, errCatchupAbortedNoLeader) || err == errCatchupTooManyRetries || err == errAlreadyLeader { if node := mset.raftNode(); node != nil && node.DrainAndReplaySnapshot() { break } @@ -3195,6 +3389,11 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps } else if isOutOfSpaceErr(err) { // If applicable this will tear all of this down, but don't assume so and return. s.handleOutOfSpace(mset) + } else { + // Encountered an unexpected error, can't continue. + mset.setWriteErr(err) + aq.recycle(&ces) + return } } } @@ -3299,7 +3498,9 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps case <-t.C: // Start forcing snapshots if they failed previously. + snapMu.Lock() forceIfFailed := failedSnapshots > 0 + snapMu.Unlock() doSnapshot(forceIfFailed) case <-uch: @@ -3849,6 +4050,41 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco return 0, err } + case deleteRangeOp: + dr, err := decodeDeleteRange(buf[1:]) + if err != nil { + if node := mset.raftNode(); node != nil { + s := js.srv + s.Errorf("JetStream cluster could not decode delete range for '%s > %s' [%s]", + mset.account(), mset.name(), node.Group()) + } + panic(err.Error()) + } + if dr.Num == 0 { + continue + } + mset.mu.Lock() + first, num := dr.First, dr.Num + lseq := first + num - 1 + if mset.lseq >= lseq { + mset.mu.Unlock() + continue + } + // Trim any prefix already applied so we only skip the uncovered tail. + if mset.lseq >= first { + first = mset.lseq + 1 + num = lseq - mset.lseq + } + if err = mset.store.SkipMsgs(first, num); err != nil { + mset.mu.Unlock() + js.srv.RateLimitWarnf("JetStream cluster failed to apply delete range [%d..%d] for '%s > %s': %v", + first, lseq, mset.account().Name, mset.cfg.Name, err) + return 0, err + } + mset.clearAllPreAcksInRange(first, lseq) + mset.lseq = lseq + mset.mu.Unlock() + case deleteMsgOp: md, err := decodeMsgDelete(buf[1:]) if err != nil { @@ -3989,10 +4225,8 @@ func (js *jetStream) applyStreamEntries(mset *stream, ce *CommittedEntry, isReco } } - if isRecovering || !mset.IsLeader() { - if err := mset.processSnapshot(ss, ce.Index); err != nil { - return 0, err - } + if err := mset.processSnapshot(ss, ce.Index); err != nil { + return 0, err } } else if e.Type == EntryRemovePeer { js.mu.RLock() @@ -4098,7 +4332,7 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR if needLock { mset.mu.RLock() } - mset.sendFlowControlReply(reply) + mset.sendFlowControlReply(reply, hdr) if needLock { mset.mu.RUnlock() } @@ -4140,11 +4374,14 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR // Messages to be skipped have no subject or timestamp or msg or hdr. if subject == _EMPTY_ && ts == 0 && len(msg) == 0 && len(hdr) == 0 { // Skip and update our lseq. - last, _ := mset.store.SkipMsg(0) + last, err := mset.store.SkipMsg(0) + if err != nil { + return err + } if needLock { mset.mu.Lock() } - mset.setLastSeq(last) + mset.lseq = last mset.clearAllPreAcks(last) if needLock { mset.mu.Unlock() @@ -4162,19 +4399,31 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR // Process the actual message here. err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt, sourced, needLock) + // Take into account subject transforms, if any. + // The untransformed subject is replicated, but the transformed subject is used for consistency checks below. + csubject := subject + if mset.inflightTransform != nil { + mset.clMu.Lock() + if subj, found := mset.inflightTransform[lseq]; found { + csubject = subj + delete(mset.inflightTransform, lseq) + } + mset.clMu.Unlock() + } + // If we have inflight make sure to clear after processing. // TODO(dlc) - technically check on inflight != nil could cause datarace. // But do not want to acquire lock since tracking this will be rare. if mset.inflight != nil { mset.clMu.Lock() - if i, found := mset.inflight[subject]; found { + if i, found := mset.inflight[csubject]; found { // Decrement from pending operations. Once it reaches zero, it can be deleted. if i.ops > 0 { var sz uint64 if mset.store.Type() == FileStorage { - sz = fileStoreMsgSizeRaw(len(subject), len(hdr), len(msg)) + sz = fileStoreMsgSizeRaw(len(csubject), len(hdr), len(msg)) } else { - sz = memStoreMsgSizeRaw(len(subject), len(hdr), len(msg)) + sz = memStoreMsgSizeRaw(len(csubject), len(hdr), len(msg)) } if i.bytes >= sz { i.bytes -= sz @@ -4184,7 +4433,7 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR i.ops-- } if i.ops == 0 { - delete(mset.inflight, subject) + delete(mset.inflight, csubject) } } mset.clMu.Unlock() @@ -4193,13 +4442,13 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR // Update running total for counter. if mset.clusteredCounterTotal != nil { mset.clMu.Lock() - if counter, found := mset.clusteredCounterTotal[subject]; found { + if counter, found := mset.clusteredCounterTotal[csubject]; found { // Decrement from pending operations. Once it reaches zero, it can be deleted. if counter.ops > 0 { counter.ops-- } if counter.ops == 0 { - delete(mset.clusteredCounterTotal, subject) + delete(mset.clusteredCounterTotal, csubject) } } mset.clMu.Unlock() @@ -4225,9 +4474,11 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR // should be reset. This is possible if the other side has a stale snapshot and no longer // has those messages. So compact and retry to reset. if state.Msgs == 0 { - mset.store.Compact(lseq + 1) - // Retry - err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt, sourced, needLock) + _, err = mset.store.Compact(lseq + 1) + if err == nil { + // Retry + err = mset.processJetStreamMsg(subject, reply, hdr, msg, lseq, ts, mt, sourced, needLock) + } } // FIXME(dlc) - We could just run a catchup with a request defining the span between what we expected // and what we got. @@ -4239,6 +4490,11 @@ func (js *jetStream) applyStreamMsgOp(mset *stream, op entryOp, mbuf []byte, isR } s.Debugf("Apply stream entries for '%s > %s' got error processing message: %v", mset.accountLocked(needLock), mset.nameLocked(needLock), err) + + // There are some errors that we can't recover from. + if err != ErrMaxMsgs && err != ErrMaxBytes && err != ErrMaxMsgsPerSubject && err != ErrMsgTooLarge && err != ErrStoreClosed { + return err + } } return nil } @@ -4295,6 +4551,7 @@ func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) { mset.clMu.Lock() // Clear inflight if we have it. mset.inflight = nil + mset.inflightTransform = nil // Clear running counter totals. mset.clusteredCounterTotal = nil // Clear expected per subject state. @@ -4302,12 +4559,11 @@ func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) { mset.expectedPerSubjectInProcess = nil mset.clMu.Unlock() - js.mu.Lock() + js.mu.RLock() s, account, err := js.srv, sa.Client.serviceAccount(), sa.err client, subject, reply := sa.Client, sa.Subject, sa.Reply - hasResponded := sa.responded - sa.responded = true - js.mu.Unlock() + hasResponded := sa.markResponded() + js.mu.RUnlock() streamName := mset.name() @@ -4320,15 +4576,15 @@ func (js *jetStream) processStreamLeaderChange(mset *stream, isLeader bool) { if node := mset.raftNode(); node != nil && !node.Quorum() && time.Since(node.Created()) > 5*time.Second { s.sendStreamLostQuorumAdvisory(mset) } + } - // Clear clseq. If we become leader again, it will be fixed up - // automatically on the next mset.setLeader call. - mset.clMu.Lock() - if mset.clseq > 0 { - mset.clseq = 0 - } - mset.clMu.Unlock() + // Clear clseq on every leader transition. recalculateClusteredSeq + // repopulates it on the next proposal. + mset.clMu.Lock() + if mset.clseq > 0 { + mset.clseq = 0 } + mset.clMu.Unlock() // Tell stream to switch leader status. mset.setLeader(isLeader) @@ -4572,7 +4828,9 @@ func (js *jetStream) processStreamAssignment(sa *streamAssignment) { sa.Group.node = osa.Group.node } sa.consumers = osa.consumers - sa.responded = osa.responded + if osa.hasResponded() { + sa.markResponded() + } sa.err = osa.err } // Unsubscribe if it was previously unsupported. @@ -4588,7 +4846,7 @@ func (js *jetStream) processStreamAssignment(sa *streamAssignment) { // Update our state. accStreams[stream] = sa cc.streams[accName] = accStreams - hasResponded := sa.responded + hasResponded := sa.hasResponded() // If unsupported, we can't register any further. if sa.unsupported != nil { @@ -4706,7 +4964,7 @@ func (js *jetStream) processUpdateStreamAssignment(sa *streamAssignment) { // Make sure we respond if we are a member. if isMember { - sa.responded = false + sa.clearResponded() } else { // Make sure to clean up any old node in case this stream moves back here. if sa.Group != nil { @@ -4802,16 +5060,15 @@ func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAss return } - js.mu.Lock() + js.mu.RLock() s, rg := js.srv, sa.Group client, subject, reply := sa.Client, sa.Subject, sa.Reply - alreadyRunning, numReplicas := osa.Group.node != nil, len(rg.Peers) + alreadyRunning, oldNumReplicas, numReplicas := osa.Group.node != nil, len(osa.Group.Peers), len(rg.Peers) needsNode := rg.node == nil storage, cfg := sa.Config.Storage, sa.Config - hasResponded := sa.responded - sa.responded = true recovering := sa.recovering - js.mu.Unlock() + hasResponded := sa.markResponded() + js.mu.RUnlock() mset, err := acc.lookupStream(cfg.Name) if err == nil && mset != nil { @@ -4831,6 +5088,9 @@ func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAss if !alreadyRunning && numReplicas > 1 { if needsNode { + // Must run before startClusterSubs reads mset.sa.Sync. + mset.setStreamAssignment(sa) + // Since we are scaling up we want to make sure our sync subject // is registered before we start our raft node. mset.mu.Lock() @@ -4907,6 +5167,11 @@ func (js *jetStream) processClusterUpdateStream(acc *Account, osa, sa *streamAss isLeader := mset.IsLeader() + // If the stream is scaled down, there is a chance we weren't already the leader. + if isLeader && numReplicas == 1 && oldNumReplicas > 1 { + js.processStreamLeaderChange(mset, true) + } + // Check for missing syncSubject bug. if isLeader && osa != nil && osa.Sync == _EMPTY_ { if node := mset.raftNode(); node != nil { @@ -5069,7 +5334,7 @@ func (js *jetStream) processClusterCreateStream(acc *Account, sa *streamAssignme js.mu.Lock() sa.err = err - hasResponded := sa.responded + hasResponded := sa.hasResponded() // If out of space do nothing for now. if isOutOfSpaceErr(err) { @@ -5404,7 +5669,9 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) { if ca.Group != nil { ca.Group.node = oca.Group.node } - ca.responded = oca.responded + if oca.hasResponded() { + ca.markResponded() + } ca.err = oca.err // Unsubscribe if it was previously unsupported. @@ -5688,13 +5955,13 @@ func (js *jetStream) processClusterCreateConsumer(oca, ca *consumerAssignment, s // Check if we already had a consumer assignment and its still pending. cca, oca := ca, o.consumerAssignment() if oca != nil { - if !oca.responded { + if !oca.hasResponded() { // We can't override info for replying here otherwise leader once elected can not respond. // So copy over original client and the reply from the old ca. - cac := *ca + cac := ca.clone() cac.Client = oca.Client cac.Reply = oca.Reply - cca = &cac + cca = cac needsLocalResponse = true } // If we look like we are scaling up, let's send our current state to the group. @@ -5740,7 +6007,7 @@ func (js *jetStream) processClusterCreateConsumer(oca, ca *consumerAssignment, s js.mu.Lock() ca.err = err - hasResponded := ca.responded + hasResponded := ca.hasResponded() // If out of space do nothing for now. if isOutOfSpaceErr(err) { @@ -5803,10 +6070,15 @@ func (js *jetStream) processClusterCreateConsumer(oca, ca *consumerAssignment, s func() { defer s.grWG.Done() defer o.clearMonitorRunning() - o.setLeader(true) + err = o.setLeader(true) var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} - resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) - s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + if err != nil { + resp.Error = NewJSConsumerCreateError(err, Unless(err)) + s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + } else { + resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) + s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + } }, pprofLabels{ "type": "consumer", @@ -5828,12 +6100,10 @@ func (js *jetStream) processClusterCreateConsumer(oca, ca *consumerAssignment, s o.signalMonitorQuit() o.monitorWg.Wait() // Single replica consumer, process manually here. - js.mu.Lock() // Force response in case we think this is an update. - if !js.metaRecovering && isConfigUpdate { - ca.responded = false + if !js.isMetaRecovering() && isConfigUpdate { + ca.clearResponded() } - js.mu.Unlock() cca := o.consumerAssignment() // Perform the leader change in a goroutine, otherwise we could block meta operations. if o.shouldStartMonitor() { @@ -5881,12 +6151,26 @@ func (js *jetStream) processClusterCreateConsumer(oca, ca *consumerAssignment, s if o.IsLeader() || (!didCreate && needsLocalResponse) { // Process if existing as an update. Double check that this is not recovered. js.mu.RLock() - client, subject, reply, recovering := ca.Client, ca.Subject, ca.Reply, ca.recovering + client, subject, reply, recovering, sourcing := ca.Client, ca.Subject, ca.Reply, ca.recovering, ca.Config.Sourcing js.mu.RUnlock() if !recovering { - var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} - resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) - s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + // If it's a sourcing consumer, we need to respond after the consumer has been reset instead. + if sourcing { + var resp = JSApiConsumerResetResponse{ApiResponse: ApiResponse{Type: JSApiConsumerResetResponseType}} + resetSeq, canRespond, err := o.resetStartingSeq(0, reply, true) + if err != nil { + resp.Error = NewJSConsumerInvalidResetError(err) + s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + } else if canRespond { + resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) + resp.ResetSeq = resetSeq + s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + } + } else { + var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} + resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) + s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + } } } } @@ -6151,8 +6435,6 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { key := make([]byte, 32) crand.Read(key) - // Hash of the last snapshot (fixed size in memory). - var lastSnap []byte var lastSnapTime time.Time // Don't allow the upper layer to install snapshots until we have @@ -6167,45 +6449,30 @@ func (js *jetStream) monitorConsumer(o *consumer, ca *consumerAssignment) { return } - // Check several things to see if we need a snapshot. - ne, nb := n.Size() - if !n.NeedSnapshot() { - // Check if we should compact etc. based on size of log. - if !force && ne < compactNumMin && nb < compactSizeMin { - return - } - } - if snap, err := o.store.EncodedState(); err == nil { - hash := highwayhash.Sum(snap, key) - // If the state hasn't changed but the log has gone way over - // the compaction size then we will want to compact anyway. - // This can happen for example when a pull consumer fetches a - // lot on an idle stream, log entries get distributed but the - // state never changes, therefore the log never gets compacted. - if !bytes.Equal(hash[:], lastSnap) || ne >= compactNumMin || nb >= compactSizeMin { - // If we had a significant number of failed snapshots, start relaxing Raft-layer checks - // to force it through. We might have been catching up a peer for a long period, and this - // protects our log size from growing indefinitely. - forceSnapshot := failedSnapshots > 4 - if err := n.InstallSnapshot(snap, forceSnapshot); err == nil { - lastSnap, lastSnapTime = hash[:], time.Now() - // If there was a failed snapshot before, we reduced the timer's interval. - // Reset it back to the original interval now. - if failedSnapshots > 0 { - t.Reset(compactInterval + rci) - } - failedSnapshots = 0 - } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { - s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err) - // If this is the first failure, reduce the interval of the snapshot timer. - // This ensures we're not waiting too long for snapshotting to eventually become forced. - if failedSnapshots == 0 { - t.Reset(compactMinInterval) - } - failedSnapshots++ + // If we had a significant number of failed snapshots, start relaxing Raft-layer checks + // to force it through. We might have been catching up a peer for a long period, and this + // protects our log size from growing indefinitely. + forceSnapshot := failedSnapshots > 4 + if err := n.InstallSnapshot(snap, forceSnapshot); err == nil { + lastSnapTime = time.Now() + // If there was a failed snapshot before, we reduced the timer's interval. + // Reset it back to the original interval now. + if failedSnapshots > 0 { + t.Reset(compactInterval + rci) + } + failedSnapshots = 0 + } else if err != errNoSnapAvailable && err != errNodeClosed && err != errCatchupsRunning { + s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err) + // If this is the first failure, reduce the interval of the snapshot timer. + // This ensures we're not waiting too long for snapshotting to eventually become forced. + if failedSnapshots == 0 { + t.Reset(compactMinInterval) } + failedSnapshots++ } + } else { + s.RateLimitWarnf("Failed to install snapshot for '%s > %s > %s' [%s]: %v", o.acc.Name, ca.Stream, ca.Name, n.Group(), err) } } @@ -6503,10 +6770,57 @@ func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLea if !o.isLeader() && sseq > o.sseq { o.sseq = sseq } + if o.dseq == 0 { + o.dseq = 1 + } if o.store != nil { o.store.UpdateStarting(sseq - 1) } o.mu.Unlock() + case resetSeqOp: + o.mu.Lock() + var le = binary.LittleEndian + sseq := le.Uint64(buf[1:9]) + reply := string(buf[9:]) + o.resetLocalStartingSeq(sseq) + if o.store != nil { + o.store.Reset(sseq - 1) + } + // Cleanup messages that lost interest. + if o.retention == InterestPolicy { + if mset := o.mset; mset != nil { + o.mu.Unlock() + ss := mset.state() + o.checkStateForInterestStream(&ss) + o.mu.Lock() + } + } + // Recalculate pending, and re-trigger message delivery. + if !o.isLeader() { + o.mu.Unlock() + } else { + o.streamNumPending() + o.signalNewMessages() + s, a := o.srv, o.acc + if reply == _EMPTY_ { + o.mu.Unlock() + } else if internal, ok := o.rsm[reply]; !ok { + o.mu.Unlock() + } else { + delete(o.rsm, reply) + o.mu.Unlock() + + // Check if the reset request needs to be answered on the system account. + // This will happen for replicated sourcing consumers that get reset as part of a create/update. + if internal { + a = nil + } + var resp = JSApiConsumerResetResponse{ApiResponse: ApiResponse{Type: JSApiConsumerResetResponseType}} + resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) + resp.ResetSeq = sseq + s.sendInternalAccountMsg(a, reply, s.jsonResponse(&resp)) + } + } case addPendingRequest: o.mu.Lock() if !o.isLeader() { @@ -6540,7 +6854,7 @@ func (o *consumer) processReplicatedAck(dseq, sseq uint64) error { o.lat = time.Now() var sagap uint64 - if o.cfg.AckPolicy == AckAll { + if o.cfg.AckPolicy == AckAll || o.cfg.AckPolicy == AckFlowControl { // Always use the store state, as o.asflr is skipped ahead already. // Capture before updating store. state, err := o.store.BorrowState() @@ -6643,12 +6957,11 @@ func (js *jetStream) processConsumerLeaderChangeWithAssignment(o *consumer, ca * if ca == nil { return stepDownIfLeader() } - js.mu.Lock() + js.mu.RLock() s, account, err := js.srv, ca.Client.serviceAccount(), ca.err - client, subject, reply, streamName, consumerName := ca.Client, ca.Subject, ca.Reply, ca.Stream, ca.Name - hasResponded := ca.responded - ca.responded = true - js.mu.Unlock() + client, subject, reply, streamName, consumerName, sourcing := ca.Client, ca.Subject, ca.Reply, ca.Stream, ca.Name, ca.Config.Sourcing + hasResponded := ca.markResponded() + js.mu.RUnlock() acc, _ := s.LookupAccount(account) if acc == nil { @@ -6674,7 +6987,9 @@ func (js *jetStream) processConsumerLeaderChangeWithAssignment(o *consumer, ca * } // Tell consumer to switch leader status. - o.setLeader(isLeader) + if lerr := o.setLeader(isLeader); lerr != nil && err == nil { + err = lerr + } if !isLeader || hasResponded { if isLeader { @@ -6688,8 +7003,22 @@ func (js *jetStream) processConsumerLeaderChangeWithAssignment(o *consumer, ca * resp.Error = NewJSConsumerCreateError(err, Unless(err)) s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) } else { - resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.initialInfo()) - s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + // If it's a sourcing consumer, we need to respond after the consumer has been reset instead. + if sourcing { + var rresp = JSApiConsumerResetResponse{ApiResponse: ApiResponse{Type: JSApiConsumerResetResponseType}} + resetSeq, canRespond, err := o.resetStartingSeq(0, reply, true) + if err != nil { + rresp.Error = NewJSConsumerInvalidResetError(err) + s.sendAPIErrResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&rresp)) + } else if canRespond { + rresp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.info()) + rresp.ResetSeq = resetSeq + s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&rresp)) + } + } else { + resp.ConsumerInfo = setDynamicConsumerInfoMetadata(o.initialInfo()) + s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp)) + } o.sendCreateAdvisory() } @@ -6864,8 +7193,8 @@ func (js *jetStream) processStreamAssignmentResults(sub *subscription, c *client } else if result.Restore != nil { resp = s.jsonResponse(result.Restore) } - if !sa.responded || result.Update { - sa.responded = true + if !sa.hasResponded() || result.Update { + sa.markResponded() js.srv.sendAPIErrResponse(sa.Client, acc, sa.Subject, sa.Reply, _EMPTY_, resp) } // Remove this assignment if possible. @@ -6898,9 +7227,9 @@ func (js *jetStream) processConsumerAssignmentResults(sub *subscription, c *clie } if sa := js.streamAssignment(result.Account, result.Stream); sa != nil && sa.consumers != nil { - if ca := sa.consumers[result.Consumer]; ca != nil && !ca.responded { + if ca := sa.consumers[result.Consumer]; ca != nil && !ca.hasResponded() { js.srv.sendAPIErrResponse(ca.Client, acc, ca.Subject, ca.Reply, _EMPTY_, s.jsonResponse(result.Response)) - ca.responded = true + ca.markResponded() // Check if this failed. // TODO(dlc) - Could have mixed results, should track per peer. @@ -7941,10 +8270,11 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su return } // Single nodes are not recorded by the NRG layer so we can rename. - if len(peers) == 1 { + if len(peers) == 1 || osa.Config.Replicas == 1 { rg.Name = groupNameForStream(peers, rg.Storage) - } else if len(rg.Peers) == 1 { - // This is scale up from being a singelton, set preferred to that singelton. + } + if len(rg.Peers) == 1 { + // This is scale up from being a singleton, set preferred to that singleton. rg.Preferred = rg.Peers[0] } rg.ScaleUp = true @@ -7988,6 +8318,11 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su } } rg.Peers = selected + // Single nodes are not recorded by the NRG layer so we can rename. + // MUST do this, otherwise a scaleup afterward could potentially lead to inconsistencies. + if len(rg.Peers) == 1 { + rg.Name = groupNameForStream(rg.Peers, rg.Storage) + } } // Need to remap any consumers. @@ -8006,6 +8341,10 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su } // Assign new peers. cca.Group.Peers = rg.Peers + // Single nodes are not recorded by the NRG layer so we can rename. + if len(cca.Group.Peers) == 1 || numPeers == 1 { + cca.Group.Name = groupNameForConsumer(cca.Group.Peers, cca.Group.Storage) + } // If the replicas was not 0 make sure it matches here. if cca.Config.Replicas != 0 { cca.Config.Replicas = len(rg.Peers) @@ -8034,6 +8373,10 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su cca := ca.copyGroup() // Assign new peers. cca.Group.Peers = newPeers + // Single nodes are not recorded by the NRG layer so we can rename. + if len(cca.Group.Peers) == 1 || numPeers == 1 { + cca.Group.Name = groupNameForConsumer(cca.Group.Peers, cca.Group.Storage) + } // If the replicas was not 0 make sure it matches here. if cca.Config.Replicas != 0 { cca.Config.Replicas = len(newPeers) @@ -8132,7 +8475,11 @@ func (s *Server) jsClusteredStreamUpdateRequest(ci *ClientInfo, acc *Account, su rg.Preferred = _EMPTY_ } - sa := &streamAssignment{Group: rg, Sync: osa.Sync, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci} + syncSubject := osa.Sync + if syncSubject == _EMPTY_ { + syncSubject = syncSubjForStream() + } + sa := &streamAssignment{Group: rg, Sync: syncSubject, Created: osa.Created, Config: newCfg, Subject: subject, Reply: reply, Client: ci} if err := meta.Propose(encodeUpdateStreamAssignment(sa)); err != nil { return } @@ -8722,7 +9069,7 @@ func (s *Server) jsClusteredMsgDeleteRequest(ci *ClientInfo, acc *Account, mset } func encodeAddStreamAssignment(sa *streamAssignment) []byte { - csa := *sa + csa := sa.clone() csa.Client = csa.Client.forProposal() csa.ConfigJSON, _ = json.Marshal(sa.Config) var bb bytes.Buffer @@ -8732,7 +9079,7 @@ func encodeAddStreamAssignment(sa *streamAssignment) []byte { } func encodeUpdateStreamAssignment(sa *streamAssignment) []byte { - csa := *sa + csa := sa.clone() csa.Client = csa.Client.forProposal() csa.ConfigJSON, _ = json.Marshal(sa.Config) var bb bytes.Buffer @@ -8742,7 +9089,7 @@ func encodeUpdateStreamAssignment(sa *streamAssignment) []byte { } func encodeDeleteStreamAssignment(sa *streamAssignment) []byte { - csa := *sa + csa := sa.clone() csa.Client = csa.Client.forProposal() csa.ConfigJSON, _ = json.Marshal(sa.Config) var bb bytes.Buffer @@ -8848,6 +9195,17 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec return } + // If the consumer is a direct sourcing consumer, we need to "upgrade" it to be durable without AckNone. + // We only get here if the stream is not Limits-based. + if cfg.Direct && cfg.Sourcing && cfg.Name != _EMPTY_ { + cfg.Direct = false + cfg.Durable = cfg.Name + cfg.AckPolicy = AckFlowControl + cfg.AckWait = 0 + cfg.MaxDeliver = 0 + cfg.InactiveThreshold = 0 + } + var resp = JSApiConsumerCreateResponse{ApiResponse: ApiResponse{Type: JSApiConsumerCreateResponseType}} streamCfg, ok := js.clusterStreamConfig(acc.Name, stream) @@ -8914,13 +9272,13 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // we're likely updating an existing consumer, so don't count it. Otherwise // we will incorrectly return NewJSMaximumConsumersLimitError for an update. if oname == _EMPTY_ || js.consumerAssignmentOrInflight(acc.Name, stream, oname) == nil { - // Don't count DIRECTS. + // Don't count direct/sourcing consumers. total := 0 for ca := range js.consumerAssignmentsOrInflightSeq(acc.Name, stream) { if ca.unsupported != nil { continue } - if ca.Config != nil && !ca.Config.Direct { + if ca.Config != nil && !ca.Config.Direct && !ca.Config.Sourcing { total++ } } @@ -8967,6 +9325,13 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // Provided config might miss metadata, copy from existing config. copyConsumerMetadata(cfg, ca.Config) + // If a durable sourcing consumer is used, we need to reset the deliver policy. + if cfg.Sourcing && cfg.Durable != _EMPTY_ { + cfg.DeliverPolicy = ca.Config.DeliverPolicy + cfg.OptStartSeq = ca.Config.OptStartSeq + cfg.OptStartTime = ca.Config.OptStartTime + } + if action == ActionCreate && !reflect.DeepEqual(cfg, ca.Config) { resp.Error = NewJSConsumerAlreadyExistsError() s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) @@ -9052,22 +9417,22 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // Check if we are work queue policy. // We will do pre-checks here to avoid thrashing meta layer. - if sa.Config.Retention == WorkQueuePolicy && !cfg.Direct { - if cfg.AckPolicy != AckExplicit { + if sa.Config.Retention == WorkQueuePolicy && !cfg.Direct && !cfg.Sourcing { + if cfg.AckPolicy != AckExplicit && cfg.AckPolicy != AckFlowControl { resp.Error = NewJSConsumerWQRequiresExplicitAckError() s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) return } subjects := gatherSubjectFilters(cfg.FilterSubject, cfg.FilterSubjects) - if len(subjects) == 0 && len(sa.consumers) > 0 { - resp.Error = NewJSConsumerWQMultipleUnfilteredError() - s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) - return - } for oca := range js.consumerAssignmentsOrInflightSeq(acc.Name, stream) { - if oca.Name == oname { + if oca.Name == oname || oca.Config.Direct || oca.Config.Sourcing { continue } + if len(subjects) == 0 { + resp.Error = NewJSConsumerWQMultipleUnfilteredError() + s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp)) + return + } for _, psubj := range gatherSubjectFilters(oca.Config.FilterSubject, oca.Config.FilterSubjects) { for _, subj := range subjects { if SubjectsCollide(subj, psubj) { @@ -9150,6 +9515,10 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec } } } + // Single nodes are not recorded by the NRG layer so we can rename. + if rBefore == 1 { + nca.Group.Name = groupNameForConsumer(newPeerSet, nca.Group.Storage) + } nca.Group.Peers = newPeerSet nca.Group.Preferred = curLeader nca.Group.ScaleUp = true @@ -9168,6 +9537,11 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec // scale down by removing peers from the end newPeerSet = newPeerSet[len(newPeerSet)-rAfter:] nca.Group.Peers = newPeerSet + // Single nodes are not recorded by the NRG layer so we can rename. + // MUST do this, otherwise a scaleup afterward could potentially lead to inconsistencies. + if len(nca.Group.Peers) == 1 { + nca.Group.Name = groupNameForConsumer(nca.Group.Peers, nca.Group.Storage) + } } // Update config and client info on copy of existing. @@ -9185,7 +9559,7 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec } func encodeAddConsumerAssignment(ca *consumerAssignment) []byte { - cca := *ca + cca := ca.clone() cca.Client = cca.Client.forProposal() cca.ConfigJSON, _ = json.Marshal(ca.Config) var bb bytes.Buffer @@ -9195,7 +9569,7 @@ func encodeAddConsumerAssignment(ca *consumerAssignment) []byte { } func encodeDeleteConsumerAssignment(ca *consumerAssignment) []byte { - cca := *ca + cca := ca.clone() cca.Client = cca.Client.forProposal() cca.ConfigJSON, _ = json.Marshal(ca.Config) var bb bytes.Buffer @@ -9236,7 +9610,7 @@ func decodeConsumerAssignmentConfig(ca *consumerAssignment) error { } func encodeAddConsumerAssignmentCompressed(ca *consumerAssignment) []byte { - cca := *ca + cca := ca.clone() cca.Client = cca.Client.forProposal() cca.ConfigJSON, _ = json.Marshal(ca.Config) var bb bytes.Buffer @@ -9527,6 +9901,16 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node maxMsgSize, lseq := int(mset.cfg.MaxMsgSize), mset.lseq isLeader, isSealed, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules := mset.isLeader(), mset.cfg.Sealed, mset.cfg.AllowRollup, mset.cfg.DenyPurge, mset.cfg.AllowMsgTTL, mset.cfg.AllowMsgCounter, mset.cfg.AllowMsgSchedules + + // Apply the input subject transform if any + csubject := subject + if mset.itr != nil { + ts, err := mset.itr.Match(csubject) + if err == nil { + // no filtering: if the subject doesn't map the source of the transform, don't change it + csubject = ts + } + } mset.mu.RUnlock() // This should not happen but possible now that we allow scale up, and scale down where this could trigger. @@ -9577,7 +9961,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ } // Check here pre-emptively if we have exceeded our account limits. - if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, subject, hdr, msg); exceeded { + if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, csubject, hdr, msg); exceeded { if err == nil { err = NewJSAccountResourcesExceededError() } @@ -9611,25 +9995,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ // Check if we need to set initial value here mset.clMu.Lock() if mset.clseq == 0 || mset.clseq < lseq+mset.clfs { - // Need to unlock and re-acquire the locks in the proper order. - mset.clMu.Unlock() - // Locking order is stream -> batchMu -> clMu - mset.mu.RLock() - batch := mset.batchApply - var batchCount uint64 - if batch != nil { - batch.mu.Lock() - batchCount = batch.count - } - mset.clMu.Lock() - // Re-capture - lseq = mset.lseq - mset.clseq = lseq + mset.clfs + batchCount - // Keep hold of the mset.clMu, but unlock the others. - if batch != nil { - batch.mu.Unlock() - } - mset.mu.RUnlock() + lseq = recalculateClusteredSeq(mset, true) } var ( @@ -9638,7 +10004,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ err error ) diff := &batchStagedDiff{} - if hdr, msg, dseq, apiErr, err = checkMsgHeadersPreClusteredProposal(diff, mset, subject, hdr, msg, sourced, name, jsa, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, discard, discardNewPer, maxMsgSize, maxMsgs, maxMsgsPer, maxBytes); err != nil { + if hdr, msg, dseq, apiErr, err = checkMsgHeadersPreClusteredProposal(diff, mset, csubject, subject, hdr, msg, sourced, name, jsa, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, discard, discardNewPer, maxMsgSize, maxMsgs, maxMsgsPer, maxBytes); err != nil { mset.clMu.Unlock() if err == errMsgIdDuplicate && dseq > 0 { var buf [256]byte @@ -9652,52 +10018,13 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [ var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} resp.Error = apiErr response, _ = json.Marshal(resp) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) + outq.sendMsg(reply, response) } return err } - diff.commit(mset) - esm := encodeStreamMsgAllowCompress(subject, reply, hdr, msg, mset.clseq, time.Now().UnixNano(), sourced) - var mtKey uint64 - if mt != nil { - mtKey = mset.clseq - if mset.mt == nil { - mset.mt = make(map[uint64]*msgTrace) - } - mset.mt[mtKey] = mt - } - - // Do proposal. - _ = node.Propose(esm) - // The proposal can fail, but we always account for trying. - mset.clseq++ - mset.trackReplicationTraffic(node, len(esm), r) - - // Check to see if we are being overrun. - // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. - if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold { - lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) - s.RateLimitWarnf("%s", lerr.Error()) - } + err = commitSingleMsg(diff, mset, subject, reply, hdr, msg, name, jsa, mt, node, r, lseq) mset.clMu.Unlock() - - if err != nil { - if mt != nil { - mset.getAndDeleteMsgTrace(mtKey) - } - if canRespond { - var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: mset.cfg.Name}} - resp.Error = &ApiError{Code: 503, Description: err.Error()} - response, _ = json.Marshal(resp) - // If we errored out respond here. - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) - } - if isOutOfSpaceErr(err) { - s.handleOutOfSpace(mset) - } - } - return err } @@ -9741,14 +10068,17 @@ func (mset *stream) calculateSyncRequest(state *StreamState, snap *StreamReplica // processSnapshotDeletes will update our current store based on the snapshot // but only processing deletes and new FirstSeq / purges. -func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) { +func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) error { mset.mu.Lock() var state StreamState mset.store.FastState(&state) // Always adjust if FirstSeq has moved beyond our state. var didReset bool if snap.FirstSeq > state.FirstSeq { - mset.store.Compact(snap.FirstSeq) + if _, err := mset.store.Compact(snap.FirstSeq); err != nil { + mset.mu.Unlock() + return err + } mset.store.FastState(&state) mset.lseq = state.LastSeq mset.clearAllPreAcksBelowFloor(state.FirstSeq) @@ -9763,8 +10093,9 @@ func (mset *stream) processSnapshotDeletes(snap *StreamReplicatedState) { } if len(snap.Deleted) > 0 { - mset.store.SyncDeleted(snap.Deleted) + return mset.store.SyncDeleted(snap.Deleted) } + return nil } func (mset *stream) setCatchupPeer(peer string, lag uint64) { @@ -9874,7 +10205,9 @@ var ( // Process a stream snapshot. func (mset *stream) processSnapshot(snap *StreamReplicatedState, index uint64) (e error) { // Update any deletes, etc. - mset.processSnapshotDeletes(snap) + if err := mset.processSnapshotDeletes(snap); err != nil { + return err + } mset.setCLFS(snap.Failed) mset.mu.Lock() @@ -9910,7 +10243,14 @@ func (mset *stream) processSnapshot(snap *StreamReplicatedState, index uint64) ( // Pause the apply channel for our raft group while we catch up. if err := n.PauseApply(); err != nil { - return err + // The only reason PauseApply can fail is due to errAlreadyLeader. + // We step down to get someone else to become the leader that can catch us up. + // Ignore the error since we could have already stepped down before us doing so here. + _ = n.StepDown() + // Now try pausing again and continue to catchup. + if err = n.PauseApply(); err != nil { + return err + } } // Set our catchup state. @@ -10089,9 +10429,13 @@ RETRY: if lseq >= snap.LastSeq { // We MUST ensure all data is flushed up to this point, if the store hadn't already. // Because the snapshot needs to represent what has been persisted. - mset.flushAllPending() - s.Noticef("Catchup for stream '%s > %s' complete (took %v)", mset.account(), mset.name(), time.Since(start).Round(time.Millisecond)) - return nil + err = mset.flushAllPending() + if err == nil { + s.Noticef("Catchup for stream '%s > %s' complete (took %v)", mset.account(), mset.name(), time.Since(start).Round(time.Millisecond)) + } else { + s.Noticef("Catchup for stream '%s > %s' errored: %v (took %v)", mset.account(), mset.name(), err, time.Since(start).Round(time.Millisecond)) + } + return err } // Make sure we do not spin and make things worse. @@ -10187,17 +10531,13 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { // Handle the delete range. // Make sure the sequences match up properly. mset.mu.Lock() - if len(mset.preAcks) > 0 { - for seq := dr.First; seq < dr.First+dr.Num; seq++ { - mset.clearAllPreAcks(seq) - } - } + lseq := dr.First + dr.Num - 1 if err = mset.store.SkipMsgs(dr.First, dr.Num); err != nil { mset.mu.Unlock() return 0, errCatchupWrongSeqForSkip } - mset.lseq = dr.First + dr.Num - 1 - lseq := mset.lseq + mset.clearAllPreAcksInRange(dr.First, lseq) + mset.lseq = lseq mset.mu.Unlock() return lseq, nil } @@ -10241,14 +10581,14 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { if _, err = mset.store.SkipMsg(seq); err != nil { return 0, errCatchupWrongSeqForSkip } - } else if err := mset.store.StoreRawMsg(subj, hdr, msg, seq, ts, ttl); err != nil { + } else if err := mset.store.StoreRawMsg(subj, hdr, msg, seq, ts, ttl, false); err != nil { return 0, err } mset.mu.Lock() defer mset.mu.Unlock() // Update our lseq. - mset.setLastSeq(seq) + mset.lseq = seq // Check for MsgId and if we have one here make sure to update our internal map. if len(hdr) > 0 { @@ -10263,8 +10603,8 @@ func (mset *stream) processCatchupMsg(msg []byte) (uint64, error) { } // flushAllPending will flush any pending writes as a result of installing a snapshot or performing catchup. -func (mset *stream) flushAllPending() { - mset.store.FlushAllPending() +func (mset *stream) flushAllPending() error { + return mset.store.FlushAllPending() } func (mset *stream) handleClusterSyncRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { @@ -10297,24 +10637,29 @@ func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo { return nil } js.mu.RLock() - defer js.mu.RUnlock() - s := js.srv if rg == nil || rg.node == nil { + js.mu.RUnlock() return &ClusterInfo{ Name: s.cachedClusterName(), Leader: s.Name(), } } + // Capture what we need and let go of the lock to ensure that + // contention on Raft locks can't happen while holding JS lock. n := rg.node + rgName := rg.Name + rgPeers := slices.Clone(rg.Peers) + js.mu.RUnlock() + ci := &ClusterInfo{ Name: s.cachedClusterName(), Leader: s.serverNameForNode(n.GroupLeader()), LeaderSince: n.LeaderSince(), SystemAcc: n.IsSystemAccount(), TrafficAcc: n.GetTrafficAccountName(), - RaftGroup: rg.Name, + RaftGroup: rgName, } now := time.Now() @@ -10326,7 +10671,7 @@ func (js *jetStream) clusterInfo(rg *raftGroup) *ClusterInfo { } for _, rp := range peers { - if rp.ID != id && rg.isMember(rp.ID) { + if rp.ID != id && slices.Contains(rgPeers, rp.ID) { var lastSeen time.Duration if now.After(rp.Last) && !rp.Last.IsZero() { lastSeen = now.Sub(rp.Last) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go index 8baf4211c3..37431d8983 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_errors_generated.go @@ -29,12 +29,30 @@ const ( // JSAtomicPublishTooLargeBatchErrF atomic publish batch is too large: {size} JSAtomicPublishTooLargeBatchErrF ErrorIdentifier = 10199 + // JSAtomicPublishTooManyInflight atomic publish too many inflight + JSAtomicPublishTooManyInflight ErrorIdentifier = 10210 + // JSAtomicPublishUnsupportedHeaderBatchErr atomic publish unsupported header used: {header} JSAtomicPublishUnsupportedHeaderBatchErr ErrorIdentifier = 10177 // JSBadRequestErr bad request JSBadRequestErr ErrorIdentifier = 10003 + // JSBatchPublishDisabledErr batch publish is disabled + JSBatchPublishDisabledErr ErrorIdentifier = 10205 + + // JSBatchPublishInvalidBatchIDErr batch publish ID is invalid + JSBatchPublishInvalidBatchIDErr ErrorIdentifier = 10207 + + // JSBatchPublishInvalidPatternErr batch publish pattern is invalid + JSBatchPublishInvalidPatternErr ErrorIdentifier = 10206 + + // JSBatchPublishTooManyInflight batch publish too many inflight + JSBatchPublishTooManyInflight ErrorIdentifier = 10211 + + // JSBatchPublishUnknownBatchIDErr batch publish ID unknown + JSBatchPublishUnknownBatchIDErr ErrorIdentifier = 10208 + // JSClusterIncompleteErr incomplete results JSClusterIncompleteErr ErrorIdentifier = 10004 @@ -71,6 +89,21 @@ const ( // JSClusterUnSupportFeatureErr not currently supported in clustered mode JSClusterUnSupportFeatureErr ErrorIdentifier = 10036 + // JSConsumerAckFCRequiresFCErr flow control ack policy requires flow control + JSConsumerAckFCRequiresFCErr ErrorIdentifier = 10219 + + // JSConsumerAckFCRequiresMaxAckPendingErr flow control ack policy requires max ack pending + JSConsumerAckFCRequiresMaxAckPendingErr ErrorIdentifier = 10220 + + // JSConsumerAckFCRequiresNoAckWaitErr flow control ack policy requires unset ack wait + JSConsumerAckFCRequiresNoAckWaitErr ErrorIdentifier = 10221 + + // JSConsumerAckFCRequiresNoMaxDeliverErr flow control ack policy requires unset max deliver + JSConsumerAckFCRequiresNoMaxDeliverErr ErrorIdentifier = 10222 + + // JSConsumerAckFCRequiresPushErr flow control ack policy requires a push based consumer + JSConsumerAckFCRequiresPushErr ErrorIdentifier = 10218 + // JSConsumerAckPolicyInvalidErr consumer ack policy invalid JSConsumerAckPolicyInvalidErr ErrorIdentifier = 10181 @@ -167,6 +200,9 @@ const ( // JSConsumerInvalidPriorityGroupErr Provided priority group does not exist for this consumer JSConsumerInvalidPriorityGroupErr ErrorIdentifier = 10160 + // JSConsumerInvalidResetErr invalid reset: {err} + JSConsumerInvalidResetErr ErrorIdentifier = 10204 + // JSConsumerInvalidSamplingErrF failed to parse consumer sampling configuration: {err} JSConsumerInvalidSamplingErrF ErrorIdentifier = 10095 @@ -317,21 +353,36 @@ const ( // JSMessageSchedulesRollupInvalidErr message schedules invalid rollup JSMessageSchedulesRollupInvalidErr ErrorIdentifier = 10192 + // JSMessageSchedulesSchedulerInvalidErr message schedules invalid scheduler + JSMessageSchedulesSchedulerInvalidErr ErrorIdentifier = 10212 + + // JSMessageSchedulesSourceInvalidErr message schedules source is invalid + JSMessageSchedulesSourceInvalidErr ErrorIdentifier = 10203 + // JSMessageSchedulesTTLInvalidErr message schedules invalid per-message TTL JSMessageSchedulesTTLInvalidErr ErrorIdentifier = 10191 // JSMessageSchedulesTargetInvalidErr message schedules target is invalid JSMessageSchedulesTargetInvalidErr ErrorIdentifier = 10190 + // JSMessageSchedulesTimeZoneInvalidErr message schedules time zone is invalid + JSMessageSchedulesTimeZoneInvalidErr ErrorIdentifier = 10223 + // JSMessageTTLDisabledErr per-message TTL is disabled JSMessageTTLDisabledErr ErrorIdentifier = 10166 // JSMessageTTLInvalidErr invalid per-message TTL JSMessageTTLInvalidErr ErrorIdentifier = 10165 + // JSMirrorConsumerRequiresAckFCErr stream mirror consumer requires flow control ack policy + JSMirrorConsumerRequiresAckFCErr ErrorIdentifier = 10214 + // JSMirrorConsumerSetupFailedErrF generic mirror consumer setup failure string ({err}) JSMirrorConsumerSetupFailedErrF ErrorIdentifier = 10029 + // JSMirrorDurableConsumerCfgInvalid stream mirror consumer config is invalid + JSMirrorDurableConsumerCfgInvalid ErrorIdentifier = 10213 + // JSMirrorInvalidStreamName mirrored stream name is invalid JSMirrorInvalidStreamName ErrorIdentifier = 10142 @@ -353,6 +404,9 @@ const ( // JSMirrorWithAtomicPublishErr stream mirrors can not also use atomic publishing JSMirrorWithAtomicPublishErr ErrorIdentifier = 10198 + // JSMirrorWithBatchPublishErr stream mirrors can not also use batch publishing + JSMirrorWithBatchPublishErr ErrorIdentifier = 10209 + // JSMirrorWithCountersErr stream mirrors can not also calculate counters JSMirrorWithCountersErr ErrorIdentifier = 10173 @@ -416,12 +470,21 @@ const ( // JSSnapshotDeliverSubjectInvalidErr deliver subject not valid JSSnapshotDeliverSubjectInvalidErr ErrorIdentifier = 10015 + // JSSourceConsumerRequiresAckFCErr stream source consumer requires flow control ack policy + JSSourceConsumerRequiresAckFCErr ErrorIdentifier = 10217 + // JSSourceConsumerSetupFailedErrF General source consumer setup failure string ({err}) JSSourceConsumerSetupFailedErrF ErrorIdentifier = 10045 // JSSourceDuplicateDetected source stream, filter and transform (plus external if present) must form a unique combination (duplicate source configuration detected) JSSourceDuplicateDetected ErrorIdentifier = 10140 + // JSSourceDurableConsumerCfgInvalid stream source consumer config is invalid + JSSourceDurableConsumerCfgInvalid ErrorIdentifier = 10215 + + // JSSourceDurableConsumerDuplicateDetected duplicate stream source consumer detected + JSSourceDurableConsumerDuplicateDetected ErrorIdentifier = 10216 + // JSSourceInvalidStreamName sourced stream name is invalid JSSourceInvalidStreamName ErrorIdentifier = 10141 @@ -619,8 +682,14 @@ var ( JSAtomicPublishInvalidBatchIDErr: {Code: 400, ErrCode: 10179, Description: "atomic publish batch ID is invalid"}, JSAtomicPublishMissingSeqErr: {Code: 400, ErrCode: 10175, Description: "atomic publish sequence is missing"}, JSAtomicPublishTooLargeBatchErrF: {Code: 400, ErrCode: 10199, Description: "atomic publish batch is too large: {size}"}, + JSAtomicPublishTooManyInflight: {Code: 429, ErrCode: 10210, Description: "atomic publish too many inflight"}, JSAtomicPublishUnsupportedHeaderBatchErr: {Code: 400, ErrCode: 10177, Description: "atomic publish unsupported header used: {header}"}, JSBadRequestErr: {Code: 400, ErrCode: 10003, Description: "bad request"}, + JSBatchPublishDisabledErr: {Code: 400, ErrCode: 10205, Description: "batch publish is disabled"}, + JSBatchPublishInvalidBatchIDErr: {Code: 400, ErrCode: 10207, Description: "batch publish ID is invalid"}, + JSBatchPublishInvalidPatternErr: {Code: 400, ErrCode: 10206, Description: "batch publish pattern is invalid"}, + JSBatchPublishTooManyInflight: {Code: 429, ErrCode: 10211, Description: "batch publish too many inflight"}, + JSBatchPublishUnknownBatchIDErr: {Code: 400, ErrCode: 10208, Description: "batch publish ID unknown"}, JSClusterIncompleteErr: {Code: 503, ErrCode: 10004, Description: "incomplete results"}, JSClusterNoPeersErrF: {Code: 400, ErrCode: 10005, Description: "{err}"}, JSClusterNotActiveErr: {Code: 500, ErrCode: 10006, Description: "JetStream not in clustered mode"}, @@ -633,6 +702,11 @@ var ( JSClusterServerNotMemberErr: {Code: 400, ErrCode: 10044, Description: "server is not a member of the cluster"}, JSClusterTagsErr: {Code: 400, ErrCode: 10011, Description: "tags placement not supported for operation"}, JSClusterUnSupportFeatureErr: {Code: 503, ErrCode: 10036, Description: "not currently supported in clustered mode"}, + JSConsumerAckFCRequiresFCErr: {Code: 400, ErrCode: 10219, Description: "flow control ack policy requires flow control"}, + JSConsumerAckFCRequiresMaxAckPendingErr: {Code: 400, ErrCode: 10220, Description: "flow control ack policy requires max ack pending"}, + JSConsumerAckFCRequiresNoAckWaitErr: {Code: 400, ErrCode: 10221, Description: "flow control ack policy requires unset ack wait"}, + JSConsumerAckFCRequiresNoMaxDeliverErr: {Code: 400, ErrCode: 10222, Description: "flow control ack policy requires unset max deliver"}, + JSConsumerAckFCRequiresPushErr: {Code: 400, ErrCode: 10218, Description: "flow control ack policy requires a push based consumer"}, JSConsumerAckPolicyInvalidErr: {Code: 400, ErrCode: 10181, Description: "consumer ack policy invalid"}, JSConsumerAckWaitNegativeErr: {Code: 400, ErrCode: 10183, Description: "consumer ack wait needs to be positive"}, JSConsumerAlreadyExists: {Code: 400, ErrCode: 10148, Description: "consumer already exists"}, @@ -665,6 +739,7 @@ var ( JSConsumerInvalidGroupNameErr: {Code: 400, ErrCode: 10162, Description: "Valid priority group name must match A-Z, a-z, 0-9, -_/=)+ and may not exceed 16 characters"}, JSConsumerInvalidPolicyErrF: {Code: 400, ErrCode: 10094, Description: "{err}"}, JSConsumerInvalidPriorityGroupErr: {Code: 400, ErrCode: 10160, Description: "Provided priority group does not exist for this consumer"}, + JSConsumerInvalidResetErr: {Code: 400, ErrCode: 10204, Description: "invalid reset: {err}"}, JSConsumerInvalidSamplingErrF: {Code: 400, ErrCode: 10095, Description: "failed to parse consumer sampling configuration: {err}"}, JSConsumerMaxDeliverBackoffErr: {Code: 400, ErrCode: 10116, Description: "max deliver is required to be > length of backoff values"}, JSConsumerMaxPendingAckExcessErrF: {Code: 400, ErrCode: 10121, Description: "consumer max ack pending exceeds system limit of {limit}"}, @@ -715,11 +790,16 @@ var ( JSMessageSchedulesDisabledErr: {Code: 400, ErrCode: 10188, Description: "message schedules is disabled"}, JSMessageSchedulesPatternInvalidErr: {Code: 400, ErrCode: 10189, Description: "message schedules pattern is invalid"}, JSMessageSchedulesRollupInvalidErr: {Code: 400, ErrCode: 10192, Description: "message schedules invalid rollup"}, + JSMessageSchedulesSchedulerInvalidErr: {Code: 400, ErrCode: 10212, Description: "message schedules invalid scheduler"}, + JSMessageSchedulesSourceInvalidErr: {Code: 400, ErrCode: 10203, Description: "message schedules source is invalid"}, JSMessageSchedulesTTLInvalidErr: {Code: 400, ErrCode: 10191, Description: "message schedules invalid per-message TTL"}, JSMessageSchedulesTargetInvalidErr: {Code: 400, ErrCode: 10190, Description: "message schedules target is invalid"}, + JSMessageSchedulesTimeZoneInvalidErr: {Code: 400, ErrCode: 10223, Description: "message schedules time zone is invalid"}, JSMessageTTLDisabledErr: {Code: 400, ErrCode: 10166, Description: "per-message TTL is disabled"}, JSMessageTTLInvalidErr: {Code: 400, ErrCode: 10165, Description: "invalid per-message TTL"}, + JSMirrorConsumerRequiresAckFCErr: {Code: 400, ErrCode: 10214, Description: "stream mirror consumer requires flow control ack policy"}, JSMirrorConsumerSetupFailedErrF: {Code: 500, ErrCode: 10029, Description: "{err}"}, + JSMirrorDurableConsumerCfgInvalid: {Code: 400, ErrCode: 10213, Description: "stream mirror consumer config is invalid"}, JSMirrorInvalidStreamName: {Code: 400, ErrCode: 10142, Description: "mirrored stream name is invalid"}, JSMirrorInvalidSubjectFilter: {Code: 400, ErrCode: 10151, Description: "mirror transform source: {err}"}, JSMirrorInvalidTransformDestination: {Code: 400, ErrCode: 10154, Description: "mirror transform: {err}"}, @@ -727,6 +807,7 @@ var ( JSMirrorMultipleFiltersNotAllowed: {Code: 400, ErrCode: 10150, Description: "mirror with multiple subject transforms cannot also have a single subject filter"}, JSMirrorOverlappingSubjectFilters: {Code: 400, ErrCode: 10152, Description: "mirror subject filters can not overlap"}, JSMirrorWithAtomicPublishErr: {Code: 400, ErrCode: 10198, Description: "stream mirrors can not also use atomic publishing"}, + JSMirrorWithBatchPublishErr: {Code: 400, ErrCode: 10209, Description: "stream mirrors can not also use batch publishing"}, JSMirrorWithCountersErr: {Code: 400, ErrCode: 10173, Description: "stream mirrors can not also calculate counters"}, JSMirrorWithFirstSeqErr: {Code: 400, ErrCode: 10143, Description: "stream mirrors can not have first sequence configured"}, JSMirrorWithMsgSchedulesErr: {Code: 400, ErrCode: 10186, Description: "stream mirrors can not also schedule messages"}, @@ -748,8 +829,11 @@ var ( JSRestoreSubscribeFailedErrF: {Code: 500, ErrCode: 10042, Description: "JetStream unable to subscribe to restore snapshot {subject}: {err}"}, JSSequenceNotFoundErrF: {Code: 400, ErrCode: 10043, Description: "sequence {seq} not found"}, JSSnapshotDeliverSubjectInvalidErr: {Code: 400, ErrCode: 10015, Description: "deliver subject not valid"}, + JSSourceConsumerRequiresAckFCErr: {Code: 400, ErrCode: 10217, Description: "stream source consumer requires flow control ack policy"}, JSSourceConsumerSetupFailedErrF: {Code: 500, ErrCode: 10045, Description: "{err}"}, JSSourceDuplicateDetected: {Code: 400, ErrCode: 10140, Description: "duplicate source configuration detected"}, + JSSourceDurableConsumerCfgInvalid: {Code: 400, ErrCode: 10215, Description: "stream source consumer config is invalid"}, + JSSourceDurableConsumerDuplicateDetected: {Code: 400, ErrCode: 10216, Description: "duplicate stream source consumer detected"}, JSSourceInvalidStreamName: {Code: 400, ErrCode: 10141, Description: "sourced stream name is invalid"}, JSSourceInvalidSubjectFilter: {Code: 400, ErrCode: 10145, Description: "source transform source: {err}"}, JSSourceInvalidTransformDestination: {Code: 400, ErrCode: 10146, Description: "source transform: {err}"}, @@ -923,6 +1007,16 @@ func NewJSAtomicPublishTooLargeBatchError(size interface{}, opts ...ErrorOption) } } +// NewJSAtomicPublishTooManyInflightError creates a new JSAtomicPublishTooManyInflight error: "atomic publish too many inflight" +func NewJSAtomicPublishTooManyInflightError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSAtomicPublishTooManyInflight] +} + // NewJSAtomicPublishUnsupportedHeaderBatchError creates a new JSAtomicPublishUnsupportedHeaderBatchErr error: "atomic publish unsupported header used: {header}" func NewJSAtomicPublishUnsupportedHeaderBatchError(header interface{}, opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -949,6 +1043,56 @@ func NewJSBadRequestError(opts ...ErrorOption) *ApiError { return ApiErrors[JSBadRequestErr] } +// NewJSBatchPublishDisabledError creates a new JSBatchPublishDisabledErr error: "batch publish is disabled" +func NewJSBatchPublishDisabledError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSBatchPublishDisabledErr] +} + +// NewJSBatchPublishInvalidBatchIDError creates a new JSBatchPublishInvalidBatchIDErr error: "batch publish ID is invalid" +func NewJSBatchPublishInvalidBatchIDError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSBatchPublishInvalidBatchIDErr] +} + +// NewJSBatchPublishInvalidPatternError creates a new JSBatchPublishInvalidPatternErr error: "batch publish pattern is invalid" +func NewJSBatchPublishInvalidPatternError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSBatchPublishInvalidPatternErr] +} + +// NewJSBatchPublishTooManyInflightError creates a new JSBatchPublishTooManyInflight error: "batch publish too many inflight" +func NewJSBatchPublishTooManyInflightError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSBatchPublishTooManyInflight] +} + +// NewJSBatchPublishUnknownBatchIDError creates a new JSBatchPublishUnknownBatchIDErr error: "batch publish ID unknown" +func NewJSBatchPublishUnknownBatchIDError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSBatchPublishUnknownBatchIDErr] +} + // NewJSClusterIncompleteError creates a new JSClusterIncompleteErr error: "incomplete results" func NewJSClusterIncompleteError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1075,6 +1219,56 @@ func NewJSClusterUnSupportFeatureError(opts ...ErrorOption) *ApiError { return ApiErrors[JSClusterUnSupportFeatureErr] } +// NewJSConsumerAckFCRequiresFCError creates a new JSConsumerAckFCRequiresFCErr error: "flow control ack policy requires flow control" +func NewJSConsumerAckFCRequiresFCError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerAckFCRequiresFCErr] +} + +// NewJSConsumerAckFCRequiresMaxAckPendingError creates a new JSConsumerAckFCRequiresMaxAckPendingErr error: "flow control ack policy requires max ack pending" +func NewJSConsumerAckFCRequiresMaxAckPendingError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerAckFCRequiresMaxAckPendingErr] +} + +// NewJSConsumerAckFCRequiresNoAckWaitError creates a new JSConsumerAckFCRequiresNoAckWaitErr error: "flow control ack policy requires unset ack wait" +func NewJSConsumerAckFCRequiresNoAckWaitError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerAckFCRequiresNoAckWaitErr] +} + +// NewJSConsumerAckFCRequiresNoMaxDeliverError creates a new JSConsumerAckFCRequiresNoMaxDeliverErr error: "flow control ack policy requires unset max deliver" +func NewJSConsumerAckFCRequiresNoMaxDeliverError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerAckFCRequiresNoMaxDeliverErr] +} + +// NewJSConsumerAckFCRequiresPushError creates a new JSConsumerAckFCRequiresPushErr error: "flow control ack policy requires a push based consumer" +func NewJSConsumerAckFCRequiresPushError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSConsumerAckFCRequiresPushErr] +} + // NewJSConsumerAckPolicyInvalidError creates a new JSConsumerAckPolicyInvalidErr error: "consumer ack policy invalid" func NewJSConsumerAckPolicyInvalidError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1419,6 +1613,22 @@ func NewJSConsumerInvalidPriorityGroupError(opts ...ErrorOption) *ApiError { return ApiErrors[JSConsumerInvalidPriorityGroupErr] } +// NewJSConsumerInvalidResetError creates a new JSConsumerInvalidResetErr error: "invalid reset: {err}" +func NewJSConsumerInvalidResetError(err error, opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + e := ApiErrors[JSConsumerInvalidResetErr] + args := e.toReplacerArgs([]interface{}{"{err}", err}) + return &ApiError{ + Code: e.Code, + ErrCode: e.ErrCode, + Description: strings.NewReplacer(args...).Replace(e.Description), + } +} + // NewJSConsumerInvalidSamplingError creates a new JSConsumerInvalidSamplingErrF error: "failed to parse consumer sampling configuration: {err}" func NewJSConsumerInvalidSamplingError(err error, opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1967,6 +2177,26 @@ func NewJSMessageSchedulesRollupInvalidError(opts ...ErrorOption) *ApiError { return ApiErrors[JSMessageSchedulesRollupInvalidErr] } +// NewJSMessageSchedulesSchedulerInvalidError creates a new JSMessageSchedulesSchedulerInvalidErr error: "message schedules invalid scheduler" +func NewJSMessageSchedulesSchedulerInvalidError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSMessageSchedulesSchedulerInvalidErr] +} + +// NewJSMessageSchedulesSourceInvalidError creates a new JSMessageSchedulesSourceInvalidErr error: "message schedules source is invalid" +func NewJSMessageSchedulesSourceInvalidError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSMessageSchedulesSourceInvalidErr] +} + // NewJSMessageSchedulesTTLInvalidError creates a new JSMessageSchedulesTTLInvalidErr error: "message schedules invalid per-message TTL" func NewJSMessageSchedulesTTLInvalidError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -1987,6 +2217,16 @@ func NewJSMessageSchedulesTargetInvalidError(opts ...ErrorOption) *ApiError { return ApiErrors[JSMessageSchedulesTargetInvalidErr] } +// NewJSMessageSchedulesTimeZoneInvalidError creates a new JSMessageSchedulesTimeZoneInvalidErr error: "message schedules time zone is invalid" +func NewJSMessageSchedulesTimeZoneInvalidError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSMessageSchedulesTimeZoneInvalidErr] +} + // NewJSMessageTTLDisabledError creates a new JSMessageTTLDisabledErr error: "per-message TTL is disabled" func NewJSMessageTTLDisabledError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -2007,6 +2247,16 @@ func NewJSMessageTTLInvalidError(opts ...ErrorOption) *ApiError { return ApiErrors[JSMessageTTLInvalidErr] } +// NewJSMirrorConsumerRequiresAckFCError creates a new JSMirrorConsumerRequiresAckFCErr error: "stream mirror consumer requires flow control ack policy" +func NewJSMirrorConsumerRequiresAckFCError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSMirrorConsumerRequiresAckFCErr] +} + // NewJSMirrorConsumerSetupFailedError creates a new JSMirrorConsumerSetupFailedErrF error: "{err}" func NewJSMirrorConsumerSetupFailedError(err error, opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -2023,6 +2273,16 @@ func NewJSMirrorConsumerSetupFailedError(err error, opts ...ErrorOption) *ApiErr } } +// NewJSMirrorDurableConsumerCfgInvalidError creates a new JSMirrorDurableConsumerCfgInvalid error: "stream mirror consumer config is invalid" +func NewJSMirrorDurableConsumerCfgInvalidError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSMirrorDurableConsumerCfgInvalid] +} + // NewJSMirrorInvalidStreamNameError creates a new JSMirrorInvalidStreamName error: "mirrored stream name is invalid" func NewJSMirrorInvalidStreamNameError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -2105,6 +2365,16 @@ func NewJSMirrorWithAtomicPublishError(opts ...ErrorOption) *ApiError { return ApiErrors[JSMirrorWithAtomicPublishErr] } +// NewJSMirrorWithBatchPublishError creates a new JSMirrorWithBatchPublishErr error: "stream mirrors can not also use batch publishing" +func NewJSMirrorWithBatchPublishError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSMirrorWithBatchPublishErr] +} + // NewJSMirrorWithCountersError creates a new JSMirrorWithCountersErr error: "stream mirrors can not also calculate counters" func NewJSMirrorWithCountersError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -2339,6 +2609,16 @@ func NewJSSnapshotDeliverSubjectInvalidError(opts ...ErrorOption) *ApiError { return ApiErrors[JSSnapshotDeliverSubjectInvalidErr] } +// NewJSSourceConsumerRequiresAckFCError creates a new JSSourceConsumerRequiresAckFCErr error: "stream source consumer requires flow control ack policy" +func NewJSSourceConsumerRequiresAckFCError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSSourceConsumerRequiresAckFCErr] +} + // NewJSSourceConsumerSetupFailedError creates a new JSSourceConsumerSetupFailedErrF error: "{err}" func NewJSSourceConsumerSetupFailedError(err error, opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) @@ -2365,6 +2645,26 @@ func NewJSSourceDuplicateDetectedError(opts ...ErrorOption) *ApiError { return ApiErrors[JSSourceDuplicateDetected] } +// NewJSSourceDurableConsumerCfgInvalidError creates a new JSSourceDurableConsumerCfgInvalid error: "stream source consumer config is invalid" +func NewJSSourceDurableConsumerCfgInvalidError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSSourceDurableConsumerCfgInvalid] +} + +// NewJSSourceDurableConsumerDuplicateDetectedError creates a new JSSourceDurableConsumerDuplicateDetected error: "duplicate stream source consumer detected" +func NewJSSourceDurableConsumerDuplicateDetectedError(opts ...ErrorOption) *ApiError { + eopts := parseOpts(opts) + if ae, ok := eopts.err.(*ApiError); ok { + return ae + } + + return ApiErrors[JSSourceDurableConsumerDuplicateDetected] +} + // NewJSSourceInvalidStreamNameError creates a new JSSourceInvalidStreamName error: "sourced stream name is invalid" func NewJSSourceInvalidStreamNameError(opts ...ErrorOption) *ApiError { eopts := parseOpts(opts) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go index c7042f89db..f85618e061 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_events.go @@ -71,10 +71,9 @@ const ( // JSStreamActionAdvisory indicates that a stream was created, edited or deleted type JSStreamActionAdvisory struct { TypedEvent - Stream string `json:"stream"` - Action ActionAdvisoryType `json:"action"` - Template string `json:"template,omitempty"` // Deprecated: stream templates are deprecated and will be removed in a future version. - Domain string `json:"domain,omitempty"` + Stream string `json:"stream"` + Action ActionAdvisoryType `json:"action"` + Domain string `json:"domain,omitempty"` } const JSStreamActionAdvisoryType = "io.nats.jetstream.advisory.v1.stream_action" @@ -269,9 +268,10 @@ type JSStreamBatchAbandonedAdvisory struct { type BatchAbandonReason string var ( - BatchTimeout BatchAbandonReason = "timeout" - BatchLarge BatchAbandonReason = "large" - BatchIncomplete BatchAbandonReason = "incomplete" + BatchTimeout BatchAbandonReason = "timeout" + BatchLarge BatchAbandonReason = "large" + BatchIncomplete BatchAbandonReason = "incomplete" + BatchRequirementsNotMet BatchAbandonReason = "unsupported" ) // JSConsumerLeaderElectedAdvisoryType is sent when the system elects a leader for a consumer. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_versioning.go b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_versioning.go index 30e0005f4e..db2f1e97bd 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jetstream_versioning.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jetstream_versioning.go @@ -17,7 +17,7 @@ import "strconv" const ( // JSApiLevel is the maximum supported JetStream API level for this server. - JSApiLevel int = 3 + JSApiLevel int = 4 JSRequiredLevelMetadataKey = "_nats.req.level" JSServerVersionMetadataKey = "_nats.ver" @@ -82,6 +82,11 @@ func setStaticStreamMetadata(cfg *StreamConfig) { requires(2) } + // Fast batch publishing was added in v2.14 and requires API level 4. + if cfg.AllowBatchPublish { + requires(4) + } + cfg.Metadata[JSRequiredLevelMetadataKey] = strconv.Itoa(requiredApiLevel) } @@ -158,6 +163,11 @@ func setStaticConsumerMetadata(cfg *ConsumerConfig) { requires(1) } + // Added in 2.14 + if cfg.AckPolicy == AckFlowControl { + requires(4) + } + cfg.Metadata[JSRequiredLevelMetadataKey] = strconv.Itoa(requiredApiLevel) } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/jwt.go b/vendor/github.com/nats-io/nats-server/v2/server/jwt.go index 82d65d90d5..ee8655538f 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/jwt.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/jwt.go @@ -202,12 +202,15 @@ func validateSrc(claims *jwt.UserClaims, host string) bool { } func validateTimes(claims *jwt.UserClaims) (bool, time.Duration) { + return validateTimesAt(claims, time.Now()) +} + +func validateTimesAt(claims *jwt.UserClaims, now time.Time) (bool, time.Duration) { if claims == nil { return false, time.Duration(0) } else if len(claims.Times) == 0 { return true, time.Duration(0) } - now := time.Now() loc := time.Local if claims.Locale != "" { var err error @@ -216,10 +219,11 @@ func validateTimes(claims *jwt.UserClaims) (bool, time.Duration) { } now = now.In(loc) } + + var ok bool + var validFor time.Duration + for _, timeRange := range claims.Times { - y, m, d := now.Date() - m = m - 1 - d = d - 1 start, err := time.ParseInLocation("15:04:05", timeRange.Start, loc) if err != nil { return false, time.Duration(0) // parsing not expected to fail at this point @@ -228,17 +232,43 @@ func validateTimes(claims *jwt.UserClaims) (bool, time.Duration) { if err != nil { return false, time.Duration(0) // parsing not expected to fail at this point } - if start.After(end) { - start = start.AddDate(y, int(m), d) - d++ // the intent is to be the next day - } else { - start = start.AddDate(y, int(m), d) + + y, m, d := now.Date() + start = time.Date(y, m, d, start.Hour(), start.Minute(), start.Second(), 0, loc) + end = time.Date(y, m, d, end.Hour(), end.Minute(), end.Second(), 0, loc) + + inRange, expires := validateTimeRangeAt(start, end, now) + if inRange && (!ok || expires > validFor) { + ok = true + validFor = expires + } + } + return ok, validFor +} + +// Returns true if now is within `start` and `end`, and +// how much time is left until `end`. +// False if `now` is not within range. +func validateTimeRangeAt(start, end, now time.Time) (bool, time.Duration) { + // Now falls within range. + // For example 11:00-22:00 at 13:00 + if start.Before(now) && end.After(now) { + return true, end.Sub(now) + } + + // Range crosses midnight. + if start.After(end) { + // Now is after midnight. + // For example 22:00-06:00 at 05:00. + if end.After(now) { + return true, end.Sub(now) } - if start.Before(now) { - end = end.AddDate(y, int(m), d) - if end.After(now) { - return true, end.Sub(now) - } + + // Now is before midnight. + // For example 22:00-06:00 at 23:30. + end = end.AddDate(0, 0, 1) + if start.Before(now) && end.After(now) { + return true, end.Sub(now) } } return false, time.Duration(0) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go index bd3e26462f..be84f7b190 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/leafnode.go @@ -1,4 +1,4 @@ -// Copyright 2019-2025 The NATS Authors +// Copyright 2019-2026 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -27,7 +27,6 @@ import ( "net/url" "os" "path" - "reflect" "regexp" "runtime" "strconv" @@ -115,21 +114,24 @@ type leafNodeCfg struct { perms *Permissions connDelay time.Duration // Delay before a connect, could be used while detecting loop condition, etc.. jsMigrateTimer *time.Timer + quitCh chan struct{} + removed bool + connInProgress bool } // Check to see if this is a solicited leafnode. We do special processing for solicited. func (c *client) isSolicitedLeafNode() bool { - return c.kind == LEAF && c.leaf.remote != nil + return c.kind == LEAF && c.leaf != nil && c.leaf.remote != nil } // Returns true if this is a solicited leafnode and is not configured to be treated as a hub or a receiving // connection leafnode where the otherside has declared itself to be the hub. func (c *client) isSpokeLeafNode() bool { - return c.kind == LEAF && c.leaf.isSpoke + return c.kind == LEAF && c.leaf != nil && c.leaf.isSpoke } func (c *client) isHubLeafNode() bool { - return c.kind == LEAF && !c.leaf.isSpoke + return c.kind == LEAF && c.leaf != nil && !c.leaf.isSpoke } func (c *client) isIsolatedLeafNode() bool { @@ -137,7 +139,7 @@ func (c *client) isIsolatedLeafNode() bool { // group name here, which the hub and/or leaf could provide, so that we // can isolate away certain LNs but not others on an opt-in basis. For // now we will just isolate all LN interest until then. - return c.kind == LEAF && c.leaf.isolated + return c.kind == LEAF && c.leaf != nil && c.leaf.isolated } // This will spin up go routines to solicit the remote leaf node connections. @@ -152,7 +154,10 @@ func (s *Server) solicitLeafNodeRemotes(remotes []*RemoteLeafOpts) { remote := newLeafNodeCfg(r) creds := remote.Credentials accName := remote.LocalAccount - s.leafRemoteCfgs = append(s.leafRemoteCfgs, remote) + if s.leafRemoteCfgs == nil { + s.leafRemoteCfgs = make(map[*leafNodeCfg]struct{}) + } + s.leafRemoteCfgs[remote] = struct{}{} // Print notice if if isSysAccRemote { if len(remote.DenyExports) > 0 { @@ -192,34 +197,30 @@ func (s *Server) solicitLeafNodeRemotes(remotes []*RemoteLeafOpts) { // configuration required for configuration reload. remote := addRemote(r, r.LocalAccount == sysAccName) if !r.Disabled { - s.startGoRoutine(func() { s.connectToRemoteLeafNode(remote, true) }) + s.connectToRemoteLeafNodeAsynchronously(remote, true) } } } -func (s *Server) remoteLeafNodeStillValid(remote *leafNodeCfg) bool { - if remote.Disabled { - return false - } - for _, ri := range s.getOpts().LeafNode.Remotes { - // FIXME(dlc) - What about auth changes? - if reflect.DeepEqual(ri.URLs, remote.URLs) { - return true - } - } - return false -} - // Ensure that leafnode is properly configured. func validateLeafNode(o *Options) error { if err := validateLeafNodeAuthOptions(o); err != nil { return err } - // Users can bind to any local account, if its empty we will assume the $G account. - for _, r := range o.LeafNode.Remotes { - if r.LocalAccount == _EMPTY_ { - r.LocalAccount = globalAccountName + if len(o.LeafNode.Remotes) > 0 { + names := make(map[string]struct{}) + // Check for duplicate remotes, also, users can bind to any local account, + // if its empty we will assume the $G account. + for _, r := range o.LeafNode.Remotes { + if r.LocalAccount == _EMPTY_ { + r.LocalAccount = globalAccountName + } + rn := r.name() + if _, dup := names[rn]; dup { + return fmt.Errorf("duplicate remote %s", r.safeName()) + } + names[rn] = struct{}{} } } @@ -428,42 +429,25 @@ func validateLeafNodeProxyOptions(remote *RemoteLeafOpts) ([]string, error) { return warnings, nil } -// Update remote LeafNode TLS configurations after a config reload. -func (s *Server) updateRemoteLeafNodesTLSConfig(opts *Options) { - max := len(opts.LeafNode.Remotes) - if max == 0 { - return - } - - s.mu.RLock() - defer s.mu.RUnlock() - - // Changes in the list of remote leaf nodes is not supported. - // However, make sure that we don't go over the arrays. - if len(s.leafRemoteCfgs) < max { - max = len(s.leafRemoteCfgs) - } - for i := 0; i < max; i++ { - ro := opts.LeafNode.Remotes[i] - cfg := s.leafRemoteCfgs[i] - if ro.TLSConfig != nil { - cfg.Lock() - cfg.TLSConfig = ro.TLSConfig.Clone() - cfg.TLSHandshakeFirst = ro.TLSHandshakeFirst - cfg.Unlock() - } - } -} - +// Wait for the configured reconnect interval before attempting to connect +// again to the remote leafnode. func (s *Server) reConnectToRemoteLeafNode(remote *leafNodeCfg) { + clearInProgress := true + defer func() { + s.grWG.Done() + if clearInProgress { + remote.setConnectInProgress(false) + } + }() delay := s.getOpts().LeafNode.ReconnectInterval select { case <-time.After(delay): + case <-remote.quitCh: + return case <-s.quitCh: - s.grWG.Done() return } - s.connectToRemoteLeafNode(remote, false) + clearInProgress = !connectToRemoteLeafNode(s, remote, false) } // Creates a leafNodeCfg object that wraps the RemoteLeafOpts. @@ -471,6 +455,7 @@ func newLeafNodeCfg(remote *RemoteLeafOpts) *leafNodeCfg { cfg := &leafNodeCfg{ RemoteLeafOpts: remote, urls: make([]*url.URL, 0, len(remote.URLs)), + quitCh: make(chan struct{}, 1), } if len(remote.DenyExports) > 0 || len(remote.DenyImports) > 0 { perms := &Permissions{} @@ -506,6 +491,53 @@ func newLeafNodeCfg(remote *RemoteLeafOpts) *leafNodeCfg { return cfg } +// Notifies the quit channel without blocking. +// No lock is needed to invoke this function. +func (cfg *leafNodeCfg) notifyQuitChannel() { + select { + case cfg.quitCh <- struct{}{}: + default: + } +} + +// Sets the connect-in-progress status for this remote leaf configuration. +func (cfg *leafNodeCfg) setConnectInProgress(inProgress bool) { + cfg.Lock() + defer cfg.Unlock() + // In both cases we want to drain the "quit" channel. + select { + case <-cfg.quitCh: + default: + } + cfg.connInProgress = inProgress +} + +// Returns `true` if this remote is in the middle of a connect, `false` otherwise. +func (cfg *leafNodeCfg) isConnectInProgress() bool { + cfg.RLock() + defer cfg.RUnlock() + return cfg.connInProgress +} + +// Mark that this remote is being removed from the configuration. +func (cfg *leafNodeCfg) markAsRemoved() { + cfg.Lock() + defer cfg.Unlock() + // This function should be invoked only once, but protect. + if cfg.removed { + return + } + cfg.removed = true + cfg.notifyQuitChannel() +} + +// Returns false if it has been disabled or removed. +func (cfg *leafNodeCfg) stillValid() bool { + cfg.RLock() + defer cfg.RUnlock() + return !cfg.Disabled && !cfg.removed +} + // Will pick an URL from the list of available URLs. func (cfg *leafNodeCfg) pickNextURL() *url.URL { cfg.Lock() @@ -622,12 +654,26 @@ func establishHTTPProxyTunnel(proxyURL, targetHost string, timeout time.Duration return conn, nil } -func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool) { - defer s.grWG.Done() +// Connect to a remote leaf node asynchronously (that is, this function will do +// the connect in a go routine). +func (s *Server) connectToRemoteLeafNodeAsynchronously(remote *leafNodeCfg, firstConnect bool) { + remote.setConnectInProgress(true) + s.startGoRoutine(func() { + defer s.grWG.Done() + if !connectToRemoteLeafNode(s, remote, firstConnect) { + remote.setConnectInProgress(false) + } + }) +} + +// Connect to a remote leaf node. Should only be invoked from +// `s.connectToRemoteLeafNodeAsynchronously()` or `s.reConnectToRemoteLeafNode()`. +// Returns `true` if this function invoked `s.createLeafNode()`, false otherwise. +func connectToRemoteLeafNode(s *Server, remote *leafNodeCfg, firstConnect bool) bool { if remote == nil || len(remote.URLs) == 0 { s.Debugf("Empty remote leafnode definition, nothing to connect") - return + return false } opts := s.getOpts() @@ -651,8 +697,10 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool) if connDelay := remote.getConnectDelay(); connDelay > 0 { select { case <-time.After(connDelay): + case <-remote.quitCh: + return false case <-s.quitCh: - return + return false } remote.setConnectDelay(0) } @@ -676,7 +724,14 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool) attempts := 0 - for s.isRunning() && s.remoteLeafNodeStillValid(remote) { + // In case the migrate timer was created but not canceled, do it when + // this function exits. Note that the timer would not be created if + // `jetstreamMigrateDelay == 0`. + if jetstreamMigrateDelay > 0 { + defer remote.cancelMigrateTimer() + } + + for s.isRunning() && remote.stillValid() { rURL := remote.pickNextURL() url, err := s.getRandomIP(resolver, rURL.Host, nil) if err == nil { @@ -729,8 +784,9 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool) remote.Unlock() select { case <-s.quitCh: - remote.cancelMigrateTimer() - return + return false + case <-remote.quitCh: + return false case <-time.After(delay): // Check if we should migrate any JetStream assets immediately while this remote is down. // This will be used if JetStreamClusterMigrateDelay was not set @@ -741,9 +797,11 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool) } } remote.cancelMigrateTimer() - if !s.remoteLeafNodeStillValid(remote) { + // We can check here, but really we will have to check again when the server + // is about to add to the `s.leafs` map later in the process. + if !remote.stillValid() { conn.Close() - return + return false } // We have a connection here to a remote server. @@ -753,8 +811,10 @@ func (s *Server) connectToRemoteLeafNode(remote *leafNodeCfg, firstConnect bool) // Clear any observer states if we had them. s.clearObserverState(remote) - return + return true } + + return false } func (cfg *leafNodeCfg) cancelMigrateTimer() { @@ -854,6 +914,8 @@ func (s *Server) isLeafConnectDisabled() bool { // their remote connections did not have a tls{} block). // We now save the host name regardless in case the remote returns an INFO indicating // that TLS is required. +// +// Lock held on entry. func (cfg *leafNodeCfg) saveTLSHostname(u *url.URL) { if cfg.tlsName == _EMPTY_ && net.ParseIP(u.Hostname()) == nil { cfg.tlsName = u.Hostname() @@ -862,6 +924,8 @@ func (cfg *leafNodeCfg) saveTLSHostname(u *url.URL) { // Save off the username/password for when we connect using a bare URL // that we get from the INFO protocol. +// +// Lock held on entry. func (cfg *leafNodeCfg) saveUserPassword(u *url.URL) { if cfg.username == _EMPTY_ && u.User != nil { cfg.username = u.User.Username() @@ -1459,6 +1523,15 @@ func (c *client) processLeafnodeInfo(info *Info) { // Check for compression, unless already done. if firstINFO && !c.flags.isSet(compressionNegotiated) { + // A solicited leafnode connection must first receive a leafnode INFO. + // Classify wrong-port connections before any leaf-specific negotiation. + if didSolicit && (info.CID == 0 || info.LeafNodeURLs == nil) { + c.mu.Unlock() + c.Errorf(ErrConnectedToWrongPort.Error()) + c.closeConnection(WrongPort) + return + } + // Prevent from getting back here. c.flags.set(compressionNegotiated) @@ -1536,15 +1609,6 @@ func (c *client) processLeafnodeInfo(info *Info) { // ** Not if "no advertise" is enabled. // *** Not if leafnode's "no advertise" is enabled. // - // As seen from above, a solicited LeafNode connection should receive - // from the remote server an INFO with CID and LeafNodeURLs. Anything - // else should be considered an attempt to connect to a wrong port. - if didSolicit && (info.CID == 0 || info.LeafNodeURLs == nil) { - c.mu.Unlock() - c.Errorf(ErrConnectedToWrongPort.Error()) - c.closeConnection(WrongPort) - return - } // Reject a cluster that contains spaces. if info.Cluster != _EMPTY_ && strings.Contains(info.Cluster, " ") { c.mu.Unlock() @@ -1552,8 +1616,12 @@ func (c *client) processLeafnodeInfo(info *Info) { c.closeConnection(ProtocolViolation) return } - // Capture a nonce here. - c.nonce = []byte(info.Nonce) + // For solicited outbound leaf connections, capture the remote's nonce. + // For inbound leaf connections, keep using the server-issued nonce that + // was sent in our initial INFO and must be signed in CONNECT. + if didSolicit { + c.nonce = []byte(info.Nonce) + } if info.TLSRequired && didSolicit { remote.TLS = true } @@ -1578,15 +1646,17 @@ func (c *client) processLeafnodeInfo(info *Info) { } // For both initial INFO and async INFO protocols, Possibly - // update our list of remote leafnode URLs we can connect to. - if didSolicit && (len(info.LeafNodeURLs) > 0 || len(info.WSConnectURLs) > 0) { + // update our list of remote leafnode URLs we can connect to, + // unless we are instructed not to. + if didSolicit && !remote.IgnoreDiscoveredServers && + (len(info.LeafNodeURLs) > 0 || len(info.WSConnectURLs) > 0) { // Consider the incoming array as the most up-to-date // representation of the remote cluster's list of URLs. c.updateLeafNodeURLs(info) } - // Check to see if we have permissions updates here. - if info.Import != nil || info.Export != nil { + // Only solicited leafnode connections trust permission updates from INFO. + if didSolicit && (info.Import != nil || info.Export != nil) { perms := &Permissions{ Publish: info.Export, Subscribe: info.Import, @@ -1623,6 +1693,12 @@ func (c *client) processLeafnodeInfo(info *Info) { // Check if we have the remote account information and if so make sure it's stored. if info.RemoteAccount != _EMPTY_ { + if c.acc == nil { + c.mu.Unlock() + c.sendErr("Authorization Violation") + c.closeConnection(ProtocolViolation) + return + } s.leafRemoteAccounts.Store(c.acc.Name, info.RemoteAccount) } c.mu.Unlock() @@ -1807,7 +1883,7 @@ func (s *Server) setLeafNodeInfoHostPortAndIP() error { // (this solves the stale connection situation). An error is returned to help the // remote detect the misconfiguration when the duplicate is the result of that // misconfiguration. -func (s *Server) addLeafNodeConnection(c *client, srvName, clusterName string, checkForDup bool) { +func (s *Server) addLeafNodeConnection(c *client, srvName, clusterName string, checkForDup bool) bool { var accName string c.mu.Lock() cid := c.cid @@ -1819,7 +1895,8 @@ func (s *Server) addLeafNodeConnection(c *client, srvName, clusterName string, c mySrvName := c.leaf.remoteServer remoteAccName := c.leaf.remoteAccName myClustName := c.leaf.remoteCluster - solicited := c.leaf.remote != nil + remote := c.leaf.remote + solicited := remote != nil c.mu.Unlock() var old *client @@ -1843,6 +1920,23 @@ func (s *Server) addLeafNodeConnection(c *client, srvName, clusterName string, c } } } + // Now that we are under the server lock and before adding it to the map, + // for a solicited leaf, we need to make sure that it has not been removed + // from the config or disabled. + if solicited { + // If no longer valid, do not add to the server map. The connection + // should have been marked so that it can't reconnect. When the caller + // calls closeConnection(), cleanup (including clearing the connect- + // in-progress flag) will occur at the appropriate time. + if !remote.stillValid() { + // Prevent reconnect in case it was not yet done. + c.setNoReconnect() + s.mu.Unlock() + s.removeFromTempClients(cid) + return false + } + remote.setConnectInProgress(false) + } // Store new connection in the map s.leafs[cid] = c s.mu.Unlock() @@ -1891,7 +1985,7 @@ func (s *Server) addLeafNodeConnection(c *client, srvName, clusterName string, c } else if domain, ok := opts.JsAccDefaultDomain[accName]; ok && domain == _EMPTY_ { // for backwards compatibility with old setups that do not have a domain name set c.Debugf("Skipping deny %q for account %q due to default domain", jsAllAPI, accName) - return + return true } } @@ -1969,9 +2063,11 @@ func (s *Server) addLeafNodeConnection(c *client, srvName, clusterName string, c c.Debugf("Adding deny %q for outgoing messages to account %q", src, accName) } } + return true } func (s *Server) removeLeafNodeConnection(c *client) { + s.mu.Lock() c.mu.Lock() cid := c.cid if c.leaf != nil { @@ -1984,10 +2080,18 @@ func (s *Server) removeLeafNodeConnection(c *client) { // We need to set this to nil for GC to release the connection c.leaf.gwSub = nil } + if remote := c.leaf.remote; remote != nil { + // If "noReconnect" is true, then we won't attempt to reconnect, so + // we will clear the "connect-in-progress" flag. However, if we can + // reconnect, then we should set "connect-in-progress" to true while + // we are under the server/client lock. The go routine that performs + // the reconnect will be started later and there would be a gap with + // the wrong flag value otherwise. + remote.setConnectInProgress(!c.flags.isSet(noReconnect)) + } } proxyKey := c.proxyKey c.mu.Unlock() - s.mu.Lock() delete(s.leafs, cid) if proxyKey != _EMPTY_ { s.removeProxiedConn(proxyKey, cid) @@ -2154,6 +2258,13 @@ func (c *client) processLeafNodeConnect(s *Server, arg []byte, lang string) erro acc := c.acc c.mu.Unlock() + // If the account is not set (e.g. connection was closed due to auth + // timeout while still being processed), bail out to avoid a panic. + if acc == nil { + c.closeConnection(MissingAccount) + return ErrMissingAccount + } + // Register the cluster, even if empty, as long as we are acting as a hub. if !proto.Hub { acc.registerLeafNodeCluster(proto.Cluster) @@ -2999,6 +3110,11 @@ func (c *client) processLeafHeaderMsgArgs(arg []byte) error { if c.pa.hdr > c.pa.size { return fmt.Errorf("processLeafHeaderMsgArgs Header Size larger then TotalSize: '%s'", arg) } + maxPayload := atomic.LoadInt32(&c.mpay) + if maxPayload != jwt.NoLimit && int64(c.pa.size) > int64(maxPayload) { + c.maxPayloadViolation(c.pa.size, maxPayload) + return ErrMaxPayload + } // Common ones processed after check for arg length c.pa.subject = args[0] @@ -3068,6 +3184,11 @@ func (c *client) processLeafMsgArgs(arg []byte) error { if c.pa.size < 0 { return fmt.Errorf("processLeafMsgArgs Bad or Missing Size: '%s'", args) } + maxPayload := atomic.LoadInt32(&c.mpay) + if maxPayload != jwt.NoLimit && int64(c.pa.size) > int64(maxPayload) { + c.maxPayloadViolation(c.pa.size, maxPayload) + return ErrMaxPayload + } // Common ones processed after check for arg length c.pa.subject = args[0] @@ -3089,6 +3210,12 @@ func (c *client) processInboundLeafMsg(msg []byte) { return } + // Check that leaf messages respect the subject permissions. + if c.perms != nil && !c.leafMsgAllowed() { + c.leafPubPermViolation(c.pa.subject) + return + } + // Match the subscriptions. We will use our own L1 map if // it's still valid, avoiding contention on the shared sublist. var r *SublistResult @@ -3150,12 +3277,102 @@ func (c *client) processInboundLeafMsg(msg []byte) { } } +// Checks whether the inbound leaf message is allowed by the +// connection's permissions. On the hub side this enforces what +// the remote leaf may publish. On the spoke side this enforces +// import restrictions such as deny_imports. +func (c *client) leafMsgAllowed() bool { + wireSubject := c.pa.subject + if len(c.pa.mapped) > 0 { + // Mappings rewrite c.pa.subject to the internal + // destination. For leaf ACLs, need to check + // the original wire subject from the remote side. + wireSubject = c.pa.mapped + } + // Strip any gateway routing prefix for the permission check. + subjectToCheck, isGW := getGWRoutedSubjectOrSelf(wireSubject) + + // Service-import replies (_R_), JS ack subjects ($JS.ACK.) + // are internal routing subjects forwarded via LS+ without + // permission checks. + if isServiceReply(subjectToCheck) || isJSAckSubject(subjectToCheck) { + return true + } + + c.mu.Lock() + defer c.mu.Unlock() + + if c.isSpokeLeafNode() { + // Gateway routed replies are forwarded without + // permission checks. + if isGW || c.leafReceiveAllowed(subjectToCheck) { + return true + } + } else if c.leafSendAllowed(subjectToCheck) { + return true + } + // Check tracked reply permissions (allow_responses). + // Use the pre-strip subject since deliverMsg tracks + // replies under the original form, which includes + // the GW routing prefix for routed requests. + return c.responseAllowed(bytesToString(wireSubject)) +} + +// Returns true if the leaf side ACLs allow importing this subject, +// based on the permissions received over INFO and any local deny_imports. +// Lock must be held. +func (c *client) leafReceiveAllowed(subject []byte) bool { + return c.canSubscribe(bytesToString(subject)) +} + +// Returns true if the hub side ACLs allow the remote leaf to send +// this subject. +// Lock must be held. +func (c *client) leafSendAllowed(bsubject []byte) bool { + // Use the original export ACL captured for this accepted leaf. + // The live perms also contain additional JetStream denies used by + // the normal forwarding path, and applying them here would reject + // legitimate inbound JS API requests. + subject := bytesToString(bsubject) + perms := c.opts.Export + if perms == nil || (perms.Allow == nil && perms.Deny == nil) { + return true + } + + allowed := true + if perms.Allow != nil && !strings.HasPrefix(subject, mqttPrefix) { + allowed = false + for _, allowSubj := range perms.Allow { + if matchLiteral(subject, allowSubj) { + allowed = true + break + } + } + } + + if allowed && len(perms.Deny) > 0 { + for _, denySubj := range perms.Deny { + if matchLiteral(subject, denySubj) { + allowed = false + break + } + } + } + return allowed +} + // Handles a subscription permission violation. // See leafPermViolation() for details. func (c *client) leafSubPermViolation(subj []byte) { c.leafPermViolation(false, subj) } +// Handles a publish permission violation. +// See leafPermViolation() for details. +func (c *client) leafPubPermViolation(subj []byte) { + c.leafPermViolation(true, subj) +} + // Common function to process publish or subscribe leafnode permission violation. // Sends the permission violation error to the remote, logs it and closes the connection. // If this is from a server soliciting, the reconnection will be delayed. @@ -3433,6 +3650,12 @@ func (s *Server) leafNodeFinishConnectProcess(c *client) { return } remote := c.leaf.remote + if remote == nil || c.acc == nil { + c.mu.Unlock() + c.sendErr("Authorization Violation") + c.closeConnection(ProtocolViolation) + return + } // Check if we will need to send the system connect event. remote.RLock() sendSysConnectEvent := remote.Hub @@ -3457,19 +3680,22 @@ func (s *Server) leafNodeFinishConnectProcess(c *client) { c.closeConnection(ProtocolViolation) return } - s.addLeafNodeConnection(c, _EMPTY_, _EMPTY_, false) + if !s.addLeafNodeConnection(c, _EMPTY_, _EMPTY_, false) { + // Was not added, could be because the remote configuration has been removed. + c.closeConnection(ClientClosed) + return + } s.initLeafNodeSmapAndSendSubs(c) if sendSysConnectEvent { s.sendLeafNodeConnect(acc) } + s.accountConnectEvent(c) - // The above functions are not atomically under the client - // lock doing those operations. It is possible - since we - // have started the read/write loops - that the connection - // is closed before or in between. This would leave the - // closed LN connection possible registered with the account - // and/or the server's leafs map. So check if connection - // is closed, and if so, manually cleanup. + // The above functions are not running under the client lock, so it is + // possible that between the time we have started the read/write loops + // and now, that the connection was closed. This would leave the closed + // LN connection possibly registered with the account and/or the server's + // leafs map. So check if connection is closed, and if so, manually cleanup. c.mu.Lock() closed := c.isClosed() if !closed { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/log.go b/vendor/github.com/nats-io/nats-server/v2/server/log.go index 2cd294c457..d251864a48 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/log.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/log.go @@ -227,6 +227,14 @@ func (s *Server) rateLimitFormatWarnf(format string, v ...any) { s.Warnf("%s", statement) } +func (s *Server) RateLimitErrorf(format string, v ...any) { + statement := fmt.Sprintf(format, v...) + if _, loaded := s.rateLimitLogging.LoadOrStore(statement, time.Now()); loaded { + return + } + s.Errorf("%s", statement) +} + func (s *Server) RateLimitWarnf(format string, v ...any) { statement := fmt.Sprintf(format, v...) if _, loaded := s.rateLimitLogging.LoadOrStore(statement, time.Now()); loaded { diff --git a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go index 63b5a9df85..4e3f113b3b 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/memstore.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/memstore.go @@ -184,7 +184,7 @@ func (ms *memStore) recoverMsgSchedulingState() { if len(sm.hdr) == 0 { continue } - if schedule, ok := getMessageSchedule(sm.hdr); ok && !schedule.IsZero() { + if schedule, apiErr := nextMessageSchedule(sm.hdr, sm.ts); apiErr == nil && !schedule.IsZero() { ms.scheduling.init(seq, sm.subj, schedule.UnixNano()) } } @@ -192,7 +192,7 @@ func (ms *memStore) recoverMsgSchedulingState() { // Stores a raw message with expected sequence number and timestamp. // Lock should be held. -func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64) error { +func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64, discardNewCheck bool) error { if ms.msgs == nil { return ErrStoreClosed } @@ -208,31 +208,31 @@ func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, tt } // Check if we are discarding new messages when we reach the limit. - if ms.cfg.Discard == DiscardNew { - if asl && ms.cfg.DiscardNewPer { + // If we are clustered, we do the enforcement above and should not disqualify + // the message here since it could cause replicas to drift. + if discardNewCheck && ms.cfg.Discard == DiscardNew { + // Allow rollup messages through since they will purge old + // messages for the subject after storing, restoring the limit. + if asl && ms.cfg.DiscardNewPer && len(sliceHeader(JSMsgRollup, hdr)) == 0 { return ErrMaxMsgsPerSubject } - // If we are discard new and limits policy and clustered, we do the enforcement - // above and should not disqualify the message here since it could cause replicas to drift. - if ms.cfg.Retention == LimitsPolicy || ms.cfg.Replicas == 1 { - if ms.cfg.MaxMsgs > 0 && ms.state.Msgs >= uint64(ms.cfg.MaxMsgs) { - // If we are tracking max messages per subject and are at the limit we will replace, so this is ok. - if !asl { - return ErrMaxMsgs - } + if ms.cfg.MaxMsgs > 0 && ms.state.Msgs >= uint64(ms.cfg.MaxMsgs) { + // If we are tracking max messages per subject and are at the limit we will replace, so this is ok. + if !asl { + return ErrMaxMsgs } - if ms.cfg.MaxBytes > 0 && ms.state.Bytes+memStoreMsgSize(subj, hdr, msg) >= uint64(ms.cfg.MaxBytes) { - if !asl { - return ErrMaxBytes - } - // If we are here we are at a subject maximum, need to determine if dropping last message gives us enough room. - if ss.firstNeedsUpdate || ss.lastNeedsUpdate { - ms.recalculateForSubj(subj, ss) - } - sm, ok := ms.msgs[ss.First] - if !ok || memStoreMsgSize(sm.subj, sm.hdr, sm.msg) < memStoreMsgSize(subj, hdr, msg) { - return ErrMaxBytes - } + } + if ms.cfg.MaxBytes > 0 && ms.state.Bytes+memStoreMsgSize(subj, hdr, msg) > uint64(ms.cfg.MaxBytes) { + if !asl { + return ErrMaxBytes + } + // If we are here we are at a subject maximum, need to determine if dropping last message gives us enough room. + if ss.firstNeedsUpdate || ss.lastNeedsUpdate { + ms.recalculateForSubj(subj, ss) + } + sm, ok := ms.msgs[ss.First] + if !ok || memStoreMsgSize(sm.subj, sm.hdr, sm.msg) < memStoreMsgSize(subj, hdr, msg) { + return ErrMaxBytes } } } @@ -309,20 +309,28 @@ func (ms *memStore) storeRawMsg(subj string, hdr, msg []byte, seq uint64, ts, tt // Message scheduling. if ms.scheduling != nil { - if schedule, ok := getMessageSchedule(hdr); ok && !schedule.IsZero() { + if schedule, apiErr := nextMessageSchedule(hdr, ts); apiErr == nil && !schedule.IsZero() { ms.scheduling.add(seq, subj, schedule.UnixNano()) - } else { + } else if getMessageScheduler(hdr) == _EMPTY_ { ms.scheduling.removeSubject(subj) } + + // Check for a repeating schedule and update such that it triggers again. + if scheduleNext := bytesToString(sliceHeader(JSScheduleNext, hdr)); scheduleNext != _EMPTY_ && scheduleNext != JSScheduleNextPurge { + scheduler := getMessageScheduler(hdr) + if next, err := time.Parse(time.RFC3339Nano, scheduleNext); err == nil && scheduler != _EMPTY_ { + ms.scheduling.update(scheduler, next.UnixNano()) + } + } } return nil } // StoreRawMsg stores a raw message with expected sequence number and timestamp. -func (ms *memStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64) error { +func (ms *memStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts, ttl int64, discardNewCheck bool) error { ms.mu.Lock() - err := ms.storeRawMsg(subj, hdr, msg, seq, ts, ttl) + err := ms.storeRawMsg(subj, hdr, msg, seq, ts, ttl, discardNewCheck) cb := ms.scb // Check if first message timestamp requires expiry // sooner than initial replica expiry timer set to MaxAge when initializing. @@ -344,7 +352,8 @@ func (ms *memStore) StoreRawMsg(subj string, hdr, msg []byte, seq uint64, ts, tt func (ms *memStore) StoreMsg(subj string, hdr, msg []byte, ttl int64) (uint64, int64, error) { ms.mu.Lock() seq, ts := ms.state.LastSeq+1, time.Now().UnixNano() - err := ms.storeRawMsg(subj, hdr, msg, seq, ts, ttl) + // This is called for a R1 with no expected sequence number, so perform DiscardNew checks on the store-level. + err := ms.storeRawMsg(subj, hdr, msg, seq, ts, ttl, true) cb := ms.scb ms.mu.Unlock() @@ -414,8 +423,9 @@ func (ms *memStore) SkipMsgs(seq uint64, num uint64) error { } // FlushAllPending flushes all data that was still pending to be written. -func (ms *memStore) FlushAllPending() { +func (ms *memStore) FlushAllPending() error { // Noop, in-memory store doesn't use async applying. + return nil } // RegisterStorageUpdates registers a callback for updates to storage changes. @@ -521,13 +531,13 @@ loop: } // FilteredState will return the SimpleState associated with the filtered subject and a proposed starting sequence. -func (ms *memStore) FilteredState(sseq uint64, subj string) SimpleState { +func (ms *memStore) FilteredState(sseq uint64, subj string) (SimpleState, error) { // This needs to be a write lock, as filteredStateLocked can // mutate the per-subject state. ms.mu.Lock() defer ms.mu.Unlock() - return ms.filteredStateLocked(sseq, subj, false) + return ms.filteredStateLocked(sseq, subj, false), nil } func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubject bool) SimpleState { @@ -1391,10 +1401,16 @@ func (ms *memStore) runMsgScheduling() { } ms.scheduling.running = true - scheduledMsgs := ms.scheduling.getScheduledMessages(func(seq uint64, smv *StoreMsg) *StoreMsg { - sm, _ := ms.loadMsgLocked(seq, smv, false) - return sm - }) + scheduledMsgs := ms.scheduling.getScheduledMessages( + func(seq uint64, smv *StoreMsg) *StoreMsg { + sm, _ := ms.loadMsgLocked(seq, smv, false) + return sm + }, + func(subj string, smv *StoreMsg) *StoreMsg { + sm, _ := ms.loadLastLocked(subj, smv) + return sm + }, + ) if len(scheduledMsgs) > 0 { ms.mu.Unlock() for _, msg := range scheduledMsgs { @@ -1429,7 +1445,7 @@ func (ms *memStore) PurgeEx(subject string, sequence, keep uint64) (purged uint6 } eq := compareFn(subject) - if ss := ms.FilteredState(1, subject); ss.Msgs > 0 { + if ss, _ := ms.FilteredState(1, subject); ss.Msgs > 0 { if keep > 0 { if keep >= ss.Msgs { return 0, nil @@ -1712,13 +1728,17 @@ func (ms *memStore) loadMsgLocked(seq uint64, smp *StoreMsg, needMSLock bool) (* // LoadLastMsg will return the last message we have that matches a given subject. // The subject can be a wildcard. func (ms *memStore) LoadLastMsg(subject string, smp *StoreMsg) (*StoreMsg, error) { - var sm *StoreMsg - var ok bool - // This needs to be a write lock, as filteredStateLocked can // mutate the per-subject state. ms.mu.Lock() defer ms.mu.Unlock() + return ms.loadLastLocked(subject, smp) +} + +// Lock should be held. +func (ms *memStore) loadLastLocked(subject string, smp *StoreMsg) (*StoreMsg, error) { + var sm *StoreMsg + var ok bool if subject == _EMPTY_ || subject == fwcs { sm, ok = ms.msgs[ms.state.LastSeq] @@ -1907,31 +1927,41 @@ func (ms *memStore) loadNextMsgLocked(filter string, wc bool, start uint64, smp return nil, ms.state.LastSeq, ErrStoreEOF } -// Will load the next non-deleted msg starting at the start sequence and walking backwards. -func (ms *memStore) LoadPrevMsg(start uint64, smp *StoreMsg) (sm *StoreMsg, err error) { +// Will load the previous message matching the filter subject, starting at the start sequence and walking backwards. +func (ms *memStore) LoadPrevMsg(filter string, wc bool, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) { ms.mu.RLock() defer ms.mu.RUnlock() if ms.msgs == nil { - return nil, ErrStoreClosed + return nil, 0, ErrStoreClosed } if ms.state.Msgs == 0 || start < ms.state.FirstSeq { - return nil, ErrStoreEOF + return nil, ms.state.FirstSeq, ErrStoreEOF } if start > ms.state.LastSeq { start = ms.state.LastSeq } + if filter == _EMPTY_ { + filter = fwcs + wc = true + } + isAll := filter == fwcs + eq := subjectsEqual + if wc { + eq = matchLiteral + } + for seq := start; seq >= ms.state.FirstSeq; seq-- { - if sm, ok := ms.msgs[seq]; ok { + if sm, ok := ms.msgs[seq]; ok && (isAll || eq(sm.subj, filter)) { if smp == nil { smp = new(StoreMsg) } sm.copy(smp) - return smp, nil + return smp, seq, nil } } - return nil, ErrStoreEOF + return nil, ms.state.FirstSeq, ErrStoreEOF } // LoadPrevMsgMulti will find the previous message matching any entry in the sublist. @@ -1965,7 +1995,7 @@ func (ms *memStore) LoadPrevMsgMulti(sl *gsl.SimpleSublist, start uint64, smp *S return smp, nseq, nil } } - return nil, ms.state.LastSeq, ErrStoreEOF + return nil, ms.state.FirstSeq, ErrStoreEOF } // RemoveMsg will remove the message from this store. @@ -2329,10 +2359,7 @@ func (ms *memStore) EncodedStreamState(failed uint64) ([]byte, error) { b := buf[0:n] if numDeleted > 0 { - buf, err := ms.dmap.Encode(nil) - if err != nil { - return nil, err - } + buf := ms.dmap.Encode(nil) b = append(b, buf...) } @@ -2340,7 +2367,11 @@ func (ms *memStore) EncodedStreamState(failed uint64) ([]byte, error) { } // SyncDeleted will make sure this stream has same deleted state as dbs. -func (ms *memStore) SyncDeleted(dbs DeleteBlocks) { +func (ms *memStore) SyncDeleted(dbs DeleteBlocks) error { + if len(dbs) == 0 { + return nil + } + ms.mu.Lock() defer ms.mu.Unlock() @@ -2349,7 +2380,7 @@ func (ms *memStore) SyncDeleted(dbs DeleteBlocks) { if len(dbs) == 1 { min, max, num := ms.dmap.State() if pmin, pmax, pnum := dbs[0].State(); pmin == min && pmax == max && pnum == num { - return + return nil } } lseq := ms.state.LastSeq @@ -2363,6 +2394,7 @@ func (ms *memStore) SyncDeleted(dbs DeleteBlocks) { return true }) } + return nil } func (o *consumerMemStore) Update(state *ConsumerState) error { @@ -2410,10 +2442,51 @@ func (o *consumerMemStore) Update(state *ConsumerState) error { return nil } +func (o *consumerMemStore) ForceUpdate(state *ConsumerState) error { + // Sanity checks. + if state.AckFloor.Consumer > state.Delivered.Consumer { + return fmt.Errorf("bad ack floor for consumer") + } + if state.AckFloor.Stream > state.Delivered.Stream { + return fmt.Errorf("bad ack floor for stream") + } + + // Copy to our state. + var pending map[uint64]*Pending + var redelivered map[uint64]uint64 + if len(state.Pending) > 0 { + pending = make(map[uint64]*Pending, len(state.Pending)) + for seq, p := range state.Pending { + pending[seq] = &Pending{p.Sequence, p.Timestamp} + if seq <= state.AckFloor.Stream || seq > state.Delivered.Stream { + return fmt.Errorf("bad pending entry, sequence [%d] out of range", seq) + } + } + } + if len(state.Redelivered) > 0 { + redelivered = make(map[uint64]uint64, len(state.Redelivered)) + for seq, dc := range state.Redelivered { + redelivered[seq] = dc + } + } + + // Replace our state. + o.mu.Lock() + defer o.mu.Unlock() + + o.state.Delivered = state.Delivered + o.state.AckFloor = state.AckFloor + o.state.Pending = pending + o.state.Redelivered = redelivered + + return nil +} + // SetStarting sets our starting stream sequence. func (o *consumerMemStore) SetStarting(sseq uint64) error { o.mu.Lock() o.state.Delivered.Stream = sseq + o.state.AckFloor.Stream = sseq o.mu.Unlock() return nil } @@ -2432,6 +2505,14 @@ func (o *consumerMemStore) UpdateStarting(sseq uint64) { } } +// Reset all values in the store, and reset the starting sequence. +func (o *consumerMemStore) Reset(sseq uint64) error { + o.mu.Lock() + o.state = ConsumerState{} + o.mu.Unlock() + return o.SetStarting(sseq) +} + // HasState returns if this store has a recorded state. func (o *consumerMemStore) HasState() bool { o.mu.Lock() @@ -2524,8 +2605,8 @@ func (o *consumerMemStore) UpdateAcks(dseq, sseq uint64) error { return ErrStoreMsgNotFound } - // Check for AckAll here. - if o.cfg.AckPolicy == AckAll { + // Check for AckAll here (or AckFlowControl which functions like AckAll). + if o.cfg.AckPolicy == AckAll || o.cfg.AckPolicy == AckFlowControl { sgap := sseq - o.state.AckFloor.Stream o.state.AckFloor.Consumer = dseq o.state.AckFloor.Stream = sseq @@ -2675,14 +2756,3 @@ func (o *consumerMemStore) copyRedelivered() map[uint64]uint64 { // Type returns the type of the underlying store. func (o *consumerMemStore) Type() StorageType { return MemoryStorage } - -// Templates -type templateMemStore struct{} - -func newTemplateMemStore() *templateMemStore { - return &templateMemStore{} -} - -// No-ops for memstore. -func (ts *templateMemStore) Store(t *streamTemplate) error { return nil } -func (ts *templateMemStore) Delete(t *streamTemplate) error { return nil } diff --git a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go index c1ad73e91e..6d4460b238 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/monitor.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/monitor.go @@ -189,6 +189,17 @@ func newSubsList(client *client) []string { return subs } +func redactBearerJWT(userJWT string) string { + if userJWT == _EMPTY_ { + return _EMPTY_ + } + uc, err := jwt.DecodeUserClaims(userJWT) + if err == nil && uc != nil && uc.BearerToken { + return _EMPTY_ + } + return userJWT +} + // Connz returns a Connz struct containing information about connections. func (s *Server) Connz(opts *ConnzOptions) (*Connz, error) { var ( @@ -441,6 +452,7 @@ func (s *Server) Connz(opts *ConnzOptions) (*Connz, error) { ci.NameTag = client.acc.getNameTag() } client.mu.Unlock() + ci.JWT = redactBearerJWT(ci.JWT) pconns[i] = ci i++ } @@ -487,6 +499,7 @@ func (s *Server) Connz(opts *ConnzOptions) (*Connz, error) { cc.NameTag = acc.getNameTag() } } + cc.JWT = redactBearerJWT(cc.JWT) } pconns[i] = &cc.ConnInfo i++ @@ -1271,6 +1284,7 @@ type Varz struct { ConfigDigest string `json:"config_digest"` // ConfigDigest is a calculated hash of the current configuration Tags jwt.TagList `json:"tags,omitempty"` // Tags are the tags assigned to the server in configuration Metadata map[string]string `json:"metadata,omitempty"` // Metadata is the metadata assigned to the server in configuration + FeatureFlags map[string]bool `json:"feature_flags,omitempty"` // FeatureFlags is the feature flags enabled/disabled in configuration TrustedOperatorsJwt []string `json:"trusted_operators_jwt,omitempty"` // TrustedOperatorsJwt is the JWTs for all trusted operators TrustedOperatorsClaim []*jwt.OperatorClaims `json:"trusted_operators_claim,omitempty"` // TrustedOperatorsClaim is the decoded claims for each trusted operator SystemAccount string `json:"system_account,omitempty"` // SystemAccount is the name of the System account @@ -1570,8 +1584,13 @@ func (s *Server) updateJszVarz(js *jetStream, v *JetStreamVarz, doConfig bool) { v.Meta.Replicas = ci.Replicas } if ipq := s.jsAPIRoutedReqs; ipq != nil { - v.Meta.Pending = ipq.len() + v.Meta.PendingRequests = ipq.len() } + if ipq := s.jsAPIRoutedInfoReqs; ipq != nil { + v.Meta.PendingInfos = ipq.len() + } + v.Meta.Pending = v.Meta.PendingRequests + v.Meta.PendingInfos + v.Meta.Snapshot = s.metaClusterSnapshotStats(js, mg) } } } @@ -1788,6 +1807,7 @@ func (s *Server) updateVarzConfigReloadableFields(v *Varz) { v.ConfigDigest = opts.configDigest v.Tags = opts.Tags v.Metadata = opts.Metadata + v.FeatureFlags = opts.getMergedFeatureFlags() // Update route URLs if applicable if s.varzUpdateRouteURLs { v.Cluster.URLs = urlsToStrings(opts.Routes) @@ -3008,15 +3028,43 @@ type MetaSnapshotStats struct { LastDuration time.Duration `json:"last_duration,omitempty"` // LastDuration is how long the last meta snapshot took } +// metaClusterSnapshotStats returns snapshot statistics for the meta group. +func (s *Server) metaClusterSnapshotStats(js *jetStream, mg RaftNode) *MetaSnapshotStats { + entries, bytes := mg.Size() + snap := &MetaSnapshotStats{ + PendingEntries: entries, + PendingSize: bytes, + } + + js.mu.RLock() + cluster := js.cluster + js.mu.RUnlock() + + if cluster != nil { + timeNanos := atomic.LoadInt64(&cluster.lastMetaSnapTime) + durationNanos := atomic.LoadInt64(&cluster.lastMetaSnapDuration) + if timeNanos > 0 { + snap.LastTime = time.Unix(0, timeNanos).UTC() + } + if durationNanos > 0 { + snap.LastDuration = time.Duration(durationNanos) + } + } + + return snap +} + // MetaClusterInfo shows information about the meta group. type MetaClusterInfo struct { - Name string `json:"name,omitempty"` // Name is the name of the cluster - Leader string `json:"leader,omitempty"` // Leader is the server name of the cluster leader - Peer string `json:"peer,omitempty"` // Peer is unique ID of the leader - Replicas []*PeerInfo `json:"replicas,omitempty"` // Replicas is a list of known peers - Size int `json:"cluster_size"` // Size is the known size of the cluster - Pending int `json:"pending"` // Pending is how many RAFT messages are not yet processed - Snapshot *MetaSnapshotStats `json:"snapshot"` // Snapshot contains meta snapshot statistics + Name string `json:"name,omitempty"` // Name is the name of the cluster + Leader string `json:"leader,omitempty"` // Leader is the server name of the cluster leader + Peer string `json:"peer,omitempty"` // Peer is unique ID of the leader + Replicas []*PeerInfo `json:"replicas,omitempty"` // Replicas is a list of known peers + Size int `json:"cluster_size"` // Size is the known size of the cluster + Pending int `json:"pending"` // Pending is how many RAFT messages are not yet processed + PendingRequests int `json:"pending_requests"` // PendingRequests is how many CRUD operations are queued for processing + PendingInfos int `json:"pending_infos"` // PendingInfos is how many info operations are queued for processing + Snapshot *MetaSnapshotStats `json:"snapshot"` // Snapshot contains meta snapshot statistics } // JSInfo has detailed information on JetStream. @@ -3233,32 +3281,18 @@ func (s *Server) Jsz(opts *JSzOptions) (*JSInfo, error) { if mg := js.getMetaGroup(); mg != nil { if ci := s.raftNodeToClusterInfo(mg); ci != nil { - entries, bytes := mg.Size() jsi.Meta = &MetaClusterInfo{Name: ci.Name, Leader: ci.Leader, Peer: getHash(ci.Leader), Size: mg.ClusterSize()} if isLeader { jsi.Meta.Replicas = ci.Replicas } if ipq := s.jsAPIRoutedReqs; ipq != nil { - jsi.Meta.Pending = ipq.len() + jsi.Meta.PendingRequests = ipq.len() } - // Add meta snapshot stats - jsi.Meta.Snapshot = &MetaSnapshotStats{ - PendingEntries: entries, - PendingSize: bytes, - } - js.mu.RLock() - cluster := js.cluster - js.mu.RUnlock() - if cluster != nil { - timeNanos := atomic.LoadInt64(&cluster.lastMetaSnapTime) - durationNanos := atomic.LoadInt64(&cluster.lastMetaSnapDuration) - if timeNanos > 0 { - jsi.Meta.Snapshot.LastTime = time.Unix(0, timeNanos).UTC() - } - if durationNanos > 0 { - jsi.Meta.Snapshot.LastDuration = time.Duration(durationNanos) - } + if ipq := s.jsAPIRoutedInfoReqs; ipq != nil { + jsi.Meta.PendingInfos = ipq.len() } + jsi.Meta.Pending = jsi.Meta.PendingRequests + jsi.Meta.PendingInfos + jsi.Meta.Snapshot = s.metaClusterSnapshotStats(js, mg) } } @@ -3695,6 +3729,20 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus { }) continue } + if streamWerr := s.getWriteErr(); streamWerr != nil { + if !details { + health.Status = na + health.Error = fmt.Sprintf("JetStream stream '%s > %s' write error: %v", acc, stream, streamWerr) + return health + } + health.Errors = append(health.Errors, HealthzError{ + Type: HealthzErrorStream, + Account: acc.Name, + Stream: stream, + Error: fmt.Sprintf("JetStream stream '%s > %s' write error: %v", acc, stream, streamWerr), + }) + continue + } if streamFound { // if consumer option is passed, verify that the consumer exists on stream if opts.Consumer != _EMPTY_ { @@ -3771,49 +3819,37 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus { meta = cc.meta js.mu.RUnlock() - // If no meta leader. - if meta == nil || meta.GroupLeader() == _EMPTY_ { - if !details { - health.Status = na - health.Error = "JetStream has not established contact with a meta leader" + // Check meta layer health. + var metaNoLeader, metaClosed, metaUnhealthy bool + var metaWerr error + if meta != nil { + metaNoLeader = meta.GroupLeader() == _EMPTY_ + metaClosed = meta.State() == Closed + metaUnhealthy = !meta.Healthy() + metaWerr = meta.GetWriteErr() + } + metaRecovering := js.isMetaRecovering() + if meta == nil || metaNoLeader || metaClosed || metaUnhealthy || metaWerr != nil || metaRecovering { + var desc string + if metaWerr != nil { + desc = fmt.Sprintf("JetStream meta layer write error: %v", metaWerr) + } else if metaClosed { + desc = "JetStream meta layer is not running" + } else if meta != nil && metaRecovering { + desc = "JetStream is still recovering meta layer" + } else if meta == nil || metaNoLeader { + desc = "JetStream has not established contact with a meta leader" } else { - health.Errors = []HealthzError{ - { - Type: HealthzErrorJetStream, - Error: "JetStream has not established contact with a meta leader", - }, - } + desc = "JetStream is not current with the meta leader" } - return health - } - - // If we are not current with the meta leader. - if !meta.Healthy() { if !details { health.Status = na - health.Error = "JetStream is not current with the meta leader" - } else { - health.Errors = []HealthzError{ - { - Type: HealthzErrorJetStream, - Error: "JetStream is not current with the meta leader", - }, - } - } - return health - } - - // Are we still recovering meta layer? - if js.isMetaRecovering() { - if !details { - health.Status = na - health.Error = "JetStream is still recovering meta layer" - + health.Error = desc } else { health.Errors = []HealthzError{ { Type: HealthzErrorJetStream, - Error: "JetStream is still recovering meta layer", + Error: desc, }, } } @@ -4090,6 +4126,8 @@ type RaftzGroup struct { QuorumNeeded int `json:"quorum_needed"` Observer bool `json:"observer,omitempty"` Paused bool `json:"paused,omitempty"` + Overrun bool `json:"overrun,omitempty"` + OverrunCount uint64 `json:"overrun_count,omitempty"` Committed uint64 `json:"committed"` Applied uint64 `json:"applied"` CatchingUp bool `json:"catching_up,omitempty"` @@ -4198,6 +4236,8 @@ func (s *Server) Raftz(opts *RaftzOptions) *RaftzStatus { QuorumNeeded: n.qn, Observer: n.observer, Paused: n.paused, + Overrun: n.quorumPaused || n.isLeaderOverrun(), + OverrunCount: n.overrunCount, Committed: n.commit, Applied: n.applied, CatchingUp: n.catchup != nil, diff --git a/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go b/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go index 2ca0230788..7b0e5c2fdb 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/mqtt.go @@ -15,7 +15,6 @@ package server import ( "bytes" - "cmp" "crypto/tls" "encoding/binary" "encoding/json" @@ -33,6 +32,8 @@ import ( "unicode/utf8" "github.com/nats-io/jwt/v2" + "github.com/nats-io/nats-server/v2/server/gsl" + "github.com/nats-io/nats-server/v2/server/stree" "github.com/nats-io/nuid" ) @@ -258,14 +259,13 @@ type mqttSessionManager struct { type mqttAccountSessionManager struct { mu sync.RWMutex - sessions map[string]*mqttSession // key is MQTT client ID - sessByHash map[string]*mqttSession // key is MQTT client ID hash - sessLocked map[string]struct{} // key is MQTT client ID and indicate that a session can not be taken by a new client at this time - flappers map[string]time.Time // When connection connects with client ID already in use - flapTimer *time.Timer // Timer to perform some cleanup of the flappers map - sl *Sublist // sublist allowing to find retained messages for given subscription - retmsgs map[string]*mqttRetainedMsgRef // retained messages - rmsCache *sync.Map // map[subject]mqttRetainedMsg + sessions map[string]*mqttSession // key is MQTT client ID + sessByHash map[string]*mqttSession // key is MQTT client ID hash + sessLocked map[string]struct{} // key is MQTT client ID and indicate that a session can not be taken by a new client at this time + flappers map[string]time.Time // When connection connects with client ID already in use + flapTimer *time.Timer // Timer to perform some cleanup of the flappers map + retmsgs *stree.SubjectTree[mqttRetainedMsgRef] // retained message metadata + rmsCache *sync.Map // map[subject]mqttRetainedMsg jsa mqttJSA domainTk string // Domain (with trailing "."), or possibly empty. This is added to session subject. } @@ -366,7 +366,6 @@ type mqttRetainedMsg struct { type mqttRetainedMsgRef struct { sseq uint64 - sub *subscription } // mqttSub contains fields associated with a MQTT subscription, and is added to @@ -2022,10 +2021,14 @@ func (as *mqttAccountSessionManager) processRetainedMsg(_ *subscription, c *clie if err != nil { return } + if strings.IndexByte(rm.Subject, 0x7f) >= 0 { + c.Warnf("Skipping retained message for subject %q: unsupported character 0x7f", rm.Subject) + return + } // The as.jsa.id is immutable, so no need to have a rlock here. local := rm.Origin == as.jsa.id // Get the stream sequence for this message. - seq, _, _ := ackReplyInfo(reply) + seq, _, _, _, _ := ackReplyInfo(reply) if len(m) == 0 { // An empty payload means that we need to remove the retained message. rmSeq := as.removeRetainedMsg(rm.Subject, 0) @@ -2042,7 +2045,7 @@ func (as *mqttAccountSessionManager) processRetainedMsg(_ *subscription, c *clie // Add this retained message. The `rm.Msg` references some buffer that we // don't own. But addRetainedMsg() will take care of making a copy of // `rm.Msg` it `rm` ends-up being stored in the cache. - as.addRetainedMsg(rm.Subject, &mqttRetainedMsgRef{sseq: seq}, rm) + as.addRetainedMsg(rm.Subject, seq, rm) } } @@ -2310,17 +2313,16 @@ func (as *mqttAccountSessionManager) sendJSAPIrequests(s *Server, c *client, acc // If a message for this topic already existed, the existing record is updated // with the provided information. // Lock not held on entry. -func (as *mqttAccountSessionManager) addRetainedMsg(key string, rf *mqttRetainedMsgRef, rm *mqttRetainedMsg) { +func (as *mqttAccountSessionManager) addRetainedMsg(key string, sseq uint64, rm *mqttRetainedMsg) { as.mu.Lock() defer as.mu.Unlock() if as.retmsgs == nil { - as.retmsgs = make(map[string]*mqttRetainedMsgRef) - as.sl = NewSublistWithCache() + as.retmsgs = stree.NewSubjectTree[mqttRetainedMsgRef]() } else { // Check if we already had one retained message. If so, update the existing one. - if erf, exists := as.retmsgs[key]; exists { + if erf, exists := as.retmsgs.Find(stringToBytes(key)); exists { // Update the stream sequence with the new value. - erf.sseq = rf.sseq + erf.sseq = sseq // Update the in-memory retained message cache but only for messages // that are already in the cache, i.e. have been (recently) used. // If that is the case, we ask setCachedRetainedMsg() to make a copy @@ -2329,9 +2331,7 @@ func (as *mqttAccountSessionManager) addRetainedMsg(key string, rf *mqttRetained return } } - rf.sub = &subscription{subject: []byte(key)} - as.retmsgs[key] = rf - as.sl.Insert(rf.sub) + as.retmsgs.Insert([]byte(key), mqttRetainedMsgRef{sseq: sseq}) } // Remove the retained message stored with the `subject` key from the map/cache. @@ -2348,15 +2348,13 @@ func (as *mqttAccountSessionManager) addRetainedMsg(key string, rf *mqttRetained func (as *mqttAccountSessionManager) removeRetainedMsg(subject string, seq uint64) uint64 { as.mu.Lock() defer as.mu.Unlock() - rm, ok := as.retmsgs[subject] + rm, ok := as.retmsgs.Find(stringToBytes(subject)) if !ok || (seq > 0 && rm.sseq != seq) { return 0 } - seq = rm.sseq + rm, _ = as.retmsgs.Delete(stringToBytes(subject)) as.rmsCache.Delete(subject) - delete(as.retmsgs, subject) - as.sl.Remove(rm.sub) - return seq + return rm.sseq } // First check if this session's client ID is already in the "locked" map, @@ -2684,27 +2682,22 @@ func (as *mqttAccountSessionManager) processSubs(sess *mqttSession, c *client, // Account session manager lock held on entry. // Session lock held on entry. func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string]*mqttRetainedMsg, sess *mqttSession, c *client, sub *subscription, trace bool) { - if len(as.retmsgs) == 0 || len(rms) == 0 { - return - } - result := as.sl.ReverseMatch(string(sub.subject)) - if len(result.psubs) == 0 { + if as.retmsgs.Size() == 0 || len(rms) == 0 { return } toTrace := []mqttPublish{} - for _, psub := range result.psubs { - - rm := rms[string(psub.subject)] + as.retmsgs.Match(sub.subject, func(subj []byte, _ *mqttRetainedMsgRef) { + rm := rms[string(subj)] if rm == nil { // This should not happen since we pre-load messages into rms before // calling serialize. - continue + return } var pi uint16 qos := min(mqttGetQoS(rm.Flags), sub.mqtt.qos) if c.mqtt.rejectQoS2Pub && qos == 2 { c.Warnf("Rejecting retained message with QoS2 for subscription %q, as configured", sub.subject) - continue + return } if qos > 0 { pi = sess.trackPublishRetained() @@ -2731,7 +2724,7 @@ func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string] sz: len(rm.Msg), }) } - } + }) for _, pp := range toTrace { c.traceOutOp("PUBLISH", []byte(mqttPubTrace(&pp))) } @@ -2743,27 +2736,21 @@ func (as *mqttAccountSessionManager) serializeRetainedMsgsForSub(rms map[string] // Account session manager NOT lock held on entry. func (as *mqttAccountSessionManager) addRetainedSubjectsForSubject(list map[string]uint64, topSubject string) { as.mu.RLock() - if len(as.retmsgs) == 0 { - as.mu.RUnlock() + defer as.mu.RUnlock() + + if as.retmsgs.Size() == 0 { return } - result := as.sl.ReverseMatch(topSubject) - as.mu.RUnlock() - for _, sub := range result.psubs { - if _, ok := list[string(sub.subject)]; ok { - continue - } - var seq uint64 - as.mu.RLock() - if rm, ok := as.retmsgs[string(sub.subject)]; ok { - seq = rm.sseq + as.retmsgs.Match(stringToBytes(topSubject), func(subj []byte, ret *mqttRetainedMsgRef) { + subject := string(subj) + if _, ok := list[subject]; ok { + return } - as.mu.RUnlock() - if seq > 0 { - list[string(sub.subject)] = seq + if seq := ret.sseq; seq > 0 { + list[subject] = seq } - } + }) } type warner interface { @@ -3457,7 +3444,7 @@ func (sess *mqttSession) trackPublish(jsDur, jsAckSubject string) (uint16, bool) } // Get the stream sequence and duplicate flag from the ack reply subject. - sseq, _, dcount := ackReplyInfo(jsAckSubject) + sseq, _, dcount, _, _ := ackReplyInfo(jsAckSubject) if dcount > 1 { dup = true } @@ -3561,7 +3548,7 @@ func (sess *mqttSession) trackAsPubRel(pi uint16, jsAckSubject string) { return } - sseq, _, _ := ackReplyInfo(jsAckSubject) + sseq, _, _, _, _ := ackReplyInfo(jsAckSubject) if sess.cpending == nil { sess.cpending = make(map[string]map[uint64]uint16) @@ -4568,13 +4555,8 @@ func (s *Server) mqttCheckPubRetainedPerms() { } sm.mu.RUnlock() - type retainedMsg struct { - subj string - rmsg *mqttRetainedMsgRef - } - // For each session we will obtain a list of retained messages. - var _rms [128]retainedMsg + var _rms [128]uint64 rms := _rms[:0] for _, asm := range asms { // Get all of the retained messages. Then we will sort them so @@ -4582,19 +4564,20 @@ func (s *Server) mqttCheckPubRetainedPerms() { // store to not have to load out-of-order blocks so often. asm.mu.RLock() rms = rms[:0] // reuse slice - for subj, rf := range asm.retmsgs { - rms = append(rms, retainedMsg{ - subj: subj, - rmsg: rf, - }) - } + // Copy the sequence out of the tree. The tree entry itself can be + // updated concurrently by addRetainedMsg() after we release the lock, + // so keeping a pointer here would race with the later sort. + asm.retmsgs.IterOrdered(func(_ []byte, rm *mqttRetainedMsgRef) bool { + rms = append(rms, rm.sseq) + return true + }) jsaID := asm.jsa.id asm.mu.RUnlock() - slices.SortFunc(rms, func(i, j retainedMsg) int { return cmp.Compare(i.rmsg.sseq, j.rmsg.sseq) }) + slices.Sort(rms) - perms := map[string]*perm{} + perms := map[string]*mqttPerm{} for _, rf := range rms { - jsm, err := asm.jsa.loadMsg(mqttRetainedMsgsStreamName, rf.rmsg.sseq) + jsm, err := asm.jsa.loadMsg(mqttRetainedMsgsStreamName, rf) if err != nil || jsm == nil { continue } @@ -4617,7 +4600,7 @@ func (s *Server) mqttCheckPubRetainedPerms() { } // If there is permission and no longer allowed to publish in // the subject, remove the publish retained message from the map. - if p != nil && !pubAllowed(p, rf.subj) { + if p != nil && !pubAllowed(p, rm.Subject) { u = nil } } @@ -4636,7 +4619,12 @@ func (s *Server) mqttCheckPubRetainedPerms() { } // Helper to generate only pub permissions from a Permissions object -func generatePubPerms(perms *Permissions) *perm { +type mqttPerm struct { + allow *gsl.SimpleSublist + deny *gsl.SimpleSublist +} + +func generatePubPerms(perms *Permissions) *mqttPerm { // If given permissions is `nil`, then it means that permissions block // has been removed (so the user is now allowed to publish on everything) // or was never there in the first place. Returning `nil` will let the @@ -4644,39 +4632,38 @@ func generatePubPerms(perms *Permissions) *perm { if perms == nil { return nil } - var p *perm + var p *mqttPerm if perms.Publish.Allow != nil { - p = &perm{} - p.allow = NewSublistWithCache() + p = &mqttPerm{} + p.allow = gsl.NewSimpleSublist() for _, pubSubject := range perms.Publish.Allow { - sub := &subscription{subject: []byte(pubSubject)} - p.allow.Insert(sub) + _ = p.allow.Insert(pubSubject, struct{}{}) } } if len(perms.Publish.Deny) > 0 { if p == nil { - p = &perm{} + p = &mqttPerm{} } - p.deny = NewSublistWithCache() + p.deny = gsl.NewSimpleSublist() for _, pubSubject := range perms.Publish.Deny { - sub := &subscription{subject: []byte(pubSubject)} - p.deny.Insert(sub) + _ = p.deny.Insert(pubSubject, struct{}{}) } } return p } // Helper that checks if given `perms` allow to publish on the given `subject` -func pubAllowed(perms *perm, subject string) bool { +func pubAllowed(perms *mqttPerm, subject string) bool { + if perms == nil { + return true + } allowed := true if perms.allow != nil { - np, _ := perms.allow.NumInterest(subject) - allowed = np != 0 + allowed = perms.allow.HasInterest(subject) } // If we have a deny list and are currently allowed, check that as well. if allowed && perms.deny != nil { - np, _ := perms.deny.NumInterest(subject) - allowed = np == 0 + allowed = !perms.deny.HasInterest(subject) } return allowed } @@ -5729,6 +5716,12 @@ func mqttToNATSSubjectConversion(mt []byte, wcOk bool) ([]byte, error) { case ' ': // As of now, we cannot support ' ' in the MQTT topic/filter. return nil, errMQTTUnsupportedCharacters + case 0x7f: + // SubjectTree uses DEL as an internal pivot marker, so retained + // subjects containing it cannot be indexed safely, including + // legacy retained messages recovered from the retained-message + // stream. + return nil, errMQTTUnsupportedCharacters case btsep: if !cp { makeCopy(i) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/msgtrace.go b/vendor/github.com/nats-io/nats-server/v2/server/msgtrace.go index 1cbb6dcbce..c51be8ee88 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/msgtrace.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/msgtrace.go @@ -1,4 +1,4 @@ -// Copyright 2024-2025 The NATS Authors +// Copyright 2024-2026 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -26,6 +26,7 @@ import ( const ( MsgTraceDest = "Nats-Trace-Dest" + MsgTraceDestDisabled = "trace disabled" // This must be an invalid NATS subject MsgTraceHop = "Nats-Trace-Hop" MsgTraceOriginAccount = "Nats-Trace-Origin-Account" MsgTraceOnly = "Nats-Trace-Only" @@ -33,10 +34,19 @@ const ( // External trace header. Note that this header is normally in lower // case (https://www.w3.org/TR/trace-context/#header-name). Vendors // MUST expect the header in any case (upper, lower, mixed), and - // SHOULD send the header name in lowercase. + // SHOULD send the header name in lowercase. We used to change it + // to lower case, but no longer do that in 2.14. traceParentHdr = "traceparent" ) +var ( + traceDestHdrAsBytes = stringToBytes(MsgTraceDest) + traceDestDisabledAsBytes = stringToBytes(MsgTraceDestDisabled) + traceParentHdrAsBytes = stringToBytes(traceParentHdr) + crLFAsBytes = stringToBytes(CR_LF) + dashAsBytes = stringToBytes("-") +) + type MsgTraceType string // Type of message trace events in the MsgTraceEvents list. @@ -352,7 +362,6 @@ func (c *client) initMsgTrace() *msgTrace { } return vv[0] } - ct := getCompressionType(getHdrVal(acceptEncodingHeader)) var ( dest string traceOnly bool @@ -454,9 +463,9 @@ func (c *client) initMsgTrace() *msgTrace { } // Check sampling, but only from origin server. if c.kind == CLIENT && !sample(sampling) { - // Need to desactivate the traceParentHdr so that if the message - // is routed, it does possibly trigger a trace there. - disableTraceHeaders(c, hdr) + // Need to disable tracing so that if the message is routed, it won't + // trigger a trace there. + c.msgBuf = c.setHeader(MsgTraceDest, MsgTraceDestDisabled, c.msgBuf) return nil } } @@ -465,7 +474,7 @@ func (c *client) initMsgTrace() *msgTrace { acc: acc, oan: oan, dest: dest, - ct: ct, + ct: getCompressionType(getHdrVal(acceptEncodingHeader)), hop: hop, event: &MsgTraceEvent{ Request: MsgTraceRequest{ @@ -503,9 +512,7 @@ func sample(sampling int) bool { // the headers have been lifted due to the presence of the external trace header // only. // Note that because of the traceParentHdr, the search is done in a case -// insensitive way, but if the header is found, it is rewritten in lower case -// as suggested by the spec, but also to make it easier to disable the header -// when needed. +// insensitive way. We used to rewrite it in lower case but no longer do since v2.14. func genHeaderMapIfTraceHeadersPresent(hdr []byte) (map[string][]string, bool) { var ( @@ -520,11 +527,6 @@ func genHeaderMapIfTraceHeadersPresent(hdr []byte) (map[string][]string, bool) { return nil, false } - traceDestHdrAsBytes := stringToBytes(MsgTraceDest) - traceParentHdrAsBytes := stringToBytes(traceParentHdr) - crLFAsBytes := stringToBytes(CR_LF) - dashAsBytes := stringToBytes("-") - keys := _keys[:0] vals := _vals[:0] @@ -537,46 +539,50 @@ func genHeaderMapIfTraceHeadersPresent(hdr []byte) (map[string][]string, bool) { keyStart := i key := hdr[keyStart : keyStart+del] i += del + 1 + for i < len(hdr) && (hdr[i] == ' ' || hdr[i] == '\t') { + i++ + } valStart := i nl := bytes.Index(hdr[valStart:], crLFAsBytes) if nl < 0 { break } - if len(key) > 0 { - val := bytes.Trim(hdr[valStart:valStart+nl], " \t") + valEnd := valStart + nl + for valEnd > valStart && (hdr[valEnd-1] == ' ' || hdr[valEnd-1] == '\t') { + valEnd-- + } + val := hdr[valStart:valEnd] + if len(key) > 0 && len(val) > 0 { vals = append(vals, val) + // We search for our special keys only if not already found. + // Check for the external trace header. - if bytes.EqualFold(key, traceParentHdrAsBytes) { - // Rewrite the header using lower case if needed. - if !bytes.Equal(key, traceParentHdrAsBytes) { - copy(hdr[keyStart:], traceParentHdrAsBytes) - } + // Search needs to be case insensitive. + if !traceParentHdrFound && bytes.EqualFold(key, traceParentHdrAsBytes) { // We will now check if the value has sampling or not. // TODO(ik): Not sure if this header can have multiple values // or not, and if so, what would be the rule to check for // sampling. What is done here is to check them all until we // found one with sampling. - if !traceParentHdrFound { - tk := bytes.Split(val, dashAsBytes) - if len(tk) == 4 && len([]byte(tk[3])) == 2 { - if hexVal, err := strconv.ParseInt(bytesToString(tk[3]), 16, 8); err == nil { - if hexVal&0x1 == 0x1 { - traceParentHdrFound = true - } + tk := bytes.Split(val, dashAsBytes) + if len(tk) == 4 && len([]byte(tk[3])) == 2 { + if hexVal, err := strconv.ParseInt(bytesToString(tk[3]), 16, 8); err == nil { + if hexVal&0x1 == 0x1 { + traceParentHdrFound = true } } } - // Add to the keys with the external trace header in lower case. - keys = append(keys, traceParentHdrAsBytes) - } else { - // Is the key the Nats-Trace-Dest header? - if bytes.EqualFold(key, traceDestHdrAsBytes) { - traceDestHdrFound = true + } else if !traceDestHdrFound && bytes.Equal(key, traceDestHdrAsBytes) { + // This is the Nats-Trace-Dest header, check the value to see + // if it indicates that the trace was disabled. + if bytes.Equal(val, traceDestDisabledAsBytes) { + return nil, false } - // Add to the keys and preserve the key's case - keys = append(keys, key) + traceDestHdrFound = true } + // Add to the keys and preserve the key's case + keys = append(keys, key) } i += nl + 2 } @@ -655,59 +661,6 @@ func (t *msgTrace) setHopHeader(c *client, msg []byte) []byte { return c.setHeader(MsgTraceHop, t.nhop, msg) } -// Will look for the MsgTraceSendTo and traceParentHdr headers and change the first -// character to an 'X' so that if this message is sent to a remote, the remote -// will not initialize tracing since it won't find the actual trace headers. -// The function returns the position of the headers so it can efficiently be -// re-enabled by calling enableTraceHeaders. -// Note that if `msg` can be either the header alone or the full message -// (header and payload). This function will use c.pa.hdr to limit the -// search to the header section alone. -func disableTraceHeaders(c *client, msg []byte) []int { - // Code largely copied from getHeader(), except that we don't need the value - if c.pa.hdr <= 0 { - return []int{-1, -1} - } - hdr := msg[:c.pa.hdr] - headers := [2]string{MsgTraceDest, traceParentHdr} - positions := [2]int{-1, -1} - for i := 0; i < 2; i++ { - key := stringToBytes(headers[i]) - pos := bytes.Index(hdr, key) - if pos < 0 { - continue - } - // Make sure this key does not have additional prefix. - if pos < 2 || hdr[pos-1] != '\n' || hdr[pos-2] != '\r' { - continue - } - index := pos + len(key) - if index >= len(hdr) { - continue - } - if hdr[index] != ':' { - continue - } - // Disable the trace by altering the first character of the header - hdr[pos] = 'X' - positions[i] = pos - } - // Return the positions of those characters so we can re-enable the headers. - return positions[:2] -} - -// Changes back the character at the given position `pos` in the `msg` -// byte slice to the first character of the MsgTraceSendTo header. -func enableTraceHeaders(msg []byte, positions []int) { - firstChar := [2]byte{MsgTraceDest[0], traceParentHdr[0]} - for i, pos := range positions { - if pos == -1 { - continue - } - msg[pos] = firstChar[i] - } -} - func (t *msgTrace) setIngressError(err string) { if i := t.event.Ingress(); i != nil { i.Error = err diff --git a/vendor/github.com/nats-io/nats-server/v2/server/opts.go b/vendor/github.com/nats-io/nats-server/v2/server/opts.go index 3828aa66d3..3ef3a60d4c 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/opts.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/opts.go @@ -27,6 +27,7 @@ import ( "os" "path" "path/filepath" + "reflect" "regexp" "runtime" "strconv" @@ -108,6 +109,34 @@ type CompressionOpts struct { RTTThresholds []time.Duration } +func (c1 *CompressionOpts) equals(c2 *CompressionOpts) bool { + if c1 == c2 { + return true + } + if (c1 == nil && c2 != nil) || (c1 != nil && c2 == nil) { + return false + } + if c1.Mode != c2.Mode { + return false + } + // For s2_auto, if one has an empty RTTThresholds, it is equivalent + // to the defaultCompressionS2AutoRTTThresholds array, so compare with that. + if c1.Mode == CompressionS2Auto { + rtts1 := c1.RTTThresholds + if len(rtts1) == 0 { + rtts1 = defaultCompressionS2AutoRTTThresholds + } + rtts2 := c2.RTTThresholds + if len(rtts2) == 0 { + rtts2 = defaultCompressionS2AutoRTTThresholds + } + if !reflect.DeepEqual(rtts1, rtts2) { + return false + } + } + return true +} + // GatewayOpts are options for gateways. // NOTE: This structure is no longer used for monitoring endpoints // and json tags are deprecated and may be removed in the future. @@ -283,6 +312,48 @@ type RemoteLeafOpts struct { // existing connection will be closed and not solicited again (until it is changed // to `false` again. Disabled bool `json:"-"` + + // If this is set to true, this remote will ignore any server leafnode URLs + // returned by the hub, allowing the user to fully manage the servers this + // remote can connect to. + IgnoreDiscoveredServers bool `json:"-"` +} + +// Returns a string representation of this `RemoteLeafOpts` object, containing +// the URLs (unredacted), the account (or "$G" if none is specified) and, if present, +// the credentials filename. +func (r *RemoteLeafOpts) name() string { + return generateRemoteLeafOptsName(r, false) +} + +// Same than RemoteLeafOpts.name() but uses redacted URLs. This is to be used for logging. +func (r *RemoteLeafOpts) safeName() string { + return generateRemoteLeafOptsName(r, true) +} + +func generateRemoteLeafOptsName(r *RemoteLeafOpts, redacted bool) string { + acc := r.LocalAccount + if acc == _EMPTY_ { + acc = globalAccountName + } + var optional string + // There could be Credentials or NKey, not both (would be caught as a misconfig) + if c := r.Credentials; c != _EMPTY_ { + optional = fmt.Sprintf(", credentials=%q", c) + } else if nk := r.Nkey; nk != _EMPTY_ { + if redacted { + optional = ", nkey=\"[REDACTED]\"" + } else { + optional = fmt.Sprintf(", nkey=%q", nk) + } + } + var urls []*url.URL + if redacted { + urls = redactURLList(r.URLs) + } else { + urls = r.URLs + } + return fmt.Sprintf("urls=%q, account=%q%s", urls, acc, optional) } // JSLimitOpts are active limits for the meta cluster @@ -387,6 +458,7 @@ type Options struct { JetStreamTpm JSTpmOpts JetStreamMaxCatchup int64 JetStreamRequestQueueLimit int64 + JetStreamInfoQueueLimit int64 JetStreamMetaCompact uint64 JetStreamMetaCompactSize uint64 JetStreamMetaCompactSync bool @@ -478,6 +550,9 @@ type Options struct { // Metadata describing the server. They will be included in 'Z' responses. Metadata map[string]string `json:"-"` + // FeatureFlags the server opts-in to (or opts-out of). They will be included in 'Z' responses. + FeatureFlags map[string]bool `json:"-"` + // OCSPConfig enables OCSP Stapling in the server. OCSPConfig *OCSPConfig tlsConfigOpts *TLSConfigOpts @@ -1748,6 +1823,29 @@ func (o *Options) processConfigFileLine(k string, v any, errors *[]error, warnin *errors = append(*errors, err) return } + case "feature_flags": + var err error + switch v := v.(type) { + case map[string]any: + for mk, mv := range v { + tk, mv = unwrapValue(mv, <) + b, ok := mv.(bool) + if !ok { + err = &configErr{tk, fmt.Sprintf("error parsing feature flag %q: expected bool, got %T", mk, mv)} + break + } + if o.FeatureFlags == nil { + o.FeatureFlags = make(map[string]bool) + } + o.FeatureFlags[mk] = b + } + default: + err = &configErr{tk, fmt.Sprintf("error parsing feature flags: unsupported type %T", v)} + } + if err != nil { + *errors = append(*errors, err) + return + } case "default_js_domain": vv, ok := v.(map[string]any) if !ok { @@ -2641,6 +2739,12 @@ func parseJetStream(v any, opts *Options, errors *[]error, warnings *[]error) er return &configErr{tk, fmt.Sprintf("Expected a parseable size for %q, got %v", mk, mv)} } opts.JetStreamRequestQueueLimit = lim + case "info_queue_limit": + lim, ok := mv.(int64) + if !ok { + return &configErr{tk, fmt.Sprintf("Expected a parseable size for %q, got %v", mk, mv)} + } + opts.JetStreamInfoQueueLimit = lim case "meta_compact": thres, ok := mv.(int64) if !ok || thres < 0 { @@ -2936,6 +3040,7 @@ func parseRemoteLeafNodes(v any, errors *[]error, warnings *[]error) ([]*RemoteL if !ok { return nil, &configErr{tk, fmt.Sprintf("Expected remotes field to be an array, got %T", v)} } + names := make(map[string]struct{}) remotes := make([]*RemoteLeafOpts, 0, len(ra)) for _, r := range ra { tk, r = unwrapValue(r, <) @@ -3105,6 +3210,8 @@ func parseRemoteLeafNodes(v any, errors *[]error, warnings *[]error) ([]*RemoteL } } } + case "ignore_discovered_servers": + remote.IgnoreDiscoveredServers = v.(bool) default: if !tk.IsUsedVariable() { err := &unknownConfigFieldErr{ @@ -3128,6 +3235,12 @@ func parseRemoteLeafNodes(v any, errors *[]error, warnings *[]error) ([]*RemoteL *warnings = append(*warnings, &configErr{proxyToken, warn}) } } + rn := remote.name() + if _, dup := names[rn]; dup { + *errors = append(*errors, &configErr{tk, fmt.Sprintf("duplicate remote %s", remote.safeName())}) + continue + } + names[rn] = struct{}{} remotes = append(remotes, remote) } return remotes, nil @@ -6006,6 +6119,9 @@ func setBaselineOptions(opts *Options) { if opts.JetStreamRequestQueueLimit <= 0 { opts.JetStreamRequestQueueLimit = JSDefaultRequestQueueLimit } + if opts.JetStreamInfoQueueLimit <= 0 { + opts.JetStreamInfoQueueLimit = opts.JetStreamRequestQueueLimit + } } func getDefaultAuthTimeout(tls *tls.Config, tlsTimeout float64) float64 { @@ -6439,14 +6555,56 @@ func expandPath(p string) (string, error) { // RedactArgs redacts sensitive arguments from the command line. // For example, turns '--pass=secret' into '--pass=[REDACTED]'. func RedactArgs(args []string) { - secret := regexp.MustCompile("^-{1,2}(user|pass|auth)(=.*)?$") + secretArg := regexp.MustCompile("^-{1,2}(user|pass|auth)(=.*)?$") + routeURLArg := regexp.MustCompile("^-{1,2}(routes)(=.*)?$") + singleURLArg := regexp.MustCompile("^-{1,2}(cluster|cluster_listen)(=.*)?$") for i, arg := range args { - if secret.MatchString(arg) { - if idx := strings.Index(arg, "="); idx != -1 { - args[i] = arg[:idx] + "=[REDACTED]" - } else if i+1 < len(args) { - args[i+1] = "[REDACTED]" - } + switch { + case secretArg.MatchString(arg): + redactArgValue(args, i, func(_ string) string { return "[REDACTED]" }) + case routeURLArg.MatchString(arg): + redactArgValue(args, i, redactURLListUser) + case singleURLArg.MatchString(arg): + redactArgValue(args, i, redactURLUser) } } } + +func redactArgValue(args []string, i int, redact func(string) string) { + if flag, value, ok := strings.Cut(args[i], "="); ok { + args[i] = flag + "=" + redact(value) + } else if i+1 < len(args) { + args[i+1] = redact(args[i+1]) + } +} + +func redactURLUser(raw string) string { + if !strings.Contains(raw, "@") { + return raw + } + parseValue := strings.TrimSpace(raw) + restoreRandom := false + if prefix, ok := strings.CutSuffix(parseValue, ":-1"); ok { + parseValue = prefix + ":0" + restoreRandom = true + } + u, err := url.Parse(parseValue) + if err != nil || u.User == nil { + return raw + } + // url.String escapes brackets in userinfo, so use + // a placeholder here and rewrite it afterward. + u.User = url.User("_REDACTED_") + if restoreRandom { + u.Host = strings.TrimSuffix(u.Host, ":0") + ":-1" + } + return strings.Replace(u.String(), "_REDACTED_@", "[REDACTED]@", 1) +} + +func redactURLListUser(raw string) string { + parts := strings.Split(raw, ",") + for i, part := range parts { + parts[i] = redactURLUser(part) + } + return strings.Join(parts, ",") +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/parser.go b/vendor/github.com/nats-io/nats-server/v2/server/parser.go index ef25e09313..b91e1541e6 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/parser.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/parser.go @@ -166,32 +166,40 @@ func (c *client) parse(buf []byte) error { goto authErr } var ok bool - // Check here for NoAuthUser. If this is set allow non CONNECT protos as our first. - // E.g. telnet proto demos. - if noAuthUser := s.getOpts().NoAuthUser; noAuthUser != _EMPTY_ { - s.mu.Lock() - user, exists := s.users[noAuthUser] - s.mu.Unlock() - if exists { - c.RegisterUser(user) - c.mu.Lock() - c.clearAuthTimer() - c.flags.set(connectReceived) - c.mu.Unlock() - authSet, ok = false, true + switch c.kind { + case CLIENT: + // Check here for NoAuthUser. If this is set allow non CONNECT protos as our first. + // E.g. telnet proto demos. + opts := s.getOpts() + noAuthUser := opts.NoAuthUser + if c.ws != nil { + if noAuthUserWS := opts.Websocket.NoAuthUser; noAuthUserWS != _EMPTY_ { + noAuthUser = noAuthUserWS + } } + if noAuthUser != _EMPTY_ { + s.mu.Lock() + user, exists := s.users[noAuthUser] + s.mu.Unlock() + if exists { + c.RegisterUser(user) + c.mu.Lock() + c.clearAuthTimer() + c.flags.set(connectReceived) + c.mu.Unlock() + authSet, ok = false, true + } + } + case LEAF: + // Compressed inbound leaf-node negotiation may require INFO + // before CONNECT. Without compression, leaf connections must + // still start with CONNECT. + ok = (b == 'I' || b == 'i') && needsCompression(s.getOpts().LeafNode.Compression.Mode) } if !ok { goto authErr } } - // If the connection is a gateway connection, make sure that - // if this is an inbound, it starts with a CONNECT. - if c.kind == GATEWAY && !c.gw.outbound && !c.gw.connected { - // Use auth violation since no CONNECT was sent. - // It could be a parseErr too. - goto authErr - } } switch b { case 'P', 'p': @@ -1250,10 +1258,17 @@ func protoSnippet(start, max int, buf []byte) string { // If so, an error is sent to the client and the connection is closed. // The error ErrMaxControlLine is returned. func (c *client) overMaxControlLineLimit(arg []byte, mcl int32) error { + // Widen to int64 so mcl*16 cannot overflow for large configured values. + effective := int64(mcl) if c.kind != CLIENT { - return nil + // This is the upper bound on argBuf length for LEAF, ROUTER, and GATEWAY connections. + // These kinds need longer arg lines than CLIENT (which is capped at mcl=4096 by default) + // because cluster/leaf frames encode origin, account, reply, and queue groups. + // By default, this is 64 KB, which matches maxBufSize so a single oversized read + // is caught on the very next parse call. + effective *= 16 } - if len(arg) > int(mcl) { + if int64(len(arg)) > effective { err := NewErrorCtx(ErrMaxControlLine, "State %d, max_control_line %d, Buffer len %d (snip: %s...)", c.state, int(mcl), len(c.argBuf), protoSnippet(0, MAX_CONTROL_LINE_SNIPPET_SIZE, arg)) c.sendErr(err.Error()) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/raft.go b/vendor/github.com/nats-io/nats-server/v2/server/raft.go index a437b98504..5055297e71 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/raft.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/raft.go @@ -89,6 +89,7 @@ type RaftNode interface { RecreateInternalSubs() error IsSystemAccount() bool GetTrafficAccountName() string + GetWriteErr() error } // RaftNodeCheckpoint is used as an alternative to a direct InstallSnapshot. @@ -248,6 +249,9 @@ type raft struct { scaleUp bool // The node is part of a scale up, puts us in observer mode until the log contains data. deleted bool // If the node was deleted. snapshotting bool // Snapshot is in progress. + quorumPaused bool // Pause replication and quorum participation to prevent log growth during slow applies. + + overrunCount uint64 // Counter of how many times we were overrun, either as follower or as leader. } type proposedEntry struct { @@ -487,9 +491,12 @@ func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabel n.papplied = 0 if _, ok := n.wal.(*memStore); ok { _ = os.RemoveAll(filepath.Join(n.sd, snapshotsDir)) - } else { - // See if we have any snapshots and if so load and process on startup. - n.setupLastSnapshot() + } else if err := n.setupLastSnapshot(); err != nil && err != errNoSnapAvailable { + // If we failed to recover from the snapshot, then we should surface + // the error upwards, otherwise we can complete recovery but have only + // a partial view of the world. + n.shutdown() + return nil, err } // We may have restored the peer state from the @@ -503,6 +510,7 @@ func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabel // Make sure that the snapshots directory exists. if err := os.MkdirAll(filepath.Join(n.sd, snapshotsDir), defaultDirPerms); err != nil { + n.shutdown() return nil, fmt.Errorf("could not create snapshots directory - %v", err) } @@ -521,12 +529,6 @@ func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabel if state.Msgs > 0 { n.debug("Replaying state of %d entries", state.Msgs) - if first, err := n.loadFirstEntry(); err == nil { - n.pterm, n.pindex = first.pterm, first.pindex - if first.commit > 0 && first.commit > n.commit { - n.commit = first.commit - } - } // This process will queue up entries on our applied queue but prior to the upper // state machine running. So we will monitor how much we have queued and if we @@ -537,6 +539,25 @@ func (s *Server) initRaftNode(accName string, cfg *RaftConfig, labels pprofLabel // yet. Replay them. for index, qsz := state.FirstSeq, 0; index <= state.LastSeq; index++ { ae, err := n.loadEntry(index) + // The first entry in our WAL initializes state but must align with our snapshot if we had one. + // Importantly, check this first, as we might need to truncate the WAL further than the index. + if index == state.FirstSeq { + // If the entry is missing, corrupt, or doesn't align with the snapshot, truncate the WAL. + if err != nil || ae == nil || ae.pindex != index-1 || n.pindex != ae.pindex { + if err != nil { + n.warn("Could not load %d from WAL [%+v]: %v", index, state, err) + } else { + n.warn("Misaligned WAL, will truncate") + } + // Truncate to the snapshot or beginning if there is none. + truncateAndErr(n.pindex) + break + } + n.pterm, n.pindex = ae.pterm, ae.pindex + if ae.commit > 0 && ae.commit > n.commit { + n.commit = ae.commit + } + } if err != nil { n.warn("Could not load %d from WAL [%+v]: %v", index, state, err) // Truncate to the previous correct entry. @@ -902,6 +923,16 @@ func (n *raft) Propose(data []byte) error { if werr := n.werr; werr != nil { return werr } + + if n.isLeaderOverrun() { + var state StreamState + n.wal.FastState(&state) + n.warn("Leader falling behind, stepping down: pindex %d, commit %d, applied %d, WAL size %s", n.pindex, n.commit, n.applied, friendlyBytes(state.Bytes)) + // Stepdown without leader transfer, likely all replicas will be overrun, and we need time to recover. + n.stepdownLocked(noLeader) + n.overrunCount++ + return errNotLeader + } n.prop.push(newProposedEntry(newEntry(EntryNormal, data), _EMPTY_)) return nil } @@ -921,12 +952,39 @@ func (n *raft) ProposeMulti(entries []*Entry) error { if werr := n.werr; werr != nil { return werr } + + if n.isLeaderOverrun() { + var state StreamState + n.wal.FastState(&state) + n.warn("Leader falling behind, stepping down: pindex %d, commit %d, applied %d, WAL size %s", n.pindex, n.commit, n.applied, friendlyBytes(state.Bytes)) + // Stepdown without leader transfer, likely all replicas will be overrun, and we need time to recover. + n.stepdownLocked(noLeader) + n.overrunCount++ + return errNotLeader + } for _, e := range entries { n.prop.push(newProposedEntry(e, _EMPTY_)) } return nil } +// isLeaderOverrun returns whether we are overrun and should step down due to continuously increasing +// uncommitted or unapplied entries. If triggered, this means we're being severely overrun by +// incoming proposals or the system is degraded such that it's too slow (or unable) to process them. +// Stepping down means the system gets to "breathe" for a bit, until a new leader can be elected. +// Lock should be held. +func (n *raft) isLeaderOverrun() bool { + applied := max(n.applied, n.papplied) + commit := max(n.commit, n.papplied) + // We only do this past a high threshold to protect ourselves. + // Worst-case we'll have 2x the threshold, once in uncommitted and once in unapplied entries. + // Either the number of uncommitted entries is over the threshold: we're not getting quorum from our followers. + uncommittedThreshold := n.pindex > commit && n.pindex-commit > pauseQuorumThreshold + // Or, the number of in-memory committed but not yet applied entries is over the threshold: we're slow to apply. + unappliedThreshold := commit > applied && commit-applied > pauseQuorumThreshold + return uncommittedThreshold || unappliedThreshold +} + // ForwardProposal will forward the proposal to the leader if known. // If we are the leader this is the same as calling propose. func (n *raft) ForwardProposal(entry []byte) error { @@ -1075,8 +1133,8 @@ func (n *raft) PauseApply() error { } func (n *raft) pauseApplyLocked() { - // If we are currently a candidate make sure we step down. - if n.State() == Candidate { + // If we are currently not a follower, make sure we step down. + if n.State() != Follower { n.stepdownLocked(noLeader) } @@ -1558,11 +1616,14 @@ func termAndIndexFromSnapFile(sn string) (term, index uint64, err error) { // setupLastSnapshot is called at startup to try and recover the last snapshot from // the disk if possible. We will try to recover the term, index and commit/applied // indices and then notify the upper layer what we found. Compacts the WAL if needed. -func (n *raft) setupLastSnapshot() { +func (n *raft) setupLastSnapshot() error { snapDir := filepath.Join(n.sd, snapshotsDir) psnaps, err := os.ReadDir(snapDir) if err != nil { - return + if os.IsNotExist(err) { + return errNoSnapAvailable + } + return err } var lterm, lindex uint64 @@ -1586,18 +1647,8 @@ func (n *raft) setupLastSnapshot() { os.Remove(sfile) } } - - // Now cleanup any old entries - for _, sf := range psnaps { - sfile := filepath.Join(snapDir, sf.Name()) - if sfile != latest { - n.debug("Removing old snapshot: %q", sfile) - os.Remove(sfile) - } - } - if latest == _EMPTY_ { - return + return nil } // Set latest snapshot we have. @@ -1607,13 +1658,7 @@ func (n *raft) setupLastSnapshot() { n.snapfile = latest snap, err := n.loadLastSnapshot() if err != nil { - // We failed to recover the last snapshot for some reason, so we will - // assume it has been corrupted and will try to delete it. - if n.snapfile != _EMPTY_ { - os.Remove(n.snapfile) - n.snapfile = _EMPTY_ - } - return + return err } // We successfully recovered the last snapshot from the disk. @@ -1627,8 +1672,8 @@ func (n *raft) setupLastSnapshot() { n.papplied = snap.lastIndex // Restore the peerState ps, err := decodePeerState(snap.peerstate) - if err == nil { - n.processPeerState(ps) + if err != nil { + return err } n.processPeerState(ps) n.extSt = ps.domainExt @@ -1636,7 +1681,19 @@ func (n *raft) setupLastSnapshot() { n.apply.push(newCommittedEntry(n.commit, []*Entry{{EntrySnapshot, snap.data}})) if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil { n.setWriteErrLocked(err) + return err } + + // Now cleanup any old entries. We only do this once we know that the + // latest snapshot was OK. + for _, sf := range psnaps { + if sfile := filepath.Join(snapDir, sf.Name()); sfile != latest { + n.debug("Removing old snapshot: %q", sfile) + os.Remove(sfile) + } + } + + return nil } // loadLastSnapshot will load and return our last snapshot. @@ -1652,14 +1709,10 @@ func (n *raft) loadLastSnapshot() (*snapshot, error) { if err != nil { n.warn("Error reading snapshot: %v", err) - os.Remove(n.snapfile) - n.snapfile = _EMPTY_ return nil, err } if len(buf) < minSnapshotLen { n.warn("Snapshot corrupt, too short") - os.Remove(n.snapfile) - n.snapfile = _EMPTY_ return nil, errSnapshotCorrupt } @@ -1671,8 +1724,6 @@ func (n *raft) loadLastSnapshot() (*snapshot, error) { var hb [highwayhash.Size64]byte if !bytes.Equal(lchk[:], n.hh.Sum(hb[:0])) { n.warn("Snapshot corrupt, checksums did not match") - os.Remove(n.snapfile) - n.snapfile = _EMPTY_ return nil, errSnapshotCorrupt } @@ -1686,12 +1737,12 @@ func (n *raft) loadLastSnapshot() (*snapshot, error) { } // We had a bug in 2.9.12 that would allow snapshots on last index of 0. - // Detect that here and return err. + // Detect that and continue anyway, nothing else we can do about it. if snap.lastIndex == 0 { n.warn("Snapshot with last index 0 is invalid, cleaning up") os.Remove(n.snapfile) n.snapfile = _EMPTY_ - return nil, errSnapshotCorrupt + return nil, errNoSnapAvailable } return snap, nil @@ -2733,9 +2784,9 @@ func decodeAppendEntry(msg []byte, sub *subscription, reply string) (*appendEntr ae.reply, ae.sub = reply, sub // Decode Entries. - ne, ri := int(le.Uint16(msg[40:])), uint64(42) + ne, ri := int(le.Uint16(msg[40:])), uint64(appendEntryBaseLen) for i, max := 0, uint64(len(msg)); i < ne; i++ { - if ri >= max-1 { + if max-ri < 4 { return nil, errBadAppendEntry } ml := uint64(le.Uint32(msg[ri:])) @@ -2867,20 +2918,44 @@ func (n *raft) handleForwardedProposal(sub *subscription, c *client, _ *Account, msg = copyBytes(msg) n.RLock() + prop := n.prop // Check state under lock, we might not be leader anymore. if n.State() != Leader || !n.leaderState.Load() { n.debug("Ignoring forwarded proposal, not leader") n.RUnlock() return } - prop, werr := n.prop, n.werr - n.RUnlock() // Ignore if we have had a write error previous. - if werr != nil { + if n.werr != nil { + n.RUnlock() return } + if n.isLeaderOverrun() { + n.RUnlock() + n.Lock() + defer n.Unlock() + // Now that we've reacquired as write lock, we need to make sure that everything we + // believed before is still true. Otherwise we've either stepped down already from + // another goroutine or we've stopped being overrun and shouldn't drop the entry. + if n.State() != Leader || !n.leaderState.Load() { + return + } else if !n.isLeaderOverrun() { + prop.push(newProposedEntry(newEntry(EntryNormal, msg), reply)) + return + } + var state StreamState + n.wal.FastState(&state) + n.warn("Leader falling behind, stepping down: pindex %d, commit %d, applied %d, WAL size %s", n.pindex, n.commit, n.applied, friendlyBytes(state.Bytes)) + // Stepdown without leader transfer, likely all replicas will be overrun, and we need time to recover. + n.stepdownLocked(noLeader) + n.overrunCount++ + return + } + // Possible that we could fall through to here from multiple connections but if + // one does end up stepping down then the proposal queue gets drained anyway. + n.RUnlock() prop.push(newProposedEntry(newEntry(EntryNormal, msg), reply)) } @@ -3252,8 +3327,6 @@ func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) { if err != nil { // We need to stepdown here when this happens. n.stepdownLocked(noLeader) - // We need to reset our state here as well. - n.resetWAL() return 0, err } // Go ahead and send the snapshot and peerstate here as first append entry to the catchup follower. @@ -4041,6 +4114,42 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { } } + // If commits are outpacing our applies, temporarily stop accepting new entries to avoid falling further behind. + // This encourages the leader to sync us via a snapshot instead. We use max(applied, papplied) to avoid + // incorrectly triggering this pause immediately after receiving a snapshot. + applied := max(n.applied, n.papplied) + commit := max(n.commit, n.papplied) + if sub != nil && (commit > applied || n.quorumPaused) { + diff := commit - applied + if n.quorumPaused { + if diff > paeWarnThreshold { + if catchingUp { + n.cancelCatchup() + } + n.Unlock() + return + } + // Once we're sufficiently below the threshold, we continue again. We'll likely receive a snapshot + // from the leader. + n.quorumPaused = false + var state StreamState + n.wal.FastState(&state) + n.warn("Quorum resumed: commit %d, applied %d, WAL size %s", commit, applied, friendlyBytes(state.Bytes)) + } else if diff > pauseQuorumThreshold { + // It takes a while until we reach the pause threshold, but once we do we enter a "cooldown period". + n.quorumPaused = true + n.overrunCount++ + var state StreamState + n.wal.FastState(&state) + n.warn("Quorum paused, falling behind: commit %d != applied %d, WAL size %s", commit, applied, friendlyBytes(state.Bytes)) + if catchingUp { + n.cancelCatchup() + } + n.Unlock() + return + } + } + if ae.pterm != n.pterm || ae.pindex != n.pindex { // Check if this is a lower or equal index than what we were expecting. if ae.pindex <= n.pindex { @@ -4086,9 +4195,26 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { } } else { // If terms mismatched, delete that entry and all others past it. - // Make sure to cancel any catchups in progress. - // Truncate will reset our pterm and pindex. Only do so if we have an entry. - n.truncateWAL(eae.pterm, eae.pindex) + // But only if we haven't already committed past this point. + if eae.pindex < n.commit { + success = true + assert.Unreachable("Truncate to earlier entry would lose commits", map[string]any{ + "n.accName": n.accName, + "n.group": n.group, + "n.id": n.id, + "n.term": n.term, + "n.pindex": n.pindex, + "n.commit": n.commit, + "n.applied": n.applied, + "ae.pindex": ae.pindex, + "ae.pterm": ae.pterm, + "ae.commit": ae.commit, + "eae.pterm": eae.pterm, + "eae.pindex": eae.pindex, + }) + } else { + n.truncateWAL(eae.pterm, eae.pindex) + } } // Cancel regardless if unsuccessful. if !success { @@ -4420,9 +4546,10 @@ func (n *raft) storeToWAL(ae *appendEntry) error { } const ( - paeDropThreshold = 20_000 - paeWarnThreshold = 10_000 - paeWarnModulo = 5_000 + pauseQuorumThreshold = 100_000 + paeDropThreshold = 20_000 + paeWarnThreshold = 10_000 + paeWarnModulo = 5_000 ) func (n *raft) sendAppendEntry(entries []*Entry) { @@ -4699,11 +4826,18 @@ func (n *raft) setWriteErrLocked(err error) { } // If this is a not found report but do not disable. if os.IsNotExist(err) { - n.error("Resource not found: %v", err) + n.warn("Resource not found: %v", err) return } n.error("Critical write error: %v", err) n.werr = err + n.shutdown() + assert.Unreachable("Raft encountered write error", map[string]any{ + "n.accName": n.accName, + "n.group": n.group, + "n.id": n.id, + "err": err, + }) if isPermissionError(err) { go n.s.handleWritePermissionError() @@ -4720,6 +4854,13 @@ func (n *raft) isClosed() bool { return n.State() == Closed } +// GetWriteErr returns the write error (if any). +func (n *raft) GetWriteErr() error { + n.RLock() + defer n.RUnlock() + return n.werr +} + // Capture our write error if any and hold. func (n *raft) setWriteErr(err error) { n.Lock() @@ -5033,6 +5174,8 @@ func (n *raft) switchToCandidate() { // Increment the term. n.term++ n.vote = noVote + // Reset quorum paused. If it was previously set, we checked above that we've applied all committed entries. + n.quorumPaused = false // Clear current Leader. n.updateLeader(noLeader) n.switchState(Candidate) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/reload.go b/vendor/github.com/nats-io/nats-server/v2/server/reload.go index c2d467af65..5ef8e6f593 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/reload.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/reload.go @@ -1,4 +1,4 @@ -// Copyright 2017-2025 The NATS Authors +// Copyright 2017-2026 The NATS Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -740,6 +740,35 @@ func (jso jetStreamOption) IsStatszChange() bool { return true } +type jetStreamLimitsOption struct { + noopOption + newMaxMemory int64 + newMaxStore int64 +} + +func (jso *jetStreamLimitsOption) Apply(s *Server) { + js := s.getJetStream() + if js == nil { + return + } + js.mu.Lock() + if jso.newMaxMemory > 0 { + js.config.MaxMemory = jso.newMaxMemory + atomic.StoreInt64(&js.memMax, js.config.MaxMemory) + s.Noticef("Reloaded: JetStream max_mem_store = %s", friendlyBytes(jso.newMaxMemory)) + } + if jso.newMaxStore > 0 { + js.config.MaxStore = jso.newMaxStore + atomic.StoreInt64(&js.storeMax, js.config.MaxStore) + s.Noticef("Reloaded: JetStream max_file_store = %s", friendlyBytes(jso.newMaxStore)) + } + js.mu.Unlock() +} + +func (jso *jetStreamLimitsOption) IsStatszChange() bool { + return true +} + type defaultSentinelOption struct { noopOption newValue string @@ -872,9 +901,207 @@ func (o *profBlockRateReload) Apply(s *Server) { type leafNodeOption struct { noopOption + tlsFirstChanged bool + compressionChanged bool + // These are for the remotes + added []*RemoteLeafOpts + changed map[*leafNodeCfg]*remoteLeafOption +} + +type remoteLeafOption struct { tlsFirstChanged bool compressionChanged bool disabledChanged bool + opts *RemoteLeafOpts +} + +// Given `old` and `new` Leafnode options, this function will return the structure +// used for applying the configuration, or an error is there are changes that +// are not supported. +func getLeafNodeOptionsChanges(s *Server, old, new *LeafNodeOpts) (*leafNodeOption, error) { + + // We can't use DeepEqual for `Users` field, so do custom check. + if usersHaveChanged(old.Users, new.Users) { + return nil, fmt.Errorf("field \"Users\": old=%v, new=%v", old.Users, new.Users) + } + + // Check the main leafnodes{} block to see if there are any changes that are + // not supported. We provide a list of fields to ignore (we already checked, + // allow them to be modified or will check later). + if err := checkConfigsEqual(old, new, []string{ + "Compression", + "Remotes", + "TLSHandshakeFirst", + "TLSHandshakeFirstFallback", + "TLSConfig", + "Users", + }); err != nil { + return nil, err + } + + const ( + remoteErrFormat = "remote %s: %s" + maxAttempts = 20 + ) + var ( + nlo *leafNodeOption + // Track whether any existing remote was not found (i.e. removed). + removed bool + ) + +forLoop: + for failed := range maxAttempts { + removed = false + if failed > 0 { + // If we failed once, we will wait a bit before trying again the remotes. + // This could give enough time for connections that were in progress to complete. + select { + case <-time.After(50 * time.Millisecond): + case <-s.quitCh: + return nil, ErrServerNotRunning + } + } + nlo = &leafNodeOption{ + tlsFirstChanged: (old.TLSHandshakeFirst != new.TLSHandshakeFirst || old.TLSHandshakeFirstFallback != new.TLSHandshakeFirstFallback), + compressionChanged: !old.Compression.equals(&new.Compression), + // Start with all, will update when processing existing ones. + // Since the list will be modified, we need to clone it. + added: slices.Clone(new.Remotes), + } + + s.mu.RLock() + // Go through the list of existing remote configurations. + for lrc := range s.leafRemoteCfgs { + var rlo *RemoteLeafOpts + // Look for the corresponding `*RemoteLeafOpts` in the `nlo.added` + // list. If it is found, that function returns an updated list + // with the element removed from it. + lrc.RLock() + rlo, nlo.added = getRemoteLeafOpts(lrc.name(), nlo.added) + if rlo == nil { + // Not found, will be removed in leafNodeOption.Apply(). + removed = true + lrc.RUnlock() + continue + } + // Now we need to make sure that there are no changes that we don't + // support for a RemoteLeafOpts. + err := checkConfigsEqual(lrc.RemoteLeafOpts, rlo, []string{ + "Compression", + "Disabled", + "TLS", + "TLSHandshakeFirst", + "TLSConfig", + }) + if err != nil { + lrc.RUnlock() + s.mu.RUnlock() + return nil, fmt.Errorf(remoteErrFormat, rlo.safeName(), err) + } + disabledChanged := lrc.Disabled != rlo.Disabled + // If this remote was disabled and is now enabled, we need to make sure + // that there is no connect in progress. If that is the case, either + // try again (if it is the first failure) or return an error. + if disabledChanged && lrc.Disabled && lrc.connInProgress { + lrc.RUnlock() + s.mu.RUnlock() + if failed < maxAttempts-1 { + continue forLoop + } + return nil, fmt.Errorf(remoteErrFormat, rlo.safeName(), + "cannot be enabled at the moment, try again") + } + // Since we will use the new `rlo.TLSConfig` later on, consider all + // existing remote configs as "changed" and store them in the + // `nlo.changed` map. + if nlo.changed == nil { + nlo.changed = make(map[*leafNodeCfg]*remoteLeafOption) + } + lnro := &remoteLeafOption{ + tlsFirstChanged: lrc.TLSHandshakeFirst != rlo.TLSHandshakeFirst, + compressionChanged: !lrc.Compression.equals(&rlo.Compression), + disabledChanged: disabledChanged, + opts: rlo, + } + lrc.RUnlock() + nlo.changed[lrc] = lnro + } + if len(nlo.added) > 0 { + // Go through the added list and check if an added was recently removed and, + // if that is the case, is it still in the `s.rmLeafRemoteCfgs` map, which + // may mean that there was a connect-in-progress that did not complete yet. + // Either try again (if it is the first failure) or return an error. + for _, rlo := range nlo.added { + if _, cip := s.rmLeafRemoteCfgs[rlo.name()]; cip { + s.mu.RUnlock() + if failed < maxAttempts-1 { + continue forLoop + } + return nil, fmt.Errorf(remoteErrFormat, rlo.safeName(), + "cannot be added at the moment, try again") + } + } + } + s.mu.RUnlock() + break + } + + // Now we want to make sure that there were actual changes, so that we don't + // cause a reload of leafnodes for nothing. However, if one has (or all have) + // been removed we still need to invoke leafNodeOption.Apply(). + if !nlo.tlsFirstChanged && !nlo.compressionChanged && !removed && len(nlo.added) == 0 && len(nlo.changed) == 0 { + return nil, nil + } + + return nlo, nil +} + +func usersHaveChanged(ousers, nusers []*User) bool { + if len(ousers) != len(nusers) { + return true + } + // We did not do a strict list order check in the past, so maintain this to + // avoid possible breaking changes. + oua := make(map[string]*User, len(ousers)) + nua := make(map[string]*User, len(nusers)) + for _, u := range ousers { + oua[u.Username] = u + } + for _, u := range nusers { + nua[u.Username] = u + } + for uname, u := range oua { + // If we can not find new one with same name, consider that they have changed. + nu, ok := nua[uname] + if !ok { + return true + } + // Same if password or account has changed. + if u.Password != nu.Password || u.Account.GetName() != nu.Account.GetName() { + return true + } + } + return false +} + +// Given the `search` remote leafnode options name, searches for a match in the `list`. +// If found, returns the `*RemoteLeafOpts` from the list, and the updated list +// without the element in it. If not found, returns `nil` and the unmodified list. +func getRemoteLeafOpts(search string, list []*RemoteLeafOpts) (*RemoteLeafOpts, []*RemoteLeafOpts) { + for i, rlo := range list { + if search == rlo.name() { + lastIdx := len(list) - 1 + if lastIdx == 0 { + return rlo, nil + } + if i < lastIdx { + list[i] = list[lastIdx] + } + list = list[:lastIdx] + return rlo, list + } + } + return nil, list } func (l *leafNodeOption) Apply(s *Server) { @@ -882,99 +1109,179 @@ func (l *leafNodeOption) Apply(s *Server) { if l.tlsFirstChanged { s.Noticef("Reloaded: LeafNode TLS HandshakeFirst value is: %v", opts.LeafNode.TLSHandshakeFirst) s.Noticef("Reloaded: LeafNode TLS HandshakeFirstFallback value is: %v", opts.LeafNode.TLSHandshakeFirstFallback) - for _, r := range opts.LeafNode.Remotes { - s.Noticef("Reloaded: LeafNode Remote to %v TLS HandshakeFirst value is: %v", r.URLs, r.TLSHandshakeFirst) - } } - if l.compressionChanged || l.disabledChanged { - var leafs []*client - var solicit []*leafNodeCfg - acceptSideCompOpts := &opts.LeafNode.Compression + if l.compressionChanged { + s.Noticef("Reloaded: LeafNode Compression value is: %v", opts.LeafNode.Compression) + } - s.mu.RLock() - // First, update our internal leaf remote configurations with the new - // compress options. - // Since changing the remotes (as in adding/removing) is currently not - // supported, we know that we should have the same number in Options - // than in leafRemoteCfgs, but to be sure, use the max size. - max := len(opts.LeafNode.Remotes) - if l := len(s.leafRemoteCfgs); l < max { - max = l - } - for i := range max { - lr := s.leafRemoteCfgs[i] - or := opts.LeafNode.Remotes[i] - lr.Lock() - lr.Compression = or.Compression - if lr.Disabled && !or.Disabled { - solicit = append(solicit, lr) + var close []*client + var enable []*leafNodeCfg + var removed bool + + s.mu.Lock() + acceptSideCompOpts := &opts.LeafNode.Compression + // First go over existing leafnode remote configurations and + // either remove if no longer present, or update the config. + for lrc := range s.leafRemoteCfgs { + rlo := l.changed[lrc] + if rlo == nil { + delete(s.leafRemoteCfgs, lrc) + removed = true + if s.rmLeafRemoteCfgs == nil { + s.rmLeafRemoteCfgs = make(map[string]*leafNodeCfg) } - lr.Disabled = or.Disabled - lr.Unlock() + s.rmLeafRemoteCfgs[lrc.name()] = lrc + lrc.markAsRemoved() + s.Noticef("Reloaded: LeafNode Remote %s removed", lrc.RemoteLeafOpts.safeName()) + // We will close the existing connection in the next for-loop. + continue } - - for _, l := range s.leafs { - var co *CompressionOpts - - l.mu.Lock() - if r := l.leaf.remote; r != nil { - // If newly marked as disabled, collect and ignore the rest. - if r.Disabled { - l.flags.set(noReconnect) - leafs = append(leafs, l) - l.mu.Unlock() - continue - } - co = &r.Compression + lrc.Lock() + // TLSConfig is always applied. + lrc.TLSConfig = rlo.opts.TLSConfig.Clone() + // Now update what has been detected has changed. + if rlo.tlsFirstChanged { + lrc.TLSHandshakeFirst = rlo.opts.TLSHandshakeFirst + s.Noticef("Reloaded: LeafNode Remote %s TLS HandshakeFirst value is: %v", + lrc.RemoteLeafOpts.safeName(), rlo.opts.TLSHandshakeFirst) + } + if rlo.compressionChanged { + lrc.Compression = rlo.opts.Compression + s.Noticef("Reloaded: LeafNode Remote %s Compression value is: %v", + lrc.RemoteLeafOpts.safeName(), rlo.opts.Compression) + } + if rlo.disabledChanged { + // Change to new value. + lrc.Disabled = rlo.opts.Disabled + if lrc.Disabled { + lrc.notifyQuitChannel() } else { - co = acceptSideCompOpts + enable = append(enable, lrc) } - newMode := co.Mode - // Skip leaf connections that are "not supported" (because they - // will never do compression) or the ones that have already the - // new compression mode. - if l.leaf.compression == CompressionNotSupported || l.leaf.compression == newMode { - l.mu.Unlock() + s.Noticef("Reloaded: LeafNode Remote %s Disabled value is: %v", + lrc.RemoteLeafOpts.safeName(), rlo.opts.Disabled) + } + lrc.Unlock() + } + // Second, go over existing leaf connections and apply compression + // changes (if applicable) and collect connections that need to be + // closed and/or disabled. + for _, c := range s.leafs { + var co *CompressionOpts + + c.mu.Lock() + if r := c.leaf.remote; r != nil { + rlo := l.changed[r] + // If the config is not in the `changed` map, or the new config says that + // the connection is disabled, collect so we can close it after the server + // lock is released. + if rlo == nil || (rlo.disabledChanged && rlo.opts.Disabled) { + c.flags.set(noReconnect) + close = append(close, c) + c.mu.Unlock() continue } - // We need to close the connections if it had compression "off" or the new - // mode is compression "off", or if the new mode is "accept", because - // these require negotiation. - if l.leaf.compression == CompressionOff || newMode == CompressionOff || newMode == CompressionAccept { - leafs = append(leafs, l) - } else if newMode == CompressionS2Auto { - // If the mode is "s2_auto", we need to check if there is really - // need to change, and at any rate, we want to save the actual - // compression level here, not s2_auto. - l.updateS2AutoCompressionLevel(co, &l.leaf.compression) - } else { - // Simply change the compression writer - l.out.cw = s2.NewWriter(nil, s2WriterOptions(newMode)...) - l.leaf.compression = newMode + if rlo.compressionChanged { + co = &r.Compression } - l.mu.Unlock() + } else if l.compressionChanged { + co = acceptSideCompOpts } - s.mu.RUnlock() - // Close the connections for which negotiation is required, or that - // have been disabled. - for _, l := range leafs { - l.closeConnection(ClientClosed) + if co != nil && applyCompressionChanges(c, co) { + close = append(close, c) } - if l.compressionChanged { - s.Noticef("Reloaded: LeafNode compression settings") + c.mu.Unlock() + } + s.mu.Unlock() + + // Close the connections for which negotiation is required, have been disabled + // or simply removed. + for _, c := range close { + c.closeConnection(ClientClosed) + } + // Start the ones that have been enabled. + for _, r := range enable { + s.connectToRemoteLeafNodeAsynchronously(r, true) + } + // Finally, deal with the ones that have been added. + if len(l.added) > 0 { + s.solicitLeafNodeRemotes(l.added) + } + // Deal with removed configs. Make sure there are no connect-in-progress. + // If there are still some, have a go routine to check in the background. + if removed { + if checkAgain := checkRemovedLeafNodeCfgs(s); checkAgain { + checkRemovedLeafNodeCfgsAsync(s) } - if l.disabledChanged { - if len(leafs) > 0 { - s.Noticef("Reloaded: LeafNode(s) disabled") - } - if len(solicit) > 0 { - for _, remote := range solicit { - s.startGoRoutine(func() { s.connectToRemoteLeafNode(remote, true) }) + } +} + +// Go through the removed remote leafnode configs map to check if the +// connect-in-progress flag is set. If not, remove from the map. +// Returns `true` if there are still some that are in progress. +func checkRemovedLeafNodeCfgs(s *Server) bool { + var inProgress int + s.mu.Lock() + for rn, r := range s.rmLeafRemoteCfgs { + if r.isConnectInProgress() { + inProgress++ + } else { + delete(s.rmLeafRemoteCfgs, rn) + } + } + s.mu.Unlock() + // Needs to be called again if inProgress > 0 + return inProgress > 0 +} + +// Will start a go routine that will periodically call `checkRemovedLeafNodeCfgs`. +// When the removed map has been emptied, the go routine will end. It is ok for +// this function to be invoked multiple times and have multiple instances running +// concurrently. +func checkRemovedLeafNodeCfgsAsync(s *Server) { + s.startGoRoutine(func() { + defer s.grWG.Done() + tick := time.NewTicker(50 * time.Millisecond) + defer tick.Stop() + for { + select { + case <-tick.C: + if checkAgain := checkRemovedLeafNodeCfgs(s); !checkAgain { + return } - s.Noticef("Reloaded: LeafNode(s) enabled") + case <-s.quitCh: + return } } + }) +} + +// The `co` compression options are applied to the given leaf connection `c`. +// If a "restart" of the connection is needed, will return true, false otherwise. +func applyCompressionChanges(c *client, co *CompressionOpts) bool { + newMode := co.Mode + // Skip leaf connections that are "not supported" (because they + // will never do compression) or the ones that have already the + // new compression mode. + if c.leaf.compression == CompressionNotSupported || c.leaf.compression == newMode { + return false } + // We need to close the connections if it had compression "off" or the new + // mode is compression "off", or if the new mode is "accept", because + // these require negotiation. + if c.leaf.compression == CompressionOff || newMode == CompressionOff || newMode == CompressionAccept { + return true + } else if newMode == CompressionS2Auto { + // If the mode is "s2_auto", we need to check if there is really + // need to change, and at any rate, we want to save the actual + // compression level here, not s2_auto. + c.updateS2AutoCompressionLevel(co, &c.leaf.compression) + } else { + // Simply change the compression writer + c.out.cw = s2.NewWriter(nil, s2WriterOptions(newMode)...) + c.leaf.compression = newMode + } + return false } type noFastProdStallReload struct { @@ -1031,7 +1338,7 @@ func (s *Server) recheckPinnedCerts(curOpts *Options, newOpts *Options) { } }) } - if s.gateway.enabled && reflect.DeepEqual(newOpts.Gateway.TLSPinnedCerts, curOpts.Gateway.TLSPinnedCerts) { + if s.gateway.enabled && !reflect.DeepEqual(newOpts.Gateway.TLSPinnedCerts, curOpts.Gateway.TLSPinnedCerts) { gw := s.gateway gw.RLock() for _, c := range gw.out { @@ -1115,11 +1422,6 @@ func (s *Server) ReloadOptions(newOpts *Options) error { curOpts := s.getOpts() - // Wipe trusted keys if needed when we have an operator. - if len(curOpts.TrustedOperators) > 0 && len(curOpts.TrustedKeys) > 0 { - curOpts.TrustedKeys = nil - } - clientOrgPort := curOpts.Port clusterOrgPort := curOpts.Cluster.Port gatewayOrgPort := curOpts.Gateway.Port @@ -1215,15 +1517,18 @@ func (s *Server) reloadOptions(curOpts, newOpts *Options) error { newOpts.CustomClientAuthentication = curOpts.CustomClientAuthentication newOpts.CustomRouterAuthentication = curOpts.CustomRouterAuthentication - changed, err := s.diffOptions(newOpts) - if err != nil { + // Do the validation before checking for differences. We need to ensure + // that the new options are valid. Note that there are possible side + // effects of calling validateOptions(), in that some default values + // may be set, etc... but that should be ok since the current options + // went through the same process on startup/previous reload. + if err := validateOptions(newOpts); err != nil { return err } - if len(changed) != 0 { - if err := validateOptions(newOpts); err != nil { - return err - } + changed, err := s.diffOptions(newOpts) + if err != nil { + return err } // Create a context that is used to pass special info that we may need @@ -1258,7 +1563,7 @@ func imposeOrder(value any) error { slices.Sort(value.AllowedOrigins) case string, bool, uint8, uint16, uint64, int, int32, int64, time.Duration, float64, nil, LeafNodeOpts, ClusterOpts, *tls.Config, PinnedCertSet, *URLAccResolver, *MemAccResolver, *DirAccResolver, *CacheDirAccResolver, Authentication, MQTTOpts, jwt.TagList, - *OCSPConfig, map[string]string, JSLimitOpts, StoreCipher, *OCSPResponseCacheConfig, *ProxiesConfig, WriteTimeoutPolicy: + *OCSPConfig, map[string]string, map[string]bool, JSLimitOpts, StoreCipher, *OCSPResponseCacheConfig, *ProxiesConfig, WriteTimeoutPolicy: // explicitly skipped types case *AuthCallout: case JSTpmOpts: @@ -1275,9 +1580,11 @@ func imposeOrder(value any) error { // error. func (s *Server) diffOptions(newOpts *Options) ([]option, error) { var ( - oldConfig = reflect.ValueOf(s.getOpts()).Elem() + oldOpts = s.getOpts() + oldConfig = reflect.ValueOf(oldOpts).Elem() newConfig = reflect.ValueOf(newOpts).Elem() diffOpts = []option{} + skipTKeys = len(oldOpts.TrustedOperators) > 0 && len(oldOpts.TrustedKeys) > 0 // Need to keep track of whether JS is being disabled // to prevent changing limits at runtime. @@ -1286,6 +1593,7 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { jsMemLimitsChanged bool jsFileLimitsChanged bool jsStoreDirChanged bool + jsLimitsUpdate *jetStreamLimitsOption ) for i := 0; i < oldConfig.NumField(); i++ { field := oldConfig.Type().Field(i) @@ -1294,6 +1602,17 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { if field.PkgPath != _EMPTY_ { continue } + optName := strings.ToLower(field.Name) + if skipTKeys && optName == "trustedkeys" { + // TrustedOperators and TrustedKeys change is not supported. During options + // validation, if they are both specified, a conflict error is returned. + // If only TrustedOperators is specified, the TrustedKeys is filled with + // the operators' signing keys. So here, if we detect that the current + // options have operators, we don't do the trusted keys comparison, so + // we can fail with the "not supported for TrustedOperators" config reload + // error instead of TrustedKeys (that the user would not have set). + continue + } var ( oldValue = oldConfig.Field(i).Interface() newValue = newConfig.Field(i).Interface() @@ -1305,7 +1624,6 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { return nil, err } - optName := strings.ToLower(field.Name) // accounts and users (referencing accounts) will always differ as accounts // contain internal state, say locks etc..., so we don't bother here. // This also avoids races with atomic stats counters @@ -1374,7 +1692,7 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { co := &clusterOption{ newValue: newClusterOpts, permsChanged: !reflect.DeepEqual(newClusterOpts.Permissions, oldClusterOpts.Permissions), - compressChanged: !compressOptsEqual(&oldClusterOpts.Compression, &newClusterOpts.Compression), + compressChanged: !oldClusterOpts.Compression.equals(&newClusterOpts.Compression), } co.diffPoolAndAccounts(&oldClusterOpts) // If there are added accounts, first make sure that we can look them up. @@ -1445,6 +1763,11 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { tmpOld.tlsConfigOpts = nil tmpNew.tlsConfigOpts = nil + // Allow TLSPinnedCerts through reload, existing connections + // are checked in recheckPinnedCerts + tmpOld.TLSPinnedCerts = nil + tmpNew.TLSPinnedCerts = nil + // Need to do the same for remote gateways' TLS configs. // But we can't just set remotes' TLSConfig to nil otherwise this // would lose the real TLS configuration. @@ -1458,149 +1781,19 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { field.Name, oldValue, newValue) } case "leafnode": - // Similar to gateways tmpOld := oldValue.(LeafNodeOpts) tmpNew := newValue.(LeafNodeOpts) - tmpOld.TLSConfig = nil - tmpNew.TLSConfig = nil - tmpOld.tlsConfigOpts = nil - tmpNew.tlsConfigOpts = nil - // We will allow TLSHandshakeFirst to be config reloaded. First, - // we just want to detect if there was a change in the leafnodes{} - // block, and if not, we will check the remotes. - handshakeFirstChanged := tmpOld.TLSHandshakeFirst != tmpNew.TLSHandshakeFirst || - tmpOld.TLSHandshakeFirstFallback != tmpNew.TLSHandshakeFirstFallback - // If changed, set them (in the temporary variables) to false so that the - // rest of the comparison does not fail. - if handshakeFirstChanged { - tmpOld.TLSHandshakeFirst, tmpNew.TLSHandshakeFirst = false, false - tmpOld.TLSHandshakeFirstFallback, tmpNew.TLSHandshakeFirstFallback = 0, 0 - } else if len(tmpOld.Remotes) == len(tmpNew.Remotes) { - // Since we don't support changes in the remotes, we will do a - // simple pass to see if there was a change of this field. - for i := 0; i < len(tmpOld.Remotes); i++ { - if tmpOld.Remotes[i].TLSHandshakeFirst != tmpNew.Remotes[i].TLSHandshakeFirst { - handshakeFirstChanged = true - break - } - } - } - // We also support config reload for compression. Check if it changed before - // blanking them out for the deep-equal check at the end. - compressionChanged := !compressOptsEqual(&tmpOld.Compression, &tmpNew.Compression) - if compressionChanged { - tmpOld.Compression, tmpNew.Compression = CompressionOpts{}, CompressionOpts{} - } else if len(tmpOld.Remotes) == len(tmpNew.Remotes) { - // Same that for tls first check, do the remotes now. - for i := range len(tmpOld.Remotes) { - if !compressOptsEqual(&tmpOld.Remotes[i].Compression, &tmpNew.Remotes[i].Compression) { - compressionChanged = true - break - } - } - } - // Check if the "disabled" option of each remote has changed. - var disabledChanged bool - for i := range len(tmpOld.Remotes) { - if tmpOld.Remotes[i].Disabled != tmpNew.Remotes[i].Disabled { - disabledChanged = true - break - } - } - - // Need to do the same for remote leafnodes' TLS configs. - // But we can't just set remotes' TLSConfig to nil otherwise this - // would lose the real TLS configuration. - tmpOld.Remotes = copyRemoteLNConfigForReloadCompare(tmpOld.Remotes) - tmpNew.Remotes = copyRemoteLNConfigForReloadCompare(tmpNew.Remotes) - - // Special check for leafnode remotes changes which are not supported right now. - leafRemotesChanged := func(a, b LeafNodeOpts) bool { - if len(a.Remotes) != len(b.Remotes) { - return true - } - - // Check whether all remotes URLs are still the same. - for _, oldRemote := range a.Remotes { - var found bool - - if oldRemote.LocalAccount == _EMPTY_ { - oldRemote.LocalAccount = globalAccountName - } - - for _, newRemote := range b.Remotes { - // Bind to global account in case not defined. - if newRemote.LocalAccount == _EMPTY_ { - newRemote.LocalAccount = globalAccountName - } - - if reflect.DeepEqual(oldRemote, newRemote) { - found = true - break - } - } - if !found { - return true - } - } - return false + lno, err := getLeafNodeOptionsChanges(s, &tmpOld, &tmpNew) + // If there was an unsupported change, we will get an error with the name + // of the (first) field and its old and new value. + if err != nil { + return nil, fmt.Errorf("config reload not supported for %s: %v", field.Name, err) } - - // First check whether remotes changed at all. If they did not, - // skip them in the complete equal check. - if !leafRemotesChanged(tmpOld, tmpNew) { - tmpOld.Remotes = nil - tmpNew.Remotes = nil - } - - // Special check for auth users to detect changes. - // If anything is off will fall through and fail below. - // If we detect they are semantically the same we nil them out - // to pass the check below. - if tmpOld.Users != nil || tmpNew.Users != nil { - if len(tmpOld.Users) == len(tmpNew.Users) { - oua := make(map[string]*User, len(tmpOld.Users)) - nua := make(map[string]*User, len(tmpOld.Users)) - for _, u := range tmpOld.Users { - oua[u.Username] = u - } - for _, u := range tmpNew.Users { - nua[u.Username] = u - } - same := true - for uname, u := range oua { - // If we can not find new one with same name, drop through to fail. - nu, ok := nua[uname] - if !ok { - same = false - break - } - // If username or password or account different break. - if u.Username != nu.Username || u.Password != nu.Password || u.Account.GetName() != nu.Account.GetName() { - same = false - break - } - } - // We can nil out here. - if same { - tmpOld.Users, tmpNew.Users = nil, nil - } - } + // If there was an actual change... + if lno != nil { + diffOpts = append(diffOpts, lno) } - - // If there is really a change prevents reload. - if !reflect.DeepEqual(tmpOld, tmpNew) { - // See TODO(ik) note below about printing old/new values. - return nil, fmt.Errorf("config reload not supported for %s: old=%v, new=%v", - field.Name, oldValue, newValue) - } - - diffOpts = append(diffOpts, &leafNodeOption{ - tlsFirstChanged: handshakeFirstChanged, - compressionChanged: compressionChanged, - disabledChanged: disabledChanged, - }) case "jetstream": new := newValue.(bool) old := oldValue.(bool) @@ -1636,26 +1829,35 @@ func (s *Server) diffOptions(newOpts *Options) ([]option, error) { fromSet = !fromUnset toUnset = new == -1 toSet = !toUnset + increased = fromSet && toSet && new > old ) if jsEnabled && modified { // Cannot change limits from dynamic storage at runtime. switch { + case increased: + // Allowed to increase, but not decrease. + if jsLimitsUpdate == nil { + jsLimitsUpdate = &jetStreamLimitsOption{} + diffOpts = append(diffOpts, jsLimitsUpdate) + } + if optName == "jetstreammaxmemory" { + jsLimitsUpdate.newMaxMemory = new + } else { + jsLimitsUpdate.newMaxStore = new + } case fromSet && toUnset: // Limits changed but it may mean that JS is being disabled, // keep track of the change and error in case it is not. - switch optName { - case "jetstreammaxmemory": + if optName == "jetstreammaxmemory" { jsMemLimitsChanged = true - case "jetstreammaxstore": + } else { jsFileLimitsChanged = true - default: - return nil, fmt.Errorf("config reload not supported for jetstream max memory and store") } case fromUnset && toSet: // Prevent changing from dynamic max memory / file at runtime. return nil, fmt.Errorf("config reload not supported for jetstream dynamic max memory and store") default: - return nil, fmt.Errorf("config reload not supported for jetstream max memory and store") + return nil, fmt.Errorf("config reload not supported for decreasing jetstream max memory and store") } } case "jetstreammetacompact", "jetstreammetacompactsize", "jetstreammetacompactsync": @@ -1801,32 +2003,6 @@ func copyRemoteGWConfigsWithoutTLSConfig(current []*RemoteGatewayOpts) []*Remote return rgws } -func copyRemoteLNConfigForReloadCompare(current []*RemoteLeafOpts) []*RemoteLeafOpts { - l := len(current) - if l == 0 { - return nil - } - rlns := make([]*RemoteLeafOpts, 0, l) - for _, rcfg := range current { - cp := *rcfg - cp.TLSConfig = nil - cp.tlsConfigOpts = nil - cp.TLSHandshakeFirst = false - // This is set only when processing a CONNECT, so reset here so that we - // don't fail the DeepEqual comparison. - cp.TLS = false - // For now, remove DenyImports/Exports since those get modified at runtime - // to add JS APIs. - cp.DenyImports, cp.DenyExports = nil, nil - // Remove compression mode - cp.Compression = CompressionOpts{} - // Reset disabled status - cp.Disabled = false - rlns = append(rlns, &cp) - } - return rlns -} - func (s *Server) applyOptions(ctx *reloadContext, opts []option) { var ( reloadLogging = false @@ -1896,15 +2072,12 @@ func (s *Server) applyOptions(ctx *reloadContext, opts []option) { s.sendStatszUpdate() } - // For remote gateways and leafnodes, make sure that their TLS configuration + // For remote gateways, make sure that their TLS configuration // is updated (since the config is "captured" early and changes would otherwise // not be visible). if s.gateway.enabled { s.gateway.updateRemotesTLSConfig(newOpts) } - if len(newOpts.LeafNode.Remotes) > 0 { - s.updateRemoteLeafNodesTLSConfig(newOpts) - } // Always restart OCSP monitoring on reload. if err := s.reloadOCSP(); err != nil { @@ -2650,3 +2823,38 @@ func diffProxiesTrustedKeys(old, new []*ProxyConfig) ([]string, []string) { } return add, del } + +// This function calls `reflect.DeepEqual` on all public fields that are +// not part of the `ignoreFields` list. If they are all equal, returns nil, +// otherwise returns an error that will contain the name of the first field +// that fails the comparison, along with its old and new values. +func checkConfigsEqual(c1, c2 any, ignoreFields []string) error { + oldConfig := reflect.ValueOf(c1).Elem() + newConfig := reflect.ValueOf(c2).Elem() + for i := 0; i < oldConfig.NumField(); i++ { + field := oldConfig.Type().Field(i) + // field.PkgPath is empty for exported fields, and is not for unexported ones. + // We skip the unexported fields. + if field.PkgPath != _EMPTY_ { + continue + } + // If it is in the set of fields to ignore, move to the next. + // We expect the number of ignore fields to be small. + var ignored bool + for _, f := range ignoreFields { + if f == field.Name { + ignored = true + break + } + } + if ignored { + continue + } + oldValue := oldConfig.Field(i).Interface() + newValue := newConfig.Field(i).Interface() + if !reflect.DeepEqual(oldValue, newValue) { + return fmt.Errorf("field %q: old=%v, new=%v", field.Name, oldValue, newValue) + } + } + return nil +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/scheduler.go b/vendor/github.com/nats-io/nats-server/v2/server/scheduler.go index d827c39451..69bf0155bd 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/scheduler.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/scheduler.go @@ -19,6 +19,7 @@ import ( "io" "math" "slices" + "strings" "time" "github.com/nats-io/nats-server/v2/server/thw" @@ -77,6 +78,18 @@ func (ms *MsgScheduling) init(seq uint64, subj string, ts int64) { delete(ms.inflight, subj) } +func (ms *MsgScheduling) update(subj string, ts int64) { + if sched, ok := ms.schedules[subj]; ok { + // Remove and add separately, it's for the same sequence, but if replicated + // this server could not know the previous timestamp yet. + ms.ttls.Remove(sched.seq, sched.ts) + ms.ttls.Add(sched.seq, ts) + sched.ts = ts + delete(ms.inflight, subj) + ms.resetTimer() + } +} + func (ms *MsgScheduling) markInflight(subj string) { if _, ok := ms.schedules[subj]; ok { ms.inflight[subj] = struct{}{} @@ -90,8 +103,7 @@ func (ms *MsgScheduling) isInflight(subj string) bool { func (ms *MsgScheduling) remove(seq uint64) { if subj, ok := ms.seqToSubj[seq]; ok { - delete(ms.seqToSubj, seq) - delete(ms.schedules, subj) + ms.removeSubject(subj) } } @@ -100,6 +112,7 @@ func (ms *MsgScheduling) removeSubject(subj string) { ms.ttls.Remove(sched.seq, sched.ts) delete(ms.schedules, subj) delete(ms.seqToSubj, sched.seq) + delete(ms.inflight, subj) } } @@ -142,11 +155,12 @@ func (ms *MsgScheduling) resetTimer() { } } -func (ms *MsgScheduling) getScheduledMessages(loadMsg func(seq uint64, smv *StoreMsg) *StoreMsg) []*inMsg { +func (ms *MsgScheduling) getScheduledMessages(loadMsg func(seq uint64, smv *StoreMsg) *StoreMsg, loadLast func(subj string, smv *StoreMsg) *StoreMsg) []*inMsg { var ( - smv StoreMsg - sm *StoreMsg - msgs []*inMsg + smv StoreMsg + srcSmv StoreMsg + sm *StoreMsg + msgs []*inMsg ) ms.ttls.ExpireTasks(func(seq uint64, ts int64) bool { // Need to grab the message for the specified sequence, and check @@ -155,10 +169,26 @@ func (ms *MsgScheduling) getScheduledMessages(loadMsg func(seq uint64, smv *Stor if sm != nil { // If already inflight, don't duplicate a scheduled message. The stream could // be replicated and the scheduled message could take some time to propagate. - if ms.isInflight(sm.subj) { + subj := sm.subj + if ms.isInflight(subj) { return false } // Validate the contents are correct if not, we just remove it from THW. + pattern := bytesToString(sliceHeader(JSSchedulePattern, sm.hdr)) + if pattern == _EMPTY_ { + ms.remove(seq) + return true + } + loc, apiErr := loadMessageScheduleLocation(sm.hdr) + if apiErr != nil { + ms.remove(seq) + return true + } + next, repeat, ok := parseMsgSchedule(pattern, loc, ts) + if !ok { + ms.remove(seq) + return true + } ttl, ok := getMessageScheduleTTL(sm.hdr) if !ok { ms.remove(seq) @@ -169,27 +199,43 @@ func (ms *MsgScheduling) getScheduledMessages(loadMsg func(seq uint64, smv *Stor ms.remove(seq) return true } + rollup := getMessageScheduleRollup(sm.hdr) + source := getMessageScheduleSource(sm.hdr) + if source != _EMPTY_ { + // Fall back to the scheduled message's own content if the source has no last message. + if srcSm := loadLast(source, &srcSmv); srcSm != nil { + sm = srcSm + } + } // Copy, as this is retrieved directly from storage, and we'll need to keep hold of this for some time. // And in the case of headers, we'll copy all of them, but make changes. hdr, msg := copyBytes(sm.hdr), copyBytes(sm.msg) - // Strip headers specific to the schedule. - hdr = removeHeaderIfPresent(hdr, JSSchedulePattern) - hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Schedule-") + // Strip headers specific to message scheduling. + // Covers Nats-Schedule, Nats-Schedule-*, and Nats-Scheduler. + hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Schedule") + // Strip headers that could prevent persisting this scheduled message. hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Expected-") hdr = removeHeaderIfPresent(hdr, JSMsgId) hdr = removeHeaderIfPresent(hdr, JSMessageTTL) hdr = removeHeaderIfPresent(hdr, JSMsgRollup) // Add headers for the scheduled message. - hdr = genHeader(hdr, JSScheduler, sm.subj) - hdr = genHeader(hdr, JSScheduleNext, JSScheduleNextPurge) // Purge the schedule message itself. + hdr = genHeader(hdr, JSScheduler, subj) + if !repeat { + hdr = genHeader(hdr, JSScheduleNext, JSScheduleNextPurge) // Purge the schedule message itself. + } else { + hdr = genHeader(hdr, JSScheduleNext, next.Format(time.RFC3339)) // Next time the schedule fires. + } if ttl != _EMPTY_ { hdr = genHeader(hdr, JSMessageTTL, ttl) } + if rollup != _EMPTY_ { + hdr = genHeader(hdr, JSMsgRollup, rollup) + } msgs = append(msgs, &inMsg{seq: seq, subj: target, hdr: hdr, msg: msg}) - ms.markInflight(sm.subj) + ms.markInflight(subj) return false } ms.remove(seq) @@ -261,3 +307,70 @@ func (ms *MsgScheduling) decode(b []byte) (uint64, error) { } return stamp, nil } + +// parseMsgSchedule parses a message schedule pattern and returns the time +// to fire, whether it is a repeating schedule, and whether the pattern was valid. +func parseMsgSchedule(pattern string, loc *time.Location, ts int64) (time.Time, bool, bool) { + if pattern == _EMPTY_ { + return time.Time{}, false, true + } + // Exact time. + if strings.HasPrefix(pattern, "@at ") { + // Time zone is not supported for @at. + if loc != nil { + return time.Time{}, false, false + } + t, err := time.Parse(time.RFC3339, pattern[4:]) + return t, false, err == nil + } + // Repeating on a simple interval. + if strings.HasPrefix(pattern, "@every ") { + // Time zone is not supported for @every. + if loc != nil { + return time.Time{}, false, false + } + dur, err := time.ParseDuration(pattern[7:]) + if err != nil { + return time.Time{}, false, false + } + // Only allow intervals of at least a second. + if dur.Seconds() < 1 { + return time.Time{}, false, false + } + // If this schedule would trigger multiple times, for example after a restart, skip ahead and only fire once. + next := time.Unix(0, ts).UTC().Round(time.Second).Add(dur) + if now := time.Now().UTC(); next.Before(now) { + next = now.Round(time.Second).Add(dur) + } + return next, true, true + } + + // Predefined schedules for cron. + switch pattern { + case "@yearly", "@annually": + pattern = "0 0 0 1 1 *" + case "@monthly": + pattern = "0 0 0 1 * *" + case "@weekly": + pattern = "0 0 0 * * 0" + case "@daily", "@midnight": + pattern = "0 0 0 * * *" + case "@hourly": + pattern = "0 0 * * * *" + } + + // Parse the cron pattern. + next, err := parseCron(pattern, loc, ts) + if err != nil { + return time.Time{}, false, false + } + // If this schedule would trigger multiple times, for example after a restart, skip ahead and only fire once. + if now := time.Now().UTC(); next.Before(now) { + ts = now.Round(time.Second).UnixNano() + next, err = parseCron(pattern, loc, ts) + if err != nil { + return time.Time{}, false, false + } + } + return next, true, true +} diff --git a/vendor/github.com/nats-io/nats-server/v2/server/server.go b/vendor/github.com/nats-io/nats-server/v2/server/server.go index aa28534dc1..9612c868c1 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/server.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/server.go @@ -31,7 +31,6 @@ import ( "os" "path" "path/filepath" - "reflect" "regexp" "runtime" "runtime/pprof" @@ -234,7 +233,8 @@ type Server struct { resolver netResolver dialTimeout time.Duration } - leafRemoteCfgs []*leafNodeCfg + leafRemoteCfgs map[*leafNodeCfg]struct{} + rmLeafRemoteCfgs map[string]*leafNodeCfg leafRemoteAccounts sync.Map leafNodeEnabled bool leafDisableConnect bool // Used in test only @@ -367,7 +367,8 @@ type Server struct { syncOutSem chan struct{} // Queue to process JS API requests that come from routes (or gateways) - jsAPIRoutedReqs *ipQueue[*jsAPIRoutedReq] + jsAPIRoutedReqs *ipQueue[*jsAPIRoutedReq] + jsAPIRoutedInfoReqs *ipQueue[*jsAPIRoutedReq] // Delayed API responses. delayedAPIResponses *ipQueue[*delayedAPIResponse] @@ -645,32 +646,6 @@ func selectS2AutoModeBasedOnRTT(rtt time.Duration, rttThresholds []time.Duration return CompressionS2Best } -func compressOptsEqual(c1, c2 *CompressionOpts) bool { - if c1 == c2 { - return true - } - if (c1 == nil && c2 != nil) || (c1 != nil && c2 == nil) { - return false - } - if c1.Mode != c2.Mode { - return false - } - // For s2_auto, if one has an empty RTTThresholds, it is equivalent - // to the defaultCompressionS2AutoRTTThresholds array, so compare with that. - if c1.Mode == CompressionS2Auto { - if len(c1.RTTThresholds) == 0 && !reflect.DeepEqual(c2.RTTThresholds, defaultCompressionS2AutoRTTThresholds) { - return false - } - if len(c2.RTTThresholds) == 0 && !reflect.DeepEqual(c1.RTTThresholds, defaultCompressionS2AutoRTTThresholds) { - return false - } - if !reflect.DeepEqual(c1.RTTThresholds, c2.RTTThresholds) { - return false - } - } - return true -} - // Returns an array of s2 WriterOption based on the route compression mode. // So far we return a single option, but this way we can call s2.NewWriter() // with a nil []s2.WriterOption, but not with a nil s2.WriterOption, so @@ -1956,12 +1931,7 @@ func (s *Server) registerAccount(acc *Account) *Account { // Helper to set the sublist based on preferences. func (s *Server) setAccountSublist(acc *Account) { if acc != nil && acc.sl == nil { - opts := s.getOpts() - if opts != nil && opts.NoSublistCache { - acc.sl = NewSublistNoCache() - } else { - acc.sl = NewSublistWithCache() - } + acc.sl = NewSublistForServer(s) } } @@ -2293,6 +2263,7 @@ func (s *Server) Start() { s.Noticef(" Node: %s", getHash(s.info.Name)) } s.Noticef(" ID: %s", s.info.ID) + s.printFeatureFlags(opts) defer s.Noticef("Server is ready") @@ -3586,7 +3557,7 @@ func (s *Server) saveClosedClient(c *client, nc net.Conn, subs map[string]*subsc if c.acc != nil && c.acc.Name != globalAccountName { cc.acc = c.acc.Name } - cc.JWT = c.opts.JWT + cc.JWT = redactBearerJWT(c.opts.JWT) cc.IssuerKey = issuerForClient(c) cc.Tags = c.tags cc.NameTag = c.nameTag diff --git a/vendor/github.com/nats-io/nats-server/v2/server/store.go b/vendor/github.com/nats-io/nats-server/v2/server/store.go index 35b4696706..31a834efac 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/store.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/store.go @@ -92,15 +92,15 @@ type ProcessJetStreamMsgHandler func(*inMsg) type StreamStore interface { StoreMsg(subject string, hdr, msg []byte, ttl int64) (uint64, int64, error) - StoreRawMsg(subject string, hdr, msg []byte, seq uint64, ts int64, ttl int64) error + StoreRawMsg(subject string, hdr, msg []byte, seq uint64, ts int64, ttl int64, discardNewCheck bool) error SkipMsg(seq uint64) (uint64, error) SkipMsgs(seq uint64, num uint64) error - FlushAllPending() + FlushAllPending() error LoadMsg(seq uint64, sm *StoreMsg) (*StoreMsg, error) LoadNextMsg(filter string, wc bool, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) LoadNextMsgMulti(sl *gsl.SimpleSublist, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) LoadLastMsg(subject string, sm *StoreMsg) (*StoreMsg, error) - LoadPrevMsg(start uint64, smp *StoreMsg) (sm *StoreMsg, err error) + LoadPrevMsg(filter string, wc bool, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) LoadPrevMsgMulti(sl *gsl.SimpleSublist, start uint64, smp *StoreMsg) (sm *StoreMsg, skip uint64, err error) RemoveMsg(seq uint64) (bool, error) EraseMsg(seq uint64) (bool, error) @@ -109,7 +109,7 @@ type StreamStore interface { Compact(seq uint64) (uint64, error) Truncate(seq uint64) error GetSeqFromTime(t time.Time) uint64 - FilteredState(seq uint64, subject string) SimpleState + FilteredState(seq uint64, subject string) (SimpleState, error) SubjectsState(filterSubject string) map[string]SimpleState SubjectsTotals(filterSubject string) map[string]uint64 AllLastSeqs() ([]uint64, error) @@ -120,7 +120,7 @@ type StreamStore interface { State() StreamState FastState(*StreamState) EncodedStreamState(failed uint64) (enc []byte, err error) - SyncDeleted(dbs DeleteBlocks) + SyncDeleted(dbs DeleteBlocks) error Type() StorageType RegisterStorageUpdates(StorageUpdateHandler) RegisterStorageRemoveMsg(StorageRemoveMsgHandler) @@ -360,11 +360,13 @@ func (dbs DeleteBlocks) NumDeleted() (total uint64) { type ConsumerStore interface { SetStarting(sseq uint64) error UpdateStarting(sseq uint64) + Reset(sseq uint64) error HasState() bool UpdateDelivered(dseq, sseq, dc uint64, ts int64) error UpdateAcks(dseq, sseq uint64) error UpdateConfig(cfg *ConsumerConfig) error Update(*ConsumerState) error + ForceUpdate(*ConsumerState) error State() (*ConsumerState, error) BorrowState() (*ConsumerState, error) EncodedState() ([]byte, error) @@ -464,13 +466,6 @@ type Pending struct { Timestamp int64 } -// TemplateStore stores templates. -// Deprecated: stream templates are deprecated and will be removed in a future version. -type TemplateStore interface { - Store(*streamTemplate) error - Delete(*streamTemplate) error -} - const ( limitsPolicyJSONString = `"limits"` interestPolicyJSONString = `"interest"` @@ -602,15 +597,17 @@ func (st *StorageType) UnmarshalJSON(data []byte) error { } const ( - ackNonePolicyJSONString = `"none"` - ackAllPolicyJSONString = `"all"` - ackExplicitPolicyJSONString = `"explicit"` + ackNonePolicyJSONString = `"none"` + ackAllPolicyJSONString = `"all"` + ackExplicitPolicyJSONString = `"explicit"` + ackFlowControlPolicyJSONString = `"flow_control"` ) var ( - ackNonePolicyJSONBytes = []byte(ackNonePolicyJSONString) - ackAllPolicyJSONBytes = []byte(ackAllPolicyJSONString) - ackExplicitPolicyJSONBytes = []byte(ackExplicitPolicyJSONString) + ackNonePolicyJSONBytes = []byte(ackNonePolicyJSONString) + ackAllPolicyJSONBytes = []byte(ackAllPolicyJSONString) + ackExplicitPolicyJSONBytes = []byte(ackExplicitPolicyJSONString) + ackFlowControlPolicyJSONBytes = []byte(ackFlowControlPolicyJSONString) ) func (ap AckPolicy) MarshalJSON() ([]byte, error) { @@ -621,6 +618,8 @@ func (ap AckPolicy) MarshalJSON() ([]byte, error) { return ackAllPolicyJSONBytes, nil case AckExplicit: return ackExplicitPolicyJSONBytes, nil + case AckFlowControl: + return ackFlowControlPolicyJSONBytes, nil default: return nil, fmt.Errorf("can not marshal %v", ap) } @@ -634,6 +633,8 @@ func (ap *AckPolicy) UnmarshalJSON(data []byte) error { *ap = AckAll case ackExplicitPolicyJSONString: *ap = AckExplicit + case ackFlowControlPolicyJSONString: + *ap = AckFlowControl default: return fmt.Errorf("can not unmarshal %q", data) } @@ -741,7 +742,7 @@ func isOutOfSpaceErr(err error) bool { var errFirstSequenceMismatch = errors.New("first sequence mismatch") func isClusterResetErr(err error) bool { - return err == errLastSeqMismatch || err == ErrStoreEOF || err == errFirstSequenceMismatch || errors.Is(err, errCatchupAbortedNoLeader) || err == errCatchupTooManyRetries + return err == errLastSeqMismatch || err == ErrStoreEOF || err == errFirstSequenceMismatch || errors.Is(err, errCatchupAbortedNoLeader) || err == errCatchupTooManyRetries || err == errAlreadyLeader } // Copy all fields. diff --git a/vendor/github.com/nats-io/nats-server/v2/server/stream.go b/vendor/github.com/nats-io/nats-server/v2/server/stream.go index 43df7fc564..5698f3cb2a 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/stream.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/stream.go @@ -33,6 +33,7 @@ import ( "sync/atomic" "time" + "github.com/antithesishq/antithesis-sdk-go/assert" "github.com/klauspost/compress/s2" "github.com/nats-io/nats-server/v2/server/gsl" "github.com/nats-io/nuid" @@ -63,7 +64,6 @@ type StreamConfig struct { Storage StorageType `json:"storage"` Replicas int `json:"num_replicas"` NoAck bool `json:"no_ack,omitempty"` - Template string `json:"template_owner,omitempty"` // Deprecated: stream templates are deprecated and will be removed in a future version. Duplicates time.Duration `json:"duplicate_window,omitempty"` Placement *Placement `json:"placement,omitempty"` Mirror *StreamSource `json:"mirror,omitempty"` @@ -122,6 +122,9 @@ type StreamConfig struct { // PersistMode allows to opt-in to different persistence mode settings. PersistMode PersistModeType `json:"persist_mode,omitempty"` + // AllowBatchPublish allows fast batch publishing into the stream. + AllowBatchPublish bool `json:"allow_batched,omitempty"` + // Metadata is additional metadata for the Stream. Metadata map[string]string `json:"metadata,omitempty"` } @@ -274,6 +277,68 @@ type CounterValue struct { // e.g. {"stream":{"subject":"123"}} type CounterSources map[string]map[string]string +// BatchFlowAck is used for flow control when fast batch publishing into a stream. +// This message is vital to handling acknowledgements and flow control. +// These may technically be lost without the client receiving it. The client can retrieve +// these by using the "ping" operation if it's expecting acks but not receiving any. +type BatchFlowAck struct { + // Type: "ack" + Type string `json:"type"` + // Sequence is the sequence of the message that triggered the ack. + // If "gap: fail" this means the messages up to and including Sequence were persisted. + // If "gap: ok" this means _some_ of the messages up to and including Sequence were persisted. + // But there could have been gaps. + Sequence uint64 `json:"seq"` + // Messages indicates acknowledgements will be sent every N messages. + Messages uint16 `json:"msgs"` +} + +func (ack BatchFlowAck) MarshalJSON() ([]byte, error) { + type Alias BatchFlowAck + a := Alias(ack) + a.Type = "ack" + return json.Marshal(a) +} + +// BatchFlowGap is used for reporting gaps when fast batch publishing into a stream. +// This message is purely informational and could technically be lost without the client receiving it. +type BatchFlowGap struct { + // Type: "gap" + Type string `json:"type"` + // ExpectedLastSequence is the sequence expected to be received next. + // Messages starting from ExpectedLastSequence up to (but not including) CurrentSequence were lost. + ExpectedLastSequence uint64 `json:"last_seq"` + // CurrentSequence is the sequence of the message that just came in and detected the gap. + CurrentSequence uint64 `json:"seq"` +} + +func (gap BatchFlowGap) MarshalJSON() ([]byte, error) { + type Alias BatchFlowGap + a := Alias(gap) + a.Type = "gap" + return json.Marshal(a) +} + +// BatchFlowErr is used for reporting errors when fast batch publishing into a stream. +// This message is purely informational and could technically be lost without the client receiving it. +type BatchFlowErr struct { + // Type: "err" + Type string `json:"type"` + // Sequence is the sequence of the message that triggered the error. + // There are no (relative) guarantees whatsoever about whether the messages up to this sequence were persisted. + // Such guarantees require the use of "gap: fail" and listening for BatchFlowAck and PubAck. + Sequence uint64 `json:"seq"` + // Error is used to return the error for the Sequence. + Error *ApiError `json:"error"` +} + +func (err BatchFlowErr) MarshalJSON() ([]byte, error) { + type Alias BatchFlowErr + a := Alias(err) + a.Type = "err" + return json.Marshal(a) +} + // StreamInfo shows config and current state for this stream. type StreamInfo struct { Config StreamConfig `json:"config"` @@ -345,11 +410,18 @@ type StreamSource struct { FilterSubject string `json:"filter_subject,omitempty"` SubjectTransforms []SubjectTransformConfig `json:"subject_transforms,omitempty"` External *ExternalStream `json:"external,omitempty"` + Consumer *StreamConsumerSource `json:"consumer,omitempty"` // Internal iname string // For indexing when stream names are the same for multiple sources. } +// StreamConsumerSource dictates a durable consumer with a specific name is used for sourcing. +type StreamConsumerSource struct { + Name string `json:"name,omitempty"` + DeliverSubject string `json:"deliver_subject,omitempty"` +} + // ExternalStream allows you to qualify access to a stream source in another account or domain. type ExternalStream struct { ApiPrefix string `json:"api"` @@ -372,17 +444,25 @@ const ( // For managing stream batches. const ( - streamDefaultMaxBatchInflightPerStream = 50 - streamDefaultMaxBatchInflightTotal = 1000 - streamDefaultMaxBatchSize = 1000 - streamDefaultMaxBatchTimeout = 10 * time.Second + streamDefaultMaxBatchTimeout = 10 * time.Second + // Atomic batches. + streamDefaultMaxAtomicBatchInflightPerStream = 50 + streamDefaultMaxAtomicBatchInflightTotal = 1000 + streamDefaultMaxAtomicBatchSize = 1000 + // Fast batches. + streamDefaultMaxFastBatchInflightPerStream = 1000 + streamDefaultMaxFastBatchInflightTotal = 50_000 ) var ( - streamMaxBatchInflightPerStream = streamDefaultMaxBatchInflightPerStream - streamMaxBatchInflightTotal = streamDefaultMaxBatchInflightTotal - streamMaxBatchSize = streamDefaultMaxBatchSize - streamMaxBatchTimeout = streamDefaultMaxBatchTimeout + streamMaxBatchTimeout = streamDefaultMaxBatchTimeout + // Atomic batches. + streamMaxAtomicBatchInflightPerStream = streamDefaultMaxAtomicBatchInflightPerStream + streamMaxAtomicBatchInflightTotal = streamDefaultMaxAtomicBatchInflightTotal + streamMaxAtomicBatchSize = streamDefaultMaxAtomicBatchSize + // Fast batches. + streamMaxFastBatchInflightPerStream = streamDefaultMaxFastBatchInflightPerStream + streamMaxFastBatchInflightTotal = streamDefaultMaxFastBatchInflightTotal ) // Stream is a jetstream stream of messages. When we receive a message internally destined @@ -437,8 +517,8 @@ type stream struct { sourcesConsumerSetup *time.Timer smsgs *ipQueue[*inMsg] // Intra-process queue for all incoming sourced messages. - // Indicates we have direct consumers. - directs int + // Indicates we have direct/sourcing consumers. + sourcingConsumers int // For input subject transform. itr *subjectTransform @@ -475,8 +555,10 @@ type stream struct { lqsent time.Time // The time at which the last lost quorum advisory was sent. Used to rate limit. uch chan struct{} // The channel to signal updates to the monitor routine. inMonitor bool // True if the monitor routine has been started. + werr error // If a write error was encountered, and if so what error. inflight map[string]*inflightSubjectRunningTotal // Inflight message sizes per subject. + inflightTransform map[uint64]string // Inflight message's optional transformed subject. clusteredCounterTotal map[string]*msgCounterRunningTotal // Inflight counter totals. expectedPerSubjectSequence map[uint64]string // Inflight 'expected per subject' subjects per clseq. expectedPerSubjectInProcess map[string]struct{} // Current 'expected per subject' subjects in process. @@ -563,8 +645,11 @@ const ( JSBatchSeq = "Nats-Batch-Sequence" JSBatchCommit = "Nats-Batch-Commit" JSSchedulePattern = "Nats-Schedule" + JSScheduleTimeZone = "Nats-Schedule-Time-Zone" JSScheduleTTL = "Nats-Schedule-TTL" + JSScheduleRollup = "Nats-Schedule-Rollup" JSScheduleTarget = "Nats-Schedule-Target" + JSScheduleSource = "Nats-Schedule-Source" ) // Headers for published KV messages. @@ -738,13 +823,6 @@ func (a *Account) addStreamWithAssignment(config *StreamConfig, fsConfig *FileSt js.mu.RUnlock() jsa.mu.Lock() } - // Check for template ownership if present. - if cfg.Template != _EMPTY_ && jsa.account != nil { - if !jsa.checkTemplateOwnership(cfg.Template, cfg.Name) { - jsa.mu.Unlock() - return nil, fmt.Errorf("stream not owned by template") - } - } // If mirror, check if the transforms (if any) are valid. if cfg.Mirror != nil { @@ -978,7 +1056,7 @@ func (a *Account) addStreamWithAssignment(config *StreamConfig, fsConfig *FileSt suppress = true } } else if sa != nil { - suppress = sa.responded + suppress = sa.hasResponded() } if !suppress { mset.sendCreateAdvisory() @@ -1002,6 +1080,9 @@ func (ssi *StreamSource) composeIName() string { if ssi.External != nil { iName = iName + ":" + getHash(ssi.External.ApiPrefix) } + if ssi.Consumer != nil { + iName = iName + ":C=" + getHash(ssi.Consumer.Name) + } source := ssi.FilterSubject destination := fwcs @@ -1040,6 +1121,23 @@ func (ssi *StreamSource) setIndexName() { ssi.iname = ssi.composeIName() } +// Composes the consumer index name. Contains the stream name and consumer name used for durable sourcing (if any). +// When the stream is external we will use the api prefix as part of the index name +// (as the same stream and consumer names could be used in multiple JS domains) +func (ssi *StreamSource) composeCName() string { + var iName = ssi.Name + + if ssi.External != nil { + iName = iName + ":" + getHash(ssi.External.ApiPrefix) + } + var c string + if ssi.Consumer != nil { + c = ssi.Consumer.Name + } + + return strings.Join([]string{iName, c}, " ") +} + func (mset *stream) streamAssignment() *streamAssignment { mset.mu.RLock() defer mset.mu.RUnlock() @@ -1171,6 +1269,16 @@ func (mset *stream) setLeader(isLeader bool) error { mset.mu.Unlock() return err } + + // Reset any inflight fast batches. We were likely a follower before and need + // to send an ack to the publishers so they know we're still there. + if mset.batches != nil { + mset.batches.mu.Lock() + for batchId, b := range mset.batches.fast { + mset.batches.fastBatchReset(mset, batchId, b) + } + mset.batches.mu.Unlock() + } } else { // cancel timer to create the source consumers if not fired yet if mset.sourcesConsumerSetup != nil { @@ -1307,8 +1415,13 @@ func (mset *stream) autoTuneFileStorageBlockSize(fsCfg *FileStoreConfig) { // headers and msgId in them. Would need signaling from the storage layer. // mset.mu and mset.ddMu locks should be held. func (mset *stream) rebuildDedupe() { + duplicates := mset.cfg.Duplicates + if duplicates <= 0 { + return + } + // We have some messages. Lookup starting sequence by duplicate time window. - sseq := mset.store.GetSeqFromTime(time.Now().Add(-mset.cfg.Duplicates)) + sseq := mset.store.GetSeqFromTime(time.Now().Add(-duplicates)) if sseq == 0 { return } @@ -1360,16 +1473,9 @@ func (mset *stream) lastSeq() uint64 { return mset.lseq } -// Set last seq. -// Write lock should be held. -func (mset *stream) setLastSeq(lseq uint64) { - mset.lseq = lseq -} - func (mset *stream) sendCreateAdvisory() { mset.mu.RLock() name := mset.cfg.Name - template := mset.cfg.Template outq := mset.outq srv := mset.srv mset.mu.RUnlock() @@ -1385,10 +1491,9 @@ func (mset *stream) sendCreateAdvisory() { ID: nuid.Next(), Time: time.Now().UTC(), }, - Stream: name, - Action: CreateEvent, - Template: template, - Domain: srv.getOpts().JetStreamDomain, + Stream: name, + Action: CreateEvent, + Domain: srv.getOpts().JetStreamDomain, } j, err := json.Marshal(m) @@ -1411,10 +1516,9 @@ func (mset *stream) sendDeleteAdvisoryLocked() { ID: nuid.Next(), Time: time.Now().UTC(), }, - Stream: mset.cfg.Name, - Action: DeleteEvent, - Template: mset.cfg.Template, - Domain: mset.srv.getOpts().JetStreamDomain, + Stream: mset.cfg.Name, + Action: DeleteEvent, + Domain: mset.srv.getOpts().JetStreamDomain, } j, err := json.Marshal(m) @@ -1518,8 +1622,8 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account, pedantic boo if config == nil { return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration invalid")) } - if !isValidName(config.Name) { - return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("stream name is required and can not contain '.', '*', '>'")) + if !isValidAssetName(config.Name) { + return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("stream name is required and can not contain '.', '*', '>', '\\', '/'")) } if len(config.Name) > JSMaxNameLen { return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("stream name is too long, maximum allowed is %d", JSMaxNameLen)) @@ -1601,7 +1705,8 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account, pedantic boo if cfg.MaxAge != 0 && cfg.MaxAge < 100*time.Millisecond { return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("max age needs to be >= 100ms")) } - if cfg.Duplicates == 0 && cfg.Mirror == nil { + + if cfg.Duplicates == 0 && cfg.Mirror == nil && len(cfg.Sources) == 0 { maxWindow := StreamDefaultDuplicatesWindow if lim.Duplicates > 0 && maxWindow > lim.Duplicates { if pedantic { @@ -1746,9 +1851,26 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account, pedantic boo if cfg.AllowAtomicPublish { return StreamConfig{}, NewJSMirrorWithAtomicPublishError() } + if cfg.AllowBatchPublish { + return StreamConfig{}, NewJSMirrorWithBatchPublishError() + } if cfg.AllowMsgSchedules { return StreamConfig{}, NewJSMirrorWithMsgSchedulesError() } + if c := cfg.Mirror.Consumer; c != nil { + if !isValidAssetName(c.Name) { + return StreamConfig{}, NewJSMirrorDurableConsumerCfgInvalidError() + } + if !subjectIsLiteral(c.DeliverSubject) || !IsValidSubject(c.DeliverSubject) { + return StreamConfig{}, NewJSMirrorDurableConsumerCfgInvalidError() + } + if cfg.Mirror.OptStartSeq != 0 || cfg.Mirror.OptStartTime != nil { + return StreamConfig{}, NewJSMirrorDurableConsumerCfgInvalidError() + } + if cfg.Mirror.FilterSubject != _EMPTY_ { + return StreamConfig{}, NewJSMirrorDurableConsumerCfgInvalidError() + } + } if cfg.Mirror.FilterSubject != _EMPTY_ && len(cfg.Mirror.SubjectTransforms) != 0 { return StreamConfig{}, NewJSMirrorMultipleFiltersNotAllowedError() } @@ -1778,7 +1900,7 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account, pedantic boo // Do not perform checks if External is provided, as it could lead to // checking against itself (if sourced stream name is the same on different JetStream) if cfg.Mirror.External == nil { - if !isValidName(cfg.Mirror.Name) { + if !isValidAssetName(cfg.Mirror.Name) { return StreamConfig{}, NewJSMirrorInvalidStreamNameError() } // We do not require other stream to exist anymore, but if we can see it check payloads. @@ -1832,8 +1954,9 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account, pedantic boo // check sources for duplicates var iNames = make(map[string]struct{}) + var cNames = make(map[string]struct{}) for _, src := range cfg.Sources { - if src == nil || !isValidName(src.Name) { + if src == nil || !isValidAssetName(src.Name) { return StreamConfig{}, NewJSSourceInvalidStreamNameError() } if _, ok := iNames[src.composeIName()]; !ok { @@ -1865,6 +1988,27 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account, pedantic boo } } + if c := src.Consumer; c != nil { + if !isValidAssetName(c.Name) { + return StreamConfig{}, NewJSSourceDurableConsumerCfgInvalidError() + } + if !subjectIsLiteral(c.DeliverSubject) || !IsValidSubject(c.DeliverSubject) { + return StreamConfig{}, NewJSSourceDurableConsumerCfgInvalidError() + } + if src.OptStartSeq != 0 || src.OptStartTime != nil { + return StreamConfig{}, NewJSSourceDurableConsumerCfgInvalidError() + } + if src.FilterSubject != _EMPTY_ { + return StreamConfig{}, NewJSSourceDurableConsumerCfgInvalidError() + } + // Reusing the same consumer for multiple sources of the same stream isn't allowed. + if _, ok := cNames[src.composeCName()]; !ok { + cNames[src.composeCName()] = struct{}{} + } else { + return StreamConfig{}, NewJSSourceDurableConsumerDuplicateDetectedError() + } + } + // Do not perform checks if External is provided, as it could lead to // checking against itself (if sourced stream name is the same on different JetStream) if src.External == nil { @@ -2126,13 +2270,6 @@ func (jsa *jsAccount) configUpdateCheck(old, new *StreamConfig, s *Server, pedan return nil, NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration update can not change retention policy to/from workqueue")) } } - // Can not have a template owner for now. - if old.Template != _EMPTY_ { - return nil, NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration update not allowed on template owned stream")) - } - if cfg.Template != _EMPTY_ { - return nil, NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration update can not be owned by a template")) - } // Can not change from true to false. if !cfg.Sealed && old.Sealed { return nil, NewJSStreamInvalidConfigError(fmt.Errorf("stream configuration update can not unseal a sealed stream")) @@ -2313,9 +2450,13 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool, jsa.mu.RUnlock() mset.mu.Lock() - if mset.isLeader() { + if mset.active { // Check for mirror promotion. if ocfg.Mirror != nil && cfg.Mirror == nil { + // Only try deleting the sourcing consumer if one wasn't provided to us. + if ocfg.Mirror.Consumer == nil { + mset.tryDeleteMirrorConsumer(ocfg.Mirror) + } mset.cancelMirrorConsumer() mset.mirror = nil } @@ -2357,10 +2498,22 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool, // Check for Sources. if len(cfg.Sources) > 0 || len(ocfg.Sources) > 0 { currentIName := make(map[string]struct{}) + currentConsumers := make(map[string]*StreamSource) needsStartingSeqNum := make(map[string]struct{}) + getSourcingConsumerIName := func(ssi *StreamSource, sources []*StreamSource) string { + var iName = ssi.Name + if ssi.External != nil { + iName = iName + ":" + getHash(ssi.External.ApiPrefix) + } + return fmt.Sprintf("%s %s", iName, mset.createSourcingConsumerHash(ssi, sources)) + } for _, s := range ocfg.Sources { currentIName[s.iname] = struct{}{} + // Only track the sourcing consumer for deletion if one wasn't provided to us. + if s.Consumer == nil { + currentConsumers[getSourcingConsumerIName(s, ocfg.Sources)] = s + } } for _, s := range cfg.Sources { s.setIndexName() @@ -2397,6 +2550,16 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool, // source already exists delete(currentIName, s.iname) } + + // Remove the source if it still exists, but only if not using a pre-existing consumer. + if s.Consumer == nil { + delete(currentConsumers, getSourcingConsumerIName(s, cfg.Sources)) + } + } + // Delete source consumers if any aren't used anymore. + for _, s := range currentConsumers { + id := mset.createSourcingConsumerHash(s, ocfg.Sources) + mset.tryDeleteSourceConsumer(id, s) } // What is left in currentIName needs to be deleted. for iName := range currentIName { @@ -2503,9 +2666,16 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool, // If atomic publish is disabled, delete any in-progress batches. if !cfg.AllowAtomicPublish { - mset.deleteInflightBatches(false) + mset.deleteAtomicBatches(false) mset.deleteBatchApplyState() } + // If fast batch publish is disabled, delete any in-progress batches. + if !cfg.AllowBatchPublish { + mset.deleteFastBatches() + } + if !cfg.AllowAtomicPublish && !cfg.AllowBatchPublish { + mset.batches = nil + } // Now update config and store's version of our config. // Although we are under the stream write lock, we will also assign the new @@ -2568,6 +2738,80 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool, return nil } +// tryDeleteMirrorConsumer is a best-effort single try to delete a consumer used for stream mirroring. +// Lock should be held. +func (mset *stream) tryDeleteMirrorConsumer(mirror *StreamSource) { + id := mset.createStableConsumerHash() + consumerName := fmt.Sprintf("JS_MIRROR_%s", id) + log := mset.mirror != nil && mset.mirror.cname == consumerName + mset.tryDeleteSourcingConsumer("mirror", mirror, consumerName, log) +} + +// tryDeleteSourceConsumer is a best-effort single try to delete a consumer used for stream sourcing. +// Lock should be held. +func (mset *stream) tryDeleteSourceConsumer(id string, source *StreamSource) { + consumerName := fmt.Sprintf("JS_SRC_%s", id) + si := mset.sources[source.iname] + log := si != nil && si.cname == consumerName + mset.tryDeleteSourcingConsumer("source", source, consumerName, log) +} + +// tryDeleteSourcingConsumer is a best-effort single try to delete a sourcing consumer. +// Lock should be held. +func (mset *stream) tryDeleteSourcingConsumer(kind string, source *StreamSource, consumerName string, log bool) { + acc := mset.acc + accName, streamName, sourceName := acc.Name, mset.cfg.Name, source.Name + subject := fmt.Sprintf(JSApiConsumerDeleteT, sourceName, consumerName) + if source.External != nil { + subject = strings.Replace(subject, JSApiPrefix, source.External.ApiPrefix, 1) + subject = strings.ReplaceAll(subject, "..", ".") + } + s := mset.srv + go func() { + warn := func(err error) { + if log { + s.Warnf("Cleanup of %s consumer '%s > %s' failed for stream '%s > %s': %v", kind, sourceName, consumerName, accName, streamName, err) + } + } + + respCh := make(chan *JSApiConsumerDeleteResponse, 1) + reply := infoReplySubject() + cdSub, err := acc.subscribeInternal(reply, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + _, msg := c.msgParts(rmsg) + + var cdr JSApiConsumerDeleteResponse + if err := json.Unmarshal(msg, &cdr); err != nil { + warn(err) + return + } + select { + case respCh <- &cdr: + default: + } + }) + if err != nil { + warn(err) + return + } + defer acc.unsubscribeInternal(cdSub) + + // Send the delete request. + err = s.sendInternalAccountMsgWithReply(acc, subject, reply, nil, nil, false) + if err != nil { + warn(err) + return + } + select { + case cdr := <-respCh: + if cdr.Error != nil { + warn(cdr.Error) + } + case <-time.After(sourceHealthCheckInterval): + warn(errors.New("timed out")) + } + }() +} + // Small helper to return the Name field from mset.cfg, protected by // the mset.cfgMu mutex. This is simply because we have several places // in consumer.go where we need it. @@ -2611,7 +2855,7 @@ func (mset *stream) purgeLocked(preq *JSApiStreamPurgeRequest, needLock bool) (p // Check if our last has moved past what our original last sequence was, if so reset. if lseq > mlseq { - mset.setLastSeq(lseq) + mset.lseq = lseq } // Clear any pending acks below first seq. @@ -2620,7 +2864,10 @@ func (mset *stream) purgeLocked(preq *JSApiStreamPurgeRequest, needLock bool) (p // Purge consumers. // Check for filtered purge. if preq != nil && preq.Subject != _EMPTY_ { - ss := store.FilteredState(fseq, preq.Subject) + ss, err := store.FilteredState(fseq, preq.Subject) + if err != nil { + return purged, err + } fseq = ss.First } @@ -2767,9 +3014,17 @@ func (mset *stream) retryDisconnectedSyncConsumers() { return } + // Not client.isClosed(): internal account clients have nil nc, which would make isClosed always true here. + clientClosed := func(c *client) bool { + return c != nil && (c.flags.isSet(closeConnection) || c.flags.isSet(connMarkedClosed)) + } + // Stale sources need to be reset: we expect a heartbeat every sourceHealthHB, so missing a couple + // is a strong signal the remote delivery is no longer reaching us and a retry is warranted. + stale := func(si *sourceInfo) bool { + return time.Since(time.Unix(0, si.last.Load())) > 2*sourceHealthHB + } shouldRetry := func(si *sourceInfo) bool { - if si != nil && (si.sip || si.sub == nil || (si.sub.client != nil && si.sub.client.isClosed())) { - // Need to reset + if si != nil && (si.sip || si.sub == nil || clientClosed(si.sub.client) || stale(si)) { si.fails, si.sip = 0, false mset.cancelSourceInfo(si) return true @@ -2875,8 +3130,8 @@ func (mset *stream) processMirrorMsgs(mirror *sourceInfo, ready *sync.WaitGroup) // Checks that the message is from our current direct consumer. We can not depend on sub comparison // since cross account imports break. -func (si *sourceInfo) isCurrentSub(reply string) bool { - return si.cname != _EMPTY_ && strings.HasPrefix(reply, jsAckPre) && si.cname == tokenAt(reply, 4) +func (si *sourceInfo) isCurrentSub(cname string) bool { + return si.cname != _EMPTY_ && si.cname == cname } // processInboundMirrorMsg handles processing messages bound for a stream. @@ -2893,10 +3148,11 @@ func (mset *stream) processInboundMirrorMsg(m *inMsg) bool { } isControl := m.isControlMsg() + cname := consumerFromAckReply(m.rply) // Ignore from old subscriptions. // The reason we can not just compare subs is that on cross account imports they will not match. - if !mset.mirror.isCurrentSub(m.rply) && !isControl { + if !mset.mirror.isCurrentSub(cname) && !isControl { mset.mu.Unlock() return false } @@ -2906,12 +3162,12 @@ func (mset *stream) processInboundMirrorMsg(m *inMsg) bool { var needsRetry bool // Flow controls have reply subjects. if m.rply != _EMPTY_ { - mset.handleFlowControl(m) + mset.handleFlowControl(m, mset.mirror.dseq, mset.mirror.sseq) } else { // For idle heartbeats make sure we did not miss anything and check if we are considered stalled. - if ldseq := parseInt64(getHeader(JSLastConsumerSeq, m.hdr)); ldseq > 0 && uint64(ldseq) != mset.mirror.dseq { + if ldseq := parseInt64(sliceHeader(JSLastConsumerSeq, m.hdr)); ldseq > 0 && uint64(ldseq) != mset.mirror.dseq { needsRetry = true - } else if fcReply := getHeader(JSConsumerStalled, m.hdr); len(fcReply) > 0 { + } else if fcReply := sliceHeader(JSConsumerStalled, m.hdr); len(fcReply) > 0 { // Other side thinks we are stalled, so send flow control reply. mset.outq.sendMsg(string(fcReply), nil) } @@ -2923,7 +3179,7 @@ func (mset *stream) processInboundMirrorMsg(m *inMsg) bool { return !needsRetry } - sseq, dseq, dc, ts, pending := replyInfo(m.rply) + sseq, dseq, dc, ts, pending := ackReplyInfo(m.rply) if dc > 1 { mset.mu.Unlock() @@ -2932,15 +3188,19 @@ func (mset *stream) processInboundMirrorMsg(m *inMsg) bool { // Mirror info tracking. olag, osseq, odseq := mset.mirror.lag, mset.mirror.sseq, mset.mirror.dseq - if sseq == mset.mirror.sseq+1 { - mset.mirror.dseq = dseq - mset.mirror.sseq++ - } else if sseq <= mset.mirror.sseq { + if sseq <= mset.mirror.sseq { // Ignore older messages. + // If the deliver sequence matches, we only update delivered accounting. + if dseq == mset.mirror.dseq+1 { + mset.mirror.dseq++ + } mset.mu.Unlock() return true + } else if sseq == mset.mirror.sseq+1 { + mset.mirror.dseq = dseq + mset.mirror.sseq++ } else if mset.mirror.cname == _EMPTY_ { - mset.mirror.cname = tokenAt(m.rply, 4) + mset.mirror.cname = cname mset.mirror.dseq, mset.mirror.sseq = dseq, sseq } else { // If the deliver sequence matches then the upstream stream has expired or deleted messages. @@ -3014,10 +3274,10 @@ func (mset *stream) processInboundMirrorMsg(m *inMsg) bool { accName, sname, err) } else { // We may have missed messages, restart. - if sseq <= mset.lastSeq() { + if lseq := mset.lastSeq(); sseq <= lseq { mset.mu.Lock() mset.mirror.lag = olag - mset.mirror.sseq = osseq + mset.mirror.sseq = lseq mset.mirror.dseq = odseq mset.mu.Unlock() return false @@ -3073,8 +3333,13 @@ func (mset *stream) skipMsgs(start, end uint64) { return } - // FIXME (dlc) - We should allow proposals of DeleteRange, but would need to make sure all peers support. - // With syncRequest was easy to add bool into request. + // Must only be enabled once every peer in the cluster supports receiving + // deleteRangeOp in the normal apply path; older peers panic on unknown ops. + if mset.srv.getOpts().getFeatureFlag(FeatureFlagJsRaftDeleteRange) { + node.Propose(encodeDeleteRange(&DeleteRange{First: start, Num: end - start + 1})) + return + } + var entries []*Entry for seq := start; seq <= end; seq++ { entries = append(entries, newEntry(EntryNormal, encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq-1, 0, false))) @@ -3185,9 +3450,12 @@ func (mset *stream) setupMirrorConsumer() error { // Determine subjects etc. var deliverSubject string + var durableDeliverSubject string ext := mset.cfg.Mirror.External - - if ext != nil && ext.DeliverPrefix != _EMPTY_ { + if mset.cfg.Mirror.Consumer != nil { + durableDeliverSubject = mset.cfg.Mirror.Consumer.DeliverSubject + mirror.cname = mset.cfg.Mirror.Consumer.Name + } else if ext != nil && ext.DeliverPrefix != _EMPTY_ { deliverSubject = strings.ReplaceAll(ext.DeliverPrefix+syncSubject(".M"), "..", ".") } else { deliverSubject = syncSubject("$JS.M") @@ -3199,9 +3467,18 @@ func (mset *stream) setupMirrorConsumer() error { var state StreamState mset.store.FastState(&state) + id := mset.createStableConsumerHash() + metadata := map[string]string{} + metadata["_nats.mirror.stream"] = mset.cfg.Name + metadata["_nats.mirror.acc"] = mset.acc.Name + if domain := mset.srv.getOpts().JetStreamDomain; domain != _EMPTY_ { + metadata["_nats.mirror.domain"] = domain + } + req := &CreateConsumerRequest{ Stream: mset.cfg.Mirror.Name, Config: ConsumerConfig{ + Name: fmt.Sprintf("JS_MIRROR_%s", id), DeliverSubject: deliverSubject, DeliverPolicy: DeliverByStartSequence, OptStartSeq: state.LastSeq + 1, @@ -3211,7 +3488,9 @@ func (mset *stream) setupMirrorConsumer() error { Heartbeat: sourceHealthHB, FlowControl: true, Direct: true, + Sourcing: true, InactiveThreshold: sourceHealthCheckInterval, + Metadata: metadata, }, } @@ -3264,7 +3543,6 @@ func (mset *stream) setupMirrorConsumer() error { respCh := make(chan *JSApiConsumerCreateResponse, 1) reply := infoReplySubject() crSub, err := mset.subscribeInternal(reply, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - mset.unsubscribe(sub) _, msg := c.msgParts(rmsg) var ccr JSApiConsumerCreateResponse @@ -3284,28 +3562,40 @@ func (mset *stream) setupMirrorConsumer() error { return nil } - var subject string - if req.Config.FilterSubject != _EMPTY_ { - req.Config.Name = fmt.Sprintf("mirror-%s", createConsumerName()) - subject = fmt.Sprintf(JSApiConsumerCreateExT, mset.cfg.Mirror.Name, req.Config.Name, req.Config.FilterSubject) - } else { - subject = fmt.Sprintf(JSApiConsumerCreateT, mset.cfg.Mirror.Name) - } - if ext != nil { - subject = strings.Replace(subject, JSApiPrefix, ext.ApiPrefix, 1) - subject = strings.ReplaceAll(subject, "..", ".") + generateSubject := func() (subject string) { + if durableDeliverSubject != _EMPTY_ { + // If we're using a pre-existing consumer, we'll send a consumer reset request instead. + subject = fmt.Sprintf(JSApiConsumerResetT, mset.cfg.Mirror.Name, mirror.cname) + } else if req.Config.FilterSubject != _EMPTY_ { + subject = fmt.Sprintf(JSApiConsumerCreateExT, mset.cfg.Mirror.Name, req.Config.Name, req.Config.FilterSubject) + } else { + subject = fmt.Sprintf(JSApiConsumerCreateT, mset.cfg.Mirror.Name) + } + if ext != nil { + subject = strings.Replace(subject, JSApiPrefix, ext.ApiPrefix, 1) + subject = strings.ReplaceAll(subject, "..", ".") + } + return subject } - - // Marshal now that we are done with `req`. - b, _ := json.Marshal(req) + subject := generateSubject() // Reset mirror.msgs = nil mirror.err = nil mirror.sip = true - // Send the consumer create request - mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, nil, b, nil, 0)) + if durableDeliverSubject != _EMPTY_ { + // Send the consumer reset request + mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, nil, nil, nil, 0)) + } else { + // Marshal now that we are done with `req`. + b, _ := json.Marshal(req) + + // Send the consumer create request + // Confirm the server supports API level 4, which contains durable sourcing, AckFlowControl, and consumer reset. + hdr := genHeader(nil, JSRequiredApiLevel, "4") + mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, hdr, b, nil, 0)) + } go func() { @@ -3335,95 +3625,128 @@ func (mset *stream) setupMirrorConsumer() error { mset.mu.Lock() if mset.mirror == nil { // Mirror config has been removed. + mset.unsubscribe(crSub) mset.mu.Unlock() return - } else { - wg := &mset.mirror.wg - mset.mu.Unlock() - wg.Wait() } + wg := &mset.mirror.wg + mset.mu.Unlock() + wg.Wait() + SELECT: select { case ccr := <-respCh: mset.mu.Lock() // Mirror config has been removed. if mset.mirror == nil { + mset.unsubscribe(crSub) mset.mu.Unlock() return } ready := sync.WaitGroup{} mirror := mset.mirror mirror.err = nil + if ccr.Error != nil || ccr.ConsumerInfo == nil { + // If the responding server doesn't support sourcing consumers, retry without it. + if req.Config.Sourcing && ccr.Error != nil && + (ccr.Error.ErrCode == uint16(JSRequiredApiLevelErr) || ccr.Error.ErrCode == uint16(JSInvalidJSONErr)) { + // Unset for retry. + req.Config.Sourcing = false + // Specify a unique consumer name, as the other end will not know to do this. + req.Config.Name = fmt.Sprintf("JS_MIRROR_%s_%s", id, createConsumerName()) + b, _ := json.Marshal(req) + // Regenerate subject since the previous name could've been included in it. + subject = generateSubject() + mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, nil, b, nil, 0)) + mset.mu.Unlock() + goto SELECT + } + mset.unsubscribe(crSub) mset.srv.Warnf("JetStream error response for create mirror consumer: %+v", ccr.Error) mirror.err = ccr.Error // Let's retry as soon as possible, but we are gated by sourceConsumerRetryThreshold retry = true mset.mu.Unlock() return - } else { - // Setup actual subscription to process messages from our source. - qname := fmt.Sprintf("[ACC:%s] stream mirror '%s' of '%s' msgs", mset.acc.Name, mset.cfg.Name, mset.cfg.Mirror.Name) - // Create a new queue each time - mirror.msgs = newIPQueue[*inMsg](mset.srv, qname) - msgs := mirror.msgs - sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. - if len(hdr) > 0 { - // Remove any Nats-Expected- headers as we don't want to validate them. - hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Expected-") - // Remove any Nats-Batch- headers, batching is not supported when mirroring. - hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Batch-") - } - mset.queueInbound(msgs, subject, reply, hdr, msg, nil, nil) - mirror.last.Store(time.Now().UnixNano()) - }) - if err != nil { - mirror.err = NewJSMirrorConsumerSetupFailedError(err, Unless(err)) - retry = true - mset.mu.Unlock() - return - } - // Save our sub. - mirror.sub = sub + } - // When an upstream stream expires messages or in general has messages that we want - // that are no longer available we need to adjust here. - var state StreamState - mset.store.FastState(&state) + // If using durable sourcing, we need the consumer to use acks based on flow control. + if durableDeliverSubject != _EMPTY_ && ccr.ConsumerInfo.Config.AckPolicy != AckFlowControl { + mset.unsubscribe(crSub) + mirror.err = NewJSMirrorConsumerRequiresAckFCError() + retry = true + mset.mu.Unlock() + return + } - // Check if we need to skip messages. - if state.LastSeq != ccr.ConsumerInfo.Delivered.Stream { - // Check to see if delivered is past our last and we have no msgs. This will help the - // case when mirroring a stream that has a very high starting sequence number. - if state.Msgs == 0 && ccr.ConsumerInfo.Delivered.Stream > state.LastSeq { - mset.store.PurgeEx(_EMPTY_, ccr.ConsumerInfo.Delivered.Stream+1, 0) - mset.lseq = ccr.ConsumerInfo.Delivered.Stream - } else { - mset.skipMsgs(state.LastSeq+1, ccr.ConsumerInfo.Delivered.Stream) - } + // We can now unsubscribe. + mset.unsubscribe(crSub) + + // Setup actual subscription to process messages from our source. + qname := fmt.Sprintf("[ACC:%s] stream mirror '%s' of '%s' msgs", mset.acc.Name, mset.cfg.Name, mset.cfg.Mirror.Name) + // Create a new queue each time + mirror.msgs = newIPQueue[*inMsg](mset.srv, qname) + msgs := mirror.msgs + if durableDeliverSubject != _EMPTY_ { + deliverSubject = durableDeliverSubject + } else { + deliverSubject = ccr.ConsumerInfo.Config.DeliverSubject + } + sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. + if len(hdr) > 0 { + // Remove any Nats-Expected- headers as we don't want to validate them. + hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Expected-") + // Remove any Nats-Batch- headers, batching is not supported when mirroring. + hdr = removeHeaderIfPrefixPresent(hdr, "Nats-Batch-") } + mset.queueInbound(msgs, subject, reply, hdr, msg, nil, nil) + mirror.last.Store(time.Now().UnixNano()) + }) + if err != nil { + mirror.err = NewJSMirrorConsumerSetupFailedError(err, Unless(err)) + retry = true + mset.mu.Unlock() + return + } + // Save our sub. + mirror.sub = sub - // Capture consumer name. - mirror.cname = ccr.ConsumerInfo.Name - mirror.dseq = 0 - mirror.sseq = ccr.ConsumerInfo.Delivered.Stream - mirror.qch = make(chan struct{}) - mirror.wg.Add(1) - ready.Add(1) - if !mset.srv.startGoRoutine( - func() { mset.processMirrorMsgs(mirror, &ready) }, - pprofLabels{ - "type": "mirror", - "account": mset.acc.Name, - "stream": mset.cfg.Name, - "consumer": mirror.cname, - }, - ) { - mirror.wg.Done() - ready.Done() + // Check if we need to skip messages. + // Re-capture state since the previous may be stale. + state = StreamState{} + mset.store.FastState(&state) + if state.LastSeq < ccr.ConsumerInfo.Delivered.Stream { + // Check to see if delivered is past our last and we have no msgs. This will help the + // case when mirroring a stream that has a very high starting sequence number. + if state.Msgs == 0 && ccr.ConsumerInfo.Delivered.Stream > state.LastSeq { + mset.store.PurgeEx(_EMPTY_, ccr.ConsumerInfo.Delivered.Stream+1, 0) + mset.lseq = ccr.ConsumerInfo.Delivered.Stream + } else { + mset.skipMsgs(state.LastSeq+1, ccr.ConsumerInfo.Delivered.Stream) } } + + // Capture consumer name. + mirror.cname = ccr.ConsumerInfo.Name + mirror.dseq = 0 + mirror.sseq = max(ccr.ConsumerInfo.Delivered.Stream, state.LastSeq) + mirror.qch = make(chan struct{}) + mirror.wg.Add(1) + ready.Add(1) + if !mset.srv.startGoRoutine( + func() { mset.processMirrorMsgs(mirror, &ready) }, + pprofLabels{ + "type": "mirror", + "account": mset.acc.Name, + "stream": mset.cfg.Name, + "consumer": mirror.cname, + }, + ) { + mirror.wg.Done() + ready.Done() + } mset.mu.Unlock() ready.Wait() case <-time.After(srcConsumerWaitTime): @@ -3563,17 +3886,29 @@ func (mset *stream) trySetupSourceConsumer(iname string, seq uint64, startTime t // Determine subjects etc. var deliverSubject string + var durableDeliverSubject string ext := ssi.External - - if ext != nil && ext.DeliverPrefix != _EMPTY_ { + if ssi.Consumer != nil { + durableDeliverSubject = ssi.Consumer.DeliverSubject + si.cname = ssi.Consumer.Name + } else if ext != nil && ext.DeliverPrefix != _EMPTY_ { deliverSubject = strings.ReplaceAll(ext.DeliverPrefix+syncSubject(".S"), "..", ".") } else { deliverSubject = syncSubject("$JS.S") } + id := mset.createSourcingConsumerHash(ssi, mset.cfg.Sources) + metadata := map[string]string{} + metadata["_nats.src.stream"] = mset.cfg.Name + metadata["_nats.src.acc"] = mset.acc.Name + if domain := mset.srv.getOpts().JetStreamDomain; domain != _EMPTY_ { + metadata["_nats.src.domain"] = domain + } + req := &CreateConsumerRequest{ Stream: si.name, Config: ConsumerConfig{ + Name: fmt.Sprintf("JS_SRC_%s", id), DeliverSubject: deliverSubject, AckPolicy: AckNone, AckWait: 22 * time.Hour, @@ -3581,7 +3916,9 @@ func (mset *stream) trySetupSourceConsumer(iname string, seq uint64, startTime t Heartbeat: sourceHealthHB, FlowControl: true, Direct: true, + Sourcing: true, InactiveThreshold: sourceHealthCheckInterval, + Metadata: metadata, }, } @@ -3628,7 +3965,6 @@ func (mset *stream) trySetupSourceConsumer(iname string, seq uint64, startTime t respCh := make(chan *JSApiConsumerCreateResponse, 1) reply := infoReplySubject() crSub, err := mset.subscribeInternal(reply, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - mset.unsubscribe(sub) _, msg := c.msgParts(rmsg) var ccr JSApiConsumerCreateResponse if err := json.Unmarshal(msg, &ccr); err != nil { @@ -3646,35 +3982,46 @@ func (mset *stream) trySetupSourceConsumer(iname string, seq uint64, startTime t return } - var subject string - if req.Config.FilterSubject != _EMPTY_ { - req.Config.Name = fmt.Sprintf("src-%s", createConsumerName()) - subject = fmt.Sprintf(JSApiConsumerCreateExT, si.name, req.Config.Name, req.Config.FilterSubject) - } else if len(req.Config.FilterSubjects) == 1 { - req.Config.Name = fmt.Sprintf("src-%s", createConsumerName()) - // It is necessary to switch to using FilterSubject here as the extended consumer - // create API checks for it, so as to not accidentally allow multiple filtered subjects. - req.Config.FilterSubject = req.Config.FilterSubjects[0] - req.Config.FilterSubjects = nil - subject = fmt.Sprintf(JSApiConsumerCreateExT, si.name, req.Config.Name, req.Config.FilterSubject) - } else { - subject = fmt.Sprintf(JSApiConsumerCreateT, si.name) - } - if ext != nil { - subject = strings.Replace(subject, JSApiPrefix, ext.ApiPrefix, 1) - subject = strings.ReplaceAll(subject, "..", ".") + generateSubject := func() (subject string) { + if durableDeliverSubject != _EMPTY_ { + // If we're using a pre-existing consumer, we'll send a consumer reset request instead. + subject = fmt.Sprintf(JSApiConsumerResetT, si.name, si.cname) + } else if req.Config.FilterSubject != _EMPTY_ { + subject = fmt.Sprintf(JSApiConsumerCreateExT, si.name, req.Config.Name, req.Config.FilterSubject) + } else if len(req.Config.FilterSubjects) == 1 { + // It is necessary to switch to using FilterSubject here as the extended consumer + // create API checks for it, so as to not accidentally allow multiple filtered subjects. + req.Config.FilterSubject = req.Config.FilterSubjects[0] + req.Config.FilterSubjects = nil + subject = fmt.Sprintf(JSApiConsumerCreateExT, si.name, req.Config.Name, req.Config.FilterSubject) + } else { + subject = fmt.Sprintf(JSApiConsumerCreateT, si.name) + } + if ext != nil { + subject = strings.Replace(subject, JSApiPrefix, ext.ApiPrefix, 1) + subject = strings.ReplaceAll(subject, "..", ".") + } + return subject } - - // Marshal request. - b, _ := json.Marshal(req) + subject := generateSubject() // Reset si.msgs = nil si.err = nil si.sip = true - // Send the consumer create request - mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, nil, b, nil, 0)) + if durableDeliverSubject != _EMPTY_ { + // Send the consumer reset request + mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, nil, nil, nil, 0)) + } else { + // Marshal request. + b, _ := json.Marshal(req) + + // Send the consumer create request + // Confirm the server supports API level 4, which contains durable sourcing, AckFlowControl, and consumer reset. + hdr := genHeader(nil, JSRequiredApiLevel, "4") + mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, hdr, b, nil, 0)) + } go func() { @@ -3699,13 +4046,32 @@ func (mset *stream) trySetupSourceConsumer(iname string, seq uint64, startTime t mset.mu.Unlock() }() + SELECT: select { case ccr := <-respCh: mset.mu.Lock() // Check that it has not been removed or canceled (si.sub would be nil) - if si := mset.sources[iname]; si != nil { + if si := mset.sources[iname]; si == nil { + mset.unsubscribe(crSub) + } else { si.err = nil + if ccr.Error != nil || ccr.ConsumerInfo == nil { + // If the responding server doesn't support sourcing consumers, retry without it. + if req.Config.Sourcing && ccr.Error != nil && + (ccr.Error.ErrCode == uint16(JSRequiredApiLevelErr) || ccr.Error.ErrCode == uint16(JSInvalidJSONErr)) { + // Unset for retry. + req.Config.Sourcing = false + // Specify a unique consumer name, as the other end will not know to do this. + req.Config.Name = fmt.Sprintf("JS_SRC_%s_%s", id, createConsumerName()) + b, _ := json.Marshal(req) + // Regenerate subject since the previous name could've been included in it. + subject = generateSubject() + mset.outq.send(newJSPubMsg(subject, _EMPTY_, reply, nil, b, nil, 0)) + mset.mu.Unlock() + goto SELECT + } + mset.unsubscribe(crSub) // Note: this warning can happen a few times when starting up the server when sourcing streams are // defined, this is normal as the streams are re-created in no particular order and it is possible // that a stream sourcing another could come up before all of its sources have been recreated. @@ -3715,48 +4081,65 @@ func (mset *stream) trySetupSourceConsumer(iname string, seq uint64, startTime t retry = true mset.mu.Unlock() return - } else { - // Check if our shared msg queue and go routine is running or not. - if mset.smsgs == nil { - qname := fmt.Sprintf("[ACC:%s] stream sources '%s' msgs", mset.acc.Name, mset.cfg.Name) - mset.smsgs = newIPQueue[*inMsg](mset.srv, qname) - mset.srv.startGoRoutine(func() { mset.processAllSourceMsgs() }, - pprofLabels{ - "type": "source", - "account": mset.acc.Name, - "stream": mset.cfg.Name, - }, - ) - } + } - // Setup actual subscription to process messages from our source. - if si.sseq != ccr.ConsumerInfo.Delivered.Stream { - si.sseq = ccr.ConsumerInfo.Delivered.Stream + 1 - } - // Capture consumer name. - si.cname = ccr.ConsumerInfo.Name + // If using durable sourcing, we need the consumer to use acks based on flow control. + if durableDeliverSubject != _EMPTY_ && ccr.ConsumerInfo.Config.AckPolicy != AckFlowControl { + mset.unsubscribe(crSub) + si.err = NewJSSourceConsumerRequiresAckFCError() + retry = true + mset.mu.Unlock() + return + } - // Do not set si.sseq to seq here. si.sseq will be set in processInboundSourceMsg - si.dseq = 0 - si.qch = make(chan struct{}) - // Set the last seen as now so that we don't fail at the first check. - si.last.Store(time.Now().UnixNano()) + // We can now unsubscribe. + mset.unsubscribe(crSub) + + // Check if our shared msg queue and go routine is running or not. + if mset.smsgs == nil { + qname := fmt.Sprintf("[ACC:%s] stream sources '%s' msgs", mset.acc.Name, mset.cfg.Name) + mset.smsgs = newIPQueue[*inMsg](mset.srv, qname) + mset.srv.startGoRoutine(func() { mset.processAllSourceMsgs() }, + pprofLabels{ + "type": "source", + "account": mset.acc.Name, + "stream": mset.cfg.Name, + }, + ) + } - msgs := mset.smsgs - sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { - hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. - mset.queueInbound(msgs, subject, reply, hdr, msg, si, nil) - si.last.Store(time.Now().UnixNano()) - }) - if err != nil { - si.err = NewJSSourceConsumerSetupFailedError(err, Unless(err)) - retry = true - mset.mu.Unlock() - return - } - // Save our sub. - si.sub = sub + // Setup actual subscription to process messages from our source. + if si.sseq < ccr.ConsumerInfo.Delivered.Stream { + si.sseq = ccr.ConsumerInfo.Delivered.Stream + } + // Capture consumer name. + si.cname = ccr.ConsumerInfo.Name + + // Do not set si.sseq to seq here. si.sseq will be set in processInboundSourceMsg + si.dseq = 0 + si.qch = make(chan struct{}) + // Set the last seen as now so that we don't fail at the first check. + si.last.Store(time.Now().UnixNano()) + + msgs := mset.smsgs + if durableDeliverSubject != _EMPTY_ { + deliverSubject = durableDeliverSubject + } else { + deliverSubject = ccr.ConsumerInfo.Config.DeliverSubject + } + sub, err := mset.subscribeInternal(deliverSubject, func(sub *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + hdr, msg := c.msgParts(copyBytes(rmsg)) // Need to copy. + mset.queueInbound(msgs, subject, reply, hdr, msg, si, nil) + si.last.Store(time.Now().UnixNano()) + }) + if err != nil { + si.err = NewJSSourceConsumerSetupFailedError(err, Unless(err)) + retry = true + mset.mu.Unlock() + return } + // Save our sub. + si.sub = sub } mset.mu.Unlock() case <-time.After(srcConsumerWaitTime): @@ -3860,20 +4243,39 @@ func (m *inMsg) isControlMsg() bool { // Sends a reply to a flow control request. // Lock should be held. -func (mset *stream) sendFlowControlReply(reply string) { +func (mset *stream) sendFlowControlReply(reply string, hdr []byte) { if mset.isLeader() && mset.outq != nil { - mset.outq.sendMsg(reply, nil) + dseq := parseInt64(sliceHeader(JSLastConsumerSeq, hdr)) + sseq := parseInt64(sliceHeader(JSLastStreamSeq, hdr)) + + // If we're responding to flow control without being delivered messages (for example after a restart), + // we'll only have the stream sequence. + if sseq > 0 { + if dseq < 0 { + dseq = 0 + } + const t = "NATS/1.0\r\n%s: %d\r\n%s: %d\r\n\r\n" + hdr = fmt.Appendf(nil, t, JSLastConsumerSeq, dseq, JSLastStreamSeq, sseq) + mset.outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } else { + mset.outq.sendMsg(reply, nil) + } } } // handleFlowControl will properly handle flow control messages for both R==1 and R>1. // Lock should be held. -func (mset *stream) handleFlowControl(m *inMsg) { +func (mset *stream) handleFlowControl(m *inMsg, dseq, sseq uint64) { // If we are clustered we will send the flow control message through the replication stack. if mset.isClustered() { + // Append the current delivery and stream sequences, to be sent after replication. + m.hdr = genHeader(m.hdr, JSLastConsumerSeq, strconv.FormatUint(dseq, 10)) + m.hdr = genHeader(m.hdr, JSLastStreamSeq, strconv.FormatUint(sseq, 10)) mset.node.Propose(encodeStreamMsg(_EMPTY_, m.rply, m.hdr, nil, 0, 0, false)) } else { - mset.outq.sendMsg(m.rply, nil) + const t = "NATS/1.0\r\n%s: %d\r\n%s: %d\r\n\r\n" + hdr := fmt.Appendf(nil, t, JSLastConsumerSeq, dseq, JSLastStreamSeq, sseq) + mset.outq.send(newJSPubMsg(m.rply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) } } @@ -3888,9 +4290,10 @@ func (mset *stream) processInboundSourceMsg(si *sourceInfo, m *inMsg) bool { } isControl := m.isControlMsg() + cname := consumerFromAckReply(m.rply) // Ignore from old subscriptions. - if !si.isCurrentSub(m.rply) && !isControl { + if !si.isCurrentSub(cname) && !isControl { mset.mu.Unlock() return false } @@ -3900,13 +4303,13 @@ func (mset *stream) processInboundSourceMsg(si *sourceInfo, m *inMsg) bool { var needsRetry bool // Flow controls have reply subjects. if m.rply != _EMPTY_ { - mset.handleFlowControl(m) + mset.handleFlowControl(m, si.dseq, si.sseq) } else { // For idle heartbeats make sure we did not miss anything. - if ldseq := parseInt64(getHeader(JSLastConsumerSeq, m.hdr)); ldseq > 0 && uint64(ldseq) != si.dseq { + if ldseq := parseInt64(sliceHeader(JSLastConsumerSeq, m.hdr)); ldseq > 0 && uint64(ldseq) != si.dseq { needsRetry = true mset.retrySourceConsumerAtSeq(si.iname, si.sseq+1) - } else if fcReply := getHeader(JSConsumerStalled, m.hdr); len(fcReply) > 0 { + } else if fcReply := sliceHeader(JSConsumerStalled, m.hdr); len(fcReply) > 0 { // Other side thinks we are stalled, so send flow control reply. mset.outq.sendMsg(string(fcReply), nil) } @@ -3915,7 +4318,7 @@ func (mset *stream) processInboundSourceMsg(si *sourceInfo, m *inMsg) bool { return !needsRetry } - sseq, dseq, dc, _, pending := replyInfo(m.rply) + sseq, dseq, dc, _, pending := ackReplyInfo(m.rply) if dc > 1 { mset.mu.Unlock() @@ -3923,12 +4326,21 @@ func (mset *stream) processInboundSourceMsg(si *sourceInfo, m *inMsg) bool { } // Tracking is done here. - if dseq == si.dseq+1 { + osseq, odseq := si.sseq, si.dseq + if sseq <= si.sseq { + // Ignore older messages. + // If the deliver sequence matches, we only update delivered accounting. + if dseq == si.dseq+1 { + si.dseq++ + } + mset.mu.Unlock() + return true + } else if dseq == si.dseq+1 { si.dseq++ si.sseq = sseq } else if dseq > si.dseq { if si.cname == _EMPTY_ { - si.cname = tokenAt(m.rply, 4) + si.cname = cname si.dseq, si.sseq = dseq, sseq } else { mset.retrySourceConsumerAtSeq(si.iname, si.sseq+1) @@ -3995,34 +4407,27 @@ func (mset *stream) processInboundSourceMsg(si *sourceInfo, m *inMsg) bool { // Can happen temporarily all the time during normal operations when the sourcing stream is discard new // (example use case is for sourcing into a work queue) // TODO - Maybe improve sourcing to WQ with limit and new to use flow control rather than re-creating the consumer. - if errors.Is(err, ErrMaxMsgs) || errors.Is(err, ErrMaxBytes) || errors.Is(err, ErrMaxMsgsPerSubject) { - // Do not need to do a full retry that includes finding the last sequence in the stream - // for that source. Just re-create starting with the seq we couldn't store instead. - mset.mu.Lock() - mset.retrySourceConsumerAtSeq(iName, si.sseq) - mset.mu.Unlock() - } else { - // Log some warning for errors other than errLastSeqMismatch. - if !errors.Is(err, errLastSeqMismatch) && !errors.Is(err, errMsgIdDuplicate) { - s.RateLimitWarnf("Error processing inbound source %q for '%s' > '%s': %v", - iName, accName, sname, err) - } - // Retry in all type of errors we do not want to skip if we are still leader. - if mset.isLeader() { - if !errors.Is(err, errMsgIdDuplicate) { - // This will make sure the source is still in mset.sources map, - // find the last sequence and then call setupSourceConsumer. - iNameMap := map[string]struct{}{iName: {}} - mset.setStartingSequenceForSources(iNameMap) - mset.mu.Lock() - mset.retrySourceConsumerAtSeq(iName, si.sseq+1) - mset.mu.Unlock() - } else { - // skipping the message but keep processing the rest of the batch - return true - } - } + discardNew := errors.Is(err, ErrMaxMsgs) || errors.Is(err, ErrMaxBytes) || errors.Is(err, ErrMaxMsgsPerSubject) + + // Log some warning for errors. + if !discardNew && !errors.Is(err, errLastSeqMismatch) && !errors.Is(err, errMsgIdDuplicate) { + s.RateLimitWarnf("Error processing inbound source %q for '%s' > '%s': %v", + iName, accName, sname, err) + } + + // Duplicates can be skipped, continue to the next message. + if errors.Is(err, errMsgIdDuplicate) { + return true } + + // Do not need to do a full retry that includes finding the last sequence in the stream + // for that source. Just re-create starting with the seq we couldn't store instead. + // Especially if we're replicated, we could have inflight proposals that will not be in our store yet. + mset.mu.Lock() + si.dseq = odseq + si.sseq = osseq + mset.retrySourceConsumerAtSeq(iName, sseq) + mset.mu.Unlock() } return false } @@ -4038,7 +4443,7 @@ func (si *sourceInfo) genSourceHeader(orig, reply string) string { b.WriteString(iNameParts[0]) b.WriteByte(' ') // Grab sequence as text here from reply subject. - var tsa [expectedNumReplyTokens]string + var tsa [expectedNumReplyTokensV2]string start, tokens := 0, tsa[:0] for i := 0; i < len(reply); i++ { if reply[i] == btsep { @@ -4047,8 +4452,12 @@ func (si *sourceInfo) genSourceHeader(orig, reply string) string { } tokens = append(tokens, reply[start:]) seq := "1" // Default - if len(tokens) == expectedNumReplyTokens && tokens[0] == "$JS" && tokens[1] == "ACK" { - seq = tokens[5] + if tokens[0] == "$JS" && tokens[1] == "ACK" { + if len(tokens) == expectedNumReplyTokensV1 { + seq = tokens[5] + } else if len(tokens) >= expectedNumReplyTokensV2 { + seq = tokens[7] + } } b.WriteString(seq) @@ -4063,7 +4472,7 @@ func (si *sourceInfo) genSourceHeader(orig, reply string) string { // Original version of header that stored ack reply direct. func streamAndSeqFromAckReply(reply string) (string, string, uint64) { - tsa := [expectedNumReplyTokens]string{} + tsa := [expectedNumReplyTokensV2]string{} start, tokens := 0, tsa[:0] for i := 0; i < len(reply); i++ { if reply[i] == btsep { @@ -4071,10 +4480,33 @@ func streamAndSeqFromAckReply(reply string) (string, string, uint64) { } } tokens = append(tokens, reply[start:]) - if len(tokens) != expectedNumReplyTokens || tokens[0] != "$JS" || tokens[1] != "ACK" { + if (len(tokens) != expectedNumReplyTokensV1 && len(tokens) < expectedNumReplyTokensV2) || tokens[0] != "$JS" || tokens[1] != "ACK" { return _EMPTY_, _EMPTY_, 0 } - return tokens[2], _EMPTY_, uint64(parseAckReplyNum(tokens[5])) + offset := 2 + if len(tokens) >= expectedNumReplyTokensV2 { + offset = 4 + } + return tokens[offset], _EMPTY_, uint64(parseAckReplyNum(tokens[offset+3])) +} + +func consumerFromAckReply(reply string) string { + tsa := [expectedNumReplyTokensV2]string{} + start, tokens := 0, tsa[:0] + for i := 0; i < len(reply); i++ { + if reply[i] == btsep { + tokens, start = append(tokens, reply[start:i]), i+1 + } + } + tokens = append(tokens, reply[start:]) + if (len(tokens) != expectedNumReplyTokensV1 && len(tokens) < expectedNumReplyTokensV2) || tokens[0] != "$JS" || tokens[1] != "ACK" { + return _EMPTY_ + } + offset := 3 + if len(tokens) >= expectedNumReplyTokensV2 { + offset = 5 + } + return tokens[offset] } // Extract the stream name, the source index name and the message sequence number from the source header. @@ -4535,14 +4967,26 @@ func (mset *stream) unsubscribeToStream(stopping, shuttingDown bool) error { if len(mset.sources) > 0 { mset.stopSourceConsumers() + mset.sources = nil } // Clear batching state. - mset.deleteInflightBatches(shuttingDown) - - if stopping { - // In case we had a direct get subscriptions. - mset.unsubscribeToDirect() - mset.unsubscribeToMirrorDirect() + mset.deleteAtomicBatches(shuttingDown) + if stopping || shuttingDown { + mset.deleteFastBatches() + } + if mset.batches != nil { + mset.batches.mu.Lock() + reset := len(mset.batches.atomic) == 0 && len(mset.batches.fast) == 0 + mset.batches.mu.Unlock() + if reset { + mset.batches = nil + } + } + + if stopping { + // In case we had a direct get subscriptions. + mset.unsubscribeToDirect() + mset.unsubscribeToMirrorDirect() } if mset.directLeaderSub == nil { @@ -4556,10 +5000,10 @@ func (mset *stream) unsubscribeToStream(stopping, shuttingDown bool) error { } // Lock should be held. -func (mset *stream) deleteInflightBatches(shuttingDown bool) { +func (mset *stream) deleteAtomicBatches(shuttingDown bool) { if mset.batches != nil { mset.batches.mu.Lock() - for batchId, b := range mset.batches.group { + for batchId, b := range mset.batches.atomic { // If shutting down, do fixup during startup. In-memory batches don't require manual cleanup. if shuttingDown { b.stopLocked() @@ -4567,8 +5011,8 @@ func (mset *stream) deleteInflightBatches(shuttingDown bool) { b.cleanupLocked(batchId, mset.batches) } } + mset.batches.atomic = nil mset.batches.mu.Unlock() - mset.batches = nil } } @@ -4583,6 +5027,18 @@ func (mset *stream) deleteBatchApplyState() { } } +// Lock should be held. +func (mset *stream) deleteFastBatches() { + if mset.batches != nil { + mset.batches.mu.Lock() + for batchId, b := range mset.batches.fast { + b.cleanupLocked(batchId, mset.batches) + } + mset.batches.fast = nil + mset.batches.mu.Unlock() + } +} + // Lock does NOT need to be held, we set the client on setup and never change it at this point. func (mset *stream) subscribeInternal(subject string, cb msgHandler) (*subscription, error) { if mset.closed.Load() { @@ -4636,7 +5092,6 @@ func (mset *stream) unsubscribeInternal(subject string) error { return nil } -// Lock should be held. func (mset *stream) unsubscribe(sub *subscription) { if sub == nil || mset.closed.Load() { return @@ -4688,10 +5143,10 @@ func (mset *stream) setupStore(fsCfg *FileStoreConfig) error { mset.store.RegisterProcessJetStreamMsg(func(im *inMsg) { if mset.IsClustered() { if mset.IsLeader() { - mset.processClusteredInboundMsg(im.subj, im.rply, im.hdr, im.msg, im.mt, false) + mset.processClusteredInboundMsg(im.subj, im.rply, im.hdr, im.msg, im.mt, true) } } else { - mset.processJetStreamMsg(im.subj, im.rply, im.hdr, im.msg, 0, 0, im.mt, false, true) + mset.processJetStreamMsg(im.subj, im.rply, im.hdr, im.msg, 0, 0, im.mt, true, true) } }) mset.mu.Unlock() @@ -4807,6 +5262,11 @@ func (mset *stream) storeMsgId(dde *ddentry) { // storeMsgIdLocked will store the message id for duplicate detection. // mset.ddMu lock should be held. func (mset *stream) storeMsgIdLocked(dde *ddentry) { + // Zero means disabled. + if mset.cfg.Duplicates <= 0 { + return + } + if mset.ddmap == nil { mset.ddmap = make(map[string]*ddentry) } @@ -4913,19 +5373,46 @@ func getMessageIncr(hdr []byte) (*big.Int, bool) { } // Fast lookup of message schedule. -func getMessageSchedule(hdr []byte) (time.Time, bool) { +func getMessageSchedule(hdr []byte) (time.Time, *ApiError) { if len(hdr) == 0 { - return time.Time{}, true + return time.Time{}, nil + } + return nextMessageSchedule(hdr, time.Now().UTC().UnixNano()) +} + +// Fast lookup and calculation of next message schedule. +func nextMessageSchedule(hdr []byte, ts int64) (time.Time, *ApiError) { + if len(hdr) == 0 { + return time.Time{}, nil + } + loc, apiErr := loadMessageScheduleLocation(hdr) + if apiErr != nil { + return time.Time{}, apiErr } val := bytesToString(sliceHeader(JSSchedulePattern, hdr)) - if val == _EMPTY_ { - return time.Time{}, true + schedule, _, ok := parseMsgSchedule(val, loc, ts) + if !ok { + return time.Time{}, NewJSMessageSchedulesPatternInvalidError() + } + return schedule, nil +} + +// loadMessageScheduleLocation returns the *time.Location for the schedule's +// time zone header. Returns nil loc when the header is absent. A header that +// is present but empty or names an unknown zone yields a TimeZoneInvalid error. +func loadMessageScheduleLocation(hdr []byte) (*time.Location, *ApiError) { + tz := sliceHeader(JSScheduleTimeZone, hdr) + if tz == nil { + return nil, nil } - if !strings.HasPrefix(val, "@at ") { - return time.Time{}, false + if len(tz) == 0 { + return nil, NewJSMessageSchedulesTimeZoneInvalidError() + } + loc, err := time.LoadLocation(bytesToString(tz)) + if err != nil { + return nil, NewJSMessageSchedulesTimeZoneInvalidError() } - t, err := time.Parse(time.RFC3339, val[4:]) - return t, err == nil + return loc, nil } // Fast lookup of the message schedule TTL from headers. @@ -4941,6 +5428,11 @@ func getMessageScheduleTTL(hdr []byte) (string, bool) { return string(ttl), true } +// Fast lookup of the message schedule rollup from headers. +func getMessageScheduleRollup(hdr []byte) string { + return string(sliceHeader(JSScheduleRollup, hdr)) +} + // Fast lookup of message schedule target. func getMessageScheduleTarget(hdr []byte) string { if len(hdr) == 0 { @@ -4949,6 +5441,14 @@ func getMessageScheduleTarget(hdr []byte) string { return string(getHeader(JSScheduleTarget, hdr)) } +// Fast lookup of message schedule source. +func getMessageScheduleSource(hdr []byte) string { + if len(hdr) == 0 { + return _EMPTY_ + } + return string(getHeader(JSScheduleSource, hdr)) +} + // Fast lookup of message scheduler. func getMessageScheduler(hdr []byte) string { if len(hdr) == 0 { @@ -4962,7 +5462,149 @@ func getBatchId(hdr []byte) string { if len(hdr) == 0 { return _EMPTY_ } - return string(getHeader(JSBatchId, hdr)) + if atomicBatchId := sliceHeader(JSBatchId, hdr); atomicBatchId != nil { + return string(atomicBatchId) + } + return _EMPTY_ +} + +type FastBatch struct { + id string + seq uint64 + flow uint16 + ping bool + gapOk bool + commit bool + commitEob bool +} + +const ( + FastBatchSuffix = ".$FI" + FastBatchGapFail = "fail" + FastBatchGapOk = "ok" +) + +const ( + FastBatchOpStart = iota + FastBatchOpAppend + FastBatchOpCommit + FastBatchOpCommitEob + FastBatchOpPing +) + +var fastBatchPool sync.Pool + +func getFastBatchFromPool() *FastBatch { + idx := fastBatchPool.Get() + if idx != nil { + return idx.(*FastBatch) + } + return new(FastBatch) +} + +func (b *FastBatch) returnToPool() { + if b == nil { + return + } + // Nil out all values. + *b = FastBatch{} + fastBatchPool.Put(b) +} + +// getFastBatch gets fast batch info from the reply subject in the form: +// ......$FI +func getFastBatch(reply string, hdr []byte) (*FastBatch, bool) { + lreply := len(reply) + if lreply <= 4 || reply[lreply-4:] != FastBatchSuffix { + if !isServiceReply(stringToBytes(reply)) { + return nil, false + } + // If account imports/exports are used, the reply might be internal. + // Check the client header for the original reply subject. + ci := sliceHeader(ClientInfoHdr, hdr) + if ci == nil { + return nil, false + } + var cis ClientInfo + if err := json.Unmarshal(ci, &cis); err != nil || cis.Reply == _EMPTY_ { + return nil, false + } + reply = cis.Reply + lreply = len(reply) + if lreply <= 4 || reply[lreply-4:] != FastBatchSuffix { + return nil, false + } + } + + n := lreply - 4 // Move to just before the dot + o := strings.LastIndexByte(reply[:n], '.') + if o == -1 { + return nil, true + } + // Batch operation. + ops := reply[o+1 : n] + op := parseInt64(stringToBytes(ops)) + if op < FastBatchOpStart || op > FastBatchOpPing { + return nil, true + } + + b := getFastBatchFromPool() + b.ping = op == FastBatchOpPing + b.commitEob = op == FastBatchOpCommitEob + b.commit = b.commitEob || op == FastBatchOpCommit + p := o + + // Batch seq. + if o = strings.LastIndexByte(reply[:o], '.'); o == -1 { + return nil, true + } + a := parseInt64(stringToBytes(reply[o+1 : p])) + if a < 1 { + return nil, true + } + b.seq = uint64(a) + p = o + if b.seq <= 0 { + return nil, true + } else if b.seq == 1 && b.commitEob { + return nil, true + } + if op == FastBatchOpStart && b.seq != 1 { + return nil, true + } else if op == FastBatchOpAppend && b.seq <= 1 { + return nil, true + } + + // Gap mode. + if o = strings.LastIndexByte(reply[:o], '.'); o == -1 { + return nil, true + } + gapMode := reply[o+1 : p] + if gapMode != FastBatchGapFail && gapMode != FastBatchGapOk { + return nil, true // Not recognized. + } + b.gapOk = gapMode == FastBatchGapOk + p = o + + // Ack flow. + if o = strings.LastIndexByte(reply[:o], '.'); o == -1 { + return nil, true + } + a = parseInt64(stringToBytes(reply[o+1 : p])) + if a <= 0 { + a = 10 + } else if a > math.MaxUint16 { + a = math.MaxUint16 + } + b.flow = uint16(a) + p = o + + // Batch id. + if o = strings.LastIndexByte(reply[:o], '.'); o == -1 { + return nil, true + } + b.id = reply[o+1 : p] + return b, false } // Fast lookup of batch sequence. @@ -5421,7 +6063,7 @@ func (mset *stream) processInboundJetStreamMsg(_ *subscription, c *client, _ *Ac // to prevent a trace event to be generated when a stored message // is delivered to a consumer and routed. if !traceOnly { - disableTraceHeaders(c, hdr) + hdr = setHeader(MsgTraceDest, MsgTraceDestDisabled, hdr) } // This will add the jetstream event while in the client read loop. // Since the event will be updated in a different go routine, the @@ -5442,7 +6084,11 @@ var ( ) // processJetStreamMsg is where we try to actually process the stream msg. -func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, mt *msgTrace, sourced bool, needLock bool) (retErr error) { +func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, mt *msgTrace, sourced bool, needLock bool) error { + return mset.processJetStreamMsgWithBatch(subject, reply, hdr, msg, lseq, ts, mt, sourced, needLock, nil) +} + +func (mset *stream) processJetStreamMsgWithBatch(subject, reply string, hdr, msg []byte, lseq uint64, ts int64, mt *msgTrace, sourced bool, needLock bool, fastBatch *FastBatch) (retErr error) { if mt != nil { // Only the leader/standalone will have mt!=nil. On exit, send the // message trace event. @@ -5495,19 +6141,35 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, numConsumers := len(mset.consumers) interestRetention := mset.cfg.Retention == InterestPolicy allowMsgCounter, allowMsgSchedules := mset.cfg.AllowMsgCounter, mset.cfg.AllowMsgSchedules + allowRollupPurge := mset.cfg.AllowRollup && !mset.cfg.DenyPurge // Snapshot if we are the leader and if we can respond. isLeader, isSealed := mset.isLeaderNodeState(), mset.cfg.Sealed - isClustered := mset.isClustered() + isClustered, isMirror := mset.isClustered(), mset.cfg.Mirror != nil canConsistencyCheck := !isClustered || traceOnly canRespond := doAck && len(reply) > 0 && isLeader outq := mset.outq var resp = &JSPubAckResponse{} - var batchId string - var batchSeq uint64 - if len(hdr) > 0 { - // Populate batch details. + var ( + batchId string + batchSeq uint64 + ) + // Populate batch details. + if fastBatch != nil { + // For R1 we can reuse without regenerating. + batchId, batchSeq = fastBatch.id, fastBatch.seq + // Disable consistency checking if this was already done + // earlier as part of the batch consistency check. + canConsistencyCheck = traceOnly + } else if fastBatch, _ = getFastBatch(reply, hdr); fastBatch != nil { + defer fastBatch.returnToPool() + batchId, batchSeq = fastBatch.id, fastBatch.seq + // Disable consistency checking if this was already done + // earlier as part of the batch consistency check. + canConsistencyCheck = traceOnly + } + if len(hdr) > 0 && batchId == _EMPTY_ { if batchId = getBatchId(hdr); batchId != _EMPTY_ { batchSeq, _ = getBatchSequence(hdr) // Disable consistency checking if this was already done @@ -5543,11 +6205,13 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, isMisMatch := true // We may be able to recover here if we have no state whatsoever, or we are a mirror. // See if we have to adjust our starting sequence. - if mset.lseq == 0 || mset.cfg.Mirror != nil { + if mset.lseq == 0 || isMirror { var state StreamState mset.store.FastState(&state) if state.FirstSeq == 0 { - mset.store.Compact(lseq + 1) + if _, err := mset.store.Compact(lseq + 1); err != nil { + return err + } mset.lseq = lseq isMisMatch = false } @@ -5715,8 +6379,9 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } // Message scheduling. - if schedule, ok := getMessageSchedule(hdr); !ok { - apiErr := NewJSMessageSchedulesPatternInvalidError() + if sourced { + // noop, sourced messages were already validated by the origin stream. + } else if schedule, apiErr := getMessageSchedule(hdr); apiErr != nil { if !allowMsgSchedules { apiErr = NewJSMessageSchedulesDisabledError() } @@ -5746,6 +6411,15 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, outq.sendMsg(reply, b) } return apiErr + } else if scheduleRollup := getMessageScheduleRollup(hdr); scheduleRollup != _EMPTY_ && scheduleRollup != JSMsgRollupSubject { + apiErr := NewJSMessageSchedulesRollupInvalidError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr } else if scheduleTtl != _EMPTY_ && !mset.cfg.AllowMsgTTL { if canRespond { resp.PubAck = &PubAck{Stream: name} @@ -5755,7 +6429,7 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } return errMsgTTLDisabled } else if scheduleTarget := getMessageScheduleTarget(hdr); scheduleTarget == _EMPTY_ || - !IsValidPublishSubject(scheduleTarget) || SubjectsCollide(scheduleTarget, subject) { + !IsValidPublishSubject(scheduleTarget) || scheduleTarget == subject { apiErr := NewJSMessageSchedulesTargetInvalidError() if canRespond { resp.PubAck = &PubAck{Stream: name} @@ -5764,6 +6438,16 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, outq.sendMsg(reply, b) } return apiErr + } else if scheduleSource := getMessageScheduleSource(hdr); scheduleSource != _EMPTY_ && + (scheduleSource == scheduleTarget || scheduleSource == subject || !IsValidPublishSubject(scheduleSource)) { + apiErr := NewJSMessageSchedulesSourceInvalidError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr } else { match := slices.ContainsFunc(mset.cfg.Subjects, func(subj string) bool { return SubjectsCollide(subj, scheduleTarget) @@ -5778,6 +6462,21 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } return apiErr } + if scheduleSource != _EMPTY_ { + match = slices.ContainsFunc(mset.cfg.Subjects, func(subj string) bool { + return SubjectsCollide(subj, scheduleSource) + }) + if !match { + apiErr := NewJSMessageSchedulesSourceInvalidError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr + } + } // Add a rollup sub header if it doesn't already exist. // Otherwise, it must exist already as a rollup on the subject. @@ -5795,13 +6494,60 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } } } + if scheduleNext := sliceHeader(JSScheduleNext, hdr); len(scheduleNext) > 0 && !sourced { + // Clients may only use Nats-Schedule-Next to purge a schedule. + if bytesToString(scheduleNext) != JSScheduleNextPurge { + apiErr := NewJSMessageSchedulesSchedulerInvalidError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr + } + // Nats-Scheduler must accompany the purge and: + // - it must NOT be empty. + // - it must NOT match the publish subject. + if scheduler := sliceHeader(JSScheduler, hdr); len(scheduler) == 0 || + bytesToString(scheduler) == subject || !IsValidPublishSubject(bytesToString(scheduler)) { + apiErr := NewJSMessageSchedulesSchedulerInvalidError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr + } else if !allowMsgSchedules { + apiErr := NewJSMessageSchedulesDisabledError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr + } + } else if !sourced && len(sliceHeader(JSScheduler, hdr)) > 0 { + // Clients may only use Nats-Scheduler alongside Nats-Schedule-Next. + apiErr := NewJSMessageSchedulesSchedulerInvalidError() + if canRespond { + resp.PubAck = &PubAck{Stream: name} + resp.Error = apiErr + b, _ := json.Marshal(resp) + outq.sendMsg(reply, b) + } + return apiErr + } } // Dedupe detection. This is done at the cluster level for dedupe detection above the // lower layers. But we still need to pull out the msgId. if msgId = getMsgId(hdr); msgId != _EMPTY_ { // Do real check only if not clustered or traceOnly flag is set. - if canConsistencyCheck { + // If we're mirroring we can't deduplicate on our own. + if canConsistencyCheck && !isMirror { var seq uint64 mset.ddMu.Lock() dde := mset.checkMsgId(msgId) @@ -5836,7 +6582,7 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } // Check for any rollups. if rollup := getRollup(hdr); rollup != _EMPTY_ { - if canConsistencyCheck && (!mset.cfg.AllowRollup || mset.cfg.DenyPurge) { + if canConsistencyCheck && !allowRollupPurge && !sourced { err := errors.New("rollup not permitted") if canRespond { resp.PubAck = &PubAck{Stream: name} @@ -6058,6 +6804,26 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, if msgId != _EMPTY_ { mset.storeMsgId(&ddentry{msgId, mset.lseq, ts}) } + if allowRollupPurge { + if err = mset.processJetStreamMsgWithRollup(subject, rollupSub, rollupAll, hdr, 0); err != nil { + return err + } + } + // If using fast batch publish, we occasionally send flow control messages. + // And, we need to ensure a PubAck is sent if the commit happens through EOB. + if fastBatch != nil { + if mset.batches == nil { + mset.batches = &batching{} + } + mset.batches.mu.Lock() + // Check full leader state so we only send the client an update once we're caught up. + commit := mset.batches.fastBatchRegisterSequences(mset, reply, mset.lseq, mset.isLeader(), fastBatch) + mset.batches.mu.Unlock() + if !commit { + reply = _EMPTY_ + canRespond = false + } + } if canRespond { response = append(pubAck, strconv.FormatUint(mset.lseq, 10)...) if batchId != _EMPTY_ { @@ -6137,11 +6903,7 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } else { // Make sure to take into account any message assignments that we had to skip (clfs). seq = lseq + 1 - clfs - // Check for preAcks and the need to clear it. - if mset.hasAllPreAcks(seq, subject) { - mset.clearAllPreAcks(seq) - } - err = store.StoreRawMsg(subject, hdr, msg, seq, ts, ttl) + err = store.StoreRawMsg(subject, hdr, msg, seq, ts, ttl, canConsistencyCheck) } if err != nil { @@ -6152,17 +6914,22 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, mset.srv.Warnf("Filesystem permission denied while writing msg, disabling JetStream: %v", err) return err } - // If we did not succeed increment clfs in case we are clustered. - bumpCLFS() switch err { case ErrMaxMsgs, ErrMaxBytes, ErrMaxMsgsPerSubject, ErrMsgTooLarge: s.RateLimitDebugf("JetStream failed to store a msg on stream '%s > %s': %v", accName, name, err) case ErrStoreClosed: default: - s.Errorf("JetStream failed to store a msg on stream '%s > %s': %v", accName, name, err) + // We don't want to respond back to the user, and definitely not up CLFS either. + // This was likely an IO issue, so only log and return the error. This will stop + // the stream if it was replicated. + s.RateLimitErrorf("JetStream failed to store a msg on stream '%s > %s': %v", accName, name, err) + mset.setWriteErrLocked(err) + return err } + // If we did not succeed increment clfs in case we are clustered. + bumpCLFS() if canRespond { resp.PubAck = &PubAck{Stream: name} resp.Error = NewJSStreamStoreFailedError(err, Unless(err)) @@ -6172,6 +6939,18 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, return err } + // Check for preAcks and the need to clear it. + if mset.hasAllPreAcks(seq, subject) { + mset.clearAllPreAcks(seq) + // If we're clustered and the stream leader, we can now propose deleting this message. + // We still store it below, so we remain properly synchronized with our followers. + // If this proposal fails, we retry out-of-band. + if isClustered && isLeader { + md := streamMsgDelete{Seq: seq, NoErase: true, Stream: mset.cfg.Name} + _ = mset.node.Propose(encodeMsgDelete(&md)) + } + } + // If here we succeeded in storing the message. mset.lmsgId = msgId mset.lseq = seq @@ -6194,15 +6973,9 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, } // No errors, this is the normal path. - if rollupSub { - mset.purgeLocked(&JSApiStreamPurgeRequest{Subject: subject, Keep: 1}, false) - } else if rollupAll { - mset.purgeLocked(&JSApiStreamPurgeRequest{Keep: 1}, false) - } else if scheduleNext := sliceHeader(JSScheduleNext, hdr); len(scheduleNext) > 0 && bytesToString(scheduleNext) == JSScheduleNextPurge { - // Purge the message schedule. - scheduler := getMessageScheduler(hdr) - if scheduler != _EMPTY_ { - mset.purgeLocked(&JSApiStreamPurgeRequest{Subject: scheduler}, false) + if allowRollupPurge { + if err = mset.processJetStreamMsgWithRollup(subject, rollupSub, rollupAll, hdr, 1); err != nil { + return err } } @@ -6235,6 +7008,22 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, outq.send(newJSPubMsg(tsubj, _EMPTY_, _EMPTY_, hdr, rpMsg, nil, seq)) } + // If using fast batch publish, we occasionally send flow control messages. + // And, we need to ensure a PubAck is sent if the commit happens through EOB. + if fastBatch != nil { + if mset.batches == nil { + mset.batches = &batching{} + } + mset.batches.mu.Lock() + // Check full leader state so we only send the client an update once we're caught up. + commit := mset.batches.fastBatchRegisterSequences(mset, reply, mset.lseq, mset.isLeader(), fastBatch) + mset.batches.mu.Unlock() + if !commit { + reply = _EMPTY_ + canRespond = false + } + } + // Send response here. if canRespond { response = append(pubAck, strconv.FormatUint(seq, 10)...) @@ -6262,17 +7051,37 @@ func (mset *stream) processJetStreamMsg(subject, reply string, hdr, msg []byte, return nil } -// processJetStreamBatchMsg processes a JetStream message that's part of an atomic batch publish. -// Handles constraints around the batch, storing messages, doing consistency checks, and performing the commit. -func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) { - // For possible error response. - var response []byte +// Lock should be held. +func (mset *stream) processJetStreamMsgWithRollup(subject string, rollupSub, rollupAll bool, hdr []byte, keep uint64) error { + if rollupSub { + if _, err := mset.purgeLocked(&JSApiStreamPurgeRequest{Subject: subject, Keep: keep}, false); err != nil { + return err + } + } else if rollupAll { + if _, err := mset.purgeLocked(&JSApiStreamPurgeRequest{Keep: keep}, false); err != nil { + return err + } + } + if scheduleNext := sliceHeader(JSScheduleNext, hdr); len(scheduleNext) > 0 && bytesToString(scheduleNext) == JSScheduleNextPurge { + // Purge the message schedule. + scheduler := getMessageScheduler(hdr) + if scheduler != _EMPTY_ { + if _, err := mset.purgeLocked(&JSApiStreamPurgeRequest{Subject: scheduler}, false); err != nil { + return err + } + } + } + return nil +} +// processJetStreamAtomicBatchMsg processes a JetStream message that's part of an atomic batch publish. +// Handles constraints around the batch, storing messages, doing consistency checks, and performing the commit. +func (mset *stream) processJetStreamAtomicBatchMsg(batchId, subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) { mset.mu.RLock() canRespond := !mset.cfg.NoAck && len(reply) > 0 name, stype := mset.cfg.Name, mset.cfg.Storage discard, discardNewPer, maxMsgs, maxMsgsPer, maxBytes := mset.cfg.Discard, mset.cfg.DiscardNewPer, mset.cfg.MaxMsgs, mset.cfg.MaxMsgsPer, mset.cfg.MaxBytes - s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node + s, js, jsa, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Replicas, mset.tier, mset.outq, mset.node maxMsgSize, lseq := int(mset.cfg.MaxMsgSize), mset.lseq isLeader, isClustered, isSealed, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, allowAtomicPublish := mset.isLeader(), mset.isClustered(), mset.cfg.Sealed, mset.cfg.AllowRollup, mset.cfg.DenyPurge, mset.cfg.AllowMsgTTL, mset.cfg.AllowMsgCounter, mset.cfg.AllowMsgSchedules, mset.cfg.AllowAtomicPublish mset.mu.RUnlock() @@ -6294,41 +7103,36 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr return NewJSClusterNotLeaderError() } + respondError := func(apiErr *ApiError) error { + if canRespond { + buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: apiErr}) + outq.sendMsg(reply, buf) + } + return apiErr + } + // Bail here if sealed. if isSealed { - var resp = JSPubAckResponse{PubAck: &PubAck{Stream: mset.name()}, Error: NewJSStreamSealedError()} - b, _ := json.Marshal(resp) - mset.outq.sendMsg(reply, b) - return NewJSStreamSealedError() + return respondError(NewJSStreamSealedError()) } // Check here pre-emptively if we have exceeded this server limits. if js.limitsExceeded(stype) { s.resourcesExceededError(stype) - if canRespond { - b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: NewJSInsufficientResourcesError()}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) - } // Stepdown regardless. if node := mset.raftNode(); node != nil { node.StepDown() } - return NewJSInsufficientResourcesError() + return respondError(NewJSInsufficientResourcesError()) } // Check here pre-emptively if we have exceeded our account limits. - if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, subject, hdr, msg); exceeded { + if exceeded, err := jsa.wouldExceedLimits(stype, tierName, r, subject, hdr, msg); exceeded { if err == nil { err = NewJSAccountResourcesExceededError() } s.RateLimitWarnf("JetStream account limits exceeded for '%s': %s", jsa.acc().GetName(), err.Error()) - if canRespond { - var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} - resp.Error = err - response, _ = json.Marshal(resp) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) - } - return err + return respondError(err) } // Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive. @@ -6336,87 +7140,63 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr if maxMsgSize >= 0 && (len(hdr) > maxMsgSize || len(msg) > maxMsgSize-len(hdr)) { err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) s.RateLimitWarnf("%s", err.Error()) - if canRespond { - var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}} - resp.Error = NewJSStreamMessageExceedsMaximumError() - response, _ = json.Marshal(resp) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, response, nil, 0)) - } + _ = respondError(NewJSStreamMessageExceedsMaximumError()) return err } if !allowAtomicPublish { - err := NewJSAtomicPublishDisabledError() - if canRespond { - b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) - } - return err + return respondError(NewJSAtomicPublishDisabledError()) } // Batch ID is too long. if len(batchId) > 64 { - err := NewJSAtomicPublishInvalidBatchIDError() - if canRespond { - b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) - } - return err + return respondError(NewJSAtomicPublishInvalidBatchIDError()) } batchSeq, exists := getBatchSequence(hdr) if !exists { - err := NewJSAtomicPublishMissingSeqError() - if canRespond { - b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) - } - return err + return respondError(NewJSAtomicPublishMissingSeqError()) } + jsa.mu.RLock() + storeDir := jsa.storeDir + jsa.mu.RUnlock() + mset.mu.Lock() if mset.batches == nil { - mset.batches = &batching{ - group: make(map[string]*batchGroup, 1), - } + mset.batches = &batching{} } batches := mset.batches - mset.mu.Unlock() + // Acquire the batches lock. + // Can't release the stream lock now, we need to keep holding it while we hold the batches lock. + // Re-acquiring the stream lock with the batches lock already held would be a lock inversion. + batches.mu.Lock() respondIncompleteBatch := func() error { - err := NewJSAtomicPublishIncompleteBatchError() - if canRespond { - buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, buf, nil, 0)) - } - return err + return respondError(NewJSAtomicPublishIncompleteBatchError()) } // Get batch. - batches.mu.Lock() - b, ok := batches.group[batchId] + b, ok := batches.atomic[batchId] if !ok { if batchSeq != 1 { batches.mu.Unlock() - maxBatchSize := streamMaxBatchSize + mset.mu.Unlock() + maxBatchSize := streamMaxAtomicBatchSize opts := s.getOpts() if opts.JetStreamLimits.MaxBatchSize > 0 { maxBatchSize = opts.JetStreamLimits.MaxBatchSize } if batchSeq > uint64(maxBatchSize) { err := NewJSAtomicPublishTooLargeBatchError(maxBatchSize) - if canRespond { - buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, buf, nil, 0)) - } - return err + return respondError(err) } return respondIncompleteBatch() } // Limits. - maxInflightPerStream := streamMaxBatchInflightPerStream - maxInflightTotal := streamMaxBatchInflightTotal + maxInflightPerStream := streamMaxAtomicBatchInflightPerStream + maxInflightTotal := streamMaxAtomicBatchInflightTotal opts := s.getOpts() if opts.JetStreamLimits.MaxBatchInflightPerStream > 0 { maxInflightPerStream = opts.JetStreamLimits.MaxBatchInflightPerStream @@ -6426,39 +7206,44 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr } // Confirm we can facilitate an additional batch. - if len(batches.group)+1 > maxInflightPerStream { + if len(batches.atomic)+1 > maxInflightPerStream { batches.mu.Unlock() - return respondIncompleteBatch() + mset.mu.Unlock() + return respondError(NewJSAtomicPublishTooManyInflightError()) } // Confirm we'll not exceed the server limit. - if globalInflightBatches.Add(1) > int32(maxInflightTotal) { - globalInflightBatches.Add(-1) + if globalInflightAtomicBatches.Add(1) > int64(maxInflightTotal) { + globalInflightAtomicBatches.Add(-1) batches.mu.Unlock() - return respondIncompleteBatch() + mset.mu.Unlock() + return respondError(NewJSAtomicPublishTooManyInflightError()) } var err error - if b, err = batches.newBatchGroup(mset, batchId); err != nil { - globalInflightBatches.Add(-1) + b, err = batches.newAtomicBatch(mset, batchId, r, stype, storeDir, name) + if err != nil { + globalInflightAtomicBatches.Add(-1) batches.mu.Unlock() + mset.mu.Unlock() return respondIncompleteBatch() } - batches.group[batchId] = b + if batches.atomic == nil { + batches.atomic = make(map[string]*atomicBatch, 1) + } + batches.atomic[batchId] = b } - var commit bool + var commit, commitEob bool if c := sliceHeader(JSBatchCommit, hdr); c != nil { + commitEob = bytes.Equal(c, []byte("eob")) // Reject the batch if the commit is not recognized. - if !bytes.Equal(c, []byte("1")) { + if !commitEob && !bytes.Equal(c, []byte("1")) { b.cleanupLocked(batchId, batches) batches.mu.Unlock() + mset.mu.Unlock() err := NewJSAtomicPublishInvalidBatchCommitError() - if canRespond { - b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) - } - return err + return respondError(err) } commit = true } @@ -6468,40 +7253,39 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr if errorOnRequiredApiLevel(hdr) { b.cleanupLocked(batchId, batches) batches.mu.Unlock() + mset.mu.Unlock() + mset.sendStreamBatchAbandonedAdvisory(batchId, BatchRequirementsNotMet) err := NewJSRequiredApiLevelError() - if canRespond { - b, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, b, nil, 0)) - } - return err + return respondError(err) } hdr = removeHeaderIfPresent(hdr, JSRequiredApiLevel) } + // If cleanup has already happened, we can't continue. + cleanup := !b.resetCleanupTimer(mset) + // Detect gaps. b.lseq++ - if b.lseq != batchSeq { + if b.lseq != batchSeq || cleanup || (batchSeq == 1 && commitEob) { b.cleanupLocked(batchId, batches) batches.mu.Unlock() + mset.mu.Unlock() mset.sendStreamBatchAbandonedAdvisory(batchId, BatchIncomplete) return respondIncompleteBatch() } // Confirm the batch doesn't exceed the allowed size. - maxSize := streamMaxBatchSize + maxSize := streamMaxAtomicBatchSize if maxBatchSize := s.getOpts().JetStreamLimits.MaxBatchSize; maxBatchSize > 0 { maxSize = maxBatchSize } if batchSeq > uint64(maxSize) { b.cleanupLocked(batchId, batches) batches.mu.Unlock() + mset.mu.Unlock() mset.sendStreamBatchAbandonedAdvisory(batchId, BatchLarge) err := NewJSAtomicPublishTooLargeBatchError(maxSize) - if canRespond { - buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: err}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, buf, nil, 0)) - } - return err + return respondError(err) } // Persist, but optimize if we're committing because we already know last. @@ -6511,24 +7295,27 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr if err != nil || seq != batchSeq { b.cleanupLocked(batchId, batches) batches.mu.Unlock() + mset.mu.Unlock() mset.sendStreamBatchAbandonedAdvisory(batchId, BatchIncomplete) return respondIncompleteBatch() } } if !commit { batches.mu.Unlock() + mset.mu.Unlock() // Send empty ack to let them know we've persisted the data prior to commit. if canRespond { - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, nil, nil, 0)) + outq.sendMsg(reply, nil) } return nil } // Ensure the batch is prepared for the commit and will not be cleaned up while committing. - if !b.readyForCommit() { + if abandonReason := b.readyForCommit(); abandonReason != nil { // Don't do cleanup, this is already done. batches.mu.Unlock() - mset.sendStreamBatchAbandonedAdvisory(batchId, BatchTimeout) + mset.mu.Unlock() + mset.sendStreamBatchAbandonedAdvisory(batchId, *abandonReason) return respondIncompleteBatch() } @@ -6547,45 +7334,26 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr // We only use mset.clseq for clustering and in case we run ahead of actual commits. // Check if we need to set initial value here if isClustered && (mset.clseq == 0 || mset.clseq < lseq+mset.clfs) { - // Need to unlock and re-acquire the locks in the proper order. - mset.clMu.Unlock() - // Locking order is stream -> batchMu -> clMu - mset.mu.RLock() - batch := mset.batchApply - var batchCount uint64 - if batch != nil { - batch.mu.Lock() - batchCount = batch.count - } - mset.clMu.Lock() - // Re-capture - lseq = mset.lseq - mset.clseq = lseq + mset.clfs + batchCount - // Keep hold of the mset.clMu, but unlock the others. - if batch != nil { - batch.mu.Unlock() - } - mset.mu.RUnlock() + lseq = recalculateClusteredSeq(mset, false) } - rollback := func(seq uint64) { + oclseq := mset.clseq + rollback := func() { if isClustered { // Only need to move the clustered sequence back if the batch fails to commit. // Other changes were staged but not applied, so this is the only thing we need to do. - mset.clseq -= seq - 1 + mset.clseq = oclseq } mset.clMu.Unlock() } - errorOnUnsupported := func(seq uint64, header string) *ApiError { + errorOnUnsupported := func(header string) *ApiError { apiErr := NewJSAtomicPublishUnsupportedHeaderBatchError(header) - rollback(seq) + rollback() b.cleanupLocked(batchId, batches) batches.mu.Unlock() - if canRespond { - buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: apiErr}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, buf, nil, 0)) - } + mset.mu.Unlock() + _ = respondError(apiErr) return apiErr } @@ -6601,32 +7369,46 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr sz int ) + // If the commit ends with an "End Of Batch" message, we don't store this. + if commitEob { + batchSeq-- + } + diff := &batchStagedDiff{} for seq := uint64(1); seq <= batchSeq; seq++ { - if seq == batchSeq && b.store.Type() != FileStorage { + if seq == batchSeq && !commitEob && b.store.Type() != FileStorage { bsubj, bhdr, bmsg = subject, hdr, msg } else if sm, err = b.store.LoadMsg(seq, &smv); sm != nil && err == nil { bsubj, bhdr, bmsg = sm.subj, sm.hdr, sm.msg } else { - rollback(seq) + rollback() b.cleanupLocked(batchId, batches) batches.mu.Unlock() + mset.mu.Unlock() return respondIncompleteBatch() } + // Apply the input subject transform if any + csubj := bsubj + if mset.itr != nil { + ts, err := mset.itr.Match(csubj) + if err == nil { + // no filtering: if the subject doesn't map the source of the transform, don't change it + csubj = ts + } + } + // Reject unsupported headers. if getExpectedLastMsgId(bhdr) != _EMPTY_ { - return errorOnUnsupported(seq, JSExpectedLastMsgId) + return errorOnUnsupported(JSExpectedLastMsgId) } - if bhdr, bmsg, _, apiErr, err = checkMsgHeadersPreClusteredProposal(diff, mset, bsubj, bhdr, bmsg, false, name, jsa, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, discard, discardNewPer, maxMsgSize, maxMsgs, maxMsgsPer, maxBytes); err != nil { - rollback(seq) + if bhdr, bmsg, _, apiErr, err = checkMsgHeadersPreClusteredProposal(diff, mset, csubj, bsubj, bhdr, bmsg, false, name, jsa, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, discard, discardNewPer, maxMsgSize, maxMsgs, maxMsgsPer, maxBytes); err != nil { + rollback() b.cleanupLocked(batchId, batches) batches.mu.Unlock() - if canRespond { - buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: apiErr}) - outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, nil, buf, nil, 0)) - } + mset.mu.Unlock() + _ = respondError(apiErr) return err } @@ -6635,6 +7417,10 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr isCommit := seq == batchSeq if isCommit { _reply = reply + // If committed by EOB, the last message must get the normal commit header. + if commitEob { + bhdr = genHeader(bhdr, JSBatchCommit, "1") + } } esm := encodeStreamMsgAllowCompressAndBatch(bsubj, _reply, bhdr, bmsg, mset.clseq, ts, false, batchId, seq, isCommit) entries = append(entries, newEntry(EntryNormal, esm)) @@ -6651,9 +7437,9 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr // Ensure the whole batch is fully isolated, and reads // can only happen after the full batch is committed. - mset.mu.Lock() + // We keep holding the stream lock. for seq := uint64(1); seq <= batchSeq; seq++ { - if seq == batchSeq && b.store.Type() != FileStorage { + if seq == batchSeq && !commitEob && b.store.Type() != FileStorage { bsubj, bhdr, bmsg = subject, hdr, msg } else if sm, err = b.store.LoadMsg(seq, &smv); sm != nil && err == nil { bsubj, bhdr, bmsg = sm.subj, sm.hdr, sm.msg @@ -6666,29 +7452,370 @@ func (mset *stream) processJetStreamBatchMsg(batchId, subject, reply string, hdr var _reply string if seq == batchSeq { _reply = reply + // If committed by EOB, the last message must get the normal commit header. + if commitEob { + bhdr = genHeader(bhdr, JSBatchCommit, "1") + } } - mset.processJetStreamMsg(bsubj, _reply, bhdr, bmsg, 0, 0, mt, false, false) + _ = mset.processJetStreamMsg(bsubj, _reply, bhdr, bmsg, 0, 0, mt, false, false) } mset.mu.Unlock() } else { + mset.mu.Unlock() // Do a single multi proposal. This ensures we get to push all entries to the proposal queue in-order // and not interleaved with other proposals. - diff.commit(mset) - _ = node.ProposeMulti(entries) - // The proposal can fail, but we always account for trying. - mset.trackReplicationTraffic(node, sz, r) + if err = node.ProposeMulti(entries); err == nil { + diff.commit(mset) + mset.trackReplicationTraffic(node, sz, r) - // Check to see if we are being overrun. - // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. - if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold { - lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) - s.RateLimitWarnf("%s", lerr.Error()) + // Check to see if we are being overrun. + // TODO(dlc) - Make this a limit where we drop messages to protect ourselves, but allow to be configured. + if mset.clseq-(lseq+mset.clfs) > streamLagWarnThreshold { + lerr := fmt.Errorf("JetStream stream '%s > %s' has high message lag", jsa.acc().Name, name) + s.RateLimitWarnf("%s", lerr.Error()) + } + } else { + mset.clseq = oclseq } mset.clMu.Unlock() } b.cleanupLocked(batchId, batches) batches.mu.Unlock() - return nil + return err +} + +// processJetStreamFastBatchMsg processes a JetStream message that's part of an atomic batch publish. +// Handles constraints around the batch, storing messages, doing consistency checks, and performing the commit. +func (mset *stream) processJetStreamFastBatchMsg(batch *FastBatch, subject, reply string, hdr, msg []byte, mt *msgTrace) (retErr error) { + mset.mu.RLock() + canRespond := !mset.cfg.NoAck && len(reply) > 0 + name, stype := mset.cfg.Name, mset.cfg.Storage + discard, discardNewPer, maxMsgs, maxMsgsPer, maxBytes := mset.cfg.Discard, mset.cfg.DiscardNewPer, mset.cfg.MaxMsgs, mset.cfg.MaxMsgsPer, mset.cfg.MaxBytes + s, js, jsa, st, r, tierName, outq, node := mset.srv, mset.js, mset.jsa, mset.cfg.Storage, mset.cfg.Replicas, mset.tier, mset.outq, mset.node + maxMsgSize, lseq := int(mset.cfg.MaxMsgSize), mset.lseq + isLeader, isClustered, isSealed, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, allowBatchPublish := mset.isLeader(), mset.isClustered(), mset.cfg.Sealed, mset.cfg.AllowRollup, mset.cfg.DenyPurge, mset.cfg.AllowMsgTTL, mset.cfg.AllowMsgCounter, mset.cfg.AllowMsgSchedules, mset.cfg.AllowBatchPublish + + // Apply the input subject transform if any + csubject := subject + if mset.itr != nil { + ts, err := mset.itr.Match(csubject) + if err == nil { + // no filtering: if the subject doesn't map the source of the transform, don't change it + csubject = ts + } + } + mset.mu.RUnlock() + + // If message tracing (with message delivery), we will need to send the + // event on exit in case there was an error (if message was not proposed). + // Otherwise, the event will be sent from processJetStreamMsg when + // invoked by the leader (from applyStreamEntries). + if mt != nil { + defer func() { + if retErr != nil { + mt.sendEventFromJetStream(retErr) + } + }() + } + + // Check that we are the leader. This can be false if we have scaled up from an R1 that had inbound queued messages. + if !isLeader { + return NewJSClusterNotLeaderError() + } + + respondError := func(apiErr *ApiError) error { + if canRespond { + buf, _ := json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: apiErr}) + outq.sendMsg(reply, buf) + } + return apiErr + } + + // Bail here if sealed. + if isSealed { + return respondError(NewJSStreamSealedError()) + } + + // Check here pre-emptively if we have exceeded this server limits. + if js.limitsExceeded(stype) { + s.resourcesExceededError(stype) + // Stepdown regardless. + if node := mset.raftNode(); node != nil { + node.StepDown() + } + return respondError(NewJSInsufficientResourcesError()) + } + + // Check here pre-emptively if we have exceeded our account limits. + if exceeded, err := jsa.wouldExceedLimits(st, tierName, r, csubject, hdr, msg); exceeded { + if err == nil { + err = NewJSAccountResourcesExceededError() + } + s.RateLimitWarnf("JetStream account limits exceeded for '%s': %s", jsa.acc().GetName(), err.Error()) + return respondError(err) + } + + // Check msgSize if we have a limit set there. Again this works if it goes through but better to be pre-emptive. + // Subtract to prevent against overflows. + if maxMsgSize >= 0 && (len(hdr) > maxMsgSize || len(msg) > maxMsgSize-len(hdr)) { + err := fmt.Errorf("JetStream message size exceeds limits for '%s > %s'", jsa.acc().Name, mset.cfg.Name) + s.RateLimitWarnf("%s", err.Error()) + _ = respondError(NewJSStreamMessageExceedsMaximumError()) + return err + } + + if !allowBatchPublish { + return respondError(NewJSBatchPublishDisabledError()) + } + + if batch == nil { + return respondError(NewJSBatchPublishInvalidPatternError()) + } + + // Batch ID is too long. + if len(batch.id) > 64 { + return respondError(NewJSBatchPublishInvalidBatchIDError()) + } + + mset.mu.Lock() + if mset.batches == nil { + mset.batches = &batching{} + } + batches := mset.batches + // Acquire the batches lock. + // Can't release the stream lock now, we need to keep holding it while we hold the batches lock. + // Re-acquiring the stream lock with the batches lock already held would be a lock inversion. + batches.mu.Lock() + + // Get batch. + b, ok := batches.fast[batch.id] + if !ok { + if batch.seq != 1 { + batches.mu.Unlock() + mset.mu.Unlock() + return respondError(NewJSBatchPublishUnknownBatchIDError()) + } + + // Limits. + maxInflightPerStream := streamMaxFastBatchInflightPerStream + maxInflightTotal := streamMaxFastBatchInflightTotal + opts := s.getOpts() + if opts.JetStreamLimits.MaxBatchInflightPerStream > 0 { + maxInflightPerStream = opts.JetStreamLimits.MaxBatchInflightPerStream + } + if opts.JetStreamLimits.MaxBatchInflightTotal > 0 { + maxInflightTotal = opts.JetStreamLimits.MaxBatchInflightTotal + } + + // Confirm we can facilitate an additional batch. + if len(batches.fast)+1 > maxInflightPerStream { + batches.mu.Unlock() + mset.mu.Unlock() + return respondError(NewJSBatchPublishTooManyInflightError()) + } + + // Confirm we'll not exceed the server limit. + if globalInflightFastBatches.Add(1) > int64(maxInflightTotal) { + globalInflightFastBatches.Add(-1) + batches.mu.Unlock() + mset.mu.Unlock() + return respondError(NewJSBatchPublishTooManyInflightError()) + } + + // We'll need a copy as we'll use it as a key and later for cleanup. + batchId := copyString(batch.id) + b = batches.newFastBatch(mset, batchId, batch.gapOk, batch.flow) + } + + // The required API level can have the batch be rejected. But the header is always removed. + if len(sliceHeader(JSRequiredApiLevel, hdr)) != 0 { + if errorOnRequiredApiLevel(hdr) { + b.cleanupLocked(batch.id, batches) + batches.mu.Unlock() + mset.mu.Unlock() + return respondError(NewJSRequiredApiLevelError()) + } + hdr = removeHeaderIfPresent(hdr, JSRequiredApiLevel) + } + + // Fast publishing resets the cleanup timer. + // If cleanup has already happened, we can't continue. + cleanup := !b.resetCleanupTimer(mset) + + // A ping operation confirms we've received a minimum amount of data and resends ack messages. + if batch.ping { + sendFlowControl := true + // Detect a gap or if the batch was cleaned up in the meantime. + if batch.seq > b.lseq || cleanup { + // If a gap is detected, we always report about it. + buf, _ := BatchFlowGap{ExpectedLastSequence: b.lseq + 1, CurrentSequence: batch.seq + 1}.MarshalJSON() + outq.sendMsg(reply, buf) + // If the gap is okay, we can continue without rejecting. + if b.gapOk && !cleanup { + b.lseq = batch.seq + if b.pending == 0 { + b.pseq = b.lseq + } + sendFlowControl = !b.checkFlowControl(mset, reply, batches) + } else if cleanup = batches.fastBatchCommit(b, batch.id, mset, reply); cleanup { + b.cleanupLocked(batch.id, batches) + sendFlowControl = false + } + } + if sendFlowControl { + b.sendFlowControl(b.fseq, mset, reply) + } + batches.mu.Unlock() + mset.mu.Unlock() + return nil + } + + // If the batch is committing, due to an error, we can't add more messages. + // We simply skip, since the client will be waiting for the PubAck. + if b.commit { + // MUST NOT clean up, that will happen when the commit completes. + batches.mu.Unlock() + mset.mu.Unlock() + return nil + } + + // Detect gaps. + b.lseq++ + if b.lseq != batch.seq || cleanup { + // If a forward gap is detected, we always report about it. + if batch.seq > b.lseq { + buf, _ := BatchFlowGap{ExpectedLastSequence: b.lseq, CurrentSequence: batch.seq}.MarshalJSON() + outq.sendMsg(reply, buf) + } + // If the forward gap is okay, we can continue without rejecting. + if b.gapOk && !cleanup && batch.seq > b.lseq { + b.lseq = batch.seq + } else { + // We've reached either a backward gap, or were cleaned up already, or it's gap-fail mode. + // Revert, since we incremented for the gap check. + b.lseq-- + if cleanup = batches.fastBatchCommit(b, batch.id, mset, reply); cleanup { + b.cleanupLocked(batch.id, batches) + } + batches.mu.Unlock() + mset.mu.Unlock() + return nil + } + } + + if batch.commit { + if batch.commitEob { + // Revert, since we incremented for the gap check. + b.lseq-- + // If there is none pending, correct the persisted sequence as we need to commit below. + if b.pending == 0 { + b.pseq = b.lseq + } + } + // We'll try to immediately send a PubAck if we can. + // Only possible if EOB is used and the last message was already persisted + // Otherwise, this sets up the commit for the last message we're about to propose. + cleanup = batches.fastBatchCommit(b, batch.id, mset, reply) + if batch.commitEob { + if cleanup { + b.cleanupLocked(batch.id, batches) + } + batches.mu.Unlock() + mset.mu.Unlock() + return nil + } + } + + // The first message in the batch responds with the settings used for flow control. + // If committing immediately, we only send the PubAck. + if batch.seq == 1 && canRespond && !batch.commit { + buf, _ := BatchFlowAck{Sequence: 0, Messages: b.ackMessages}.MarshalJSON() + outq.sendMsg(reply, buf) + } + + // Proceed with proposing this message. + + // We only use mset.clseq for clustering and in case we run ahead of actual commits. + // Check if we need to set initial value here + mset.clMu.Lock() + if mset.clseq == 0 || mset.clseq < lseq+mset.clfs { + lseq = recalculateClusteredSeq(mset, false) + } + // We can now unlock, since we've potentially recalculated the clustered seq above. + mset.mu.Unlock() + + var ( + dseq uint64 + apiErr *ApiError + err error + ) + diff := &batchStagedDiff{} + if hdr, msg, dseq, apiErr, err = checkMsgHeadersPreClusteredProposal(diff, mset, csubject, subject, hdr, msg, false, name, jsa, allowRollup, denyPurge, allowTTL, allowMsgCounter, allowMsgSchedules, discard, discardNewPer, maxMsgSize, maxMsgs, maxMsgsPer, maxBytes); err != nil { + mset.clMu.Unlock() + + // If the message is a duplicate, and we have no pending messages, we should check if we need to + // send the flow control message here. + if err == errMsgIdDuplicate { + if b.pending == 0 { + b.pseq = batch.seq + b.checkFlowControl(mset, reply, batches) + } + if !batch.commit { + // Otherwise, just skip. + batches.mu.Unlock() + return err + } + } + + // If a batch immediately errors, we send the same response as we would a normal publish. + if !batch.gapOk && b.lseq == 1 { + var response []byte + if err == errMsgIdDuplicate && dseq > 0 { + var buf [256]byte + response = append(buf[:0], mset.pubAck...) + response = append(response, strconv.FormatUint(dseq, 10)...) + response = append(response, fmt.Sprintf(",\"duplicate\": true,\"batch\":%q,\"count\":%d}", batch.id, batch.seq)...) + } else { + response, _ = json.Marshal(&JSPubAckResponse{PubAck: &PubAck{Stream: name}, Error: apiErr}) + } + b.cleanupLocked(batch.id, batches) + batches.mu.Unlock() + outq.sendMsg(reply, response) + return err + } + + // We always return the error to the client, unless it's a duplicate. + if err != errMsgIdDuplicate { + buf, _ := BatchFlowErr{Sequence: batch.seq, Error: apiErr}.MarshalJSON() + outq.sendMsg(reply, buf) + } + + // If gaps are okay, we just allow them to continue. + if batch.gapOk { + batches.mu.Unlock() + return err + } + + // Revert the last sequence, we might be able to immediately return the PubAck as part of the commit. + // Otherwise, the batch is cleaned up automatically later. + if err != errMsgIdDuplicate { + b.lseq-- + } + if cleanup = batches.fastBatchCommit(b, batch.id, mset, reply); cleanup { + b.cleanupLocked(batch.id, batches) + } + batches.mu.Unlock() + return err + } + b.pending++ + batches.mu.Unlock() + if !isClustered { + mset.clMu.Unlock() + return mset.processJetStreamMsgWithBatch(subject, reply, hdr, msg, 0, 0, mt, false, true, batch) + } + err = commitSingleMsg(diff, mset, subject, reply, hdr, msg, name, jsa, mt, node, r, lseq) + mset.clMu.Unlock() + return err } // Used to signal inbound message to registered consumers. @@ -6986,8 +8113,11 @@ func (mset *stream) internalLoop() { ims := msgs.pop() for _, im := range ims { // If we are clustered we need to propose this message to the underlying raft group. - if batchId := getBatchId(im.hdr); batchId != _EMPTY_ { - mset.processJetStreamBatchMsg(batchId, im.subj, im.rply, im.hdr, im.msg, im.mt) + if batch, err := getFastBatch(im.rply, im.hdr); batch != nil || err { + mset.processJetStreamFastBatchMsg(batch, im.subj, im.rply, im.hdr, im.msg, im.mt) + batch.returnToPool() + } else if batchId := getBatchId(im.hdr); batchId != _EMPTY_ { + mset.processJetStreamAtomicBatchMsg(batchId, im.subj, im.rply, im.hdr, im.msg, im.mt) } else if isClustered { mset.processClusteredInboundMsg(im.subj, im.rply, im.hdr, im.msg, im.mt, false) } else { @@ -7074,6 +8204,20 @@ func (mset *stream) stop(deleteFlag, advisory bool) error { // Mark closed. mset.closed.Store(true) + // Both flags set mean a delete where we are the stream leader. + // Try to clean up any consumers used for sourcing (if one wasn't provided to us). + if deleteFlag && advisory { + if mset.cfg.Mirror != nil && mset.cfg.Mirror.Consumer == nil { + mset.tryDeleteMirrorConsumer(mset.cfg.Mirror) + } + for _, s := range mset.cfg.Sources { + if s.Consumer == nil { + id := mset.createSourcingConsumerHash(s, mset.cfg.Sources) + mset.tryDeleteSourceConsumer(id, s) + } + } + } + // Signal to the monitor loop. // Can't use qch here. if mset.mqch != nil { @@ -7253,9 +8397,11 @@ func (mset *stream) getConsumers() []*consumer { return append([]*consumer(nil), mset.cList...) } +// numLimitableConsumers returns the number of consumers that are not direct/sourcing consumers. +// Used to limit the number of consumers for MaxConsumers limits or WQ exclusivity. // Lock should be held for this one. -func (mset *stream) numPublicConsumers() int { - return len(mset.consumers) - mset.directs +func (mset *stream) numLimitableConsumers() int { + return len(mset.consumers) - mset.sourcingConsumers } // This returns all consumers that are not DIRECT. @@ -7352,8 +8498,8 @@ func (mset *stream) setConsumer(o *consumer) { if len(o.subjf) > 0 { mset.numFilter++ } - if o.cfg.Direct { - mset.directs++ + if o.cfg.Direct || o.cfg.Sourcing { + mset.sourcingConsumers++ } // Now update consumers list as well mset.clsMu.Lock() @@ -7372,8 +8518,8 @@ func (mset *stream) removeConsumer(o *consumer) { if o.cfg.FilterSubject != _EMPTY_ && mset.numFilter > 0 { mset.numFilter-- } - if o.cfg.Direct && mset.directs > 0 { - mset.directs-- + if (o.cfg.Direct || o.cfg.Sourcing) && mset.sourcingConsumers > 0 { + mset.sourcingConsumers-- } if mset.consumers != nil { delete(mset.consumers, o.name) @@ -7495,7 +8641,7 @@ func (mset *stream) Store() StreamStore { return mset.store } -// Determines if the new proposed partition is unique amongst all consumers. +// Determines if the new proposed partition is unique amongst all public consumers. // Lock should be held. func (mset *stream) partitionUnique(name string, partitions []string) bool { for _, partition := range partitions { @@ -7505,6 +8651,11 @@ func (mset *stream) partitionUnique(name string, partitions []string) bool { continue } o.mu.RLock() + // Ignore direct/sourcing consumers. + if o.cfg.Direct || o.cfg.Sourcing { + o.mu.RUnlock() + continue + } if o.subjf == nil { o.mu.RUnlock() return false @@ -7625,6 +8776,17 @@ func (mset *stream) clearAllPreAcksBelowFloor(floor uint64) { } } +// Clear all preAcks in [first, last]. Iterates the preAcks map, not the +// range, so callers can pass very wide ranges cheaply. +// Write lock should be held. +func (mset *stream) clearAllPreAcksInRange(first, last uint64) { + for seq := range mset.preAcks { + if seq >= first && seq <= last { + delete(mset.preAcks, seq) + } + } +} + // This will register an ack for a consumer if it arrives before the actual message. func (mset *stream) registerPreAckLock(o *consumer, seq uint64) { mset.mu.Lock() @@ -7718,9 +8880,13 @@ func (mset *stream) ackMsg(o *consumer, seq uint64) bool { return true } - // Only propose message deletion to the stream if we're consumer leader, otherwise all followers would also propose. - // We must be the consumer leader, since we know for sure we've stored the message and don't register as pre-ack. - if o != nil && !o.IsLeader() { + // Only propose message deletion to the stream if we're the leader, otherwise followers would also propose. + // We must be the stream leader, since we are the only one that can guarantee message ordering and ack handling. + // Either we've stored the message, and we know for sure all consumers have acked the message. + // Or, we've not stored the message yet (rare), and all consumers have registered as pre-acks, + // then we do the message delete proposal after we've stored the message instead. + // Except for a Direct AckNone consumer, as that has a nil consumer here, we still forward the delete proposal. + if o != nil && !mset.isLeader() { // Currently, interest-based streams can race on "no interest" because consumer creates/updates go over // the meta layer and published messages go over the stream layer. Some servers could then either store // or not store some initial set of messages that gained new interest. To get the stream back in sync, @@ -7739,6 +8905,7 @@ func (mset *stream) ackMsg(o *consumer, seq uint64) bool { } md := streamMsgDelete{Seq: seq, NoErase: true, Stream: mset.cfg.Name} + // Directly proposes if stream leader, otherwise forwards it. mset.node.ForwardProposal(encodeMsgDelete(&md)) mset.mu.Unlock() return true @@ -7889,11 +9056,6 @@ func (a *Account) RestoreStream(ncfg *StreamConfig, r io.Reader) (*stream, error return nil, err } - if cfg.Template != _EMPTY_ { - if err := jsa.addStreamNameToTemplate(cfg.Template, cfg.Name); err != nil { - return nil, err - } - } mset, err := a.addStream(&cfg) if err != nil { // Make sure to clean up after ourselves here. @@ -8043,6 +9205,43 @@ func (mset *stream) isMonitorRunning() bool { return mset.inMonitor } +// setWriteErr stores the write error in the stream. +func (mset *stream) setWriteErr(err error) { + mset.mu.Lock() + defer mset.mu.Unlock() + mset.setWriteErrLocked(err) +} + +func (mset *stream) setWriteErrLocked(err error) { + if mset.werr != nil { + return + } + // Ignore non-write errors. + if err == ErrStoreClosed { + return + } + mset.srv.Errorf("JetStream stream '%s > %s' critical write error: %v", mset.acc.Name, mset.cfg.Name, err) + mset.werr = err + assert.Unreachable("Stream encountered write error", map[string]any{ + "account": mset.acc.Name, + "stream": mset.cfg.Name, + "err": err, + }) + + // If stream is replicated, put it in observer mode to make sure another server can pick it up. + if node := mset.node; node != nil { + node.StepDown() + node.SetObserver(true) + } +} + +// getWriteErr returns the write error stored in the stream (if any). +func (mset *stream) getWriteErr() error { + mset.mu.RLock() + defer mset.mu.RUnlock() + return mset.werr +} + // Adjust accounting for sent messages as part of replication. func (mset *stream) trackReplicationTraffic(node RaftNode, sz int, r int) { // If we are using the system account for NRG, add in the extra sent msgs and bytes to our account diff --git a/vendor/github.com/nats-io/nats-server/v2/server/sublist.go b/vendor/github.com/nats-io/nats-server/v2/server/sublist.go index 2589cd2129..7423be0b20 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/sublist.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/sublist.go @@ -121,6 +121,18 @@ func NewSublist(enableCache bool) *Sublist { return &Sublist{root: newLevel()} } +// NewSublistForServer will create a default sublist with caching enabled determined +// by the server options. +func NewSublistForServer(srv *Server) *Sublist { + if srv == nil { + return NewSublistNoCache() // Probably just unit tests. + } + if opts := srv.getOpts(); opts != nil { + return NewSublist(!opts.NoSublistCache) + } + return NewSublistNoCache() +} + // NewSublistWithCache will create a default sublist with caching enabled. func NewSublistWithCache() *Sublist { return NewSublist(true) diff --git a/vendor/github.com/nats-io/nats-server/v2/server/thw/thw.go b/vendor/github.com/nats-io/nats-server/v2/server/thw/thw.go index bcbc16e9a3..ff6ac24cb2 100644 --- a/vendor/github.com/nats-io/nats-server/v2/server/thw/thw.go +++ b/vendor/github.com/nats-io/nats-server/v2/server/thw/thw.go @@ -155,8 +155,11 @@ func (hw *HashWheel) expireTasks(ts int64, callback func(seq uint64, expires int slotLowest := int64(math.MaxInt64) for seq, expires := range s.entries { if expires <= ts && callback(seq, expires) { - delete(s.entries, seq) - hw.count-- + // Only remove if not done so already by the callback. + if _, ok := s.entries[seq]; ok { + delete(s.entries, seq) + hw.count-- + } continue } if expires < slotLowest { diff --git a/vendor/github.com/onsi/gomega/CHANGELOG.md b/vendor/github.com/onsi/gomega/CHANGELOG.md index 91e65521b4..9c94d0e6cf 100644 --- a/vendor/github.com/onsi/gomega/CHANGELOG.md +++ b/vendor/github.com/onsi/gomega/CHANGELOG.md @@ -1,3 +1,11 @@ +## 1.40.0 + +We're adopting a new release strategy to minimize dependency bloat in projects that consume Gomega. It is a limitation of the go mod toolchain that _test_ subdependencies of your project's direct dependencies get pulled in as *indirect* dependencies. In the case of Gomega, this ends up pulling in all of Ginkgo into your `go.mod` even if you are only using Gomega (Gomega uses Ginkgo for its own tests). + +Going forward, releases will strip out all tests, tidy up the `go.mod` and then push this stripped down version to a new `master-lite` branch. These stripped-down versions will receive the `vx.y.z` git tag and will be picked up by the go toolchain. + +Please open an issue if this new release process causes unexpected changes for your projects. + ## 1.39.1 Update all dependencies. This auto-updated the required version of Go to 1.24, consistent with the fact that Go 1.23 has been out of support for almost six months. diff --git a/vendor/github.com/onsi/gomega/gomega_dsl.go b/vendor/github.com/onsi/gomega/gomega_dsl.go index 87c70692bf..af1341bdbd 100644 --- a/vendor/github.com/onsi/gomega/gomega_dsl.go +++ b/vendor/github.com/onsi/gomega/gomega_dsl.go @@ -22,7 +22,7 @@ import ( "github.com/onsi/gomega/types" ) -const GOMEGA_VERSION = "1.39.1" +const GOMEGA_VERSION = "1.40.0" const nilGomegaPanic = `You are trying to make an assertion, but haven't registered Gomega's fail handler. If you're using Ginkgo then you probably forgot to put your assertion in an It(). diff --git a/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/storageprovider/storageprovider.go b/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/storageprovider/storageprovider.go index e14907025f..8a50c9c43f 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/storageprovider/storageprovider.go +++ b/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/storageprovider/storageprovider.go @@ -744,16 +744,6 @@ func (s *Service) Delete(ctx context.Context, req *provider.DeleteRequest) (*pro } ctx = ctxpkg.ContextSetLockID(ctx, req.LockId) - - // check DeleteRequest for any known opaque properties. - // FIXME these should be part of the DeleteRequest object - if req.Opaque != nil { - if _, ok := req.Opaque.Map["deleting_shared_resource"]; ok { - // it is a binary key; its existence signals true. Although, do not assume. - ctx = appctx.WithDeletingSharedResource(ctx) - } - } - md, err := s.Storage.GetMD(ctx, req.Ref, []string{}, []string{"id", "status"}) if err != nil { return &provider.DeleteResponse{ diff --git a/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/userprovider/userprovider.go b/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/userprovider/userprovider.go index 8bf5fbbe11..85952e94f8 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/userprovider/userprovider.go +++ b/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/userprovider/userprovider.go @@ -182,9 +182,12 @@ func (s *service) GetUser(ctx context.Context, req *userpb.GetUserRequest) (*use user, err := s.usermgr.GetUser(ctx, req.UserId, req.SkipFetchingUserGroups) if err != nil { res := &userpb.GetUserResponse{} - if _, ok := err.(errtypes.NotFound); ok { + switch err.(type) { + case errtypes.NotFound: res.Status = status.NewNotFound(ctx, "user not found") - } else { + case errtypes.Unavailable: + res.Status = status.NewUnavailable(ctx, "user provider temporarily unavailable") + default: res.Status = status.NewInternal(ctx, "error getting user") } return res, nil @@ -205,9 +208,12 @@ func (s *service) GetUserByClaim(ctx context.Context, req *userpb.GetUserByClaim user, err := s.usermgr.GetUserByClaim(ctx, req.Claim, req.Value, tenantID, req.SkipFetchingUserGroups) if err != nil { res := &userpb.GetUserByClaimResponse{} - if _, ok := err.(errtypes.NotFound); ok { + switch err.(type) { + case errtypes.NotFound: res.Status = status.NewNotFound(ctx, fmt.Sprintf("user not found %s %s", req.Claim, req.Value)) - } else { + case errtypes.Unavailable: + res.Status = status.NewUnavailable(ctx, "user provider temporarily unavailable") + default: res.Status = status.NewInternal(ctx, "error getting user by claim") } return res, nil diff --git a/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/usershareprovider/usershareprovider.go b/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/usershareprovider/usershareprovider.go index eaab359763..1b801f5812 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/usershareprovider/usershareprovider.go +++ b/vendor/github.com/opencloud-eu/reva/v2/internal/grpc/services/usershareprovider/usershareprovider.go @@ -84,9 +84,9 @@ type service struct { allowedPathsForShares []*regexp.Regexp } -func getShareManager(c *config) (share.Manager, error) { +func getShareManager(c *config, logger *zerolog.Logger) (share.Manager, error) { if f, ok := registry.NewFuncs[c.Driver]; ok { - return f(c.Drivers[c.Driver]) + return f(c.Drivers[c.Driver], logger) } return nil, errtypes.NotFound("driver not found: " + c.Driver) } @@ -114,7 +114,7 @@ func parseConfig(m map[string]interface{}) (*config, error) { } // New creates a new user share provider svc initialized from defaults -func NewDefault(m map[string]interface{}, ss *grpc.Server, _ *zerolog.Logger) (rgrpc.Service, error) { +func NewDefault(m map[string]any, ss *grpc.Server, logger *zerolog.Logger) (rgrpc.Service, error) { c, err := parseConfig(m) if err != nil { @@ -123,7 +123,7 @@ func NewDefault(m map[string]interface{}, ss *grpc.Server, _ *zerolog.Logger) (r c.init() - sm, err := getShareManager(c) + sm, err := getShareManager(c, logger) if err != nil { return nil, err } diff --git a/vendor/github.com/opencloud-eu/reva/v2/internal/http/services/owncloud/ocdav/proppatch.go b/vendor/github.com/opencloud-eu/reva/v2/internal/http/services/owncloud/ocdav/proppatch.go index 1c1f9797b9..3a2df9496a 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/internal/http/services/owncloud/ocdav/proppatch.go +++ b/vendor/github.com/opencloud-eu/reva/v2/internal/http/services/owncloud/ocdav/proppatch.go @@ -155,6 +155,13 @@ func (s *svc) handleProppatch(ctx context.Context, w http.ResponseWriter, r *htt } for j := range patches[i].Props { propNameXML := patches[i].Props[j].XMLName + + // favorites are now managed by the Graph API and can no longer be set using PROPPATCH. To avoid confusion, we return a 403 Forbidden when clients try to set the oc:favorites property + if propNameXML.Local == "favorite" { + w.WriteHeader(http.StatusForbidden) + return nil, nil, false + } + // don't use path.Join. It removes the double slash! concatenate with a / key := fmt.Sprintf("%s/%s", patches[i].Props[j].XMLName.Space, patches[i].Props[j].XMLName.Local) value := string(patches[i].Props[j].InnerXML) diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/appctx/appctx.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/appctx/appctx.go index 049ac9df46..336c5618d1 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/appctx/appctx.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/appctx/appctx.go @@ -27,16 +27,6 @@ import ( "go.opentelemetry.io/otel/trace" ) -// deletingSharedResource flags to a storage a shared resource is being deleted not by the owner. -type deletingSharedResource struct{} - -func WithDeletingSharedResource(ctx context.Context) context.Context { - return context.WithValue(ctx, deletingSharedResource{}, struct{}{}) -} -func DeletingSharedResourceFromContext(ctx context.Context) bool { - return ctx.Value(deletingSharedResource{}) != nil -} - // WithLogger returns a context with an associated logger. func WithLogger(ctx context.Context, l *zerolog.Logger) context.Context { return l.WithContext(ctx) diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/errtypes/errtypes.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/errtypes/errtypes.go index a02399e406..3b349f82f4 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/errtypes/errtypes.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/errtypes/errtypes.go @@ -203,6 +203,15 @@ func (e TooEarly) Error() string { return "error: too early: " + string(e) } // IsTooEarly implements the IsTooEarly interface. func (e TooEarly) IsTooEarly() {} +// Unavailable is the error to use when a backend service (e.g. LDAP, database) is +// temporarily unreachable. Callers should treat this as a transient failure and retry. +type Unavailable string + +func (e Unavailable) Error() string { return "error: unavailable: " + string(e) } + +// IsUnavailable implements the IsUnavailable interface. +func (e Unavailable) IsUnavailable() {} + // IsNotFound is the interface to implement // to specify that a resource is not found. type IsNotFound interface { @@ -293,6 +302,12 @@ type IsTooEarly interface { IsTooEarly() } +// IsUnavailable is the interface to implement to specify that a backend service is +// temporarily unavailable and the caller should retry. +type IsUnavailable interface { + IsUnavailable() +} + // NewErrtypeFromStatus maps a rpc status to an errtype func NewErrtypeFromStatus(status *rpc.Status) error { switch status.Code { @@ -329,6 +344,8 @@ func NewErrtypeFromStatus(status *rpc.Status) error { return BadRequest(status.Message) case rpc.Code_CODE_TOO_EARLY: return TooEarly(status.Message) + case rpc.Code_CODE_UNAVAILABLE: + return Unavailable(status.Message) default: return InternalError(status.Message) } @@ -363,6 +380,8 @@ func NewErrtypeFromHTTPStatusCode(code int, message string) error { return PartialContent(message) case http.StatusTooEarly: return TooEarly(message) + case http.StatusServiceUnavailable: + return Unavailable(message) case StatusChecksumMismatch: return ChecksumMismatch(message) default: @@ -399,6 +418,8 @@ func NewHTTPStatusCodeFromErrtype(err error) int { return http.StatusPartialContent case TooEarly: return http.StatusTooEarly + case Unavailable: + return http.StatusServiceUnavailable case ChecksumMismatch: return StatusChecksumMismatch default: diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/events/raw/raw.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/events/raw/raw.go index 5f24f45c14..5126bc1d6b 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/events/raw/raw.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/events/raw/raw.go @@ -71,9 +71,8 @@ type RawStream struct { c Config } -func FromConfig(ctx context.Context, name string, cfg Config) (Stream, error) { - var s Stream - b := backoff.NewExponentialBackOff() +func JetStream(ctx context.Context, name string, cfg Config) (jetstream.JetStream, error) { + var js jetstream.JetStream connect := func() error { var tlsConf *tls.Config @@ -120,27 +119,32 @@ func FromConfig(ctx context.Context, name string, cfg Config) (Stream, error) { return err } - jsConn, err := jetstream.New(conn) - if err != nil { - return err - } + js, err = jetstream.New(conn) + return err + } - js, err := jsConn.Stream(ctx, events.MainQueueName) - if err != nil { - return err - } + err := backoff.Retry(connect, backoff.NewExponentialBackOff()) + if err != nil { + return nil, errors.Wrap(err, "could not connect to nats jetstream") + } + return js, nil +} - s = &RawStream{ - js: js, - c: cfg, - } - return nil +func FromConfig(ctx context.Context, name string, cfg Config) (Stream, error) { + jsConn, err := JetStream(ctx, name, cfg) + if err != nil { + return nil, err } - err := backoff.Retry(connect, b) + + js, err := jsConn.Stream(ctx, events.MainQueueName) if err != nil { - return s, errors.Wrap(err, "could not connect to nats jetstream") + return nil, err } - return s, nil + + return &RawStream{ + js: js, + c: cfg, + }, nil } func (s *RawStream) Consume(group string, evs ...events.Unmarshaller) (<-chan Event, error) { diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/events/stream/nats.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/events/stream/nats.go index 24b47872df..4fbaa5cfee 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/events/stream/nats.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/events/stream/nats.go @@ -2,6 +2,7 @@ package stream import ( "bytes" + "context" "crypto/tls" "crypto/x509" "errors" @@ -11,7 +12,9 @@ import ( "github.com/cenkalti/backoff" "github.com/go-micro/plugins/v4/events/natsjs" + "github.com/nats-io/nats.go/jetstream" "github.com/opencloud-eu/reva/v2/pkg/events" + "github.com/opencloud-eu/reva/v2/pkg/events/raw" "github.com/opencloud-eu/reva/v2/pkg/logger" ) @@ -65,7 +68,38 @@ func NatsFromConfig(connName string, disableDurability bool, cfg NatsConfig) (ev opts = append(opts, natsjs.DisableDurableStreams()) } - return Nats(opts...) + s, err := Nats(opts...) + if err != nil { + return nil, err + } + + // apply a MaxAge to the main queue to prevent it from filling up + ctx := context.Background() + jsConn, err := raw.JetStream(ctx, connName, raw.Config{ + Endpoint: cfg.Endpoint, + Cluster: cfg.Cluster, + TLSInsecure: cfg.TLSInsecure, + TLSRootCACertificate: cfg.TLSRootCACertificate, + EnableTLS: cfg.EnableTLS, + AuthUsername: cfg.AuthUsername, + AuthPassword: cfg.AuthPassword, + }) + if err != nil { + return nil, err + } + streamCfg := jetstream.StreamConfig{ + Name: "main-queue", + MaxAge: 7 * 24 * time.Hour, + } + _, err = jsConn.CreateStream(ctx, streamCfg) + if err != nil { + // If the stream already exists, update its configuration + if err == jetstream.ErrStreamNameAlreadyInUse { + _, _ = jsConn.UpdateStream(ctx, streamCfg) + } + } + + return s, nil } // nats returns a nats streaming client diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/rgrpc/status/status.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/rgrpc/status/status.go index 0ee9b1029f..07eb32e253 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/rgrpc/status/status.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/rgrpc/status/status.go @@ -68,6 +68,15 @@ func NewInternal(ctx context.Context, msg string) *rpc.Status { } } +// NewUnavailable returns a Status with CODE_UNAVAILABLE. +func NewUnavailable(ctx context.Context, msg string) *rpc.Status { + return &rpc.Status{ + Code: rpc.Code_CODE_UNAVAILABLE, + Message: msg, + Trace: getTrace(ctx), + } +} + // NewUnauthenticated returns a Status with CODE_UNAUTHENTICATED. func NewUnauthenticated(ctx context.Context, err error, msg string) *rpc.Status { return &rpc.Status{ @@ -191,6 +200,10 @@ func NewStatusFromErrType(ctx context.Context, msg string, err error) *rpc.Statu return NewUnimplemented(ctx, err, msg+":"+err.Error()) case errtypes.BadRequest: return NewInvalid(ctx, msg+":"+err.Error()) + case errtypes.Unavailable: + return NewUnavailable(ctx, msg+": "+err.Error()) + case errtypes.IsUnavailable: + return NewUnavailable(ctx, msg+": "+err.Error()) } // map GRPC status codes coming from the auth middleware diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/jsoncs3.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/jsoncs3.go index ba995639b8..cf336ecb02 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/jsoncs3.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/jsoncs3.go @@ -36,9 +36,9 @@ import ( "github.com/opencloud-eu/reva/v2/pkg/errtypes" "github.com/opencloud-eu/reva/v2/pkg/events" "github.com/opencloud-eu/reva/v2/pkg/events/stream" - "github.com/opencloud-eu/reva/v2/pkg/logger" "github.com/opencloud-eu/reva/v2/pkg/rgrpc/todo/pool" "github.com/opencloud-eu/reva/v2/pkg/share" + migration "github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations" "github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/providercache" "github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/receivedsharecache" "github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/sharecache" @@ -48,6 +48,7 @@ import ( "github.com/opencloud-eu/reva/v2/pkg/storagespace" "github.com/opencloud-eu/reva/v2/pkg/utils" "github.com/pkg/errors" + "github.com/rs/zerolog" "go.opentelemetry.io/otel/codes" "golang.org/x/sync/errgroup" "google.golang.org/genproto/protobuf/field_mask" @@ -122,14 +123,20 @@ var ( ) type config struct { - GatewayAddr string `mapstructure:"gateway_addr"` - MaxConcurrency int `mapstructure:"max_concurrency"` - ProviderAddr string `mapstructure:"provider_addr"` - ServiceUserID string `mapstructure:"service_user_id"` - ServiceUserIdp string `mapstructure:"service_user_idp"` - MachineAuthAPIKey string `mapstructure:"machine_auth_apikey"` - CacheTTL int `mapstructure:"ttl"` - Events EventOptions `mapstructure:"events"` + GatewayAddr string `mapstructure:"gateway_addr"` + MaxConcurrency int `mapstructure:"max_concurrency"` + ProviderAddr string `mapstructure:"provider_addr"` + SystemUserID string `mapstructure:"system_user_id"` + SystemUserIdp string `mapstructure:"system_user_idp"` + MachineAuthAPIKey string `mapstructure:"machine_auth_apikey"` + ServiceAccountID string `mapstructure:"service_account_id"` + ServiceAccountSecret string `mapstructure:"service_account_secret"` + // ProviderRegistryAddr is the address of the storage registry used during + // migrations. Defaults to GatewayAddr when empty, because in the default + // OpenCloud deployment the registry is co-located with the gateway. + ProviderRegistryAddr string `mapstructure:"provider_registry_addr"` + CacheTTL int `mapstructure:"ttl"` + Events EventOptions `mapstructure:"events"` } // EventOptions are the configurable options for events @@ -145,8 +152,6 @@ type EventOptions struct { // Manager implements a share manager using a cs3 storage backend with local caching type Manager struct { - sync.RWMutex - Cache providercache.Cache // holds all shares, sharded by provider id and space id CreatedCache sharecache.Cache // holds the list of shares a user has created, sharded by user id GroupReceivedCache sharecache.Cache // holds the list of shares a group has access to, sharded by group id @@ -155,23 +160,25 @@ type Manager struct { storage metadata.Storage SpaceRoot *provider.ResourceId - initialized bool + ready chan struct{} // closed once initialize() has completed successfully + migrationsDone chan struct{} // closed once doMigrations() has returned on this instance MaxConcurrency int gatewaySelector pool.Selectable[gatewayv1beta1.GatewayAPIClient] eventStream events.Stream + logger *zerolog.Logger } // NewDefault returns a new manager instance with default dependencies -func NewDefault(m map[string]interface{}) (share.Manager, error) { +func NewDefault(m map[string]interface{}, logger *zerolog.Logger) (share.Manager, error) { c := &config{} if err := mapstructure.Decode(m, c); err != nil { err = errors.Wrap(err, "error creating a new manager") return nil, err } - s, err := metadata.NewCS3Storage(c.ProviderAddr, c.ProviderAddr, c.ServiceUserID, c.ServiceUserIdp, c.MachineAuthAPIKey) + s, err := metadata.NewCS3Storage(c.ProviderAddr, c.ProviderAddr, c.SystemUserID, c.SystemUserIdp, c.MachineAuthAPIKey) if err != nil { return nil, err } @@ -189,11 +196,34 @@ func NewDefault(m map[string]interface{}) (share.Manager, error) { } } - return New(s, gatewaySelector, c.CacheTTL, es, c.MaxConcurrency) + mgr, err := New(s, logger, gatewaySelector, c.CacheTTL, es, c.MaxConcurrency) + if err != nil { + return nil, err + } + providerRegistryAddr := c.ProviderRegistryAddr + if providerRegistryAddr == "" { + providerRegistryAddr = c.GatewayAddr + } + mgr.RunMigrations(migration.MigrationConfig{ + ServiceAccountID: c.ServiceAccountID, + ServiceAccountSecret: c.ServiceAccountSecret, + ProviderRegistryAddr: providerRegistryAddr, + }) + return mgr, nil } // New returns a new manager instance. -func New(s metadata.Storage, gatewaySelector pool.Selectable[gatewayv1beta1.GatewayAPIClient], ttlSeconds int, es events.Stream, maxconcurrency int) (*Manager, error) { +func New(s metadata.Storage, + logger *zerolog.Logger, + gatewaySelector pool.Selectable[gatewayv1beta1.GatewayAPIClient], + ttlSeconds int, + es events.Stream, + maxconcurrency int, +) (*Manager, error) { + if logger == nil { + nop := zerolog.Nop() + logger = &nop + } ttl := time.Duration(ttlSeconds) * time.Second m := &Manager{ @@ -205,13 +235,38 @@ func New(s metadata.Storage, gatewaySelector pool.Selectable[gatewayv1beta1.Gate gatewaySelector: gatewaySelector, eventStream: es, MaxConcurrency: maxconcurrency, + logger: logger, + ready: make(chan struct{}), + // migrationsDone is open (blocking) by default. It is closed by + // doMigrations when all migrations complete, or by SkipMigrations for + // callers (e.g. tests) that do not run migrations at all. + migrationsDone: make(chan struct{}), } + // Initialize the metadata storage connection in the background, retrying + // with exponential backoff if the backend is not yet available. + go func() { + backoff := time.Second + for { + if err := m.initialize(context.Background()); err != nil { + logger.Info().Err(err).Dur("backoff", backoff).Msg("share manager: metadata storage initialization failed, retrying") + time.Sleep(backoff) + if backoff < 30*time.Second { + backoff *= 2 + } + continue + } + logger.Debug().Msg("share manager: initialization succeeded") + close(m.ready) + return + } + }() + // listen for events if m.eventStream != nil { ch, err := events.Consume(m.eventStream, "jsoncs3sharemanager", _registeredEvents...) if err != nil { - appctx.GetLogger(context.Background()).Error().Err(err).Msg("error consuming events") + logger.Error().Err(err).Msg("error consuming events") } go m.ProcessEvents(ch) } @@ -219,23 +274,13 @@ func New(s metadata.Storage, gatewaySelector pool.Selectable[gatewayv1beta1.Gate return m, nil } +// initialize connects to the metadata storage backend and ensures the required +// directory structure exists. It is called once at startup from a background +// goroutine (see New) and must not be called concurrently. func (m *Manager) initialize(ctx context.Context) error { _, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "initialize") defer span.End() - if m.initialized { - span.SetStatus(codes.Ok, "already initialized") - return nil - } - - m.Lock() - defer m.Unlock() - - if m.initialized { // check if initialization happened while grabbing the lock - span.SetStatus(codes.Ok, "initialized while grabbing lock") - return nil - } - ctx = context.Background() err := m.storage.Init(ctx, "jsoncs3-share-manager-metadata") if err != nil { span.RecordError(err) @@ -261,21 +306,85 @@ func (m *Manager) initialize(ctx context.Context) error { span.SetStatus(codes.Error, err.Error()) return err } + err = m.storage.MakeDirIfNotExist(ctx, "migrations") + if err != nil { + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) + return err + } - m.initialized = true span.SetStatus(codes.Ok, "initialized") return nil } +// waitForInit blocks until the background initialization goroutine has +// successfully completed, or until ctx is cancelled. +func (m *Manager) waitForInit(ctx context.Context) error { + select { + case <-m.ready: + return nil + case <-ctx.Done(): + return errors.Wrap(ctx.Err(), "share manager not yet initialized") + } +} + +// waitForMigrations blocks until both storage initialization and all data +// migrations have completed on this instance, or until ctx is cancelled. +// It is a strict superset of waitForInit and should be used by write operations +// to ensure no writes race with an in-progress migration. +func (m *Manager) waitForMigrations(ctx context.Context) error { + select { + case <-m.ready: + case <-ctx.Done(): + return errors.Wrap(ctx.Err(), "share manager not yet initialized") + } + select { + case <-m.migrationsDone: + return nil + case <-ctx.Done(): + return errors.Wrap(ctx.Err(), "share manager migrations not yet complete") + } +} + +// RunMigrations starts data migrations in a background goroutine. It should be +// called once after New() in production server startup. Callers that do not +// need migrations should call SkipMigrations instead to unblock write operations. +func (m *Manager) RunMigrations(cfg migration.MigrationConfig) { + go m.doMigrations(cfg) +} + +// SkipMigrations unblocks write operations on this instance without running +// any migrations. It must be called when RunMigrations will not be called, +// for example in tests. +func (m *Manager) SkipMigrations() { + close(m.migrationsDone) +} + +func (m *Manager) doMigrations(cfg migration.MigrationConfig) { + // Always close migrationsDone when this goroutine exits, whether migrations + // ran, were skipped, or failed. This unblocks write operations on this + // instance. Non-winning instances are held here by acquireLock until the + // winning instance finishes, so the close happens only after the storage + // state is fully migrated. + defer close(m.migrationsDone) + if err := m.waitForInit(context.Background()); err != nil { + m.logger.Error().Err(err).Msg("share manager: aborting migrations, manager did not initialize") + return + } + m.logger.Debug().Msg("migrations start") + migrations := migration.New(*m.logger, m.gatewaySelector, m.storage, cfg, m, m) + migrations.RunMigrations() +} + func (m *Manager) ProcessEvents(ch <-chan events.Event) { - log := logger.New() + log := m.logger + ctx := context.Background() + if err := m.waitForInit(ctx); err != nil { + log.Error().Err(err).Msg("share manager: error waiting for initialization") + return + } for event := range ch { ctx := context.Background() - - if err := m.initialize(ctx); err != nil { - log.Error().Err(err).Msg("error initializing manager") - } - if ev, ok := event.Event.(events.SpaceDeleted); ok { log.Debug().Msgf("space deleted event: %v", ev) go func() { m.purgeSpace(ctx, ev.ID) }() @@ -287,7 +396,7 @@ func (m *Manager) ProcessEvents(ch <-chan events.Event) { func (m *Manager) Share(ctx context.Context, md *provider.ResourceInfo, g *collaboration.ShareGrant) (*collaboration.Share, error) { ctx, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "Share") defer span.End() - if err := m.initialize(ctx); err != nil { + if err := m.waitForMigrations(ctx); err != nil { span.RecordError(err) span.SetStatus(codes.Error, err.Error()) return nil, err @@ -436,7 +545,7 @@ func (m *Manager) GetShare(ctx context.Context, ref *collaboration.ShareReferenc ctx, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "GetShare") defer span.End() sublog := appctx.GetLogger(ctx).With().Str("id", ref.GetId().GetOpaqueId()).Str("key", ref.GetKey().String()).Str("driver", "jsoncs3").Str("handler", "GetShare").Logger() - if err := m.initialize(ctx); err != nil { + if err := m.waitForInit(ctx); err != nil { return nil, err } @@ -494,7 +603,7 @@ func (m *Manager) Unshare(ctx context.Context, ref *collaboration.ShareReference ctx, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "Unshare") defer span.End() - if err := m.initialize(ctx); err != nil { + if err := m.waitForMigrations(ctx); err != nil { return err } @@ -511,7 +620,7 @@ func (m *Manager) UpdateShare(ctx context.Context, ref *collaboration.ShareRefer ctx, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "UpdateShare") defer span.End() - if err := m.initialize(ctx); err != nil { + if err := m.waitForMigrations(ctx); err != nil { return nil, err } @@ -599,7 +708,7 @@ func (m *Manager) ListShares(ctx context.Context, filters []*collaboration.Filte ctx, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "ListShares") defer span.End() - if err := m.initialize(ctx); err != nil { + if err := m.waitForInit(ctx); err != nil { return nil, err } @@ -816,7 +925,7 @@ func (m *Manager) ListReceivedShares(ctx context.Context, filters []*collaborati defer span.End() sublog := appctx.GetLogger(ctx).With().Str("driver", "jsoncs3").Str("handler", "ListReceivedShares").Logger() - if err := m.initialize(ctx); err != nil { + if err := m.waitForInit(ctx); err != nil { return nil, err } @@ -1012,7 +1121,7 @@ func (m *Manager) convert(ctx context.Context, userID string, s *collaboration.S // GetReceivedShare returns the information for a received share. func (m *Manager) GetReceivedShare(ctx context.Context, ref *collaboration.ShareReference) (*collaboration.ReceivedShare, error) { - if err := m.initialize(ctx); err != nil { + if err := m.waitForInit(ctx); err != nil { return nil, err } @@ -1056,7 +1165,7 @@ func (m *Manager) UpdateReceivedShare(ctx context.Context, receivedShare *collab ctx, span := appctx.GetTracerProvider(ctx).Tracer(tracerName).Start(ctx, "UpdateReceivedShare") defer span.End() - if err := m.initialize(ctx); err != nil { + if err := m.waitForMigrations(ctx); err != nil { return nil, err } @@ -1103,8 +1212,8 @@ func updateShareID(share *collaboration.Share) { // Load imports shares and received shares from channels (e.g. during migration) func (m *Manager) Load(ctx context.Context, shareChan <-chan *collaboration.Share, receivedShareChan <-chan share.ReceivedShareWithUser) error { - log := appctx.GetLogger(ctx) - if err := m.initialize(ctx); err != nil { + l := m.logger + if err := m.waitForInit(ctx); err != nil { return err } @@ -1119,14 +1228,14 @@ func (m *Manager) Load(ctx context.Context, shareChan <-chan *collaboration.Shar updateShareID(s) } if err := m.Cache.Add(context.Background(), s.GetResourceId().GetStorageId(), s.GetResourceId().GetSpaceId(), s.Id.OpaqueId, s); err != nil { - log.Error().Err(err).Interface("share", s).Msg("error persisting share") + l.Error().Err(err).Interface("share", s).Msg("error persisting share") } else { - log.Debug().Str("storageid", s.GetResourceId().GetStorageId()).Str("spaceid", s.GetResourceId().GetSpaceId()).Str("shareid", s.Id.OpaqueId).Msg("imported share") + l.Debug().Str("storageid", s.GetResourceId().GetStorageId()).Str("spaceid", s.GetResourceId().GetSpaceId()).Str("shareid", s.Id.OpaqueId).Msg("imported share") } if err := m.CreatedCache.Add(ctx, s.GetCreator().GetOpaqueId(), s.Id.OpaqueId); err != nil { - log.Error().Err(err).Interface("share", s).Msg("error persisting created cache") + l.Error().Err(err).Interface("share", s).Msg("error persisting created cache") } else { - log.Debug().Str("creatorid", s.GetCreator().GetOpaqueId()).Str("shareid", s.Id.OpaqueId).Msg("updated created cache") + l.Debug().Str("creatorid", s.GetCreator().GetOpaqueId()).Str("shareid", s.Id.OpaqueId).Msg("updated created cache") } } wg.Done() @@ -1137,18 +1246,19 @@ func (m *Manager) Load(ctx context.Context, shareChan <-chan *collaboration.Shar if !shareIsRoutable(s.ReceivedShare.GetShare()) { updateShareID(s.ReceivedShare.GetShare()) } - switch s.ReceivedShare.Share.Grantee.Type { - case provider.GranteeType_GRANTEE_TYPE_USER: - if err := m.UserReceivedStates.Add(context.Background(), s.ReceivedShare.GetShare().GetGrantee().GetUserId().GetOpaqueId(), s.ReceivedShare.GetShare().GetResourceId().GetSpaceId(), s.ReceivedShare); err != nil { - log.Error().Err(err).Interface("received share", s).Msg("error persisting received share for user") + if s.UserID != nil { + spaceid := s.ReceivedShare.GetShare().GetResourceId().GetStorageId() + shareid.IDDelimiter + s.ReceivedShare.GetShare().GetResourceId().GetSpaceId() + if err := m.UserReceivedStates.Add(context.Background(), s.UserID.GetOpaqueId(), spaceid, s.ReceivedShare); err != nil { + l.Error().Err(err).Interface("received share", s).Msg("error persisting received share for user") } else { - log.Debug().Str("userid", s.ReceivedShare.GetShare().GetGrantee().GetUserId().GetOpaqueId()).Str("spaceid", s.ReceivedShare.GetShare().GetResourceId().GetSpaceId()).Str("shareid", s.ReceivedShare.GetShare().Id.OpaqueId).Msg("updated received share userdata") + l.Debug().Str("userid", s.UserID.GetOpaqueId()).Str("spaceid", spaceid).Str("shareid", s.ReceivedShare.GetShare().Id.OpaqueId).Msg("updated received share userdata") } - case provider.GranteeType_GRANTEE_TYPE_GROUP: + } + if s.ReceivedShare.Share.Grantee.Type == provider.GranteeType_GRANTEE_TYPE_GROUP && s.UserID == nil { if err := m.GroupReceivedCache.Add(context.Background(), s.ReceivedShare.GetShare().GetGrantee().GetGroupId().GetOpaqueId(), s.ReceivedShare.GetShare().GetId().GetOpaqueId()); err != nil { - log.Error().Err(err).Interface("received share", s).Msg("error persisting received share to group cache") + l.Error().Err(err).Interface("received share", s).Msg("error persisting received share to group cache") } else { - log.Debug().Str("groupid", s.ReceivedShare.GetShare().GetGrantee().GetGroupId().GetOpaqueId()).Str("shareid", s.ReceivedShare.GetShare().Id.OpaqueId).Msg("updated received share group cache") + l.Debug().Str("groupid", s.ReceivedShare.GetShare().GetGrantee().GetGroupId().GetOpaqueId()).Str("shareid", s.ReceivedShare.GetShare().Id.OpaqueId).Msg("updated received share group cache") } } } @@ -1220,7 +1330,7 @@ func (m *Manager) removeShare(ctx context.Context, s *collaboration.Share, skipS func (m *Manager) CleanupStaleShares(ctx context.Context) { log := appctx.GetLogger(ctx) - if err := m.initialize(ctx); err != nil { + if err := m.waitForMigrations(ctx); err != nil { return } diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/0001_import_spacemembers.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/0001_import_spacemembers.go new file mode 100644 index 0000000000..8f15a563eb --- /dev/null +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/0001_import_spacemembers.go @@ -0,0 +1,435 @@ +// Copyright 2026 OpenCloud GmbH +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// In applying this license, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +package migration + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" + + "github.com/cenkalti/backoff" + grouppb "github.com/cs3org/go-cs3apis/cs3/identity/group/v1beta1" + userpb "github.com/cs3org/go-cs3apis/cs3/identity/user/v1beta1" + rpc "github.com/cs3org/go-cs3apis/cs3/rpc/v1beta1" + collaboration "github.com/cs3org/go-cs3apis/cs3/sharing/collaboration/v1beta1" + provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1" + registry "github.com/cs3org/go-cs3apis/cs3/storage/registry/v1beta1" + typesv1beta1 "github.com/cs3org/go-cs3apis/cs3/types/v1beta1" + "github.com/google/uuid" + ctxpkg "github.com/opencloud-eu/reva/v2/pkg/ctx" + "github.com/opencloud-eu/reva/v2/pkg/errtypes" + "github.com/opencloud-eu/reva/v2/pkg/rgrpc/todo/pool" + "github.com/opencloud-eu/reva/v2/pkg/share" + "github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/shareid" + "github.com/opencloud-eu/reva/v2/pkg/utils" + "github.com/rs/zerolog" + "google.golang.org/grpc" +) + +// storageProvider is the narrow subset of provider.ProviderAPIClient that the +// migration actually uses. Keeping it narrow makes test stubs trivial to write. +type storageProvider interface { + ListGrants(ctx context.Context, in *provider.ListGrantsRequest, opts ...grpc.CallOption) (*provider.ListGrantsResponse, error) +} + +type ImportSpaceMembersMigration struct { + cfg config + sharesChan chan *collaboration.Share + receivedChan chan share.ReceivedShareWithUser + userCache map[string]*userpb.UserId + groupCache map[string]*grouppb.GroupId + providerResolver func(context.Context, *provider.StorageSpace) (storageProvider, error) +} + +func init() { + registerMigration(&ImportSpaceMembersMigration{}) +} + +func (m *ImportSpaceMembersMigration) Initialize(cfg config) { + m.cfg = cfg + m.sharesChan = make(chan *collaboration.Share) + m.receivedChan = make(chan share.ReceivedShareWithUser) + m.userCache = make(map[string]*userpb.UserId) + m.groupCache = make(map[string]*grouppb.GroupId) + m.providerResolver = func(ctx context.Context, space *provider.StorageSpace) (storageProvider, error) { + return m.storageProviderForSpace(ctx, space) + } +} + +func (m *ImportSpaceMembersMigration) Name() string { + return "import_space_members" +} + +func (m *ImportSpaceMembersMigration) Version() int { + return 1 +} + +func (m *ImportSpaceMembersMigration) Migrate() error { + gwc, err := m.cfg.gatewaySelector.Next() + if err != nil { + return err + } + + svcCtx, err := utils.GetServiceUserContextWithContext(context.Background(), gwc, m.cfg.serviceAccountID, m.cfg.serviceAccountSecret) + if err != nil { + m.cfg.logger.Error().Err(err).Msg("failed to get service user context for migration") + return err + } + // List all project spaces. + listRes, err := gwc.ListStorageSpaces(svcCtx, &provider.ListStorageSpacesRequest{ + Opaque: utils.AppendPlainToOpaque(nil, "unrestricted", "true"), + Filters: []*provider.ListStorageSpacesRequest_Filter{ + { + Type: provider.ListStorageSpacesRequest_Filter_TYPE_SPACE_TYPE, + Term: &provider.ListStorageSpacesRequest_Filter_SpaceType{SpaceType: "project"}, + }, + }, + }) + if err != nil { + m.cfg.logger.Error().Err(err).Msg("space-membership migration: failed to list storage spaces") + return err + } + + if listRes.GetStatus().GetCode() != rpc.Code_CODE_OK { + m.cfg.logger.Error().Str("status", listRes.GetStatus().GetMessage()).Msg("space-membership migration: ListStorageSpaces returned non-OK status") + return errtypes.InternalError("ListStorageSpaces") + } + + spaces := listRes.GetStorageSpaces() + m.cfg.logger.Info().Int("spaces", len(spaces)).Msg("Starting migration") + + // loadCtx is cancelled when the producer finishes (or fails) so that the + // Load goroutine — which blocks reading from the channels — is not left + // waiting forever if we return early from an error. + loadCtx, cancelLoad := context.WithCancel(svcCtx) + defer cancelLoad() + + var wg sync.WaitGroup + var loaderError error + wg.Go(func() { + loaderError = m.cfg.loader.Load(loadCtx, m.sharesChan, m.receivedChan) + }) + + migrated := 0 + for _, space := range spaces { + sharesCreated, err := m.migrateSpace(loadCtx, space) + if err != nil { + m.cfg.logger.Error().Err(err).Str("space", space.GetId().GetOpaqueId()).Msg("failed to migrate space; continuing with remaining spaces") + continue + } + migrated++ + m.cfg.logger.Debug(). + Str("space", space.GetId().GetOpaqueId()). + Int("shares_created", sharesCreated). + Msg("space migrated") + if migrated%10 == 0 { + m.cfg.logger.Info(). + Int("migrated", migrated). + Int("total", len(spaces)). + Msg("migration progress") + } + } + close(m.receivedChan) + close(m.sharesChan) + + wg.Wait() + m.cfg.logger.Info().Err(loaderError).Int("migrated", migrated).Int("total", len(spaces)).Msg("Migration finished") + return loaderError +} + +func (m *ImportSpaceMembersMigration) migrateSpace(ctx context.Context, space *provider.StorageSpace) (int, error) { + spClient, err := m.providerResolver(ctx, space) + if err != nil { + return 0, err + } + + ref := &provider.Reference{ResourceId: space.GetRoot()} + grantsRes, err := spClient.ListGrants(ctx, &provider.ListGrantsRequest{Ref: ref}) + if err != nil { + return 0, err + } + if grantsRes.GetStatus().GetCode() != rpc.Code_CODE_OK { + return 0, errtypes.NewErrtypeFromStatus(grantsRes.GetStatus()) + } + + sharesCreated := 0 + for _, grant := range grantsRes.GetGrants() { + share, receivedShares, err := m.spaceGrantToShares(ctx, grant, space) + if err != nil { + m.cfg.logger.Error().Err(err). + Interface("grant", grant). + Msg("Failed to convert grant to shares") + continue + } + if share == nil { + // share already existed; nothing to import for this grant + continue + } + + select { + case m.sharesChan <- share: + case <-ctx.Done(): + return sharesCreated, ctx.Err() + } + for _, rs := range receivedShares { + select { + case m.receivedChan <- rs: + case <-ctx.Done(): + return sharesCreated, ctx.Err() + } + } + sharesCreated++ + } + return sharesCreated, nil +} + +// resolveRetries is the maximum number of times resolveUserID / resolveGroupID +// will retry after receiving an errtypes.Unavailable response (LDAP down). +const resolveRetries = 10 + +// retryOnUnavailable calls op, retrying with exponential backoff whenever op +// returns errtypes.Unavailable. Any other error (including context +// cancellation) stops the loop immediately and is returned as-is. +// Retries are capped at resolveRetries attempts and respect ctx cancellation. +func retryOnUnavailable(ctx context.Context, log zerolog.Logger, op func() error) error { + b := backoff.WithContext( + backoff.WithMaxRetries(backoff.NewExponentialBackOff(), resolveRetries), + ctx, + ) + notify := func(err error, d time.Duration) { + log.Warn().Err(err).Dur("retry_in", d).Msg("identity provider temporarily unavailable, retrying") + } + return backoff.RetryNotify(func() error { + err := op() + if err == nil { + return nil + } + if _, ok := err.(errtypes.Unavailable); ok { + return err // transient — keep retrying + } + return backoff.Permanent(err) // permanent — stop immediately + }, b, notify) +} + +func (m *ImportSpaceMembersMigration) resolveUserID(ctx context.Context, opaqueID string) (*userpb.UserId, error) { + if id, ok := m.userCache[opaqueID]; ok { + return id, nil + } + var id *userpb.UserId + err := retryOnUnavailable(ctx, m.cfg.logger, func() error { + gwc, err := m.cfg.gatewaySelector.Next() + if err != nil { + return err + } + res, err := gwc.GetUser(ctx, &userpb.GetUserRequest{ + UserId: &userpb.UserId{OpaqueId: opaqueID}, + SkipFetchingUserGroups: true, + }) + if err != nil { + return err + } + if res.GetStatus().GetCode() != rpc.Code_CODE_OK { + // errtypes.NewErrtypeFromStatus maps CODE_UNAVAILABLE → errtypes.Unavailable, + // which retryOnUnavailable will retry; all other codes are treated as permanent. + return errtypes.NewErrtypeFromStatus(res.GetStatus()) + } + id = res.GetUser().GetId() + return nil + }) + if err != nil { + return nil, err + } + m.userCache[opaqueID] = id + return id, nil +} + +func (m *ImportSpaceMembersMigration) resolveGroupID(ctx context.Context, opaqueID string) (*grouppb.GroupId, error) { + if id, ok := m.groupCache[opaqueID]; ok { + return id, nil + } + var id *grouppb.GroupId + err := retryOnUnavailable(ctx, m.cfg.logger, func() error { + gwc, err := m.cfg.gatewaySelector.Next() + if err != nil { + return err + } + res, err := gwc.GetGroup(ctx, &grouppb.GetGroupRequest{ + GroupId: &grouppb.GroupId{OpaqueId: opaqueID}, + SkipFetchingMembers: true, + }) + if err != nil { + return err + } + if res.GetStatus().GetCode() != rpc.Code_CODE_OK { + return errtypes.NewErrtypeFromStatus(res.GetStatus()) + } + id = res.GetGroup().GetId() + return nil + }) + if err != nil { + return nil, err + } + m.groupCache[opaqueID] = id + return id, nil +} + +func (m *ImportSpaceMembersMigration) spaceGrantToShares(ctx context.Context, grant *provider.Grant, space *provider.StorageSpace) (*collaboration.Share, []share.ReceivedShareWithUser, error) { + // The grantee ids as persisted on disk do not have an IDP or type stored as + // part of the userid/groupid. Resolve them via the gateway so we get the + // full userid + switch grant.GetGrantee().GetType() { + case provider.GranteeType_GRANTEE_TYPE_GROUP: + groupID, err := m.resolveGroupID(ctx, grant.GetGrantee().GetGroupId().GetOpaqueId()) + if err != nil { + return nil, nil, fmt.Errorf("resolve group %s: %w", grant.GetGrantee().GetGroupId().GetOpaqueId(), err) + } + grant.Grantee.Id = &provider.Grantee_GroupId{GroupId: groupID} + case provider.GranteeType_GRANTEE_TYPE_USER: + userID, err := m.resolveUserID(ctx, grant.GetGrantee().GetUserId().GetOpaqueId()) + if err != nil { + return nil, nil, fmt.Errorf("resolve user %s: %w", grant.GetGrantee().GetUserId().GetOpaqueId(), err) + } + grant.Grantee.Id = &provider.Grantee_UserId{UserId: userID} + } + + ref := &collaboration.ShareReference{ + Spec: &collaboration.ShareReference_Key{ + Key: &collaboration.ShareKey{ + ResourceId: space.GetRoot(), + Grantee: grant.GetGrantee(), + }, + }, + } + + ctx = ctxpkg.ContextSetUser(ctx, &userpb.User{Id: grant.Creator}) + if s, err := m.cfg.manager.GetShare(ctx, ref); err == nil { + // FIXME: Verify the actual grants? + m.cfg.logger.Debug().Interface("share", s).Msg("share already exists") + return nil, nil, nil + } + + ts := utils.TSNow() + shareID := shareid.Encode(space.GetRoot().GetStorageId(), space.GetRoot().GetSpaceId(), uuid.NewString()) + + creator := grant.GetCreator() + if creator.Type == userpb.UserType_USER_TYPE_INVALID { + creator = nil + } + newShare := &collaboration.Share{ + Id: &collaboration.ShareId{OpaqueId: shareID}, + ResourceId: space.GetRoot(), + Permissions: &collaboration.SharePermissions{Permissions: grant.GetPermissions()}, + Grantee: grant.GetGrantee(), + Expiration: grant.GetExpiration(), + Owner: creator, + Creator: creator, + Ctime: ts, + Mtime: ts, + } + + var newReceivedShares []share.ReceivedShareWithUser + switch grant.GetGrantee().GetType() { + case provider.GranteeType_GRANTEE_TYPE_GROUP: + gwc, err := m.cfg.gatewaySelector.Next() + if err != nil { + m.cfg.logger.Error().Err(err).Msg("Failed to get gateway client") + return nil, nil, err + } + + gr, err := gwc.GetMembers(ctx, &grouppb.GetMembersRequest{ + GroupId: grant.GetGrantee().GetGroupId(), + }) + if err != nil { + m.cfg.logger.Error().Err(err).Msg("Failed to expand group membership") + return nil, nil, err + } + if gr.GetStatus().GetCode() != rpc.Code_CODE_OK { + m.cfg.logger.Error().Str("Status", gr.GetStatus().GetMessage()).Msg("Failed to expand group membership") + return nil, nil, errtypes.NewErrtypeFromStatus(gr.GetStatus()) + } + for _, u := range gr.GetMembers() { + newReceivedShares = append(newReceivedShares, share.ReceivedShareWithUser{ + UserID: u, + ReceivedShare: &collaboration.ReceivedShare{ + Share: newShare, + State: collaboration.ShareState_SHARE_STATE_ACCEPTED, + }, + }) + } + // Also add a group-level entry (UserID == nil) so the group cache is populated. + newReceivedShares = append(newReceivedShares, share.ReceivedShareWithUser{ + UserID: nil, + ReceivedShare: &collaboration.ReceivedShare{ + Share: newShare, + State: collaboration.ShareState_SHARE_STATE_ACCEPTED, + }, + }) + case provider.GranteeType_GRANTEE_TYPE_USER: + newReceivedShares = append(newReceivedShares, share.ReceivedShareWithUser{ + UserID: grant.GetGrantee().GetUserId(), + ReceivedShare: &collaboration.ReceivedShare{ + Share: newShare, + State: collaboration.ShareState_SHARE_STATE_ACCEPTED, + }, + }) + } + return newShare, newReceivedShares, nil +} + +// storageProviderForSpace resolves the storageprovider responsible for the +// given storage space and returns a dialled client. In the default opencloud +// deployment the storage registry is co-located with the gateway, so +// the GatewayAddr is used as the registry address. +func (m *ImportSpaceMembersMigration) storageProviderForSpace(ctx context.Context, space *provider.StorageSpace) (provider.ProviderAPIClient, error) { + + srClient, err := pool.GetStorageRegistryClient(m.cfg.providerRegistryAddr) + if err != nil { + return nil, fmt.Errorf("get storage registry client: %w", err) + } + + spaceJSON, err := json.Marshal(space) + if err != nil { + return nil, fmt.Errorf("marshal space: %w", err) + } + + res, err := srClient.GetStorageProviders(ctx, ®istry.GetStorageProvidersRequest{ + Opaque: &typesv1beta1.Opaque{ + Map: map[string]*typesv1beta1.OpaqueEntry{ + "space": { + Decoder: "json", + Value: spaceJSON, + }, + }, + }, + }) + if err != nil { + return nil, fmt.Errorf("GetStorageProviders: %w", err) + } + if len(res.GetProviders()) == 0 { + return nil, fmt.Errorf("no storage provider found for space %s", space.GetId().GetOpaqueId()) + } + + c, err := pool.GetStorageProviderServiceClient(res.GetProviders()[0].GetAddress()) + if err != nil { + return nil, fmt.Errorf("dial storage provider: %w", err) + } + return c, nil +} diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/migration.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/migration.go new file mode 100644 index 0000000000..94fd7af49b --- /dev/null +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations/migration.go @@ -0,0 +1,353 @@ +// Copyright 2026 OpenCloud GmbH +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// In applying this license, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +package migration + +import ( + "cmp" + "context" + "crypto/rand" + "encoding/json" + "fmt" + "os" + "os/signal" + "slices" + "syscall" + "time" + + gatewayv1beta1 "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1" + "github.com/opencloud-eu/reva/v2/pkg/errtypes" + "github.com/opencloud-eu/reva/v2/pkg/rgrpc/todo/pool" + "github.com/opencloud-eu/reva/v2/pkg/share" + "github.com/opencloud-eu/reva/v2/pkg/storage/utils/metadata" + "github.com/rs/zerolog" +) + +const stateFile = "migrations/state.json" + +const ( + lockFile = "migrations/lock.json" + lockTTL = time.Minute + lockHeartbeatInterval = 20 * time.Second +) + +// lockPollInterval is how long acquireLock sleeps between retries when the +// lock is held by another instance. Declared as a variable so tests can +// shorten it without rebuilding. +var lockPollInterval = 5 * time.Second + +// lockData is the content written to the lock file. +type lockData struct { + Timestamp time.Time `json:"timestamp"` + InstanceID string `json:"instance_id"` +} + +type migration interface { + Name() string + Version() int + Initialize(config) + Migrate() error +} + +// persistedState is the on-disk representation of the migration state. +type persistedState struct { + Version int `json:"version"` +} + +type state struct { + version int +} + +// MigrationConfig holds all caller-supplied options for a migration run. +// It is intentionally a plain struct so that new fields can be added without +// changing function signatures throughout the call chain. +type MigrationConfig struct { + ServiceAccountID string + ServiceAccountSecret string + ProviderRegistryAddr string +} + +type config struct { + logger zerolog.Logger + gatewaySelector pool.Selectable[gatewayv1beta1.GatewayAPIClient] + storage metadata.Storage + serviceAccountID string + serviceAccountSecret string + providerRegistryAddr string + manager share.Manager + loader share.LoadableManager +} + +type Migrations struct { + config + state state + instanceID string +} + +var migrations []migration + +// registerMigration is only supposed to be call from init(), which runs sequentially +// so we don't need ot protect migrations with a lock +func registerMigration(m migration) { + migrations = append(migrations, m) +} + +func New(logger zerolog.Logger, + gatewaySelector pool.Selectable[gatewayv1beta1.GatewayAPIClient], + storage metadata.Storage, + cfg MigrationConfig, + manager share.Manager, + loader share.LoadableManager, +) Migrations { + + slices.SortFunc(migrations, func(a, b migration) int { + return cmp.Compare(a.Version(), b.Version()) + }) + + b := make([]byte, 8) + _, _ = rand.Read(b) + instanceID := fmt.Sprintf("%x", b) + + return Migrations{ + config{ + logger: logger.With().Str("jsoncs3", "migrations").Logger(), + gatewaySelector: gatewaySelector, + storage: storage, + serviceAccountID: cfg.ServiceAccountID, + serviceAccountSecret: cfg.ServiceAccountSecret, + providerRegistryAddr: cfg.ProviderRegistryAddr, + manager: manager, + loader: loader, + }, + state{}, + instanceID, + } +} + +// acquireLock tries to atomically create the lock file, blocking until the lock +// is obtained. It returns the etag of the lock file on success. It retries +// indefinitely until ctx is cancelled. A lock whose timestamp is older than +// lockTTL is considered stale and will be taken over. +func (m *Migrations) acquireLock(ctx context.Context) (string, error) { + m.logger.Debug().Str("instance", m.instanceID).Msg("acquiring migration lock") + for { + // Fast path: create the lock file only if it does not exist yet. + data, err := json.Marshal(lockData{Timestamp: time.Now(), InstanceID: m.instanceID}) + if err != nil { + return "", err + } + res, err := m.storage.Upload(ctx, metadata.UploadRequest{ + Path: lockFile, + Content: data, + IfNoneMatch: []string{"*"}, + }) + if err == nil { + m.logger.Debug().Str("instance", m.instanceID).Msg("migration lock acquired") + return res.Etag, nil + } + + // Propagate context cancellation immediately. + select { + case <-ctx.Done(): + return "", ctx.Err() + default: + } + + // Any error other than a conflict means something unexpected happened. + if !isConflict(err) { + return "", err + } + + // Lock file already exists — read it to decide whether it is stale. + dl, err := m.storage.Download(ctx, metadata.DownloadRequest{Path: lockFile}) + if err != nil { + if _, ok := err.(errtypes.IsNotFound); ok { + // Lock was released between our upload attempt and the download; + // retry acquiring it immediately. + m.logger.Debug().Str("instance", m.instanceID).Msg("migration lock vanished during read; retrying") + continue + } + return "", err + } + + var existing lockData + stale := true + if err := json.Unmarshal(dl.Content, &existing); err == nil { + stale = time.Since(existing.Timestamp) > lockTTL + } + + if stale { + m.logger.Debug(). + Str("instance", m.instanceID). + Str("held_by", existing.InstanceID). + Time("lock_timestamp", existing.Timestamp). + Msg("migration lock is stale; attempting takeover") + + // Atomically take over the stale lock using the etag we just read. + newData, err := json.Marshal(lockData{Timestamp: time.Now(), InstanceID: m.instanceID}) + if err != nil { + return "", err + } + res, err := m.storage.Upload(ctx, metadata.UploadRequest{ + Path: lockFile, + Content: newData, + IfMatchEtag: dl.Etag, + }) + if err == nil { + m.logger.Debug().Str("instance", m.instanceID).Msg("migration lock acquired via stale takeover") + return res.Etag, nil + } + // Another instance took the stale lock before us; loop and retry. + m.logger.Debug().Str("instance", m.instanceID).Err(err).Msg("stale lock takeover lost race; retrying") + continue + } + + m.logger.Debug(). + Str("instance", m.instanceID). + Str("held_by", existing.InstanceID). + Time("lock_timestamp", existing.Timestamp). + Dur("poll_interval", lockPollInterval). + Msg("migration lock held by another instance; waiting") + + // Lock is fresh and held by another instance; wait before retrying. + select { + case <-ctx.Done(): + return "", ctx.Err() + case <-time.After(lockPollInterval): + } + } +} + +// startHeartbeat spawns a goroutine that periodically renews the lock file so +// that it is not considered stale while a long migration is running. Call the +// returned cancel function to stop the heartbeat. +func (m *Migrations) startHeartbeat(ctx context.Context, etag string) context.CancelFunc { + hbCtx, cancel := context.WithCancel(ctx) + go func() { + ticker := time.NewTicker(lockHeartbeatInterval) + defer ticker.Stop() + for { + select { + case <-hbCtx.Done(): + return + case <-ticker.C: + data, err := json.Marshal(lockData{Timestamp: time.Now(), InstanceID: m.instanceID}) + if err != nil { + m.logger.Warn().Err(err).Msg("failed to marshal heartbeat data for migration lock") + return + } + res, err := m.storage.Upload(hbCtx, metadata.UploadRequest{ + Path: lockFile, + Content: data, + IfMatchEtag: etag, + }) + if err != nil { + m.logger.Warn().Err(err).Msg("failed to renew migration lock; another instance may take over") + return + } + etag = res.Etag + } + } + }() + return cancel +} + +// releaseLock deletes the lock file unconditionally. +func (m *Migrations) releaseLock(ctx context.Context) { + if err := m.storage.Delete(ctx, lockFile); err != nil { + m.logger.Warn().Err(err).Msg("failed to release migration lock") + } +} + +// isConflict returns true for errors that signal a conditional-upload conflict, +// i.e. the lock file already exists or the etag did not match. +func isConflict(err error) bool { + switch err.(type) { + case errtypes.IsAlreadyExists, errtypes.IsAborted, errtypes.IsPreconditionFailed: + return true + } + return false +} + +// loadState reads the persisted migration version from storage. If no state +// file exists yet (fresh deployment) it returns version 0 without error. +func (m *Migrations) loadState(ctx context.Context) error { + data, err := m.storage.SimpleDownload(ctx, stateFile) + if err != nil { + if _, ok := err.(errtypes.IsNotFound); ok { + m.state = state{version: 0} + return nil + } + return err + } + var ps persistedState + if err := json.Unmarshal(data, &ps); err != nil { + return err + } + m.state = state{version: ps.Version} + return nil +} + +// saveState writes the current migration version to storage so that already- +// applied migrations are not re-run on the next server start. +func (m *Migrations) saveState(ctx context.Context) error { + data, err := json.Marshal(persistedState{Version: m.state.version}) + if err != nil { + return err + } + return m.storage.SimpleUpload(ctx, stateFile, data) +} + +func (m *Migrations) RunMigrations() { + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + etag, err := m.acquireLock(ctx) + if err != nil { + m.logger.Error().Err(err).Msg("failed to acquire migration lock; skipping migrations") + return + } + cancelHB := m.startHeartbeat(ctx, etag) + defer cancelHB() + defer m.releaseLock(ctx) + + if err := m.loadState(ctx); err != nil { + m.logger.Error().Err(err).Msg("failed to load migration state; skipping migrations") + return + } + + m.logger.Info().Int("current state", m.state.version).Msg("checking migrations") + + for _, mig := range migrations { + if mig.Version() > m.state.version { + m.logger.Info().Str("migration", mig.Name()).Int("version", mig.Version()).Msg("running migration") + mig.Initialize(m.config) + if err := mig.Migrate(); err != nil { + m.logger.Error().Err(err).Str("migration", mig.Name()).Msg("migration failed; stopping") + return + } + m.state.version = mig.Version() + if err := m.saveState(ctx); err != nil { + m.logger.Error().Err(err).Msg("failed to save migration state; stopping") + return + } + } else { + m.logger.Info().Str("migration", mig.Name()).Int("version", mig.Version()).Msg("skipping migration") + } + } +} diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/memory/memory.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/memory/memory.go index 8ade5926f1..0f34d3a68b 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/memory/memory.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/memory/memory.go @@ -28,6 +28,7 @@ import ( ctxpkg "github.com/opencloud-eu/reva/v2/pkg/ctx" "github.com/opencloud-eu/reva/v2/pkg/share" + "github.com/rs/zerolog" "google.golang.org/genproto/protobuf/field_mask" userv1beta1 "github.com/cs3org/go-cs3apis/cs3/identity/user/v1beta1" @@ -46,7 +47,7 @@ func init() { } // New returns a new manager. -func New(c map[string]interface{}) (share.Manager, error) { +func New(c map[string]any, _ *zerolog.Logger) (share.Manager, error) { state := map[string]map[*collaboration.ShareId]collaboration.ShareState{} mp := map[string]map[*collaboration.ShareId]*provider.Reference{} return &manager{ diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/registry/registry.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/registry/registry.go index 16fc627fff..6b22beb779 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/registry/registry.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/share/manager/registry/registry.go @@ -18,11 +18,14 @@ package registry -import "github.com/opencloud-eu/reva/v2/pkg/share" +import ( + "github.com/opencloud-eu/reva/v2/pkg/share" + "github.com/rs/zerolog" +) // NewFunc is the function that share managers // should register at init time. -type NewFunc func(map[string]interface{}) (share.Manager, error) +type NewFunc func(map[string]any, *zerolog.Logger) (share.Manager, error) // NewFuncs is a map containing all the registered share managers. var NewFuncs = map[string]NewFunc{} diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/cache/kv.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/cache/kv.go index 9ce096f1b1..94c134d3a6 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/cache/kv.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/cache/kv.go @@ -72,7 +72,7 @@ func NewNatsKeyValueFromJetStream(c Config, js jetstream.JetStream) (jetstream.K if err != nil { kvConfig := jetstream.KeyValueConfig{ Bucket: c.Database, - TTL: 0, // we don't do TTLs for this store + TTL: c.TTL, } if c.DisablePersistence { kvConfig.Storage = jetstream.MemoryStorage diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/idcache/idcache.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/idcache/idcache.go index 861e19180c..0228dd5d7d 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/idcache/idcache.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/idcache/idcache.go @@ -65,16 +65,16 @@ func (c *IDCache) DeleteByPath(ctx context.Context, path string) error { } else { err := c.kv.Purge(ctx, baseKey) if err != nil && err != nats.ErrKeyNotFound { - appctx.GetLogger(ctx).Error().Err(err).Str("record", path).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not get spaceID and nodeID from cache") + appctx.GetLogger(ctx).Error().Err(err).Str("record", baseKey).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not purge from cache") } err = c.kv.Purge(ctx, cacheKey(spaceID, nodeID)) if err != nil && err != nats.ErrKeyNotFound { - appctx.GetLogger(ctx).Error().Err(err).Str("record", path).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not get spaceID and nodeID from cache") + appctx.GetLogger(ctx).Error().Err(err).Str("record", cacheKey(spaceID, nodeID)).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not purge from cache") } } - watcher, err := c.kv.Watch(ctx, baseKey+".*") + watcher, err := c.kv.Watch(ctx, baseKey+".>") if err != nil { return err } @@ -85,7 +85,6 @@ func (c *IDCache) DeleteByPath(ctx context.Context, path string) error { break } key := update.Key() - spaceID, nodeID, ok := c.getByReverseCacheKey(ctx, key) if !ok { appctx.GetLogger(ctx).Error().Str("record", key).Msg("could not get spaceID and nodeID from cache") @@ -94,12 +93,12 @@ func (c *IDCache) DeleteByPath(ctx context.Context, path string) error { err := c.kv.Purge(ctx, key) if err != nil && err != nats.ErrKeyNotFound { - appctx.GetLogger(ctx).Error().Err(err).Str("record", key).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not get spaceID and nodeID from cache") + appctx.GetLogger(ctx).Error().Err(err).Str("record", key).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not purge from cache") } err = c.kv.Purge(ctx, cacheKey(spaceID, nodeID)) if err != nil && err != nats.ErrKeyNotFound { - appctx.GetLogger(ctx).Error().Err(err).Str("record", key).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not get spaceID and nodeID from cache") + appctx.GetLogger(ctx).Error().Err(err).Str("record", cacheKey(spaceID, nodeID)).Str("spaceID", spaceID).Str("nodeID", nodeID).Msg("could not purge from cache") } } return nil diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/posix.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/posix.go index ed807f9015..9c60f3a68e 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/posix.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/posix.go @@ -24,6 +24,7 @@ import ( "os" "path/filepath" "syscall" + "time" "github.com/rs/zerolog" tusd "github.com/tus/tusd/v2/pkg/handler" @@ -58,7 +59,8 @@ func init() { type posixFS struct { storage.FS - um usermapper.Mapper + tree *tree.Tree + um usermapper.Mapper } // New returns an implementation to of the storage.FS interface that talk to @@ -70,6 +72,7 @@ func NewDefault(m map[string]interface{}, stream events.Stream, log *zerolog.Log } o.IDCache.Database += "_v2" // Use a versioned bucket name to avoid conflicts with previous implementations + o.IDCache.TTL = 0 // Disable TTL for the ID cache, as the posix driver relies on it for caching file IDs and we don't want them to expire kv, err := cache.NewNatsKeyValue(o.IDCache) if err != nil { return nil, errors.Wrap(err, "could not create nats key value store") @@ -80,6 +83,7 @@ func NewDefault(m map[string]interface{}, stream events.Stream, log *zerolog.Log } o.IDCache.Database += "_history" // Use a versioned bucket name to avoid conflicts with previous implementations + o.IDCache.TTL = 24 * 60 * time.Minute historyKv, err := cache.NewNatsKeyValue(o.IDCache) if err != nil { return nil, errors.Wrap(err, "could not create nats key value store") @@ -215,11 +219,17 @@ func New(o *options.Options, stream events.Stream, cache, historyCache *idcache. mw := middleware.NewFS(dfs, hooks...) fs.FS = mw + fs.tree = tp fs.um = um return fs, nil } +// WarmupIDCache allows triggering a posix fs scan and id cache warmup manually. +func (fs *posixFS) WarmupIDCache(root string, assimilate, onlyDirty bool) error { + return fs.tree.WarmupIDCache(root, assimilate, onlyDirty) +} + // ListUploadSessions returns the upload sessions matching the given filter func (fs *posixFS) ListUploadSessions(ctx context.Context, filter storage.UploadSessionFilter) ([]storage.UploadSession, error) { return fs.FS.(storage.UploadSessionLister).ListUploadSessions(ctx, filter) diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/tree/tree.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/tree/tree.go index f3f8fb9c15..ff510bbc26 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/tree/tree.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/tree/tree.go @@ -37,10 +37,8 @@ import ( "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" - user "github.com/cs3org/go-cs3apis/cs3/identity/user/v1beta1" provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1" - "github.com/opencloud-eu/reva/v2/pkg/appctx" "github.com/opencloud-eu/reva/v2/pkg/errtypes" "github.com/opencloud-eu/reva/v2/pkg/events" "github.com/opencloud-eu/reva/v2/pkg/storage/fs/posix/blobstore" @@ -590,11 +588,6 @@ func (t *Tree) Delete(ctx context.Context, n *node.Node) error { } }() - if appctx.DeletingSharedResourceFromContext(ctx) { - src := filepath.Join(n.ParentPath(), n.Name) - return os.RemoveAll(src) - } - var sizeDiff int64 if n.IsDir(ctx) { treesize, err := n.GetTreeSize(ctx) @@ -819,11 +812,3 @@ func isLockFile(path string) bool { func isTrash(path string) bool { return strings.HasSuffix(path, ".trashinfo") || strings.HasSuffix(path, ".trashitem") || strings.Contains(path, ".Trash") } - -func (t *Tree) AddLabel(ctx context.Context, ref *provider.Reference, userID *user.UserId, label string) error { - return errtypes.NotSupported("AddLabel not implemented") -} - -func (t *Tree) RemoveLabel(ctx context.Context, ref *provider.Reference, userID *user.UserId, label string) error { - return errtypes.NotSupported("RemoveLabel not implemented") -} diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/node/permissions.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/node/permissions.go index 6b1beb37d3..9fe2a459fa 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/node/permissions.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/node/permissions.go @@ -101,6 +101,7 @@ func ServiceAccountPermissions() *provider.ResourcePermissions { Delete: true, // for cli restore command with replace option CreateContainer: true, // for space provisioning AddGrant: true, // for initial project space member assignment + ListGrants: true, // for initial project space member assignment } } diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/tree/tree.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/tree/tree.go index 47d6ab0586..c718b3f163 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/tree/tree.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/pkg/decomposedfs/tree/tree.go @@ -429,11 +429,6 @@ func (t *Tree) Delete(ctx context.Context, n *node.Node) (err error) { // remove entry from cache immediately to avoid inconsistencies defer func() { _ = t.idCache.Delete(path) }() - if appctx.DeletingSharedResourceFromContext(ctx) { - src := filepath.Join(n.ParentPath(), n.Name) - return os.Remove(src) - } - // get the original path origin, err := t.lookup.Path(ctx, n, node.NoCheck) if err != nil { diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/decomposedfs/tree/tree.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/decomposedfs/tree/tree.go index 1162836ea6..6fdf2df3c5 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/decomposedfs/tree/tree.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/decomposedfs/tree/tree.go @@ -445,11 +445,6 @@ func (t *Tree) Delete(ctx context.Context, n *node.Node) (err error) { // remove entry from cache immediately to avoid inconsistencies defer func() { _ = t.idCache.Delete(path) }() - if appctx.DeletingSharedResourceFromContext(ctx) { - src := filepath.Join(n.ParentPath(), n.Name) - return os.Remove(src) - } - // get the original path origin, err := t.lookup.Path(ctx, n, node.NoCheck) if err != nil { diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/metadata/disk.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/metadata/disk.go index 6eae43e638..5caef632d3 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/metadata/disk.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/storage/utils/metadata/disk.go @@ -93,6 +93,40 @@ func (disk *Disk) SimpleUpload(ctx context.Context, uploadpath string, content [ // Upload stores a file on disk func (disk *Disk) Upload(_ context.Context, req UploadRequest) (*UploadResponse, error) { p := disk.targetPath(req.Path) + + // IfNoneMatch: ["*"] means create the file only if it does not already + // exist. Use O_EXCL so the check and the create are atomic on the local + // filesystem. + for _, tag := range req.IfNoneMatch { + if tag != "*" { + continue + } + f, err := os.OpenFile(p, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0644) + if err != nil { + if errors.Is(err, os.ErrExist) { + return nil, errtypes.AlreadyExists(p) + } + return nil, err + } + if _, err := f.Write(req.Content); err != nil { + _ = f.Close() + return nil, err + } + if err := f.Close(); err != nil { + return nil, err + } + info, err := os.Stat(p) + if err != nil { + return nil, err + } + res := &UploadResponse{} + res.Etag, err = calcEtag(info.ModTime(), info.Size()) + if err != nil { + return nil, err + } + return res, nil + } + if req.IfMatchEtag != "" { info, err := os.Stat(p) if err != nil && !errors.Is(err, os.ErrNotExist) { @@ -170,7 +204,10 @@ func (disk *Disk) Download(_ context.Context, req DownloadRequest) (*DownloadRes // SimpleDownload reads a file from disk func (disk *Disk) SimpleDownload(ctx context.Context, downloadpath string) ([]byte, error) { res, err := disk.Download(ctx, DownloadRequest{Path: downloadpath}) - return res.Content, err + if err != nil { + return nil, err + } + return res.Content, nil } // Delete deletes a path diff --git a/vendor/github.com/opencloud-eu/reva/v2/pkg/utils/ldap/identity.go b/vendor/github.com/opencloud-eu/reva/v2/pkg/utils/ldap/identity.go index 9e98041574..fcbd1ae57a 100644 --- a/vendor/github.com/opencloud-eu/reva/v2/pkg/utils/ldap/identity.go +++ b/vendor/github.com/opencloud-eu/reva/v2/pkg/utils/ldap/identity.go @@ -266,15 +266,10 @@ func (i *Identity) GetLDAPUserByFilter(ctx context.Context, lc ldap.Client, filt res, err := lc.Search(searchRequest) if err != nil { log.Debug().Str("backend", "ldap").Err(err).Str("userfilter", filter).Msg("Error looking up user by filter") - var errmsg string - if lerr, ok := err.(*ldap.Error); ok { - if lerr.ResultCode == ldap.LDAPResultSizeLimitExceeded { - errmsg = fmt.Sprintf("too many results searching for user '%s'", filter) - } - } - span.SetAttributes(attribute.String("ldap.error", errmsg)) - span.SetStatus(codes.Error, errmsg) - return nil, errtypes.NotFound(errmsg) + classified := classifySearchError(err, fmt.Sprintf("too many results searching for user '%s'", filter)) + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } if len(res.Entries) == 0 { return nil, errtypes.NotFound(filter) @@ -306,9 +301,10 @@ func (i *Identity) GetLDAPUserByDN(ctx context.Context, lc ldap.Client, dn strin res, err := lc.Search(searchRequest) if err != nil { log.Debug().Str("backend", "ldap").Err(err).Str("dn", dn).Msg("Error looking up user by DN") - span.SetAttributes(attribute.String("ldap.error", err.Error())) - span.SetStatus(codes.Error, "") - return nil, errtypes.NotFound(dn) + classified := classifySearchError(err, "") + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } span.SetStatus(codes.Ok, "") if len(res.Entries) == 0 { @@ -337,9 +333,10 @@ func (i *Identity) GetLDAPUsers(ctx context.Context, lc ldap.Client, query, tena sr, err := lc.Search(searchRequest) if err != nil { log.Debug().Str("backend", "ldap").Err(err).Str("filter", filter).Msg("Error searching users") - span.SetAttributes(attribute.String("ldap.error", err.Error())) - span.SetStatus(codes.Error, "") - return nil, errtypes.NotFound(query) + classified := classifySearchError(err, "") + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } span.SetAttributes(attribute.Int("ldap.result_count", len(sr.Entries))) @@ -376,7 +373,8 @@ func (i *Identity) IsLDAPUserInDisabledGroup(ctx context.Context, lc ldap.Client sr, err := lc.Search(searchRequest) if err != nil { log.Error().Str("backend", "ldap").Err(err).Str("filter", filter).Msg("Error looking up error group") - // Err on the side of caution. + // Err on the side of caution: treat search failures (including network + // errors) as if the user is in the disabled group. span.SetAttributes(attribute.String("ldap.error", err.Error())) span.SetStatus(codes.Error, "") return true @@ -423,10 +421,10 @@ func (i *Identity) GetLDAPUserGroups(ctx context.Context, lc ldap.Client, userEn // not having any groups in LDAP return []string{}, nil } - - span.SetAttributes(attribute.String("ldap.error", err.Error())) - span.SetStatus(codes.Error, "") - return []string{}, err + classified := classifySearchError(err, "") + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } span.SetStatus(codes.Ok, "") span.SetAttributes(attribute.Int("ldap.result_count", len(sr.Entries))) @@ -504,15 +502,10 @@ func (i *Identity) GetLDAPGroupByFilter(ctx context.Context, lc ldap.Client, fil res, err := lc.Search(searchRequest) if err != nil { log.Debug().Str("backend", "ldap").Err(err).Str("filter", filter).Msg("Error looking up group by filter") - var errmsg string - if lerr, ok := err.(*ldap.Error); ok { - if lerr.ResultCode == ldap.LDAPResultSizeLimitExceeded { - errmsg = fmt.Sprintf("too many results searching for group '%s'", filter) - } - } - span.SetAttributes(attribute.String("ldap.error", errmsg)) - span.SetStatus(codes.Error, "") - return nil, errtypes.NotFound(errmsg) + classified := classifySearchError(err, fmt.Sprintf("too many results searching for group '%s'", filter)) + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } if len(res.Entries) == 0 { return nil, errtypes.NotFound(filter) @@ -543,10 +536,11 @@ func (i *Identity) GetLDAPGroups(ctx context.Context, lc ldap.Client, query stri setLDAPSearchSpanAttributes(span, searchRequest) sr, err := lc.Search(searchRequest) if err != nil { - span.SetAttributes(attribute.String("ldap.error", err.Error())) - span.SetStatus(codes.Error, "") log.Debug().Str("backend", "ldap").Err(err).Str("query", query).Msg("Error search for groups") - return nil, errtypes.NotFound(query) + classified := classifySearchError(err, "") + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } span.SetStatus(codes.Ok, "") return sr.Entries, nil @@ -919,15 +913,10 @@ func (i *Identity) GetLDAPTenantByFilter(ctx context.Context, lc ldap.Client, fi res, err := lc.Search(searchRequest) if err != nil { log.Debug().Str("backend", "ldap").Err(err).Str("tenantfilter", filter).Msg("Error looking up tenant by filter") - var errmsg string - if lerr, ok := err.(*ldap.Error); ok { - if lerr.ResultCode == ldap.LDAPResultSizeLimitExceeded { - errmsg = fmt.Sprintf("too many results searching for tenant '%s'", filter) - } - } - span.SetAttributes(attribute.String("ldap.error", errmsg)) - span.SetStatus(codes.Error, errmsg) - return nil, errtypes.NotFound(errmsg) + classified := classifySearchError(err, fmt.Sprintf("too many results searching for tenant '%s'", filter)) + span.SetAttributes(attribute.String("ldap.error", classified.Error())) + span.SetStatus(codes.Error, classified.Error()) + return nil, classified } if len(res.Entries) == 0 { return nil, errtypes.NotFound(filter) @@ -980,6 +969,24 @@ func (i *Identity) getTenantAttributeFilter(attribute, value string) (string, er ), nil } +// classifySearchError maps a raw error from lc.Search to the appropriate +// errtypes value: +// - ldap.ErrorNetwork → errtypes.Unavailable (transient; caller should retry) +// - ldap.LDAPResultSizeLimitExceeded → errtypes.NotFound(sizeExceededMsg) +// - anything else → errtypes.NotFound("") (preserving prior behaviour) +// +// The sizeExceededMsg is only used for the SizeLimitExceeded case; pass an +// empty string if the caller does not need a custom message for that case. +func classifySearchError(err error, sizeExceededMsg string) error { + if ldap.IsErrorWithCode(err, ldap.ErrorNetwork) { + return errtypes.Unavailable("ldap server unreachable: " + err.Error()) + } + if sizeExceededMsg != "" && ldap.IsErrorWithCode(err, ldap.LDAPResultSizeLimitExceeded) { + return errtypes.NotFound(sizeExceededMsg) + } + return errtypes.NotFound("") +} + func setLDAPSearchSpanAttributes(span trace.Span, request *ldap.SearchRequest) { span.SetAttributes( attribute.String("ldap.basedn", request.BaseDN), diff --git a/vendor/golang.org/x/crypto/ssh/cipher.go b/vendor/golang.org/x/crypto/ssh/cipher.go index 7554ed57a9..ad2b370578 100644 --- a/vendor/golang.org/x/crypto/ssh/cipher.go +++ b/vendor/golang.org/x/crypto/ssh/cipher.go @@ -586,7 +586,7 @@ func (c *cbcCipher) writeCipherPacket(seqNum uint32, w io.Writer, rand io.Reader // Length of encrypted portion of the packet (header, payload, padding). // Enforce minimum padding and packet size. - encLength := maxUInt32(prefixLen+len(packet)+cbcMinPaddingSize, cbcMinPaddingSize) + encLength := maxUInt32(prefixLen+len(packet)+cbcMinPaddingSize, cbcMinPacketSize) // Enforce block size. encLength = (encLength + effectiveBlockSize - 1) / effectiveBlockSize * effectiveBlockSize diff --git a/vendor/golang.org/x/crypto/ssh/client_auth.go b/vendor/golang.org/x/crypto/ssh/client_auth.go index 3127e49903..4f2f75c367 100644 --- a/vendor/golang.org/x/crypto/ssh/client_auth.go +++ b/vendor/golang.org/x/crypto/ssh/client_auth.go @@ -274,10 +274,14 @@ func pickSignatureAlgorithm(signer Signer, extensions map[string][]byte) (MultiA } // Filter algorithms based on those supported by MultiAlgorithmSigner. + // Iterate over the signer's algorithms first to preserve its preference order. + supportedKeyAlgos := algorithmsForKeyFormat(keyFormat) var keyAlgos []string - for _, algo := range algorithmsForKeyFormat(keyFormat) { - if slices.Contains(as.Algorithms(), underlyingAlgo(algo)) { - keyAlgos = append(keyAlgos, algo) + for _, signerAlgo := range as.Algorithms() { + if idx := slices.IndexFunc(supportedKeyAlgos, func(algo string) bool { + return underlyingAlgo(algo) == signerAlgo + }); idx >= 0 { + keyAlgos = append(keyAlgos, supportedKeyAlgos[idx]) } } diff --git a/vendor/golang.org/x/tools/go/packages/golist.go b/vendor/golang.org/x/tools/go/packages/golist.go index 680a70ca8f..a6c17cf634 100644 --- a/vendor/golang.org/x/tools/go/packages/golist.go +++ b/vendor/golang.org/x/tools/go/packages/golist.go @@ -61,13 +61,42 @@ func (r *responseDeduper) addAll(dr *DriverResponse) { } func (r *responseDeduper) addPackage(p *Package) { - if r.seenPackages[p.ID] != nil { + if prev := r.seenPackages[p.ID]; prev != nil { + // Package already seen in a previous response. Merge the file lists, + // removing duplicates. This can happen when the same package appears + // in multiple driver responses that are being merged together. + prev.GoFiles = appendUniqueStrings(prev.GoFiles, p.GoFiles) + prev.CompiledGoFiles = appendUniqueStrings(prev.CompiledGoFiles, p.CompiledGoFiles) + prev.OtherFiles = appendUniqueStrings(prev.OtherFiles, p.OtherFiles) + prev.IgnoredFiles = appendUniqueStrings(prev.IgnoredFiles, p.IgnoredFiles) + prev.EmbedFiles = appendUniqueStrings(prev.EmbedFiles, p.EmbedFiles) + prev.EmbedPatterns = appendUniqueStrings(prev.EmbedPatterns, p.EmbedPatterns) return } r.seenPackages[p.ID] = p r.dr.Packages = append(r.dr.Packages, p) } +// appendUniqueStrings appends elements from src to dst, skipping duplicates. +func appendUniqueStrings(dst, src []string) []string { + if len(src) == 0 { + return dst + } + + seen := make(map[string]bool, len(dst)) + for _, s := range dst { + seen[s] = true + } + + for _, s := range src { + if !seen[s] { + dst = append(dst, s) + } + } + + return dst +} + func (r *responseDeduper) addRoot(id string) { if r.seenRoots[id] { return @@ -832,6 +861,8 @@ func golistargs(cfg *Config, words []string, goVersion int) []string { // go list doesn't let you pass -test and -find together, // probably because you'd just get the TestMain. fmt.Sprintf("-find=%t", !cfg.Tests && cfg.Mode&findFlags == 0 && !usesExportData(cfg)), + // VCS information is not needed when not printing Stale or StaleReason fields + "-buildvcs=false", } // golang/go#60456: with go1.21 and later, go list serves pgo variants, which diff --git a/vendor/golang.org/x/tools/go/packages/packages.go b/vendor/golang.org/x/tools/go/packages/packages.go index b249a5c7ef..412ba06b56 100644 --- a/vendor/golang.org/x/tools/go/packages/packages.go +++ b/vendor/golang.org/x/tools/go/packages/packages.go @@ -403,6 +403,10 @@ func mergeResponses(responses ...*DriverResponse) *DriverResponse { if len(responses) == 0 { return nil } + // No dedup needed + if len(responses) == 1 { + return responses[0] + } response := newDeduper() response.dr.NotHandled = false response.dr.Compiler = responses[0].Compiler diff --git a/vendor/modules.txt b/vendor/modules.txt index 062a535c8a..d007dea584 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -4,8 +4,8 @@ contrib.go.opencensus.io/exporter/prometheus # dario.cat/mergo v1.0.2 ## explicit; go 1.13 dario.cat/mergo -# filippo.io/edwards25519 v1.1.1 -## explicit; go 1.20 +# filippo.io/edwards25519 v1.2.0 +## explicit; go 1.24.0 filippo.io/edwards25519 filippo.io/edwards25519/field # github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c @@ -93,8 +93,8 @@ github.com/alexedwards/argon2id # github.com/amoghe/go-crypt v0.0.0-20220222110647-20eada5f5964 ## explicit github.com/amoghe/go-crypt -# github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op -## explicit; go 1.20 +# github.com/antithesishq/antithesis-sdk-go v0.7.0-default-no-op +## explicit; go 1.24.0 github.com/antithesishq/antithesis-sdk-go/assert github.com/antithesishq/antithesis-sdk-go/internal # github.com/armon/go-radix v1.0.0 @@ -624,8 +624,8 @@ github.com/go-redis/redis/v8/internal/util ## explicit; go 1.23.0 github.com/go-resty/resty/v2 github.com/go-resty/resty/v2/shellescape -# github.com/go-sql-driver/mysql v1.9.3 -## explicit; go 1.21.0 +# github.com/go-sql-driver/mysql v1.10.0 +## explicit; go 1.24.0 github.com/go-sql-driver/mysql # github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 ## explicit; go 1.13 @@ -1050,7 +1050,7 @@ github.com/mileusna/useragent # github.com/minio/crc64nvme v1.1.1 ## explicit; go 1.22 github.com/minio/crc64nvme -# github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 +# github.com/minio/highwayhash v1.0.4 ## explicit; go 1.15 github.com/minio/highwayhash # github.com/minio/md5-simd v1.1.2 @@ -1158,7 +1158,7 @@ github.com/munnerz/goautoneg # github.com/nats-io/jwt/v2 v2.8.1 ## explicit; go 1.25.0 github.com/nats-io/jwt/v2 -# github.com/nats-io/nats-server/v2 v2.12.6 +# github.com/nats-io/nats-server/v2 v2.14.0 ## explicit; go 1.25.0 github.com/nats-io/nats-server/v2/conf github.com/nats-io/nats-server/v2/internal/fastrand @@ -1273,7 +1273,7 @@ github.com/onsi/ginkgo/v2/internal/reporters github.com/onsi/ginkgo/v2/internal/testingtproxy github.com/onsi/ginkgo/v2/reporters github.com/onsi/ginkgo/v2/types -# github.com/onsi/gomega v1.39.1 +# github.com/onsi/gomega v1.40.0 ## explicit; go 1.24.0 github.com/onsi/gomega github.com/onsi/gomega/format @@ -1371,7 +1371,7 @@ github.com/opencloud-eu/icap-client # github.com/opencloud-eu/libre-graph-api-go v1.0.8-0.20260310090739-853d972b282d ## explicit; go 1.18 github.com/opencloud-eu/libre-graph-api-go -# github.com/opencloud-eu/reva/v2 v2.43.1-0.20260424125411-c5db28365753 +# github.com/opencloud-eu/reva/v2 v2.43.1-0.20260512061040-cd4be86c66b0 ## explicit; go 1.25.0 github.com/opencloud-eu/reva/v2/cmd/revad/internal/grace github.com/opencloud-eu/reva/v2/cmd/revad/runtime @@ -1606,6 +1606,7 @@ github.com/opencloud-eu/reva/v2/pkg/share/cache github.com/opencloud-eu/reva/v2/pkg/share/cache/warmup/loader github.com/opencloud-eu/reva/v2/pkg/share/cache/warmup/registry github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3 +github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/migrations github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/providercache github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/receivedsharecache github.com/opencloud-eu/reva/v2/pkg/share/manager/jsoncs3/sharecache @@ -2413,7 +2414,7 @@ go.yaml.in/yaml/v2 # go.yaml.in/yaml/v3 v3.0.4 ## explicit; go 1.16 go.yaml.in/yaml/v3 -# golang.org/x/crypto v0.49.0 +# golang.org/x/crypto v0.50.0 ## explicit; go 1.25.0 golang.org/x/crypto/argon2 golang.org/x/crypto/bcrypt @@ -2465,8 +2466,8 @@ golang.org/x/image/vector golang.org/x/image/vp8 golang.org/x/image/vp8l golang.org/x/image/webp -# golang.org/x/mod v0.33.0 -## explicit; go 1.24.0 +# golang.org/x/mod v0.34.0 +## explicit; go 1.25.0 golang.org/x/mod/internal/lazyregexp golang.org/x/mod/module golang.org/x/mod/semver @@ -2515,10 +2516,10 @@ golang.org/x/sys/windows/registry golang.org/x/sys/windows/svc golang.org/x/sys/windows/svc/eventlog golang.org/x/sys/windows/svc/mgr -# golang.org/x/term v0.41.0 +# golang.org/x/term v0.42.0 ## explicit; go 1.25.0 golang.org/x/term -# golang.org/x/text v0.35.0 +# golang.org/x/text v0.36.0 ## explicit; go 1.25.0 golang.org/x/text/cases golang.org/x/text/collate @@ -2548,8 +2549,8 @@ golang.org/x/text/width # golang.org/x/time v0.15.0 ## explicit; go 1.25.0 golang.org/x/time/rate -# golang.org/x/tools v0.42.0 -## explicit; go 1.24.0 +# golang.org/x/tools v0.43.0 +## explicit; go 1.25.0 golang.org/x/tools/cover golang.org/x/tools/go/ast/astutil golang.org/x/tools/go/ast/edge From ef24b9fbff9927eb1720cdd9156b0c3c4d5c3482 Mon Sep 17 00:00:00 2001 From: Ralf Haferkamp Date: Mon, 11 May 2026 16:34:56 +0200 Subject: [PATCH 2/2] feat(sharing): Require service account to be configured for sharing service In order to be able to run migrations, the "sharing" service now needs the "service_account_id" and -"_secret" to be configured. This is a "breaking/backwards incompatible" change. --- opencloud/pkg/command/shares.go | 10 +++++----- opencloud/pkg/init/init.go | 3 +++ opencloud/pkg/init/structs.go | 3 ++- services/sharing/pkg/config/config.go | 10 +++++++++- services/sharing/pkg/config/parser/parse.go | 8 ++++++++ services/sharing/pkg/revaconfig/config.go | 16 +++++++++------- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/opencloud/pkg/command/shares.go b/opencloud/pkg/command/shares.go index 59e391ff2a..98cee2da4a 100644 --- a/opencloud/pkg/command/shares.go +++ b/opencloud/pkg/command/shares.go @@ -85,12 +85,16 @@ func cleanup(_ *cobra.Command, cfg *config.Config) error { return configlog.ReturnError(errors.New("cleanup is only implemented for the jsoncs3 share manager")) } + l := logger() + + zerolog.SetGlobalLevel(zerolog.InfoLevel) + rcfg := revaShareConfig(cfg.Sharing) f, ok := registry.NewFuncs[driver] if !ok { return configlog.ReturnError(errors.New("Unknown share manager type '" + driver + "'")) } - mgr, err := f(rcfg[driver].(map[string]any)) + mgr, err := f(rcfg[driver].(map[string]any), l) if err != nil { return configlog.ReturnError(err) } @@ -115,10 +119,6 @@ func cleanup(_ *cobra.Command, cfg *config.Config) error { if err != nil { return configlog.ReturnError(err) } - - l := logger() - - zerolog.SetGlobalLevel(zerolog.InfoLevel) serviceUserCtx = l.WithContext(serviceUserCtx) mgr.(*jsoncs3.Manager).CleanupStaleShares(serviceUserCtx) diff --git a/opencloud/pkg/init/init.go b/opencloud/pkg/init/init.go index 6d53b78f4d..938205e6dc 100644 --- a/opencloud/pkg/init/init.go +++ b/opencloud/pkg/init/init.go @@ -272,6 +272,9 @@ func CreateConfig(insecure, forceOverwrite, diff bool, configPath, adminPassword Activitylog: Activitylog{ ServiceAccount: serviceAccount, }, + Sharing: Sharing{ + ServiceAccount: serviceAccount, + }, } if insecure { diff --git a/opencloud/pkg/init/structs.go b/opencloud/pkg/init/structs.go index b49ae8726a..3e6d0ae4f9 100644 --- a/opencloud/pkg/init/structs.go +++ b/opencloud/pkg/init/structs.go @@ -204,7 +204,8 @@ type SettingsService struct { // Sharing is the configuration for the sharing service type Sharing struct { - Events Events + Events Events + ServiceAccount ServiceAccount `yaml:"service_account"` } // StorageRegistry is the configuration for the storage registry diff --git a/services/sharing/pkg/config/config.go b/services/sharing/pkg/config/config.go index 0ba4a330fc..2f3f7178ec 100644 --- a/services/sharing/pkg/config/config.go +++ b/services/sharing/pkg/config/config.go @@ -18,7 +18,8 @@ type Config struct { Reva *shared.Reva `yaml:"reva"` Events Events `yaml:"events"` - SkipUserGroupsInToken bool `yaml:"skip_user_groups_in_token" env:"SHARING_SKIP_USER_GROUPS_IN_TOKEN" desc:"Disables the loading of user's group memberships from the reva access token." introductionVersion:"1.0.0"` + ServiceAccount ServiceAccount `yaml:"service_account"` + SkipUserGroupsInToken bool `yaml:"skip_user_groups_in_token" env:"SHARING_SKIP_USER_GROUPS_IN_TOKEN" desc:"Disables the loading of user's group memberships from the reva access token." introductionVersion:"1.0.0"` UserSharingDriver string `yaml:"user_sharing_driver" env:"SHARING_USER_DRIVER" desc:"Driver to be used to persist shares. Supported values are 'jsoncs3', 'json', 'cs3' (deprecated) and 'owncloudsql'." introductionVersion:"1.0.0"` UserSharingDrivers UserSharingDrivers `yaml:"user_sharing_drivers"` @@ -100,6 +101,13 @@ type UserSharingJSONCS3Driver struct { CacheTTL int `yaml:"cache_ttl" env:"SHARING_USER_JSONCS3_CACHE_TTL" desc:"TTL for the internal caches in seconds." introductionVersion:"1.0.0"` MaxConcurrency int `yaml:"max_concurrency" env:"OC_MAX_CONCURRENCY;SHARING_USER_JSONCS3_MAX_CONCURRENCY" desc:"Maximum number of concurrent go-routines. Higher values can potentially get work done faster but will also cause more load on the system. Values of 0 or below will be ignored and the default value will be used." introductionVersion:"1.0.0"` } + +// ServiceAccount is the configuration for the used service account +type ServiceAccount struct { + ServiceAccountID string `yaml:"service_account_id" env:"OC_SERVICE_ACCOUNT_ID;SHARING_SERVICE_ACCOUNT" desc:"The ID of the service account the service should use. See the 'auth-service' service description for more details." introductionVersion:"%%NEXT%%"` + ServiceAccountSecret string `yaml:"service_account_secret" env:"OC_SERVICE_ACCOUNT_SECRET;SHARING_SERVICE_ACCOUNT_SECRET" desc:"The service account secret." introductionVersion:"%%NEXT%%"` +} + type PublicSharingDrivers struct { JSON PublicSharingJSONDriver `yaml:"json"` JSONCS3 PublicSharingJSONCS3Driver `yaml:"jsoncs3"` diff --git a/services/sharing/pkg/config/parser/parse.go b/services/sharing/pkg/config/parser/parse.go index 678f50d256..96adde279d 100644 --- a/services/sharing/pkg/config/parser/parse.go +++ b/services/sharing/pkg/config/parser/parse.go @@ -58,5 +58,13 @@ func Validate(cfg *config.Config) error { return shared.MissingSystemUserID(cfg.Service.Name) } + if cfg.ServiceAccount.ServiceAccountID == "" { + return shared.MissingServiceAccountID(cfg.Service.Name) + } + + if cfg.ServiceAccount.ServiceAccountSecret == "" { + return shared.MissingServiceAccountSecret(cfg.Service.Name) + } + return nil } diff --git a/services/sharing/pkg/revaconfig/config.go b/services/sharing/pkg/revaconfig/config.go index 57b12b9b02..b7244e37b8 100644 --- a/services/sharing/pkg/revaconfig/config.go +++ b/services/sharing/pkg/revaconfig/config.go @@ -71,13 +71,15 @@ func SharingConfigFromStruct(cfg *config.Config, logger log.Logger) (map[string] "machine_auth_apikey": cfg.UserSharingDrivers.CS3.SystemUserAPIKey, }, "jsoncs3": map[string]any{ - "gateway_addr": cfg.Reva.Address, - "provider_addr": cfg.UserSharingDrivers.JSONCS3.ProviderAddr, - "service_user_id": cfg.UserSharingDrivers.JSONCS3.SystemUserID, - "service_user_idp": cfg.UserSharingDrivers.JSONCS3.SystemUserIDP, - "machine_auth_apikey": cfg.UserSharingDrivers.JSONCS3.SystemUserAPIKey, - "ttl": cfg.UserSharingDrivers.JSONCS3.CacheTTL, - "max_concurrency": cfg.UserSharingDrivers.JSONCS3.MaxConcurrency, + "gateway_addr": cfg.Reva.Address, + "provider_addr": cfg.UserSharingDrivers.JSONCS3.ProviderAddr, + "system_user_id": cfg.UserSharingDrivers.JSONCS3.SystemUserID, + "system_user_idp": cfg.UserSharingDrivers.JSONCS3.SystemUserIDP, + "machine_auth_apikey": cfg.UserSharingDrivers.JSONCS3.SystemUserAPIKey, + "ttl": cfg.UserSharingDrivers.JSONCS3.CacheTTL, + "max_concurrency": cfg.UserSharingDrivers.JSONCS3.MaxConcurrency, + "service_account_id": cfg.ServiceAccount.ServiceAccountID, + "service_account_secret": cfg.ServiceAccount.ServiceAccountSecret, "events": map[string]any{ "natsaddress": cfg.Events.Addr, "natsclusterid": cfg.Events.ClusterID,