From 1c0f2de5f9279bbe303d9358119f6babb49cae42 Mon Sep 17 00:00:00 2001 From: slach Date: Wed, 13 May 2026 22:53:17 +0500 Subject: [PATCH 1/7] fix https://github.com/Altinity/clickhouse-backup/issues/1356 --- pkg/backup/delete.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/backup/delete.go b/pkg/backup/delete.go index fd12ca12..41495a56 100644 --- a/pkg/backup/delete.go +++ b/pkg/backup/delete.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "github.com/Altinity/clickhouse-backup/v2/pkg/common" "github.com/Altinity/clickhouse-backup/v2/pkg/pidlock" "github.com/Altinity/clickhouse-backup/v2/pkg/clickhouse" @@ -19,6 +20,7 @@ import ( "github.com/Altinity/clickhouse-backup/v2/pkg/storage/object_disk" "github.com/Altinity/clickhouse-backup/v2/pkg/utils" + "github.com/eapache/go-resiliency/retrier" "github.com/pkg/errors" "github.com/rs/zerolog/log" ) @@ -405,6 +407,7 @@ func (b *Backuper) cleanBackupObjectDisks(ctx context.Context, backupName string batchSize := b.cfg.General.DeleteBatchSize log.Info().Msgf("cleanBackupObjectDisks: starting batch deletion for object disk backup %s using %s (batch_size=%d)", backupName, b.dst.Kind(), batchSize) + retry := retrier.New(retrier.ExponentialBackoff(b.cfg.General.RetriesOnFailure, common.AddRandomJitter(b.cfg.General.RetriesDuration, b.cfg.General.RetriesJitter)), b) // Process deletion in batches to avoid loading all keys in memory var keysToDelete []string @@ -420,7 +423,9 @@ func (b *Backuper) cleanBackupObjectDisks(ctx context.Context, backupName string // When we've collected enough keys, delete them as a batch if len(keysToDelete) >= batchSize { - deleteErr := batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + deleteErr := retry.RunCtx(ctx, func(ctx context.Context) error { + return batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + }) if deleteErr != nil { return deleteErr } @@ -436,7 +441,9 @@ func (b *Backuper) cleanBackupObjectDisks(ctx context.Context, backupName string // Delete remaining keys if len(keysToDelete) > 0 { - deleteErr := batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + deleteErr := retry.RunCtx(ctx, func(ctx context.Context) error { + return batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + }) if deleteErr != nil { return totalDeleted, deleteErr } From 6b3b9f4e568836b96c3141d60cde2bc6e1c6e619 Mon Sep 17 00:00:00 2001 From: slach Date: Wed, 13 May 2026 22:53:17 +0500 Subject: [PATCH 2/7] docs: add ChangeLog entry for v2.6.44 --- ChangeLog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 315aa70f..cc70c7b5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,3 +1,8 @@ +# v2.6.44 + +BUG FIXES +- fix [1356](https://github.com/Altinity/clickhouse-backup/issues/1356), retry batch deletion in `cleanBackupObjectDisks` so transient errors (e.g. GCS 503) during retention no longer leave orphaned objects in `object_disks_path` + # v2.6.43 NEW FEATURES From 0b9d6b9dc08b99c448b6f7699b9b72ab4cd059d7 Mon Sep 17 00:00:00 2001 From: slach Date: Wed, 15 Apr 2026 21:32:55 +0300 Subject: [PATCH 3/7] fix TestFIPS Signed-off-by: slach --- test/integration/fips_test.go | 45 +++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/test/integration/fips_test.go b/test/integration/fips_test.go index 844ff22e..4d91afe8 100644 --- a/test/integration/fips_test.go +++ b/test/integration/fips_test.go @@ -18,7 +18,7 @@ import ( func TestFIPS(t *testing.T) { if compareVersion(os.Getenv("CLICKHOUSE_VERSION"), "19.17") <= 0 { - t.Skip("go 1.25 with boringcrypto stop works for 19.17, works only for 20.1+") + t.Skip("go 1.26 with boringcrypto stop works for 19.17, works only for 20.1+") } if os.Getenv("QA_AWS_ACCESS_KEY") == "" { t.Skip("QA_AWS_ACCESS_KEY is empty, TestFIPS will skip") @@ -27,12 +27,38 @@ func TestFIPS(t *testing.T) { env.connectWithWait(t, r, 1*time.Second, 1*time.Second, 1*time.Minute) fipsBackupName := fmt.Sprintf("fips_backup_%d", rand.Int()) env.DockerExecNoError(r, "clickhouse", "rm", "-fv", "/etc/apt/sources.list.d/clickhouse.list") - env.InstallDebIfNotExists(r, "clickhouse", "ca-certificates", "curl", "gettext-base", "bsdmainutils", "dnsutils", "git") + env.InstallDebIfNotExists(r, "clickhouse", "ca-certificates", "curl", "gettext-base", "binutils", "bsdmainutils", "dnsutils", "git") env.DockerExecNoError(r, "clickhouse", "update-ca-certificates") r.NoError(env.DockerCP("configs/config-s3-fips.yml", "clickhouse:/etc/clickhouse-backup/config.yml.fips-template")) env.DockerExecNoError(r, "clickhouse", "git", "clone", "--depth", "1", "--branch", "v3.2rc3", "https://github.com/drwetter/testssl.sh.git", "/opt/testssl") env.DockerExecNoError(r, "clickhouse", "chmod", "+x", "/opt/testssl/testssl.sh") + // P0: Verify binary version contains -fips suffix + fipsVersion, err := env.DockerExecOut("clickhouse", "bash", "-ce", "clickhouse-backup-fips --version 2>&1") + r.NoError(err, "unexpected clickhouse-backup-fips --version error: %v", err) + r.Contains(fipsVersion, "FIPS 140-3:\t true", "FIPS binary version should contain 'FIPS 140-3: true' suffix, got: %s", fipsVersion) + + // P0: Integrity self-check — binary starts without panic in FIPS mode + fipsSelfCheck, err := env.DockerExecOut("clickhouse", "bash", "-ce", "GODEBUG=fips140=on clickhouse-backup-fips --version 2>&1") + r.NoError(err, "unexpected FIPS self-check error: %v", err) + r.NotContains(fipsSelfCheck, "panic", "FIPS binary should not panic during integrity self-check") + + // P0: Verify crypto/fips140.Enabled() reports true via version output + r.Contains(fipsSelfCheck, "FIPS 140-3:\t true", "FIPS 140-3 should be enabled when GODEBUG=fips140=on, got: %s", fipsSelfCheck) + + // P1: Verify GODEBUG=fips140=only (strict mode) — non-FIPS crypto returns errors + fipsOnlyCheck, err := env.DockerExecOut("clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips --version 2>&1") + r.NoError(err, "unexpected FIPS only mode error: %v", err) + r.NotContains(fipsOnlyCheck, "panic", "FIPS binary should not panic in fips140=only mode") + r.Contains(fipsOnlyCheck, "FIPS 140-3:\t true", "FIPS 140-3 should be enabled in fips140=only mode, got: %s", fipsOnlyCheck) + + // P2: Verify binary contains fips140 symbols + fipsSymbols, err := env.DockerExecOut("clickhouse", "bash", "-ce", "strings /usr/bin/clickhouse-backup-fips | grep -c 'crypto/internal/fips140'") + r.NoError(err, "unexpected strings/grep error: %v", err) + fipsSymbolCount, convErr := strconv.Atoi(strings.TrimSpace(fipsSymbols)) + r.NoError(convErr, "unexpected Atoi error for fipsSymbols=%q: %v", fipsSymbols, convErr) + r.Greater(fipsSymbolCount, 0, "binary should contain crypto/internal/fips140 symbols") + generateCerts := func(certType, keyLength, curveType string) { env.DockerExecNoError(r, "clickhouse", "bash", "-xce", "openssl rand -out /root/.rnd 2048") switch certType { @@ -93,6 +119,12 @@ func TestFIPS(t *testing.T) { r.NoError(err, "%s\nunexpected grep testssl.csv error: %v", out, err) r.Equal(strconv.Itoa(len(cipherList)), strings.Trim(out, " \t\r\n")) + // P1: Negative test — non-FIPS ciphers (RC4, DES, 3DES, CHACHA, NULL) should NOT be offered + nonFipsOut, nonFipsErr := env.DockerExecOut("clickhouse", "bash", "-ce", "grep -v 'not offered' /tmp/testssl.csv | grep -c -i -E '(RC4|TRIPLEDES|DES-CBC|CHACHA|NULL)' || true") + r.NoError(nonFipsErr, "%s\nunexpected grep non-FIPS ciphers error: %v", nonFipsOut, nonFipsErr) + nonFipsCount, _ := strconv.Atoi(strings.TrimSpace(nonFipsOut)) + r.Equal(0, nonFipsCount, "non-FIPS ciphers (RC4/DES/3DES/CHACHA/NULL) should not be offered by FIPS server, found %d matches", nonFipsCount) + inProgressActions := make([]struct { Command string `ch:"command"` Status string `ch:"status"` @@ -104,6 +136,15 @@ func TestFIPS(t *testing.T) { r.Equal(0, len(inProgressActions), "inProgressActions=%+v", inProgressActions) env.DockerExecNoError(r, "clickhouse", "pkill", "-n", "-f", "clickhouse-backup-fips") } + // P1: Test create_remote + restore_remote in strict GODEBUG=fips140=only mode + // @todo think about FIPS clickhouse-server, which not supported by GODEBUG=fips140=only + // fipsOnlyBackupName := fmt.Sprintf("fips_only_backup_%d", rand.Int()) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml create_remote --tables="+t.Name()+".fips_table "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml delete local "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml restore_remote --tables="+t.Name()+".fips_table "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml delete local "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml delete remote "+fipsOnlyBackupName) + // https://www.perplexity.ai/search/0920f1e8-59ec-4e14-b779-ba7b2e037196 testTLSCerts("rsa", "4096", "", "ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES256-GCM-SHA384", "AES_128_GCM_SHA256", "AES_256_GCM_SHA384") testTLSCerts("ecdsa", "", "prime256v1", "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES256-GCM-SHA384") From a4830ae32f83c168dd7ded29b0efc5b5f53d179d Mon Sep 17 00:00:00 2001 From: slach Date: Fri, 17 Apr 2026 21:23:22 +0300 Subject: [PATCH 4/7] fix flaky TestFIPS: wait for API server readiness instead of 1s sleep Replace time.Sleep(1s) after starting clickhouse-backup-fips server with active polling of both the /backup/actions HTTP endpoint and the system.backup_actions integration table. Under CI load 1s was not enough for the server to create the URL-engine table and start listening on :7172, producing "Table system.backup_actions does not exist" (Code 60). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/integration/fips_test.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/test/integration/fips_test.go b/test/integration/fips_test.go index 4d91afe8..7b1a83af 100644 --- a/test/integration/fips_test.go +++ b/test/integration/fips_test.go @@ -88,7 +88,23 @@ func TestFIPS(t *testing.T) { log.Debug().Msg("Run `clickhouse-backup-fips server` in background") env.DockerExecBackgroundNoError(r, "clickhouse", "bash", "-ce", "AWS_USE_FIPS_ENDPOINT=true clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml server &>>/tmp/clickhouse-backup-server-fips.log") - time.Sleep(1 * time.Second) + + // Wait until the API server is fully ready: HTTP listener on :7172 is up AND + // `system.backup_actions` integration table has been registered. Otherwise the + // client `INSERT INTO system.backup_actions` below races the server startup and + // fails with `Code 60: UNKNOWN_TABLE`. + fipsReadyDeadline := time.Now().Add(30 * time.Second) + for { + httpOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "curl -sf -o /dev/null -w '%{http_code}' http://localhost:7172/backup/actions || true") + tblOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "clickhouse client -q 'EXISTS TABLE system.backup_actions' 2>/dev/null || true") + if strings.HasPrefix(strings.TrimSpace(httpOut), "2") && strings.TrimSpace(tblOut) == "1" { + break + } + if time.Now().After(fipsReadyDeadline) { + r.FailNow("clickhouse-backup-fips server did not become ready in 30s", "http=%q table_exists=%q", httpOut, tblOut) + } + time.Sleep(200 * time.Millisecond) + } runClickHouseClientInsertSystemBackupActions(r, env, []string{fmt.Sprintf("create_remote --tables="+t.Name()+".fips_table %s", fipsBackupName)}, true) runClickHouseClientInsertSystemBackupActions(r, env, []string{fmt.Sprintf("delete local %s", fipsBackupName)}, false) From 8ca570786684a41d5df72dda466ce210465d5839 Mon Sep 17 00:00:00 2001 From: slach Date: Sat, 18 Apr 2026 22:18:08 +0300 Subject: [PATCH 5/7] fix TestFIPS readiness probe: TCP connect instead of HTTP curl against HTTPS server The HTTP probe curled http://localhost:7172 against an HTTPS server (secure: true in config-s3-fips.yml with required client cert auth), so Go's TLS server replied 400 "Client sent an HTTP request to an HTTPS server" and the 2xx prefix check could never succeed. Also, CreateIntegrationTables runs before ListenAndServeTLS in api.Restart(), so table existence alone does not prove the listener is bound. Swap curl for a pure TCP probe (bash --- test/integration/fips_test.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/test/integration/fips_test.go b/test/integration/fips_test.go index 7b1a83af..8be13829 100644 --- a/test/integration/fips_test.go +++ b/test/integration/fips_test.go @@ -89,19 +89,22 @@ func TestFIPS(t *testing.T) { log.Debug().Msg("Run `clickhouse-backup-fips server` in background") env.DockerExecBackgroundNoError(r, "clickhouse", "bash", "-ce", "AWS_USE_FIPS_ENDPOINT=true clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml server &>>/tmp/clickhouse-backup-server-fips.log") - // Wait until the API server is fully ready: HTTP listener on :7172 is up AND - // `system.backup_actions` integration table has been registered. Otherwise the - // client `INSERT INTO system.backup_actions` below races the server startup and - // fails with `Code 60: UNKNOWN_TABLE`. + // Wait until the API server is fully ready: TLS listener on :7172 is bound AND + // `system.backup_actions` integration table has been registered. The server is + // HTTPS with required client cert auth, so we only probe TCP reachability (a + // successful TCP connect proves ListenAndServeTLS has started). Without this + // probe, the client `INSERT INTO system.backup_actions` below races server + // startup and fails with `Code 60: UNKNOWN_TABLE` — table creation runs before + // the listener is bound. fipsReadyDeadline := time.Now().Add(30 * time.Second) for { - httpOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "curl -sf -o /dev/null -w '%{http_code}' http://localhost:7172/backup/actions || true") + tcpOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "if timeout 2 bash -c '/dev/null; then echo open; else echo closed; fi") tblOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "clickhouse client -q 'EXISTS TABLE system.backup_actions' 2>/dev/null || true") - if strings.HasPrefix(strings.TrimSpace(httpOut), "2") && strings.TrimSpace(tblOut) == "1" { + if strings.TrimSpace(tcpOut) == "open" && strings.TrimSpace(tblOut) == "1" { break } if time.Now().After(fipsReadyDeadline) { - r.FailNow("clickhouse-backup-fips server did not become ready in 30s", "http=%q table_exists=%q", httpOut, tblOut) + r.FailNow("clickhouse-backup-fips server did not become ready in 30s", "tcp=%q table_exists=%q", tcpOut, tblOut) } time.Sleep(200 * time.Millisecond) } From 778390b537003572e321877cb637375f42949b8e Mon Sep 17 00:00:00 2001 From: slach Date: Thu, 14 May 2026 09:19:29 +0500 Subject: [PATCH 6/7] trigger CI From fb12a07b040e360a958e5968baa40acd63e10737 Mon Sep 17 00:00:00 2001 From: slach Date: Thu, 14 May 2026 12:32:34 +0500 Subject: [PATCH 7/7] fix(s3): also delete delete-markers and stop creating new ones on retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getObjectAllVersions now returns both Versions and DeleteMarkers as ObjectIdentifier list, so each delete-marker is removed by VersionId. deleteKeys / deleteKey: when identifiers list is empty (key already gone, e.g. on retry) skip instead of issuing a key-only DeleteObject — which on a versioned bucket would create a fresh empty delete-marker. Refs: avoid delete-marker accumulation on versioned S3 buckets when batch delete is retried after a partial failure. --- pkg/storage/s3.go | 59 ++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/pkg/storage/s3.go b/pkg/storage/s3.go index 0c8c9608..fcc4a7f1 100644 --- a/pkg/storage/s3.go +++ b/pkg/storage/s3.go @@ -358,19 +358,19 @@ func (s *S3) deleteKey(ctx context.Context, key string) error { params.RequestPayer = s3types.RequestPayer(s.Config.RequestPayer) } if s.versioning { - objVersions, err := s.getObjectAllVersions(ctx, key) + identifiers, err := s.getObjectAllVersions(ctx, key) if err != nil { return errors.Wrapf(err, "deleteKey, obtaining object version bucket: %s key: %s", s.Config.Bucket, key) } - for _, objVersion := range objVersions { - params.VersionId = &objVersion + for _, id := range identifiers { + params.VersionId = id.VersionId if _, err := s.client.DeleteObject(ctx, params); err != nil { return errors.Wrapf(err, "deleteKey, deleting object bucket: %s key: %s version: %v", s.Config.Bucket, key, params.VersionId) } } - if len(objVersions) > 0 { - return nil - } + // Either we drained all versions+markers, or there was nothing to delete. + // In both cases avoid a key-only DeleteObject which would create a fresh delete-marker. + return nil } if _, err := s.client.DeleteObject(ctx, params); err != nil { return errors.Wrapf(err, "deleteKey, deleting object bucket: %s key: %s version: %v", s.Config.Bucket, key, params.VersionId) @@ -434,7 +434,7 @@ func (s *S3) deleteKeys(ctx context.Context, keys []string) error { for _, key := range keys { key := key g.Go(func() error { - versions, err := s.getObjectAllVersions(ctx, key) + identifiers, err := s.getObjectAllVersions(ctx, key) if err != nil { // If we can't list versions, try deleting without version ID log.Warn().Msgf("S3 deleteKeys: can't get versions for %s: %v, will try without version", key, err) @@ -445,21 +445,13 @@ func (s *S3) deleteKeys(ctx context.Context, keys []string) error { mu.Unlock() return nil } - mu.Lock() - if len(versions) == 0 { - // No versions found, delete without version ID - objectsToDelete = append(objectsToDelete, s3types.ObjectIdentifier{ - Key: aws.String(key), - }) - } else { - // Add each version as a separate object to delete - for _, version := range versions { - objectsToDelete = append(objectsToDelete, s3types.ObjectIdentifier{ - Key: aws.String(key), - VersionId: aws.String(version), - }) - } + if len(identifiers) == 0 { + // Already absent (e.g. retry of an earlier successful delete). + // Skip — issuing a key-only delete here would create a new delete-marker. + return nil } + mu.Lock() + objectsToDelete = append(objectsToDelete, identifiers...) mu.Unlock() return nil }) @@ -593,7 +585,11 @@ func (s *S3) isVersioningEnabled(ctx context.Context) bool { return output.Status == s3types.BucketVersioningStatusEnabled } -func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]string, error) { +// getObjectAllVersions returns ObjectIdentifier entries for every object version +// AND every delete-marker that belongs exactly to the given key. +// Deleting delete-markers by VersionId removes them, preventing accumulation +// of empty delete-markers across retries and prior partial deletions. +func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]s3types.ObjectIdentifier, error) { listParams := &s3.ListObjectVersionsInput{ Bucket: aws.String(s.Config.Bucket), Prefix: aws.String(key), @@ -601,7 +597,7 @@ func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]string, er if s.Config.RequestPayer != "" { listParams.RequestPayer = s3types.RequestPayer(s.Config.RequestPayer) } - var versions []string + var identifiers []s3types.ObjectIdentifier pager := s3.NewListObjectVersionsPaginator(s.client, listParams) for pager.HasMorePages() { page, err := pager.NextPage(ctx) @@ -609,12 +605,23 @@ func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]string, er return nil, errors.Wrapf(err, "listing object versions bucket: %s key: %s", s.Config.Bucket, key) } for _, version := range page.Versions { - if *version.Key == key { - versions = append(versions, *version.VersionId) + if version.Key != nil && *version.Key == key { + identifiers = append(identifiers, s3types.ObjectIdentifier{ + Key: aws.String(key), + VersionId: version.VersionId, + }) + } + } + for _, marker := range page.DeleteMarkers { + if marker.Key != nil && *marker.Key == key { + identifiers = append(identifiers, s3types.ObjectIdentifier{ + Key: aws.String(key), + VersionId: marker.VersionId, + }) } } } - return versions, nil + return identifiers, nil } func (s *S3) StatFile(ctx context.Context, key string) (RemoteFile, error) {