diff --git a/ChangeLog.md b/ChangeLog.md index 315aa70f..cc70c7b5 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,3 +1,8 @@ +# v2.6.44 + +BUG FIXES +- fix [1356](https://github.com/Altinity/clickhouse-backup/issues/1356), retry batch deletion in `cleanBackupObjectDisks` so transient errors (e.g. GCS 503) during retention no longer leave orphaned objects in `object_disks_path` + # v2.6.43 NEW FEATURES diff --git a/pkg/backup/delete.go b/pkg/backup/delete.go index fd12ca12..41495a56 100644 --- a/pkg/backup/delete.go +++ b/pkg/backup/delete.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "github.com/Altinity/clickhouse-backup/v2/pkg/common" "github.com/Altinity/clickhouse-backup/v2/pkg/pidlock" "github.com/Altinity/clickhouse-backup/v2/pkg/clickhouse" @@ -19,6 +20,7 @@ import ( "github.com/Altinity/clickhouse-backup/v2/pkg/storage/object_disk" "github.com/Altinity/clickhouse-backup/v2/pkg/utils" + "github.com/eapache/go-resiliency/retrier" "github.com/pkg/errors" "github.com/rs/zerolog/log" ) @@ -405,6 +407,7 @@ func (b *Backuper) cleanBackupObjectDisks(ctx context.Context, backupName string batchSize := b.cfg.General.DeleteBatchSize log.Info().Msgf("cleanBackupObjectDisks: starting batch deletion for object disk backup %s using %s (batch_size=%d)", backupName, b.dst.Kind(), batchSize) + retry := retrier.New(retrier.ExponentialBackoff(b.cfg.General.RetriesOnFailure, common.AddRandomJitter(b.cfg.General.RetriesDuration, b.cfg.General.RetriesJitter)), b) // Process deletion in batches to avoid loading all keys in memory var keysToDelete []string @@ -420,7 +423,9 @@ func (b *Backuper) cleanBackupObjectDisks(ctx context.Context, backupName string // When we've collected enough keys, delete them as a batch if len(keysToDelete) >= batchSize { - deleteErr := batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + deleteErr := retry.RunCtx(ctx, func(ctx context.Context) error { + return batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + }) if deleteErr != nil { return deleteErr } @@ -436,7 +441,9 @@ func (b *Backuper) cleanBackupObjectDisks(ctx context.Context, backupName string // Delete remaining keys if len(keysToDelete) > 0 { - deleteErr := batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + deleteErr := retry.RunCtx(ctx, func(ctx context.Context) error { + return batchDeleter.DeleteKeysFromObjectDiskBackupBatch(ctx, keysToDelete) + }) if deleteErr != nil { return totalDeleted, deleteErr } diff --git a/pkg/storage/s3.go b/pkg/storage/s3.go index 0c8c9608..fcc4a7f1 100644 --- a/pkg/storage/s3.go +++ b/pkg/storage/s3.go @@ -358,19 +358,19 @@ func (s *S3) deleteKey(ctx context.Context, key string) error { params.RequestPayer = s3types.RequestPayer(s.Config.RequestPayer) } if s.versioning { - objVersions, err := s.getObjectAllVersions(ctx, key) + identifiers, err := s.getObjectAllVersions(ctx, key) if err != nil { return errors.Wrapf(err, "deleteKey, obtaining object version bucket: %s key: %s", s.Config.Bucket, key) } - for _, objVersion := range objVersions { - params.VersionId = &objVersion + for _, id := range identifiers { + params.VersionId = id.VersionId if _, err := s.client.DeleteObject(ctx, params); err != nil { return errors.Wrapf(err, "deleteKey, deleting object bucket: %s key: %s version: %v", s.Config.Bucket, key, params.VersionId) } } - if len(objVersions) > 0 { - return nil - } + // Either we drained all versions+markers, or there was nothing to delete. + // In both cases avoid a key-only DeleteObject which would create a fresh delete-marker. + return nil } if _, err := s.client.DeleteObject(ctx, params); err != nil { return errors.Wrapf(err, "deleteKey, deleting object bucket: %s key: %s version: %v", s.Config.Bucket, key, params.VersionId) @@ -434,7 +434,7 @@ func (s *S3) deleteKeys(ctx context.Context, keys []string) error { for _, key := range keys { key := key g.Go(func() error { - versions, err := s.getObjectAllVersions(ctx, key) + identifiers, err := s.getObjectAllVersions(ctx, key) if err != nil { // If we can't list versions, try deleting without version ID log.Warn().Msgf("S3 deleteKeys: can't get versions for %s: %v, will try without version", key, err) @@ -445,21 +445,13 @@ func (s *S3) deleteKeys(ctx context.Context, keys []string) error { mu.Unlock() return nil } - mu.Lock() - if len(versions) == 0 { - // No versions found, delete without version ID - objectsToDelete = append(objectsToDelete, s3types.ObjectIdentifier{ - Key: aws.String(key), - }) - } else { - // Add each version as a separate object to delete - for _, version := range versions { - objectsToDelete = append(objectsToDelete, s3types.ObjectIdentifier{ - Key: aws.String(key), - VersionId: aws.String(version), - }) - } + if len(identifiers) == 0 { + // Already absent (e.g. retry of an earlier successful delete). + // Skip — issuing a key-only delete here would create a new delete-marker. + return nil } + mu.Lock() + objectsToDelete = append(objectsToDelete, identifiers...) mu.Unlock() return nil }) @@ -593,7 +585,11 @@ func (s *S3) isVersioningEnabled(ctx context.Context) bool { return output.Status == s3types.BucketVersioningStatusEnabled } -func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]string, error) { +// getObjectAllVersions returns ObjectIdentifier entries for every object version +// AND every delete-marker that belongs exactly to the given key. +// Deleting delete-markers by VersionId removes them, preventing accumulation +// of empty delete-markers across retries and prior partial deletions. +func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]s3types.ObjectIdentifier, error) { listParams := &s3.ListObjectVersionsInput{ Bucket: aws.String(s.Config.Bucket), Prefix: aws.String(key), @@ -601,7 +597,7 @@ func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]string, er if s.Config.RequestPayer != "" { listParams.RequestPayer = s3types.RequestPayer(s.Config.RequestPayer) } - var versions []string + var identifiers []s3types.ObjectIdentifier pager := s3.NewListObjectVersionsPaginator(s.client, listParams) for pager.HasMorePages() { page, err := pager.NextPage(ctx) @@ -609,12 +605,23 @@ func (s *S3) getObjectAllVersions(ctx context.Context, key string) ([]string, er return nil, errors.Wrapf(err, "listing object versions bucket: %s key: %s", s.Config.Bucket, key) } for _, version := range page.Versions { - if *version.Key == key { - versions = append(versions, *version.VersionId) + if version.Key != nil && *version.Key == key { + identifiers = append(identifiers, s3types.ObjectIdentifier{ + Key: aws.String(key), + VersionId: version.VersionId, + }) + } + } + for _, marker := range page.DeleteMarkers { + if marker.Key != nil && *marker.Key == key { + identifiers = append(identifiers, s3types.ObjectIdentifier{ + Key: aws.String(key), + VersionId: marker.VersionId, + }) } } } - return versions, nil + return identifiers, nil } func (s *S3) StatFile(ctx context.Context, key string) (RemoteFile, error) { diff --git a/test/integration/fips_test.go b/test/integration/fips_test.go index 844ff22e..8be13829 100644 --- a/test/integration/fips_test.go +++ b/test/integration/fips_test.go @@ -18,7 +18,7 @@ import ( func TestFIPS(t *testing.T) { if compareVersion(os.Getenv("CLICKHOUSE_VERSION"), "19.17") <= 0 { - t.Skip("go 1.25 with boringcrypto stop works for 19.17, works only for 20.1+") + t.Skip("go 1.26 with boringcrypto stop works for 19.17, works only for 20.1+") } if os.Getenv("QA_AWS_ACCESS_KEY") == "" { t.Skip("QA_AWS_ACCESS_KEY is empty, TestFIPS will skip") @@ -27,12 +27,38 @@ func TestFIPS(t *testing.T) { env.connectWithWait(t, r, 1*time.Second, 1*time.Second, 1*time.Minute) fipsBackupName := fmt.Sprintf("fips_backup_%d", rand.Int()) env.DockerExecNoError(r, "clickhouse", "rm", "-fv", "/etc/apt/sources.list.d/clickhouse.list") - env.InstallDebIfNotExists(r, "clickhouse", "ca-certificates", "curl", "gettext-base", "bsdmainutils", "dnsutils", "git") + env.InstallDebIfNotExists(r, "clickhouse", "ca-certificates", "curl", "gettext-base", "binutils", "bsdmainutils", "dnsutils", "git") env.DockerExecNoError(r, "clickhouse", "update-ca-certificates") r.NoError(env.DockerCP("configs/config-s3-fips.yml", "clickhouse:/etc/clickhouse-backup/config.yml.fips-template")) env.DockerExecNoError(r, "clickhouse", "git", "clone", "--depth", "1", "--branch", "v3.2rc3", "https://github.com/drwetter/testssl.sh.git", "/opt/testssl") env.DockerExecNoError(r, "clickhouse", "chmod", "+x", "/opt/testssl/testssl.sh") + // P0: Verify binary version contains -fips suffix + fipsVersion, err := env.DockerExecOut("clickhouse", "bash", "-ce", "clickhouse-backup-fips --version 2>&1") + r.NoError(err, "unexpected clickhouse-backup-fips --version error: %v", err) + r.Contains(fipsVersion, "FIPS 140-3:\t true", "FIPS binary version should contain 'FIPS 140-3: true' suffix, got: %s", fipsVersion) + + // P0: Integrity self-check — binary starts without panic in FIPS mode + fipsSelfCheck, err := env.DockerExecOut("clickhouse", "bash", "-ce", "GODEBUG=fips140=on clickhouse-backup-fips --version 2>&1") + r.NoError(err, "unexpected FIPS self-check error: %v", err) + r.NotContains(fipsSelfCheck, "panic", "FIPS binary should not panic during integrity self-check") + + // P0: Verify crypto/fips140.Enabled() reports true via version output + r.Contains(fipsSelfCheck, "FIPS 140-3:\t true", "FIPS 140-3 should be enabled when GODEBUG=fips140=on, got: %s", fipsSelfCheck) + + // P1: Verify GODEBUG=fips140=only (strict mode) — non-FIPS crypto returns errors + fipsOnlyCheck, err := env.DockerExecOut("clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips --version 2>&1") + r.NoError(err, "unexpected FIPS only mode error: %v", err) + r.NotContains(fipsOnlyCheck, "panic", "FIPS binary should not panic in fips140=only mode") + r.Contains(fipsOnlyCheck, "FIPS 140-3:\t true", "FIPS 140-3 should be enabled in fips140=only mode, got: %s", fipsOnlyCheck) + + // P2: Verify binary contains fips140 symbols + fipsSymbols, err := env.DockerExecOut("clickhouse", "bash", "-ce", "strings /usr/bin/clickhouse-backup-fips | grep -c 'crypto/internal/fips140'") + r.NoError(err, "unexpected strings/grep error: %v", err) + fipsSymbolCount, convErr := strconv.Atoi(strings.TrimSpace(fipsSymbols)) + r.NoError(convErr, "unexpected Atoi error for fipsSymbols=%q: %v", fipsSymbols, convErr) + r.Greater(fipsSymbolCount, 0, "binary should contain crypto/internal/fips140 symbols") + generateCerts := func(certType, keyLength, curveType string) { env.DockerExecNoError(r, "clickhouse", "bash", "-xce", "openssl rand -out /root/.rnd 2048") switch certType { @@ -62,7 +88,26 @@ func TestFIPS(t *testing.T) { log.Debug().Msg("Run `clickhouse-backup-fips server` in background") env.DockerExecBackgroundNoError(r, "clickhouse", "bash", "-ce", "AWS_USE_FIPS_ENDPOINT=true clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml server &>>/tmp/clickhouse-backup-server-fips.log") - time.Sleep(1 * time.Second) + + // Wait until the API server is fully ready: TLS listener on :7172 is bound AND + // `system.backup_actions` integration table has been registered. The server is + // HTTPS with required client cert auth, so we only probe TCP reachability (a + // successful TCP connect proves ListenAndServeTLS has started). Without this + // probe, the client `INSERT INTO system.backup_actions` below races server + // startup and fails with `Code 60: UNKNOWN_TABLE` — table creation runs before + // the listener is bound. + fipsReadyDeadline := time.Now().Add(30 * time.Second) + for { + tcpOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "if timeout 2 bash -c '/dev/null; then echo open; else echo closed; fi") + tblOut, _ := env.DockerExecOut("clickhouse", "bash", "-ce", "clickhouse client -q 'EXISTS TABLE system.backup_actions' 2>/dev/null || true") + if strings.TrimSpace(tcpOut) == "open" && strings.TrimSpace(tblOut) == "1" { + break + } + if time.Now().After(fipsReadyDeadline) { + r.FailNow("clickhouse-backup-fips server did not become ready in 30s", "tcp=%q table_exists=%q", tcpOut, tblOut) + } + time.Sleep(200 * time.Millisecond) + } runClickHouseClientInsertSystemBackupActions(r, env, []string{fmt.Sprintf("create_remote --tables="+t.Name()+".fips_table %s", fipsBackupName)}, true) runClickHouseClientInsertSystemBackupActions(r, env, []string{fmt.Sprintf("delete local %s", fipsBackupName)}, false) @@ -93,6 +138,12 @@ func TestFIPS(t *testing.T) { r.NoError(err, "%s\nunexpected grep testssl.csv error: %v", out, err) r.Equal(strconv.Itoa(len(cipherList)), strings.Trim(out, " \t\r\n")) + // P1: Negative test — non-FIPS ciphers (RC4, DES, 3DES, CHACHA, NULL) should NOT be offered + nonFipsOut, nonFipsErr := env.DockerExecOut("clickhouse", "bash", "-ce", "grep -v 'not offered' /tmp/testssl.csv | grep -c -i -E '(RC4|TRIPLEDES|DES-CBC|CHACHA|NULL)' || true") + r.NoError(nonFipsErr, "%s\nunexpected grep non-FIPS ciphers error: %v", nonFipsOut, nonFipsErr) + nonFipsCount, _ := strconv.Atoi(strings.TrimSpace(nonFipsOut)) + r.Equal(0, nonFipsCount, "non-FIPS ciphers (RC4/DES/3DES/CHACHA/NULL) should not be offered by FIPS server, found %d matches", nonFipsCount) + inProgressActions := make([]struct { Command string `ch:"command"` Status string `ch:"status"` @@ -104,6 +155,15 @@ func TestFIPS(t *testing.T) { r.Equal(0, len(inProgressActions), "inProgressActions=%+v", inProgressActions) env.DockerExecNoError(r, "clickhouse", "pkill", "-n", "-f", "clickhouse-backup-fips") } + // P1: Test create_remote + restore_remote in strict GODEBUG=fips140=only mode + // @todo think about FIPS clickhouse-server, which not supported by GODEBUG=fips140=only + // fipsOnlyBackupName := fmt.Sprintf("fips_only_backup_%d", rand.Int()) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml create_remote --tables="+t.Name()+".fips_table "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml delete local "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml restore_remote --tables="+t.Name()+".fips_table "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml delete local "+fipsOnlyBackupName) + //env.DockerExecNoError(r, "clickhouse", "bash", "-ce", "GODEBUG=fips140=only clickhouse-backup-fips -c /etc/clickhouse-backup/config-s3-fips.yml delete remote "+fipsOnlyBackupName) + // https://www.perplexity.ai/search/0920f1e8-59ec-4e14-b779-ba7b2e037196 testTLSCerts("rsa", "4096", "", "ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES256-GCM-SHA384", "AES_128_GCM_SHA256", "AES_256_GCM_SHA384") testTLSCerts("ecdsa", "", "prime256v1", "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES256-GCM-SHA384")