From 6d67d088d0537659c740f75413c3ed7c8face806 Mon Sep 17 00:00:00 2001 From: Yuval Mandelboum Date: Tue, 12 May 2026 15:07:28 -0700 Subject: [PATCH 1/6] fix(releases): Prevent row-lock contention on last_seen bump for ReleaseProjectEnvironment and ReleaseEnvironment High-volume releases cause concurrent workers to pile up on the same row's UPDATE for last_seen, hitting statement_timeout (SENTRY-5HQZ). This adds a cache-based distributed lock so only one worker per row per 60s attempts the DB update, and catches OperationalError so a failed bump doesn't abort the rest of the event save pipeline. Fixes SENTRY-5HQZ --- src/sentry/models/releaseenvironment.py | 24 ++++--- .../models/releaseprojectenvironment.py | 25 +++++-- .../sentry/models/test_releaseenvironment.py | 69 +++++++++++++++++++ .../models/test_releaseprojectenvironment.py | 69 +++++++++++++++++++ 4 files changed, 171 insertions(+), 16 deletions(-) diff --git a/src/sentry/models/releaseenvironment.py b/src/sentry/models/releaseenvironment.py index acb0ae86be09..29ea7b8cd663 100644 --- a/src/sentry/models/releaseenvironment.py +++ b/src/sentry/models/releaseenvironment.py @@ -1,6 +1,7 @@ from datetime import timedelta from django.db import models +from django.db.utils import OperationalError from django.utils import timezone from sentry.backup.scopes import RelocationScope @@ -63,16 +64,21 @@ def _get_or_create_impl(cls, project, release, environment, datetime, metric_tag metric_tags["created"] = "true" if created else "false" - # TODO(dcramer): this would be good to buffer, but until then we minimize - # updates to once a minute, and allow Postgres to optimistically skip - # it even if we can't + bump_key = f"releaseenv_bump:{instance.id}" if not created and instance.last_seen < datetime - timedelta(seconds=60): - metric_tags["bumped"] = "true" - cls.objects.filter( - id=instance.id, last_seen__lt=datetime - timedelta(seconds=60) - ).update(last_seen=datetime) - instance.last_seen = datetime - cache.set(cache_key, instance, 3600) + if cache.add(bump_key, "1", timeout=60): + try: + cls.objects.filter( + id=instance.id, last_seen__lt=datetime - timedelta(seconds=60) + ).update(last_seen=datetime) + except OperationalError: + metric_tags["bumped"] = "error" + return instance + instance.last_seen = datetime + cache.set(cache_key, instance, 3600) + metric_tags["bumped"] = "true" + else: + metric_tags["bumped"] = "skipped" else: metric_tags["bumped"] = "false" diff --git a/src/sentry/models/releaseprojectenvironment.py b/src/sentry/models/releaseprojectenvironment.py index eec01d046b28..1f3dba2487be 100644 --- a/src/sentry/models/releaseprojectenvironment.py +++ b/src/sentry/models/releaseprojectenvironment.py @@ -5,6 +5,7 @@ from typing import TypedDict from django.db import models +from django.db.utils import OperationalError from django.utils import timezone from sentry.backup.scopes import RelocationScope @@ -82,14 +83,24 @@ def _get_or_create_impl(cls, release, project, environment, datetime, metrics_ta metrics_tags["created"] = "true" if created else "false" - # Same as releaseenvironment model. Minimizes last_seen updates to once a minute + # Same as releaseenvironment model. Minimizes last_seen updates to once a minute. + # The cache.add acts as a distributed lock so only one worker attempts the + # DB update per row per 60s, avoiding row-lock contention on hot releases. + bump_key = f"releaseprojectenv_bump:{instance.id}" if not created and instance.last_seen < datetime - timedelta(seconds=60): - cls.objects.filter( - id=instance.id, last_seen__lt=datetime - timedelta(seconds=60) - ).update(last_seen=datetime) - instance.last_seen = datetime - cache.set(cache_key, instance, 3600) - metrics_tags["bumped"] = "true" + if cache.add(bump_key, "1", timeout=60): + try: + cls.objects.filter( + id=instance.id, last_seen__lt=datetime - timedelta(seconds=60) + ).update(last_seen=datetime) + except OperationalError: + metrics_tags["bumped"] = "error" + return instance + instance.last_seen = datetime + cache.set(cache_key, instance, 3600) + metrics_tags["bumped"] = "true" + else: + metrics_tags["bumped"] = "skipped" else: metrics_tags["bumped"] = "false" diff --git a/tests/sentry/models/test_releaseenvironment.py b/tests/sentry/models/test_releaseenvironment.py index d572797573bb..bd0a1ece54bc 100644 --- a/tests/sentry/models/test_releaseenvironment.py +++ b/tests/sentry/models/test_releaseenvironment.py @@ -1,5 +1,7 @@ from datetime import timedelta +from unittest.mock import patch +from django.db.utils import OperationalError from django.utils import timezone from sentry.models.environment import Environment @@ -52,3 +54,70 @@ def test_simple(self) -> None: ) assert relenv.id == relenv2.id assert ReleaseEnvironment.objects.get(id=relenv.id).last_seen == relenv2.last_seen + + def test_bump_skipped_when_cache_lock_held(self) -> None: + """Second worker skips the DB update when another worker already holds the bump lock.""" + project = self.create_project(name="foo") + datetime_now = timezone.now() + + release = Release.objects.create(organization_id=project.organization_id, version="abcdef") + release.add_project(project) + env = Environment.objects.create(organization_id=project.organization_id, name="prod") + + ReleaseEnvironment.get_or_create( + project=project, release=release, environment=env, datetime=datetime_now + ) + + datetime_next = datetime_now + timedelta(days=1) + + from sentry.utils.cache import cache + + re = ReleaseEnvironment.objects.get( + organization_id=project.organization_id, release=release, environment=env + ) + bump_key = f"releaseenv_bump:{re.id}" + cache.set(bump_key, "1", timeout=60) + + with patch.object( + ReleaseEnvironment.objects, + "filter", + wraps=ReleaseEnvironment.objects.filter, + ) as mock_filter: + result = ReleaseEnvironment.get_or_create( + project=project, release=release, environment=env, datetime=datetime_next + ) + for call in mock_filter.call_args_list: + assert "last_seen__lt" not in (call.kwargs or {}) + + re.refresh_from_db() + assert re.last_seen == datetime_now + assert result is not None + + def test_bump_survives_operational_error(self) -> None: + """OperationalError on the UPDATE doesn't prevent the instance from being returned.""" + project = self.create_project(name="foo") + datetime_now = timezone.now() + + release = Release.objects.create(organization_id=project.organization_id, version="abcdef") + release.add_project(project) + env = Environment.objects.create(organization_id=project.organization_id, name="prod") + + ReleaseEnvironment.get_or_create( + project=project, release=release, environment=env, datetime=datetime_now + ) + + datetime_next = datetime_now + timedelta(days=1) + + with patch("sentry.models.releaseenvironment.ReleaseEnvironment.objects") as mock_manager: + mock_qs = mock_manager.filter.return_value + mock_qs.update.side_effect = OperationalError("canceling statement due to user request") + + result = ReleaseEnvironment.get_or_create( + project=project, release=release, environment=env, datetime=datetime_next + ) + + assert result is not None + re = ReleaseEnvironment.objects.get( + organization_id=project.organization_id, release=release, environment=env + ) + assert re.last_seen == datetime_now diff --git a/tests/sentry/models/test_releaseprojectenvironment.py b/tests/sentry/models/test_releaseprojectenvironment.py index 375b01f04e7b..5f9b4ebdee9e 100644 --- a/tests/sentry/models/test_releaseprojectenvironment.py +++ b/tests/sentry/models/test_releaseprojectenvironment.py @@ -1,5 +1,7 @@ from datetime import timedelta +from unittest.mock import patch +from django.db.utils import OperationalError from django.utils import timezone from sentry.models.environment import Environment @@ -83,3 +85,70 @@ def test_no_update_too_close(self) -> None: ) assert release_project_env.first_seen == self.datetime_now assert release_project_env.last_seen == self.datetime_now + + def test_bump_skipped_when_cache_lock_held(self) -> None: + """Second worker skips the DB update when another worker already holds the bump lock.""" + ReleaseProjectEnvironment.get_or_create( + project=self.project, + release=self.release, + environment=self.environment, + datetime=self.datetime_now, + ) + + datetime_next = self.datetime_now + timedelta(days=1) + + from sentry.utils.cache import cache + + rpe = ReleaseProjectEnvironment.objects.get( + project=self.project, release=self.release, environment=self.environment + ) + bump_key = f"releaseprojectenv_bump:{rpe.id}" + cache.set(bump_key, "1", timeout=60) + + with patch.object( + ReleaseProjectEnvironment.objects, + "filter", + wraps=ReleaseProjectEnvironment.objects.filter, + ) as mock_filter: + result = ReleaseProjectEnvironment.get_or_create( + project=self.project, + release=self.release, + environment=self.environment, + datetime=datetime_next, + ) + for call in mock_filter.call_args_list: + assert "last_seen__lt" not in (call.kwargs or {}) + + rpe.refresh_from_db() + assert rpe.last_seen == self.datetime_now + assert result is not None + + def test_bump_survives_operational_error(self) -> None: + """OperationalError on the UPDATE doesn't prevent the instance from being returned.""" + ReleaseProjectEnvironment.get_or_create( + project=self.project, + release=self.release, + environment=self.environment, + datetime=self.datetime_now, + ) + + datetime_next = self.datetime_now + timedelta(days=1) + + with patch( + "sentry.models.releaseprojectenvironment.ReleaseProjectEnvironment.objects" + ) as mock_manager: + mock_qs = mock_manager.filter.return_value + mock_qs.update.side_effect = OperationalError("canceling statement due to user request") + + result = ReleaseProjectEnvironment.get_or_create( + project=self.project, + release=self.release, + environment=self.environment, + datetime=datetime_next, + ) + + assert result is not None + rpe = ReleaseProjectEnvironment.objects.get( + project=self.project, release=self.release, environment=self.environment + ) + assert rpe.last_seen == self.datetime_now From 5ef02bbb954c03cc356b9da86edd83e626d100f8 Mon Sep 17 00:00:00 2001 From: Yuval Mandelboum Date: Tue, 12 May 2026 15:20:53 -0700 Subject: [PATCH 2/6] fix(releases): Simplify SQL filter to last_seen__lt=datetime The cache lock handles the 60s throttle now, so the SQL filter only needs to prevent setting last_seen backwards. --- src/sentry/models/releaseenvironment.py | 6 +++--- src/sentry/models/releaseprojectenvironment.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sentry/models/releaseenvironment.py b/src/sentry/models/releaseenvironment.py index 29ea7b8cd663..2540aabec7e1 100644 --- a/src/sentry/models/releaseenvironment.py +++ b/src/sentry/models/releaseenvironment.py @@ -68,9 +68,9 @@ def _get_or_create_impl(cls, project, release, environment, datetime, metric_tag if not created and instance.last_seen < datetime - timedelta(seconds=60): if cache.add(bump_key, "1", timeout=60): try: - cls.objects.filter( - id=instance.id, last_seen__lt=datetime - timedelta(seconds=60) - ).update(last_seen=datetime) + cls.objects.filter(id=instance.id, last_seen__lt=datetime).update( + last_seen=datetime + ) except OperationalError: metric_tags["bumped"] = "error" return instance diff --git a/src/sentry/models/releaseprojectenvironment.py b/src/sentry/models/releaseprojectenvironment.py index 1f3dba2487be..4351b1be0e43 100644 --- a/src/sentry/models/releaseprojectenvironment.py +++ b/src/sentry/models/releaseprojectenvironment.py @@ -90,9 +90,9 @@ def _get_or_create_impl(cls, release, project, environment, datetime, metrics_ta if not created and instance.last_seen < datetime - timedelta(seconds=60): if cache.add(bump_key, "1", timeout=60): try: - cls.objects.filter( - id=instance.id, last_seen__lt=datetime - timedelta(seconds=60) - ).update(last_seen=datetime) + cls.objects.filter(id=instance.id, last_seen__lt=datetime).update( + last_seen=datetime + ) except OperationalError: metrics_tags["bumped"] = "error" return instance From 23a466222e619c0e18d4206ffd8eea2d8c15bcce Mon Sep 17 00:00:00 2001 From: Yuval Mandelboum Date: Tue, 12 May 2026 15:23:54 -0700 Subject: [PATCH 3/6] fix(releases): Move cache import to module level in tests --- tests/sentry/models/test_releaseenvironment.py | 3 +-- tests/sentry/models/test_releaseprojectenvironment.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/sentry/models/test_releaseenvironment.py b/tests/sentry/models/test_releaseenvironment.py index bd0a1ece54bc..209de37f910d 100644 --- a/tests/sentry/models/test_releaseenvironment.py +++ b/tests/sentry/models/test_releaseenvironment.py @@ -8,6 +8,7 @@ from sentry.models.release import Release from sentry.models.releaseenvironment import ReleaseEnvironment from sentry.testutils.cases import TestCase +from sentry.utils.cache import cache class GetOrCreateTest(TestCase): @@ -70,8 +71,6 @@ def test_bump_skipped_when_cache_lock_held(self) -> None: datetime_next = datetime_now + timedelta(days=1) - from sentry.utils.cache import cache - re = ReleaseEnvironment.objects.get( organization_id=project.organization_id, release=release, environment=env ) diff --git a/tests/sentry/models/test_releaseprojectenvironment.py b/tests/sentry/models/test_releaseprojectenvironment.py index 5f9b4ebdee9e..dd2dca76adef 100644 --- a/tests/sentry/models/test_releaseprojectenvironment.py +++ b/tests/sentry/models/test_releaseprojectenvironment.py @@ -8,6 +8,7 @@ from sentry.models.release import Release from sentry.models.releaseprojectenvironment import ReleaseProjectEnvironment from sentry.testutils.cases import TestCase +from sentry.utils.cache import cache class GetOrCreateTest(TestCase): @@ -97,8 +98,6 @@ def test_bump_skipped_when_cache_lock_held(self) -> None: datetime_next = self.datetime_now + timedelta(days=1) - from sentry.utils.cache import cache - rpe = ReleaseProjectEnvironment.objects.get( project=self.project, release=self.release, environment=self.environment ) From 7bdb919ba64ae8429c9219299bd45ddc09989db1 Mon Sep 17 00:00:00 2001 From: Yuval Mandelboum Date: Wed, 13 May 2026 10:36:55 -0700 Subject: [PATCH 4/6] ref(releases): Extract try_bump_last_seen into shared utility The throttled last_seen bump pattern was duplicated across ReleaseEnvironment and ReleaseProjectEnvironment. Extract it into sentry/utils/last_seen.py so both models share one implementation. --- src/sentry/models/releaseenvironment.py | 30 +++++--------- .../models/releaseprojectenvironment.py | 33 +++++---------- src/sentry/utils/last_seen.py | 40 +++++++++++++++++++ 3 files changed, 61 insertions(+), 42 deletions(-) create mode 100644 src/sentry/utils/last_seen.py diff --git a/src/sentry/models/releaseenvironment.py b/src/sentry/models/releaseenvironment.py index 2540aabec7e1..124a5e3835d2 100644 --- a/src/sentry/models/releaseenvironment.py +++ b/src/sentry/models/releaseenvironment.py @@ -1,7 +1,4 @@ -from datetime import timedelta - from django.db import models -from django.db.utils import OperationalError from django.utils import timezone from sentry.backup.scopes import RelocationScope @@ -14,6 +11,7 @@ ) from sentry.utils import metrics from sentry.utils.cache import cache +from sentry.utils.last_seen import try_bump_last_seen @cell_silo_model @@ -64,22 +62,14 @@ def _get_or_create_impl(cls, project, release, environment, datetime, metric_tag metric_tags["created"] = "true" if created else "false" - bump_key = f"releaseenv_bump:{instance.id}" - if not created and instance.last_seen < datetime - timedelta(seconds=60): - if cache.add(bump_key, "1", timeout=60): - try: - cls.objects.filter(id=instance.id, last_seen__lt=datetime).update( - last_seen=datetime - ) - except OperationalError: - metric_tags["bumped"] = "error" - return instance - instance.last_seen = datetime - cache.set(cache_key, instance, 3600) - metric_tags["bumped"] = "true" - else: - metric_tags["bumped"] = "skipped" - else: - metric_tags["bumped"] = "false" + if not created: + try_bump_last_seen( + model_class=cls, + instance=instance, + datetime=datetime, + bump_key=f"releaseenv_bump:{instance.id}", + cache_key=cache_key, + metrics_tags=metric_tags, + ) return instance diff --git a/src/sentry/models/releaseprojectenvironment.py b/src/sentry/models/releaseprojectenvironment.py index 4351b1be0e43..56edfceefee6 100644 --- a/src/sentry/models/releaseprojectenvironment.py +++ b/src/sentry/models/releaseprojectenvironment.py @@ -1,11 +1,10 @@ from __future__ import annotations -from datetime import datetime, timedelta +from datetime import datetime from enum import Enum from typing import TypedDict from django.db import models -from django.db.utils import OperationalError from django.utils import timezone from sentry.backup.scopes import RelocationScope @@ -18,6 +17,7 @@ ) from sentry.utils import metrics from sentry.utils.cache import cache +from sentry.utils.last_seen import try_bump_last_seen class ReleaseStages(str, Enum): @@ -83,26 +83,15 @@ def _get_or_create_impl(cls, release, project, environment, datetime, metrics_ta metrics_tags["created"] = "true" if created else "false" - # Same as releaseenvironment model. Minimizes last_seen updates to once a minute. - # The cache.add acts as a distributed lock so only one worker attempts the - # DB update per row per 60s, avoiding row-lock contention on hot releases. - bump_key = f"releaseprojectenv_bump:{instance.id}" - if not created and instance.last_seen < datetime - timedelta(seconds=60): - if cache.add(bump_key, "1", timeout=60): - try: - cls.objects.filter(id=instance.id, last_seen__lt=datetime).update( - last_seen=datetime - ) - except OperationalError: - metrics_tags["bumped"] = "error" - return instance - instance.last_seen = datetime - cache.set(cache_key, instance, 3600) - metrics_tags["bumped"] = "true" - else: - metrics_tags["bumped"] = "skipped" - else: - metrics_tags["bumped"] = "false" + if not created: + try_bump_last_seen( + model_class=cls, + instance=instance, + datetime=datetime, + bump_key=f"releaseprojectenv_bump:{instance.id}", + cache_key=cache_key, + metrics_tags=metrics_tags, + ) return instance diff --git a/src/sentry/utils/last_seen.py b/src/sentry/utils/last_seen.py new file mode 100644 index 000000000000..40c204d12df3 --- /dev/null +++ b/src/sentry/utils/last_seen.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from datetime import datetime, timedelta + +from django.core.cache import cache +from django.db.utils import OperationalError + +from sentry.db.models import Model + + +def try_bump_last_seen( + *, + model_class: type[Model], + instance: Model, + datetime: datetime, + bump_key: str, + cache_key: str, + metrics_tags: dict[str, str], +) -> bool: + """Throttled last_seen bump — at most once per 60s per row via a cache-based lock.""" + if instance.last_seen >= datetime - timedelta(seconds=60): + metrics_tags["bumped"] = "false" + return False + + if not cache.add(bump_key, "1", timeout=60): + metrics_tags["bumped"] = "skipped" + return False + + try: + model_class.objects.filter(id=instance.id, last_seen__lt=datetime).update( + last_seen=datetime + ) + except OperationalError: + metrics_tags["bumped"] = "error" + return False + + instance.last_seen = datetime + cache.set(cache_key, instance, 3600) + metrics_tags["bumped"] = "true" + return True From 30204dac5d67e3011c4aa12b6a04c07fd331770f Mon Sep 17 00:00:00 2001 From: Yuval Mandelboum Date: Wed, 13 May 2026 10:53:38 -0700 Subject: [PATCH 5/6] fix(typing): Use Any for try_bump_last_seen params to avoid mypy errors Model doesn't have a last_seen attribute as far as mypy knows. Using Any avoids the attr-defined errors without needing a Protocol. --- src/sentry/utils/last_seen.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sentry/utils/last_seen.py b/src/sentry/utils/last_seen.py index 40c204d12df3..c67896ed8932 100644 --- a/src/sentry/utils/last_seen.py +++ b/src/sentry/utils/last_seen.py @@ -1,17 +1,16 @@ from __future__ import annotations from datetime import datetime, timedelta +from typing import Any from django.core.cache import cache from django.db.utils import OperationalError -from sentry.db.models import Model - def try_bump_last_seen( *, - model_class: type[Model], - instance: Model, + model_class: Any, + instance: Any, datetime: datetime, bump_key: str, cache_key: str, From bdd1e90f15b277427bd1c848fe2b4dc875ac34d9 Mon Sep 17 00:00:00 2001 From: Yuval Mandelboum Date: Wed, 13 May 2026 10:59:53 -0700 Subject: [PATCH 6/6] fix(releases): Use Protocol for typing, restore bumped=false for created rows Use a HasLastSeen Protocol to type the instance parameter instead of Any. Keep model_class as Any since typing the Django manager chain is impractical. Also restore the bumped=false metric tag when a row is newly created, which was dropped during the extraction. --- src/sentry/models/releaseenvironment.py | 2 ++ src/sentry/models/releaseprojectenvironment.py | 2 ++ src/sentry/utils/last_seen.py | 14 ++++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/sentry/models/releaseenvironment.py b/src/sentry/models/releaseenvironment.py index 124a5e3835d2..e18a2da0a2ce 100644 --- a/src/sentry/models/releaseenvironment.py +++ b/src/sentry/models/releaseenvironment.py @@ -71,5 +71,7 @@ def _get_or_create_impl(cls, project, release, environment, datetime, metric_tag cache_key=cache_key, metrics_tags=metric_tags, ) + else: + metric_tags["bumped"] = "false" return instance diff --git a/src/sentry/models/releaseprojectenvironment.py b/src/sentry/models/releaseprojectenvironment.py index 56edfceefee6..309ffdd0e5b6 100644 --- a/src/sentry/models/releaseprojectenvironment.py +++ b/src/sentry/models/releaseprojectenvironment.py @@ -92,6 +92,8 @@ def _get_or_create_impl(cls, release, project, environment, datetime, metrics_ta cache_key=cache_key, metrics_tags=metrics_tags, ) + else: + metrics_tags["bumped"] = "false" return instance diff --git a/src/sentry/utils/last_seen.py b/src/sentry/utils/last_seen.py index c67896ed8932..62f29714b51a 100644 --- a/src/sentry/utils/last_seen.py +++ b/src/sentry/utils/last_seen.py @@ -1,17 +1,23 @@ from __future__ import annotations -from datetime import datetime, timedelta -from typing import Any +from datetime import datetime as dt +from datetime import timedelta +from typing import Any, Protocol from django.core.cache import cache from django.db.utils import OperationalError +class HasLastSeen(Protocol): + id: int + last_seen: dt + + def try_bump_last_seen( *, model_class: Any, - instance: Any, - datetime: datetime, + instance: HasLastSeen, + datetime: dt, bump_key: str, cache_key: str, metrics_tags: dict[str, str],