From 8e5d0ea0eae1a27db5768ace49fe5810dc11415a Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 1/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 .../commands/fill_link_aggregates.py          |   4 +-
 ...x_aggregates_for_organisation_and_month.py |  89 ++++
 extlinks/aggregates/tests.py                  | 437 ++++++++++++++++++
 3 files changed, 529 insertions(+), 1 deletion(-)
 create mode 100644 extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py

diff --git a/extlinks/aggregates/management/commands/fill_link_aggregates.py b/extlinks/aggregates/management/commands/fill_link_aggregates.py
index 7b9faff3..710d1245 100644
--- a/extlinks/aggregates/management/commands/fill_link_aggregates.py
+++ b/extlinks/aggregates/management/commands/fill_link_aggregates.py
@@ -8,7 +8,7 @@
 from django.db.models.fields import DateField
 
 from ...models import LinkAggregate
-from extlinks.links.models import LinkEvent
+from extlinks.links.models import LinkEvent, URLPattern
 from extlinks.organisations.models import Collection
 
 
@@ -117,6 +117,8 @@ def _process_single_collection(self, link_event_filter, collection):
         None
         """
         url_patterns = collection.get_url_patterns()
+        if len(url_patterns) == 0:
+            url_patterns = URLPattern.objects.filter(collection=collection).all()
         for url_pattern in url_patterns:
             link_events_with_annotated_timestamp = url_pattern.link_events.annotate(
                 timestamp_date=Cast("timestamp", DateField())
diff --git a/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py
new file mode 100644
index 00000000..5752756f
--- /dev/null
+++ b/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py
@@ -0,0 +1,89 @@
+import gzip
+import json
+import os
+import logging
+from datetime import datetime
+
+from django.core.management import call_command
+
+from extlinks.common.management.commands import BaseCommand
+from extlinks.links.models import URLPattern, LinkEvent
+from extlinks.organisations.models import Organisation
+
+logger = logging.getLogger("django")
+class Command(BaseCommand):
+    help = "Loads, parses, and fixes monthly link aggregates for a given organisation."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--month",
+            help="The date (YYYYMM) of the monthly archive to be fixed.",
+            type=str,
+        )
+        parser.add_argument(
+            "--organisation",
+            help="The organisation id to fix link aggregates for.",
+            type=str,
+        )
+        parser.add_argument(
+            "--dir", help="The directory from which to parse archives.", type=str
+        )
+        parser.add_argument(
+            "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False
+        )
+
+
+    def _handle(self, *args, **options):
+        directory = options["dir"]
+        month_to_fix = options["month"]
+        organisation = Organisation.objects.filter(id=options["organisation"]).first()
+        collections = organisation.collection_set.all()
+        skip_monthly = options["skip_monthly"]
+        if not month_to_fix or not organisation or not collections or not directory:
+            return
+        url_pattern_strings = [i.url for i in URLPattern.objects.filter(collection__in=collections)]
+        org_only_events = []
+        for filename in os.listdir(directory):
+            if (
+                filename.endswith(".json.gz")
+                and filename.startswith("links_linkevent_")
+                and month_to_fix in filename
+            ):
+                file_path = os.path.join(directory, filename)
+                with gzip.open(file_path, "rt", encoding="utf-8") as f:
+                    data = json.load(f)
+                    for event in data:
+                        link = event["fields"]["link"]
+                        for url_pattern in url_pattern_strings:
+                            if url_pattern in link:
+                                org_only_events.append(event)
+                filtered_file = f"link_events_filtered_{month_to_fix}_organisation_{organisation.id}.json.gz"
+                filtered_file_path = os.path.join(
+                    directory,
+                    filtered_file,
+                )
+                if len(org_only_events) > 0:
+                    try:
+                        with gzip.open(
+                            filtered_file_path, "wt", encoding="utf-8"
+                        ) as new_archive:
+                            json.dump(org_only_events, new_archive)
+                        logger.info(f"Aggregating {len(org_only_events)} filtered events for {filtered_file}")
+                        # load filtered records into the database
+                        call_command("archive_link_aggregates", "load", filtered_file_path)
+                        # run aggregate command
+                        call_command(
+                            "fill_link_aggregates",
+                            collections=[i.id for i in collections.all()],
+                        )
+                        # run monthly aggregate command if we're not skipping it
+                        if not skip_monthly:
+                            call_command(
+                                "fill_monthly_link_aggregates",
+                                collections=collections,
+                                year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"),
+                            )
+                        # delete the records from the database, as we do not need to re-archive or re-upload them
+                        LinkEvent.objects.filter(pk__in=[i['pk'] for i in org_only_events]).delete()
+                    except Exception as e:
+                        logger.error(e)
diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py
index 44ed323f..85c64011 100644
--- a/extlinks/aggregates/tests.py
+++ b/extlinks/aggregates/tests.py
@@ -11,6 +11,7 @@
 from dateutil.relativedelta import relativedelta
 from unittest import mock
 
+from django.contrib.contenttypes.models import ContentType
 from django.core.management import call_command, CommandError
 from django.test import TransactionTestCase
 
@@ -33,6 +34,7 @@
     UserFactory,
 )
 from extlinks.organisations.models import Organisation
+from ..links.models import URLPattern, LinkEvent
 
 
 class BaseTransactionTest(TransactionTestCase):
@@ -2128,3 +2130,438 @@ def test_uploads_all_files_successfully(self, mock_swift_connection):
 
             for file in glob.glob(pattern):
                 os.remove(file)
+
+class FixAggregatesForOrganisationAndMonthCommandTest(BaseTransactionTest):
+
+    def setUp(self):
+        # Creating one Collection, Organisation, and URLPattern
+        self.organisation = OrganisationFactory(name="ACME Org")
+        self.collection = CollectionFactory(organisation=self.organisation)
+        self.user = UserFactory()
+        self.url = URLPatternFactory(url="www.test.com")
+        self.url.collection = self.collection
+        self.url.save()
+
+    def test_fixes_monthly_aggregates_for_organisation(self):
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "fix_aggregates_for_organisation_and_month",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            monthly_aggregate = LinkAggregate.objects.filter(day=0).first()
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(2, monthly_aggregate.total_links_added)
+            self.assertEqual(1, monthly_aggregate.total_links_removed)
+            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives(self):
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "aggregates_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-01-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "fix_aggregates_for_organisation_and_month",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives_in_correct_zipped_format(self):
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-01-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "fix_aggregates_for_organisation_and_month",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(self):
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "fix_aggregates_for_organisation_and_month",
+                "--month",
+                "202412",
+                '--skip-monthly',
+                True,
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            link_aggregate = LinkAggregate.objects.filter(day=16).first()
+            link_aggregate2 = LinkAggregate.objects.filter(day=15).first()
+            self.assertEqual(2, LinkAggregate.objects.count())
+            self.assertEqual(1, link_aggregate.total_links_added)
+            self.assertEqual(0, link_aggregate.total_links_removed)
+            self.assertEqual(1, link_aggregate2.total_links_added)
+            self.assertEqual(1, link_aggregate2.total_links_removed)
+            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+

From 65f27bf88d247fe1b4db5197eb4287c861a7d4fb Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 2/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 ...x_aggregates_for_organisation_and_month.py |  89 ----------
 ...k_aggregates_for_organisation_and_month.py | 163 ++++++++++++++++++
 extlinks/aggregates/tests.py                  |  16 +-
 3 files changed, 171 insertions(+), 97 deletions(-)
 delete mode 100644 extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py
 create mode 100644 extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py

diff --git a/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py
deleted file mode 100644
index 5752756f..00000000
--- a/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import gzip
-import json
-import os
-import logging
-from datetime import datetime
-
-from django.core.management import call_command
-
-from extlinks.common.management.commands import BaseCommand
-from extlinks.links.models import URLPattern, LinkEvent
-from extlinks.organisations.models import Organisation
-
-logger = logging.getLogger("django")
-class Command(BaseCommand):
-    help = "Loads, parses, and fixes monthly link aggregates for a given organisation."
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "--month",
-            help="The date (YYYYMM) of the monthly archive to be fixed.",
-            type=str,
-        )
-        parser.add_argument(
-            "--organisation",
-            help="The organisation id to fix link aggregates for.",
-            type=str,
-        )
-        parser.add_argument(
-            "--dir", help="The directory from which to parse archives.", type=str
-        )
-        parser.add_argument(
-            "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False
-        )
-
-
-    def _handle(self, *args, **options):
-        directory = options["dir"]
-        month_to_fix = options["month"]
-        organisation = Organisation.objects.filter(id=options["organisation"]).first()
-        collections = organisation.collection_set.all()
-        skip_monthly = options["skip_monthly"]
-        if not month_to_fix or not organisation or not collections or not directory:
-            return
-        url_pattern_strings = [i.url for i in URLPattern.objects.filter(collection__in=collections)]
-        org_only_events = []
-        for filename in os.listdir(directory):
-            if (
-                filename.endswith(".json.gz")
-                and filename.startswith("links_linkevent_")
-                and month_to_fix in filename
-            ):
-                file_path = os.path.join(directory, filename)
-                with gzip.open(file_path, "rt", encoding="utf-8") as f:
-                    data = json.load(f)
-                    for event in data:
-                        link = event["fields"]["link"]
-                        for url_pattern in url_pattern_strings:
-                            if url_pattern in link:
-                                org_only_events.append(event)
-                filtered_file = f"link_events_filtered_{month_to_fix}_organisation_{organisation.id}.json.gz"
-                filtered_file_path = os.path.join(
-                    directory,
-                    filtered_file,
-                )
-                if len(org_only_events) > 0:
-                    try:
-                        with gzip.open(
-                            filtered_file_path, "wt", encoding="utf-8"
-                        ) as new_archive:
-                            json.dump(org_only_events, new_archive)
-                        logger.info(f"Aggregating {len(org_only_events)} filtered events for {filtered_file}")
-                        # load filtered records into the database
-                        call_command("archive_link_aggregates", "load", filtered_file_path)
-                        # run aggregate command
-                        call_command(
-                            "fill_link_aggregates",
-                            collections=[i.id for i in collections.all()],
-                        )
-                        # run monthly aggregate command if we're not skipping it
-                        if not skip_monthly:
-                            call_command(
-                                "fill_monthly_link_aggregates",
-                                collections=collections,
-                                year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"),
-                            )
-                        # delete the records from the database, as we do not need to re-archive or re-upload them
-                        LinkEvent.objects.filter(pk__in=[i['pk'] for i in org_only_events]).delete()
-                    except Exception as e:
-                        logger.error(e)
diff --git a/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py
new file mode 100644
index 00000000..263384f4
--- /dev/null
+++ b/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py
@@ -0,0 +1,163 @@
+import gzip
+import json
+import os
+import logging
+from datetime import datetime, timedelta, date
+
+from django.core.management import call_command
+from django.db import transaction
+
+from extlinks.aggregates.models import LinkAggregate
+from extlinks.common.management.commands import BaseCommand
+from extlinks.links.models import URLPattern
+from extlinks.organisations.models import Organisation
+
+logger = logging.getLogger("django")
+
+
+class Command(BaseCommand):
+    help = ("Loads, parses, and fixes daily link aggregates for a given month and organisation. "
+            "Only run this command if the month's link events have not been already been aggregated.")
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--month",
+            help="The date (YYYYMM) of the monthly archive to be fixed.",
+            type=str,
+        )
+        parser.add_argument(
+            "--organisation",
+            help="The organisation id to fix link aggregates for.",
+            type=str,
+        )
+        parser.add_argument(
+            "--dir", help="The directory from which to parse archives.", type=str
+        )
+        parser.add_argument(
+            "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False
+        )
+
+
+    def _handle(self, *args, **options):
+        directory = options["dir"]
+        month_to_fix = options["month"]
+        organisation = Organisation.objects.filter(id=options["organisation"]).first()
+        collections = organisation.collection_set.all()
+        skip_monthly = options["skip_monthly"]
+        if not month_to_fix or not organisation or not collections or not directory:
+            return
+        url_patterns = URLPattern.objects.filter(collection__in=collections)
+        events_split_by_url_pattern = self.load_events_from_archives(
+            directory, month_to_fix, [i.url for i in url_patterns]
+        )
+        first_day_of_month = self.get_first_day_of_month(month_to_fix)
+        last_day_of_month = self.get_last_day_of_month(first_day_of_month)
+        try:
+            for i in range(
+                first_day_of_month.day, last_day_of_month.day+1
+            ):
+                for collection in collections:
+                    collection_url_pattern_strings = [
+                        i.url for i in url_patterns.filter(collection=collection)
+                    ]
+                    for collection_url_string in collection_url_pattern_strings:
+                        for link_event in self.get_link_events_for_day(
+                            collection_url_string, events_split_by_url_pattern, i
+                        ):
+                            self.fill_aggregate_from_archived_link_event(collection, link_event)
+            # run monthly aggregate command if we're not skipping it
+            if not skip_monthly:
+                # fill monthly aggregate for the subset of link events we just aggregated
+                call_command(
+                    "fill_monthly_link_aggregates",
+                    collections=collections,
+                    year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"),
+                )
+        except Exception as e:
+            logger.info(f"Unexpected exception occurred: {e}")
+
+    def fill_aggregate_from_archived_link_event(self, collection, link_event):
+        change_number = link_event["fields"]["change"]
+        existing_link_aggregate = (
+            LinkAggregate.objects.filter(
+                organisation=collection.organisation.id,
+                collection=collection.id,
+                full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]),
+                on_user_list=link_event["fields"]["on_user_list"],
+            )
+            .exclude(day=0)
+            .first()
+        )
+        if existing_link_aggregate is not None:
+            if change_number == 0:
+                existing_link_aggregate.total_links_removed += 1
+            else:
+                existing_link_aggregate.total_links_added += 1
+            existing_link_aggregate.save()
+        else:
+            # Create a new link aggregate
+            links_added = change_number if change_number > 0 else 0
+            links_removed = 1 if change_number == 0 else 0
+            try:
+                with transaction.atomic():
+                    LinkAggregate.objects.create(
+                        organisation=collection.organisation,
+                        collection=collection,
+                        full_date=datetime.fromisoformat(
+                            link_event["fields"]["timestamp"]
+                        ).date(),
+                        total_links_added=links_added,
+                        total_links_removed=links_removed,
+                        on_user_list=link_event["fields"]["on_user_list"],
+                    )
+            except Exception as e:
+                logger.info(
+                    f"Unexpected exception occurred filling aggregate: {e}"
+                )
+
+    def get_link_events_for_day(
+        self, collection_url: str, events_split_by_url_pattern, i: int
+    ):
+        link_events_for_day = [
+            j
+            for j in events_split_by_url_pattern[collection_url]
+            if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == i
+        ]
+        return link_events_for_day
+
+    def get_last_day_of_month(self, first_day_of_month: date) -> date:
+        if first_day_of_month.month == 12:
+            return first_day_of_month.replace(day=31)
+        replace = first_day_of_month.replace(month=first_day_of_month.month + 1)
+        return replace - timedelta(days=1)
+
+    def get_first_day_of_month(self, month_to_fix: str) -> date:
+        return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1)
+
+    def load_events_from_archives(self, directory: object, month_to_fix: str, url_pattern_strings) -> object:
+        events_split_by_url_pattern = dict.fromkeys(url_pattern_strings)
+        # initialize empty array for each url pattern in the org
+        for key, value in events_split_by_url_pattern.items():
+            events_split_by_url_pattern[key] = []
+        for filename in os.listdir(directory):
+            if (
+                filename.endswith(".json.gz")
+                and filename.startswith("links_linkevent_")
+                and month_to_fix in filename
+            ):
+                try:
+                    file_path = os.path.join(directory, filename)
+                    with gzip.open(file_path, "rt", encoding="utf-8") as f:
+                        data = json.load(f)
+                        for event in data:
+                            link = event["fields"]["link"]
+                            for url_pattern in url_pattern_strings:
+                                if url_pattern in link:
+                                    events_split_by_url_pattern[url_pattern].append(
+                                        event
+                                    )
+                except Exception as e:
+                    logger.info(
+                        f"Unexpected exception occurred loading events from archive: {e}"
+                    )
+        return events_split_by_url_pattern
diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py
index 85c64011..7a296c22 100644
--- a/extlinks/aggregates/tests.py
+++ b/extlinks/aggregates/tests.py
@@ -2142,7 +2142,7 @@ def setUp(self):
         self.url.collection = self.collection
         self.url.save()
 
-    def test_fixes_monthly_aggregates_for_organisation(self):
+    def test_fixes_link_aggregates_for_organisation_and_month(self):
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -2242,7 +2242,7 @@ def test_fixes_monthly_aggregates_for_organisation(self):
 
         try:
             call_command(
-                "fix_aggregates_for_organisation_and_month",
+                "fix_link_aggregates_for_organisation_and_month",
                 "--month",
                 "202412",
                 "--organisation",
@@ -2250,7 +2250,7 @@ def test_fixes_monthly_aggregates_for_organisation(self):
                 "--dir",
                 temp_dir,
             )
-            monthly_aggregate = LinkAggregate.objects.filter(day=0).first()
+            monthly_aggregate = LinkAggregate.objects.all().first()
             self.assertEqual(1, LinkAggregate.objects.count())
             self.assertEqual(2, monthly_aggregate.total_links_added)
             self.assertEqual(1, monthly_aggregate.total_links_removed)
@@ -2259,7 +2259,7 @@ def test_fixes_monthly_aggregates_for_organisation(self):
             for file in glob.glob(archive_path):
                 os.remove(file)
 
-    def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives(self):
+    def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archives(self):
         temp_dir = tempfile.gettempdir()
         archive_filename = "aggregates_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -2337,7 +2337,7 @@ def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives
 
         try:
             call_command(
-                "fix_aggregates_for_organisation_and_month",
+                "fix_link_aggregates_for_organisation_and_month",
                 "--month",
                 "202412",
                 "--organisation",
@@ -2351,7 +2351,7 @@ def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives
             for file in glob.glob(archive_path):
                 os.remove(file)
 
-    def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives_in_correct_zipped_format(self):
+    def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in_correct_zipped_format(self):
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -2429,7 +2429,7 @@ def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives
 
         try:
             call_command(
-                "fix_aggregates_for_organisation_and_month",
+                "fix_link_aggregates_for_organisation_and_month",
                 "--month",
                 "202412",
                 "--organisation",
@@ -2543,7 +2543,7 @@ def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(sel
 
         try:
             call_command(
-                "fix_aggregates_for_organisation_and_month",
+                "fix_link_aggregates_for_organisation_and_month",
                 "--month",
                 "202412",
                 '--skip-monthly',

From cba0539935628ff95d62f223ffb3bc81eb46d5f4 Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 3/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly/daily aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 .../commands/reaggregate_link_archives.py     | 479 ++++++++++++++++
 extlinks/aggregates/tests.py                  | 519 +++++++++++++++++-
 2 files changed, 972 insertions(+), 26 deletions(-)
 create mode 100644 extlinks/aggregates/management/commands/reaggregate_link_archives.py

diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
new file mode 100644
index 00000000..c4af8dfc
--- /dev/null
+++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
@@ -0,0 +1,479 @@
+import gzip
+import json
+import os
+import logging
+from datetime import datetime, timedelta, date
+
+from django.db import transaction
+
+from extlinks.aggregates.models import LinkAggregate
+from extlinks.common import swift
+from extlinks.common.management.commands import BaseCommand
+from extlinks.links.models import URLPattern
+from extlinks.organisations.models import Organisation
+
+logger = logging.getLogger("django")
+
+
+class Command(BaseCommand):
+    help = (
+        "Loads, parses, and fixes daily or monthly link aggregates for a given organisation. "
+        "WARNING: Only run this command if you are certain the link events have been archived before being aggregated."
+    )
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--month",
+            help="If provided, will fix a monthly aggregate. The date (YYYYMM) of the monthly archive to be fixed.",
+            type=str,
+        )
+        parser.add_argument(
+            "--day",
+            help="If provided, will fix a daily aggregate. The date (YYYYMMDD) of the daily archive to be fixed.",
+            type=str,
+        )
+        parser.add_argument(
+            "--organisation",
+            help="The organisation id to fix link aggregates for.",
+            type=str,
+        )
+        parser.add_argument(
+            "--dir", help="The directory from which to parse archives.", type=str
+        )
+
+    def _handle(self, *args, **options):
+        directory = options["dir"]
+        month_to_fix = options["month"]
+        day_to_fix = options["day"]
+        organisation = Organisation.objects.filter(id=options["organisation"]).first()
+        collections = organisation.collection_set.all()
+
+        if not month_to_fix and not day_to_fix:
+            logger.warning("Please provide a month or day to fix.")
+            return
+        if month_to_fix and day_to_fix:
+            logger.warning("Please only provide a month or a day to fix-not both.")
+            return
+        if not directory:
+            logger.warning("Please provide a directory from which to parse archives.")
+            return
+        if not organisation:
+            logger.warning(
+                "Please provide an organisation for which to parse archives."
+            )
+            return
+        if not collections:
+            logger.warning(
+                "Please provide an organisation which has collections for which to fix archives."
+            )
+            return
+        try:
+            conn = swift.swift_connection()
+        except RuntimeError:
+            logger.info("Swift credentials not provided. Skipping.")
+            return False
+
+        # get existing archives to ensure we have not already aggregated
+        existing_link_aggregates_in_object_storage = self._get_existing_link_aggregates(
+            conn
+        )
+        # get all URLPatterns for an organisation
+        url_patterns = URLPattern.objects.filter(collection__in=collections)
+
+        if month_to_fix:
+            # if we already have aggregates for this month uploaded, don't try to re-aggregate
+            if self._has_aggregates_for_month(
+                existing_link_aggregates_in_object_storage, month_to_fix
+            ):
+                return
+            # otherwise, attempt re-aggregation
+            with transaction.atomic():
+                self._process_monthly_aggregates(
+                    directory, month_to_fix, organisation, url_patterns
+                )
+        else:
+            # if we already have aggregates for this day uploaded, don't try to re-aggregate
+            if self._has_aggregates_for_day(
+                existing_link_aggregates_in_object_storage, day_to_fix
+            ):
+                return
+            # otherwise, attempt re-aggregation
+            with transaction.atomic():
+                self._process_daily_aggregates(
+                    collections, day_to_fix, directory, url_patterns
+                )
+
+    def _get_existing_link_aggregates(self, conn):
+        """
+        This function gets existing link aggregates from object storage.
+        Parameters
+        ----------
+        conn : swiftclient.Connection
+            A connection to the Swift object storage.
+
+        Returns
+        -------
+        An array of existing link aggregates from object storage.
+        """
+        existing_link_aggregates_in_object_storage = [
+            i["name"]
+            for i in swift.get_object_list(
+                conn,
+                os.environ.get("SWIFT_CONTAINER_AGGREGATES", "archive-aggregates"),
+                "aggregates_linkaggregate_",
+            )
+        ]
+        return existing_link_aggregates_in_object_storage
+
+    def _has_aggregates_for_month(
+        self, existing_link_aggregates_in_object_storage, month_to_fix
+    ):
+        """
+        This function checks whether there are existing aggregates for the month to fix.
+        Parameters
+        ----------
+        existing_link_aggregates_in_object_storage :  An array of existing link aggregates from object storage.
+
+        month_to_fix :  str
+
+        Returns
+        -------
+        bool: whether there are existing aggregates for a given month in object storage
+        """
+        return (
+            len(
+                [
+                    i
+                    for i in existing_link_aggregates_in_object_storage
+                    if self._get_first_day_of_month(month_to_fix).strftime("%Y-%m") in i
+                ]
+            )
+            > 0
+        )
+
+    def _has_aggregates_for_day(
+        self, existing_link_aggregates_in_object_storage, day_to_fix
+    ):
+        """
+        This function checks whether there are existing aggregates for the day to fix.
+        Parameters
+        ----------
+        existing_link_aggregates_in_object_storage :  An array of existing link aggregates from object storage.
+
+        day_to_fix :  str
+
+        Returns
+        -------
+        bool: whether there are existing aggregates for a given day in object storage
+        """
+        day_to_fix_formatted = (
+            datetime.fromisoformat(day_to_fix).date().strftime("%Y-%m-%d")
+        )
+        return (
+            len(
+                [
+                    i
+                    for i in existing_link_aggregates_in_object_storage
+                    if day_to_fix_formatted in i
+                ]
+            )
+            > 0
+        )
+
+    def _process_daily_aggregates(
+        self, collections, day_to_fix, directory, url_patterns
+    ):
+        """
+        This function loops through each url pattern and link event to fill the daily aggregates.
+        Parameters
+        ----------
+        collections :  An array of collections
+
+        day_to_fix :  str
+
+        directory :  str
+
+        url_patterns :  An array of url patterns
+
+        Returns
+        -------
+        None
+        """
+        # pull month string from day input parameter
+        month_to_fix = day_to_fix[:-2]
+        # load and split link events by url pattern
+        events_split_by_url_pattern = self._load_events_from_archives(
+            directory, month_to_fix, [i.url for i in url_patterns]
+        )
+        # loop through each collection
+        for collection in collections:
+            collection_url_pattern_strings = [
+                i.url for i in url_patterns.filter(collection=collection)
+            ]
+            # loop through each collection's URLPatterns
+            for collection_url_string in collection_url_pattern_strings:
+                # get the link events for the collection and day
+                for link_event in self._get_link_events_for_day(
+                    collection_url_string,
+                    events_split_by_url_pattern,
+                    int(day_to_fix[-2:]),
+                ):
+                    # create daily aggregates
+                    self._fill_daily_aggregate(collection, link_event)
+
+    def _fill_daily_aggregate(self, collection, link_event):
+        """
+        This function updates or creates a daily LinkAggregate for a collection and a parsed JSON object(LinkEvent).
+        Parameters
+        ----------
+        collection :  Collection
+
+        link_event :  obj
+
+        Returns
+        -------
+        None
+        """
+        change_number = link_event["fields"]["change"]
+        existing_link_aggregate = (
+            LinkAggregate.objects.filter(
+                organisation=collection.organisation.id,
+                collection=collection.id,
+                full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]),
+                on_user_list=link_event["fields"]["on_user_list"],
+            )
+            .exclude(day=0)
+            .first()
+        )
+        if existing_link_aggregate is None:
+            # Create a new link aggregate
+            links_added = change_number if change_number > 0 else 0
+            links_removed = 1 if change_number == 0 else 0
+            LinkAggregate.objects.create(
+                organisation=collection.organisation,
+                collection=collection,
+                full_date=datetime.fromisoformat(
+                    link_event["fields"]["timestamp"]
+                ).date(),
+                total_links_added=links_added,
+                total_links_removed=links_removed,
+                on_user_list=link_event["fields"]["on_user_list"],
+            )
+        else:
+            if change_number == 0:
+                existing_link_aggregate.total_links_removed += 1
+            else:
+                existing_link_aggregate.total_links_added += 1
+            existing_link_aggregate.save()
+
+    def _process_monthly_aggregates(
+        self, directory, month_to_fix, organisation, url_patterns
+    ):
+        """
+        This function loops through each url pattern and link events to fill the monthly aggregates.
+        Parameters
+        ----------
+        directory :  str
+
+        month_to_fix :  str
+
+        organisation :  Organisation
+
+        url_patterns :  An array of url patterns
+
+        Returns
+        -------
+        None
+        """
+        # load and split link events by url pattern
+        events_split_by_url_pattern = self._load_events_from_archives(
+            directory, month_to_fix, [i.url for i in url_patterns]
+        )
+        # get the first and last day of the month to fix
+        first_day_of_month = self._get_first_day_of_month(month_to_fix)
+        last_day_of_month = self._get_last_day_of_month(first_day_of_month)
+        for url_pattern, link_events in events_split_by_url_pattern.items():
+            # create monthly aggregates
+            self._fill_monthly_aggregate(
+                url_pattern, last_day_of_month, organisation, url_patterns, link_events
+            )
+
+    def _fill_monthly_aggregate(
+        self, url_pattern, last_day_of_month, organisation, url_patterns, link_events
+    ):
+        """
+        This function fills monthly LinkAggregates for an organisation and a parsed JSON object(LinkEvent).
+        Parameters
+        ----------
+        url_pattern :  str
+
+        last_day_of_month :  date
+
+        organisation :  Organisation
+
+        url_patterns :  An array of url patterns
+
+        link_events :  an array of link event JSON objects
+
+        Returns
+        -------
+        None
+        """
+        # find the collection associated with this url
+        collection = url_patterns.filter(url=url_pattern).first().collection
+        self._process_monthly_events(
+            True, link_events, collection, organisation, last_day_of_month
+        )
+        self._process_monthly_events(
+            False, link_events, collection, organisation, last_day_of_month
+        )
+
+    def _process_monthly_events(
+        self,
+        on_user_list_flag,
+        link_events,
+        collection,
+        organisation,
+        last_day_of_month,
+    ):
+        """
+        This function updates or creates a monthly LinkAggregate for a collection and parsed JSON objects(LinkEvents).
+        Parameters
+        ----------
+        on_user_list_flag :  bool, whether the aggregate should save with on_user_list flag or not
+
+        link_events :  an array of link event JSON objects
+
+        collection: a Collection
+
+        organisation: Organisation
+
+        last_day_of_month:  date
+
+        Returns
+        -------
+        None
+        """
+        events = [
+            i for i in link_events if i["fields"]["on_user_list"] is on_user_list_flag
+        ]
+        if not events:
+            return
+
+        total_added = sum(1 for i in events if i["fields"]["change"] == 1)
+        total_removed = sum(1 for i in events if i["fields"]["change"] == 0)
+
+        existing_aggregate = LinkAggregate.objects.filter(
+            organisation_id=organisation.id,
+            collection_id=collection.id,
+            on_user_list=on_user_list_flag,
+            full_date=last_day_of_month,
+            day=0,
+        )
+
+        if existing_aggregate.exists():
+            existing_aggregate.update(
+                total_links_added=total_added,
+                total_links_removed=total_removed,
+            )
+        else:
+            LinkAggregate.objects.create(
+                organisation_id=organisation.id,
+                collection_id=collection.id,
+                on_user_list=on_user_list_flag,
+                full_date=last_day_of_month,
+                day=0,
+                total_links_added=total_added,
+                total_links_removed=total_removed,
+            )
+
+    def _get_link_events_for_day(
+        self, collection_url: str, events_split_by_url_pattern, day: int
+    ):
+        """
+        This function splits parsed JSON objects(LinkEvent) by collection url pattern.
+        Parameters
+        ----------
+        collection_url :  str
+
+        events_split_by_url_pattern :  an array of link event JSON objects
+
+        Returns
+        -------
+        link_events_for_day :  an array of link event JSON objects filtered by day
+        """
+        link_events_for_day = [
+            j
+            for j in events_split_by_url_pattern[collection_url]
+            if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == day
+        ]
+        return link_events_for_day
+
+    def _get_last_day_of_month(self, first_day_of_month: date) -> date:
+        """
+        This function gets the last day of the month from the first day of the input month
+        Parameters
+        ----------
+        first_day_of_month :  date
+
+        Returns
+        -------
+        date
+        """
+        if first_day_of_month.month == 12:
+            return first_day_of_month.replace(day=31)
+        replace = first_day_of_month.replace(month=first_day_of_month.month + 1)
+        return replace - timedelta(days=1)
+
+    def _get_first_day_of_month(self, month_to_fix: str) -> date:
+        """
+        This function gets the first day of the month from the input month
+        Parameters
+        ----------
+        month_to_fix :  str
+
+        Returns
+        -------
+        date
+        """
+        return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1)
+
+    def _load_events_from_archives(
+        self, directory: object, month_to_fix: str, url_pattern_strings
+    ) -> object:
+        """Parse archived .json.gz files and split the link events by URL pattern.
+        Parameters
+        ----------
+        directory :  str
+
+        month_to_fix :  str
+
+        url_pattern_strings :  an array of str
+
+        Returns
+        -------
+        parsed JSON link event objects
+        """
+        events_split_by_url_pattern = {url: [] for url in url_pattern_strings}
+        for filename in os.listdir(directory):
+            if (
+                filename.endswith(".json.gz")
+                and filename.startswith("links_linkevent_")
+                and month_to_fix in filename
+            ):
+                try:
+                    file_path = os.path.join(directory, filename)
+                    with gzip.open(file_path, "rt", encoding="utf-8") as f:
+                        data = json.load(f)
+                        for event in data:
+                            link = event["fields"]["link"]
+                            for url_pattern in url_pattern_strings:
+                                if url_pattern in link:
+                                    events_split_by_url_pattern[url_pattern].append(
+                                        event
+                                    )
+                except Exception as e:
+                    logger.info(
+                        f"Unexpected exception occurred loading events from archive: {e}"
+                    )
+        return events_split_by_url_pattern
diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py
index 7a296c22..afd3c953 100644
--- a/extlinks/aggregates/tests.py
+++ b/extlinks/aggregates/tests.py
@@ -2131,6 +2131,7 @@ def test_uploads_all_files_successfully(self, mock_swift_connection):
             for file in glob.glob(pattern):
                 os.remove(file)
 
+
 class FixAggregatesForOrganisationAndMonthCommandTest(BaseTransactionTest):
 
     def setUp(self):
@@ -2142,10 +2143,26 @@ def setUp(self):
         self.url.collection = self.collection
         self.url.save()
 
-    def test_fixes_link_aggregates_for_organisation_and_month(self):
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_monthly(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
+        )
+        mock_conn.get_container.return_value = ({},[])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
+
         json_data = [
             {
                 "model": "links.linkevent",
@@ -2242,7 +2259,7 @@ def test_fixes_link_aggregates_for_organisation_and_month(self):
 
         try:
             call_command(
-                "fix_link_aggregates_for_organisation_and_month",
+                "reaggregate_link_archives",
                 "--month",
                 "202412",
                 "--organisation",
@@ -2251,17 +2268,38 @@ def test_fixes_link_aggregates_for_organisation_and_month(self):
                 temp_dir,
             )
             monthly_aggregate = LinkAggregate.objects.all().first()
+            # assert only one monthly aggregates created for on_user_list=True
             self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=0).count())
+            # assert daily aggregates were not created
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
             self.assertEqual(2, monthly_aggregate.total_links_added)
             self.assertEqual(1, monthly_aggregate.total_links_removed)
-            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
-    def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archives(self):
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
+        )
+        mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}])
         temp_dir = tempfile.gettempdir()
-        archive_filename = "aggregates_20241222_0.json.gz"
+        archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
         json_data = [
             {
@@ -2313,7 +2351,7 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv
                 "pk": 3,
                 "fields": {
                     "link": "https://www.test.com/3",
-                    "timestamp": "2024-01-15T09:15:27.363Z",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
                     "domain": "en.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
@@ -2330,6 +2368,28 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv
                     "url": []
                 }
             },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
         ]
 
         with gzip.open(archive_path, "wt", encoding="utf-8") as f:
@@ -2337,7 +2397,7 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv
 
         try:
             call_command(
-                "fix_link_aggregates_for_organisation_and_month",
+                "reaggregate_link_archives",
                 "--month",
                 "202412",
                 "--organisation",
@@ -2345,15 +2405,32 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv
                 "--dir",
                 temp_dir,
             )
+            # assert no daily or monthly aggregates created
             self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=0).count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
-    def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in_correct_zipped_format(self):
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
         temp_dir = tempfile.gettempdir()
-        archive_filename = "links_linkevent_20241222_0.json"
+        archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
         json_data = [
             {
@@ -2405,7 +2482,7 @@ def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in
                 "pk": 3,
                 "fields": {
                     "link": "https://www.test.com/3",
-                    "timestamp": "2024-01-15T09:15:27.363Z",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
                     "domain": "en.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
@@ -2422,6 +2499,28 @@ def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in
                     "url": []
                 }
             },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
         ]
 
         with gzip.open(archive_path, "wt", encoding="utf-8") as f:
@@ -2429,21 +2528,44 @@ def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in
 
         try:
             call_command(
-                "fix_link_aggregates_for_organisation_and_month",
-                "--month",
-                "202412",
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
                 "--organisation",
                 self.organisation.id,
                 "--dir",
                 temp_dir,
             )
-            self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+            daily_aggregate = LinkAggregate.objects.all().first()
+            # assert no monthly aggregates created
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=0).count())
+            # assert daily aggregates were created for the correct day
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(1, daily_aggregate.total_links_added)
+            self.assertEqual(1, daily_aggregate.total_links_removed)
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
-    def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(self):
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-15"}])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -2543,25 +2665,370 @@ def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(sel
 
         try:
             call_command(
-                "fix_link_aggregates_for_organisation_and_month",
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_monthly_on_and_off_user_list(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({}, [])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": False,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
                 "--month",
                 "202412",
-                '--skip-monthly',
-                True,
                 "--organisation",
                 self.organisation.id,
                 "--dir",
                 temp_dir,
             )
-            link_aggregate = LinkAggregate.objects.filter(day=16).first()
-            link_aggregate2 = LinkAggregate.objects.filter(day=15).first()
+            monthly_aggregate_on_user_list = LinkAggregate.objects.filter(on_user_list=True).first()
+            monthly_aggregate_not_on_user_list = LinkAggregate.objects.filter(on_user_list=False).first()
+            # assert two monthly aggregates were created for on_user_list=True and on_user_list=False
             self.assertEqual(2, LinkAggregate.objects.count())
-            self.assertEqual(1, link_aggregate.total_links_added)
-            self.assertEqual(0, link_aggregate.total_links_removed)
-            self.assertEqual(1, link_aggregate2.total_links_added)
-            self.assertEqual(1, link_aggregate2.total_links_removed)
+            self.assertEqual(2, LinkAggregate.objects.filter(day=0).count())
+            # assert daily aggregates were not created
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(2, monthly_aggregate_on_user_list.total_links_added)
+            self.assertEqual(0, monthly_aggregate_on_user_list.total_links_removed)
+            self.assertEqual(0, monthly_aggregate_not_on_user_list.total_links_added)
+            self.assertEqual(1, monthly_aggregate_not_on_user_list.total_links_removed)
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_only_link_event_archives(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({}, [])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "aggregates_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-01-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
             self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_only_in_correct_zipped_format(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({}, [])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-01-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)

From e6e25cbb75295259a82362e9530f13fc09534fa7 Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 4/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly/daily aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 ...k_aggregates_for_organisation_and_month.py | 163 ------------------
 1 file changed, 163 deletions(-)
 delete mode 100644 extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py

diff --git a/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py
deleted file mode 100644
index 263384f4..00000000
--- a/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import gzip
-import json
-import os
-import logging
-from datetime import datetime, timedelta, date
-
-from django.core.management import call_command
-from django.db import transaction
-
-from extlinks.aggregates.models import LinkAggregate
-from extlinks.common.management.commands import BaseCommand
-from extlinks.links.models import URLPattern
-from extlinks.organisations.models import Organisation
-
-logger = logging.getLogger("django")
-
-
-class Command(BaseCommand):
-    help = ("Loads, parses, and fixes daily link aggregates for a given month and organisation. "
-            "Only run this command if the month's link events have not been already been aggregated.")
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "--month",
-            help="The date (YYYYMM) of the monthly archive to be fixed.",
-            type=str,
-        )
-        parser.add_argument(
-            "--organisation",
-            help="The organisation id to fix link aggregates for.",
-            type=str,
-        )
-        parser.add_argument(
-            "--dir", help="The directory from which to parse archives.", type=str
-        )
-        parser.add_argument(
-            "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False
-        )
-
-
-    def _handle(self, *args, **options):
-        directory = options["dir"]
-        month_to_fix = options["month"]
-        organisation = Organisation.objects.filter(id=options["organisation"]).first()
-        collections = organisation.collection_set.all()
-        skip_monthly = options["skip_monthly"]
-        if not month_to_fix or not organisation or not collections or not directory:
-            return
-        url_patterns = URLPattern.objects.filter(collection__in=collections)
-        events_split_by_url_pattern = self.load_events_from_archives(
-            directory, month_to_fix, [i.url for i in url_patterns]
-        )
-        first_day_of_month = self.get_first_day_of_month(month_to_fix)
-        last_day_of_month = self.get_last_day_of_month(first_day_of_month)
-        try:
-            for i in range(
-                first_day_of_month.day, last_day_of_month.day+1
-            ):
-                for collection in collections:
-                    collection_url_pattern_strings = [
-                        i.url for i in url_patterns.filter(collection=collection)
-                    ]
-                    for collection_url_string in collection_url_pattern_strings:
-                        for link_event in self.get_link_events_for_day(
-                            collection_url_string, events_split_by_url_pattern, i
-                        ):
-                            self.fill_aggregate_from_archived_link_event(collection, link_event)
-            # run monthly aggregate command if we're not skipping it
-            if not skip_monthly:
-                # fill monthly aggregate for the subset of link events we just aggregated
-                call_command(
-                    "fill_monthly_link_aggregates",
-                    collections=collections,
-                    year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"),
-                )
-        except Exception as e:
-            logger.info(f"Unexpected exception occurred: {e}")
-
-    def fill_aggregate_from_archived_link_event(self, collection, link_event):
-        change_number = link_event["fields"]["change"]
-        existing_link_aggregate = (
-            LinkAggregate.objects.filter(
-                organisation=collection.organisation.id,
-                collection=collection.id,
-                full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]),
-                on_user_list=link_event["fields"]["on_user_list"],
-            )
-            .exclude(day=0)
-            .first()
-        )
-        if existing_link_aggregate is not None:
-            if change_number == 0:
-                existing_link_aggregate.total_links_removed += 1
-            else:
-                existing_link_aggregate.total_links_added += 1
-            existing_link_aggregate.save()
-        else:
-            # Create a new link aggregate
-            links_added = change_number if change_number > 0 else 0
-            links_removed = 1 if change_number == 0 else 0
-            try:
-                with transaction.atomic():
-                    LinkAggregate.objects.create(
-                        organisation=collection.organisation,
-                        collection=collection,
-                        full_date=datetime.fromisoformat(
-                            link_event["fields"]["timestamp"]
-                        ).date(),
-                        total_links_added=links_added,
-                        total_links_removed=links_removed,
-                        on_user_list=link_event["fields"]["on_user_list"],
-                    )
-            except Exception as e:
-                logger.info(
-                    f"Unexpected exception occurred filling aggregate: {e}"
-                )
-
-    def get_link_events_for_day(
-        self, collection_url: str, events_split_by_url_pattern, i: int
-    ):
-        link_events_for_day = [
-            j
-            for j in events_split_by_url_pattern[collection_url]
-            if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == i
-        ]
-        return link_events_for_day
-
-    def get_last_day_of_month(self, first_day_of_month: date) -> date:
-        if first_day_of_month.month == 12:
-            return first_day_of_month.replace(day=31)
-        replace = first_day_of_month.replace(month=first_day_of_month.month + 1)
-        return replace - timedelta(days=1)
-
-    def get_first_day_of_month(self, month_to_fix: str) -> date:
-        return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1)
-
-    def load_events_from_archives(self, directory: object, month_to_fix: str, url_pattern_strings) -> object:
-        events_split_by_url_pattern = dict.fromkeys(url_pattern_strings)
-        # initialize empty array for each url pattern in the org
-        for key, value in events_split_by_url_pattern.items():
-            events_split_by_url_pattern[key] = []
-        for filename in os.listdir(directory):
-            if (
-                filename.endswith(".json.gz")
-                and filename.startswith("links_linkevent_")
-                and month_to_fix in filename
-            ):
-                try:
-                    file_path = os.path.join(directory, filename)
-                    with gzip.open(file_path, "rt", encoding="utf-8") as f:
-                        data = json.load(f)
-                        for event in data:
-                            link = event["fields"]["link"]
-                            for url_pattern in url_pattern_strings:
-                                if url_pattern in link:
-                                    events_split_by_url_pattern[url_pattern].append(
-                                        event
-                                    )
-                except Exception as e:
-                    logger.info(
-                        f"Unexpected exception occurred loading events from archive: {e}"
-                    )
-        return events_split_by_url_pattern

From 8ac1a2d737ae8298099a8fb90412e7fa98173ae9 Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 5/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly/daily aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 .../commands/reaggregate_link_archives.py     |  25 +-
 extlinks/aggregates/tests.py                  | 265 ++++++++++++++++++
 2 files changed, 282 insertions(+), 8 deletions(-)

diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
index c4af8dfc..592a5944 100644
--- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py
+++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
@@ -9,7 +9,7 @@
 from extlinks.aggregates.models import LinkAggregate
 from extlinks.common import swift
 from extlinks.common.management.commands import BaseCommand
-from extlinks.links.models import URLPattern
+from extlinks.links.models import URLPattern, LinkEvent
 from extlinks.organisations.models import Organisation
 
 logger = logging.getLogger("django")
@@ -18,7 +18,6 @@
 class Command(BaseCommand):
     help = (
         "Loads, parses, and fixes daily or monthly link aggregates for a given organisation. "
-        "WARNING: Only run this command if you are certain the link events have been archived before being aggregated."
     )
 
     def add_arguments(self, parser):
@@ -81,21 +80,25 @@ def _handle(self, *args, **options):
         url_patterns = URLPattern.objects.filter(collection__in=collections)
 
         if month_to_fix:
+            first_day_of_month = self._get_first_day_of_month(month_to_fix)
+            last_day_of_month = self._get_last_day_of_month(first_day_of_month)
             # if we already have aggregates for this month uploaded, don't try to re-aggregate
+            # or if we have not archived all events for the given timeframe, don't try to re-aggregate
             if self._has_aggregates_for_month(
                 existing_link_aggregates_in_object_storage, month_to_fix
-            ):
+            ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month):
                 return
             # otherwise, attempt re-aggregation
             with transaction.atomic():
                 self._process_monthly_aggregates(
-                    directory, month_to_fix, organisation, url_patterns
+                    directory, month_to_fix, organisation, url_patterns, last_day_of_month
                 )
         else:
             # if we already have aggregates for this day uploaded, don't try to re-aggregate
+            # or if we have not archived all events for the given timeframe, don't try to re-aggregate
             if self._has_aggregates_for_day(
                 existing_link_aggregates_in_object_storage, day_to_fix
-            ):
+            ) or self._has_link_events_for_day(day_to_fix):
                 return
             # otherwise, attempt re-aggregation
             with transaction.atomic():
@@ -180,6 +183,14 @@ def _has_aggregates_for_day(
             > 0
         )
 
+    def _has_link_events_for_month(self, first_day_of_month, last_day_of_month):
+        return LinkEvent.objects.filter(timestamp__gte=first_day_of_month, timestamp__lte=last_day_of_month).count() > 0
+
+    def _has_link_events_for_day(self, day_to_fix):
+        day = datetime.fromisoformat(day_to_fix)
+        return LinkEvent.objects.filter(timestamp__gte=day, timestamp__lte=day + timedelta(days=1)).count() > 0
+
+
     def _process_daily_aggregates(
         self, collections, day_to_fix, directory, url_patterns
     ):
@@ -267,7 +278,7 @@ def _fill_daily_aggregate(self, collection, link_event):
             existing_link_aggregate.save()
 
     def _process_monthly_aggregates(
-        self, directory, month_to_fix, organisation, url_patterns
+        self, directory, month_to_fix, organisation, url_patterns, last_day_of_month
     ):
         """
         This function loops through each url pattern and link events to fill the monthly aggregates.
@@ -290,8 +301,6 @@ def _process_monthly_aggregates(
             directory, month_to_fix, [i.url for i in url_patterns]
         )
         # get the first and last day of the month to fix
-        first_day_of_month = self._get_first_day_of_month(month_to_fix)
-        last_day_of_month = self._get_last_day_of_month(first_day_of_month)
         for url_pattern, link_events in events_split_by_url_pattern.items():
             # create monthly aggregates
             self._fill_monthly_aggregate(
diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py
index afd3c953..e40474a0 100644
--- a/extlinks/aggregates/tests.py
+++ b/extlinks/aggregates/tests.py
@@ -2413,6 +2413,141 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co
                 os.remove(file)
 
 
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+
+        # create link events
+        call_command("loaddata", archive_path)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            # assert no daily or monthly aggregates created
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=0).count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
     @mock.patch.dict(
         os.environ,
         {
@@ -2678,6 +2813,136 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn
             for file in glob.glob(archive_path):
                 os.remove(file)
 
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_skips_if_linkevents_for_day(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        # create link events
+        call_command("loaddata", archive_path)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
 
     @mock.patch.dict(
         os.environ,

From c206eb22272730030fe34aa84aeeec4962220ed6 Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 6/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly/daily aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 .../management/commands/reaggregate_link_archives.py   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
index 592a5944..98fa5549 100644
--- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py
+++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
@@ -48,10 +48,10 @@ def _handle(self, *args, **options):
         collections = organisation.collection_set.all()
 
         if not month_to_fix and not day_to_fix:
-            logger.warning("Please provide a month or day to fix.")
+            logger.warning("Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix.")
             return
         if month_to_fix and day_to_fix:
-            logger.warning("Please only provide a month or a day to fix-not both.")
+            logger.warning("Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both.")
             return
         if not directory:
             logger.warning("Please provide a directory from which to parse archives.")
@@ -87,6 +87,9 @@ def _handle(self, *args, **options):
             if self._has_aggregates_for_month(
                 existing_link_aggregates_in_object_storage, month_to_fix
             ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month):
+                logger.warning(
+                    "Organisation already has aggregates or link events for month."
+                )
                 return
             # otherwise, attempt re-aggregation
             with transaction.atomic():
@@ -99,6 +102,9 @@ def _handle(self, *args, **options):
             if self._has_aggregates_for_day(
                 existing_link_aggregates_in_object_storage, day_to_fix
             ) or self._has_link_events_for_day(day_to_fix):
+                logger.warning(
+                    "Organisation already has aggregates or link events for day."
+                )
                 return
             # otherwise, attempt re-aggregation
             with transaction.atomic():

From c5c458ada8a73c028e4ec44e623412453c48b439 Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 7/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly/daily aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 .../commands/fill_pageproject_aggregates.py   |    4 +-
 .../commands/fill_user_aggregates.py          |    4 +-
 .../commands/reaggregate_link_archives.py     |  321 +++++-
 extlinks/aggregates/tests.py                  | 1007 ++++++++++++++++-
 4 files changed, 1291 insertions(+), 45 deletions(-)

diff --git a/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py b/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py
index 9014d14a..a05f8cee 100644
--- a/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py
+++ b/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py
@@ -9,7 +9,7 @@
 from django.db.models.fields import DateField
 
 from ...models import PageProjectAggregate
-from extlinks.links.models import LinkEvent
+from extlinks.links.models import LinkEvent, URLPattern
 from extlinks.organisations.models import Collection
 
 logger = logging.getLogger("django")
@@ -118,6 +118,8 @@ def _process_single_collection(self, link_event_filter, collection):
         None
         """
         url_patterns = collection.get_url_patterns()
+        if len(url_patterns) == 0:
+            url_patterns = URLPattern.objects.filter(collection=collection).all()
         for url_pattern in url_patterns:
             link_events_with_annotated_timestamp = url_pattern.link_events.annotate(
                 timestamp_date=Cast("timestamp", DateField())
diff --git a/extlinks/aggregates/management/commands/fill_user_aggregates.py b/extlinks/aggregates/management/commands/fill_user_aggregates.py
index e2b99a9e..d4c324e6 100644
--- a/extlinks/aggregates/management/commands/fill_user_aggregates.py
+++ b/extlinks/aggregates/management/commands/fill_user_aggregates.py
@@ -8,7 +8,7 @@
 from django.db.models.fields import DateField
 
 from ...models import UserAggregate
-from extlinks.links.models import LinkEvent
+from extlinks.links.models import LinkEvent, URLPattern
 from extlinks.organisations.models import Collection
 
 
@@ -117,6 +117,8 @@ def _process_single_collection(self, link_event_filter, collection):
         None
         """
         url_patterns = collection.get_url_patterns()
+        if len(url_patterns) == 0:
+            url_patterns = URLPattern.objects.filter(collection=collection).all()
         for url_pattern in url_patterns:
             link_events_with_annotated_timestamp = url_pattern.link_events.annotate(
                 timestamp_date=Cast("timestamp", DateField())
diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
index 98fa5549..63b64a60 100644
--- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py
+++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
@@ -6,19 +6,21 @@
 
 from django.db import transaction
 
-from extlinks.aggregates.models import LinkAggregate
+from extlinks.aggregates.models import (
+    LinkAggregate,
+    PageProjectAggregate,
+    UserAggregate,
+)
 from extlinks.common import swift
 from extlinks.common.management.commands import BaseCommand
 from extlinks.links.models import URLPattern, LinkEvent
-from extlinks.organisations.models import Organisation
+from extlinks.organisations.models import Organisation, User
 
 logger = logging.getLogger("django")
 
 
 class Command(BaseCommand):
-    help = (
-        "Loads, parses, and fixes daily or monthly link aggregates for a given organisation. "
-    )
+    help = "Loads, parses, and fixes daily or monthly aggregates for a given organisation. "
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -33,7 +35,7 @@ def add_arguments(self, parser):
         )
         parser.add_argument(
             "--organisation",
-            help="The organisation id to fix link aggregates for.",
+            help="The organisation id to fix aggregates for.",
             type=str,
         )
         parser.add_argument(
@@ -48,10 +50,14 @@ def _handle(self, *args, **options):
         collections = organisation.collection_set.all()
 
         if not month_to_fix and not day_to_fix:
-            logger.warning("Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix.")
+            logger.warning(
+                "Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix."
+            )
             return
         if month_to_fix and day_to_fix:
-            logger.warning("Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both.")
+            logger.warning(
+                "Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both."
+            )
             return
         if not directory:
             logger.warning("Please provide a directory from which to parse archives.")
@@ -252,6 +258,130 @@ def _fill_daily_aggregate(self, collection, link_event):
         None
         """
         change_number = link_event["fields"]["change"]
+        self._fill_daily_pageproject_aggregates(change_number, collection, link_event)
+        self._fill_daily_user_aggregate(change_number, collection, link_event)
+        self._fill_daily_link_aggregate(change_number, collection, link_event)
+
+    def _fill_daily_pageproject_aggregates(self, change_number, collection, link_event):
+        """
+        This function updates or creates a daily PageProjectAggregate for a collection and a parsed JSON (LinkEvent).
+        Parameters
+        ----------
+        change_number :  int
+
+        collection :  Collection
+
+        link_event : obj
+
+        Returns
+        -------
+        None
+        """
+        existing_pageproject_aggregate = PageProjectAggregate.objects.filter(
+            organisation=collection.organisation,
+            collection=collection,
+            page_name=link_event['fields']["page_title"],
+            project_name=link_event['fields']["domain"],
+            full_date=datetime.fromisoformat(
+                link_event["fields"]["timestamp"]
+            ).date(),
+            on_user_list=link_event['fields']["on_user_list"],
+        ).exclude(day=0)[:1].all()
+        existing_pageproject_aggregate = (
+            existing_pageproject_aggregate[0] if len(existing_pageproject_aggregate) > 0 else None
+        )
+        if existing_pageproject_aggregate:
+            if change_number == 0:
+                existing_pageproject_aggregate.total_links_removed += 1
+            else:
+                existing_pageproject_aggregate.total_links_added += 1
+            existing_pageproject_aggregate.save()
+        else:
+            # Create a new page project aggregate
+            links_added = change_number if change_number > 0 else 0
+            links_removed = 1 if change_number == 0 else 0
+            PageProjectAggregate.objects.get_or_create(
+                organisation=collection.organisation,
+                collection=collection,
+                page_name=link_event['fields']["page_title"],
+                project_name=link_event['fields']["domain"],
+                full_date=datetime.fromisoformat(
+                    link_event["fields"]["timestamp"]
+                ).date(),
+                total_links_added=links_added,
+                total_links_removed=links_removed,
+                on_user_list=link_event['fields']["on_user_list"],
+            )
+
+    def _fill_daily_user_aggregate(self, change_number, collection, link_event):
+        """
+        This function updates or creates a daily UserAggregate for a collection and a parsed JSON (LinkEvent).
+        Parameters
+        ----------
+        change_number :  int
+
+        collection :  Collection
+
+        link_event : obj
+
+        Returns
+        -------
+        None
+        """
+        try:
+            user_retrieved = User.objects.get(pk=link_event["fields"]["user_id"])
+        except User.DoesNotExist:
+            return
+        exisiting_user_aggregate = (
+            UserAggregate.objects.filter(
+                organisation=collection.organisation,
+                collection=collection,
+                username=user_retrieved.username,
+                full_date=datetime.fromisoformat(
+                    link_event["fields"]["timestamp"]
+                ).date(),
+                on_user_list=link_event["fields"]["on_user_list"],
+            )
+            .exclude(day=0)
+            .first()
+        )
+        if exisiting_user_aggregate:
+            if change_number == 0:
+                exisiting_user_aggregate.total_links_removed += 1
+            else:
+                exisiting_user_aggregate.total_links_added += 1
+            exisiting_user_aggregate.save()
+        else:
+            # Create a new link aggregate
+            links_added = change_number if change_number > 0 else 0
+            links_removed = 1 if change_number == 0 else 0
+            UserAggregate.objects.create(
+                organisation=collection.organisation,
+                collection=collection,
+                username=user_retrieved.username,
+                full_date=datetime.fromisoformat(
+                    link_event["fields"]["timestamp"]
+                ).date(),
+                total_links_added=links_added,
+                total_links_removed=links_removed,
+                on_user_list=link_event["fields"]["on_user_list"],
+            )
+
+    def _fill_daily_link_aggregate(self, change_number, collection, link_event):
+        """
+        This function updates or creates a daily LinkAggregate for a collection and a parsed JSON (LinkEvent).
+        Parameters
+        ----------
+        change_number :  int
+
+        collection :  Collection
+
+        link_event : obj
+
+        Returns
+        -------
+        None
+        """
         existing_link_aggregate = (
             LinkAggregate.objects.filter(
                 organisation=collection.organisation.id,
@@ -378,16 +508,124 @@ def _process_monthly_events(
         total_added = sum(1 for i in events if i["fields"]["change"] == 1)
         total_removed = sum(1 for i in events if i["fields"]["change"] == 0)
 
-        existing_aggregate = LinkAggregate.objects.filter(
+        # set of tuples that consist of (page_title, domain) for a group of link events
+        page_projects = list(
+            set([(i["fields"]["page_title"], i["fields"]["domain"]) for i in events])
+        )
+        # set of user ids to fill user aggregates for
+        users = list(set([i["fields"]["user_id"] for i in events]))
+        try:
+            for page_project in page_projects:
+                self._fill_monthly_page_project_aggregates(collection, events, last_day_of_month, on_user_list_flag,
+                                                           page_project)
+            for user in users:
+                self._fill_monthly_user_aggregates(
+                    collection, last_day_of_month, link_events, on_user_list_flag, user
+                )
+            self._fill_monthly_link_aggregates(
+                collection,
+                last_day_of_month,
+                on_user_list_flag,
+                organisation,
+                total_added,
+                total_removed,
+            )
+        except Exception as e:
+            print(e)
+
+
+    def _fill_monthly_page_project_aggregates(self, collection, events, last_day_of_month, on_user_list_flag,
+                                              page_project):
+        """
+        This function updates or creates monthly PageProjectAggregate for collection and a parsed array of JSON(LinkEvents).
+        Parameters
+        ----------
+        collection :  Collection
+
+        events : an array of JSON link_events parsed from archives
+
+        last_day_of_month:  date
+
+        on_user_list_flag: bool
+
+        page_project: tuple(str, str)
+
+        Returns
+        -------
+        None
+        """
+        events_for_page_project = [
+            i
+            for i in events
+            if i["fields"]["page_title"] == page_project[0]
+               and i["fields"]["domain"] == page_project[1]
+        ]
+        total_added_page_project = sum(1 for i in events_for_page_project if i["fields"]["change"] == 1)
+        total_removed_page_project = sum(1 for i in events_for_page_project if i["fields"]["change"] == 0)
+        existing_page_project_aggregate = PageProjectAggregate.objects.filter(
+            organisation=collection.organisation,
+            collection=collection,
+            page_name=page_project[0],
+            project_name=page_project[1],
+            day=0,
+            full_date=last_day_of_month,
+            on_user_list=on_user_list_flag,
+        )[:1].all()
+        if existing_page_project_aggregate:
+            existing_page_project_aggregate.total_links_added = total_added_page_project
+            existing_page_project_aggregate.total_links_removed = total_removed_page_project
+            existing_page_project_aggregate.save()
+        else:
+            PageProjectAggregate.objects.get_or_create(
+                organisation=collection.organisation,
+                collection=collection,
+                page_name=page_project[0],
+                project_name=page_project[1],
+                full_date=last_day_of_month,
+                day=0,
+                total_links_added=total_added_page_project,
+                total_links_removed=total_removed_page_project,
+                on_user_list=on_user_list_flag,
+            )
+
+    def _fill_monthly_link_aggregates(
+        self,
+        collection,
+        last_day_of_month,
+        on_user_list_flag,
+        organisation,
+        total_added,
+        total_removed,
+    ):
+        """
+        This function updates or creates monthly LinkAggregate for collection.
+        Parameters
+        ----------
+        collection :  Collection
+
+        last_day_of_month:  date
+
+        on_user_list_flag: bool
+
+        organisation: Organisation
+
+        total_added: int
+
+        total_removed: int
+
+        Returns
+        -------
+        None
+        """
+        existing_link_aggregate = LinkAggregate.objects.filter(
             organisation_id=organisation.id,
             collection_id=collection.id,
             on_user_list=on_user_list_flag,
             full_date=last_day_of_month,
             day=0,
         )
-
-        if existing_aggregate.exists():
-            existing_aggregate.update(
+        if existing_link_aggregate.exists():
+            existing_link_aggregate.update(
                 total_links_added=total_added,
                 total_links_removed=total_removed,
             )
@@ -401,6 +639,65 @@ def _process_monthly_events(
                 total_links_added=total_added,
                 total_links_removed=total_removed,
             )
+    def _fill_monthly_user_aggregates(
+        self, collection, last_day_of_month, link_events, on_user_list_flag, user
+    ):
+        """
+         This function updates or creates monthly UserAggregate for user and collection.
+         Parameters
+         ----------
+         collection :  Collection
+
+         last_day_of_month:  date
+
+         link_events: an array of JSON link_events parsed from archives
+
+         on_user_list_flag: bool
+
+         user :  User
+
+         Returns
+         -------
+         None
+         """
+        try:
+            user_retrieved = User.objects.get(pk=user)
+        except User.DoesNotExist:
+            return
+        events_for_user = [i for i in link_events if i["fields"]["user_id"] is user]
+        total_added_by_user = sum(
+            1 for i in events_for_user if i["fields"]["change"] == 1
+        )
+        total_removed_by_user = sum(
+            1 for i in events_for_user if i["fields"]["change"] == 0
+        )
+        exisiting_user_aggregate = (
+            UserAggregate.objects.filter(
+                organisation_id=collection.organisation.id,
+                collection_id=collection.id,
+                username=user_retrieved.username,
+                full_date=last_day_of_month,
+                day=0,
+                on_user_list=on_user_list_flag,
+            )
+            .first()
+        )
+        if exisiting_user_aggregate:
+            exisiting_user_aggregate.total_links_added = total_added_by_user
+            exisiting_user_aggregate.total_links_removed = total_removed_by_user
+            exisiting_user_aggregate.save()
+        else:
+            # Create a new link aggregate
+            UserAggregate.objects.create(
+                organisation_id=collection.organisation.id,
+                collection_id=collection.id,
+                username=user_retrieved.username,
+                full_date=last_day_of_month,
+                day=0,
+                total_links_added=total_added_by_user,
+                total_links_removed=total_removed_by_user,
+                on_user_list=on_user_list_flag,
+            )
 
     def _get_link_events_for_day(
         self, collection_url: str, events_split_by_url_pattern, day: int
diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py
index e40474a0..798828cf 100644
--- a/extlinks/aggregates/tests.py
+++ b/extlinks/aggregates/tests.py
@@ -2139,6 +2139,7 @@ def setUp(self):
         self.organisation = OrganisationFactory(name="ACME Org")
         self.collection = CollectionFactory(organisation=self.organisation)
         self.user = UserFactory()
+        self.user2 = UserFactory()
         self.url = URLPatternFactory(url="www.test.com")
         self.url.collection = self.collection
         self.url.save()
@@ -2267,16 +2268,27 @@ def test_reaggregate_link_archives_monthly(self, mock_swift_connection):
                 "--dir",
                 temp_dir,
             )
-            monthly_aggregate = LinkAggregate.objects.all().first()
-            # assert only one monthly aggregates created for on_user_list=True
+            monthly_link_aggregate = LinkAggregate.objects.all().first()
+            monthly_user_aggregates = UserAggregate.objects.all().first()
+            monthly_page_project_aggregates = PageProjectAggregate.objects.all().first()
+            # assert only one monthly aggregate created for on_user_list=True
             self.assertEqual(1, LinkAggregate.objects.count())
-            self.assertEqual(1, LinkAggregate.objects.filter(day=0).count())
+            self.assertEqual(1, PageProjectAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
             # assert daily aggregates were not created
             self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
             self.assertEqual(0, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=15).count())
             # assert totals match expected totals
-            self.assertEqual(2, monthly_aggregate.total_links_added)
-            self.assertEqual(1, monthly_aggregate.total_links_removed)
+            self.assertEqual(2, monthly_link_aggregate.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(2, monthly_user_aggregates.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(2, monthly_page_project_aggregates.total_links_added)
+            self.assertEqual(1, monthly_page_project_aggregates.total_links_removed)
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -2291,16 +2303,17 @@ def test_reaggregate_link_archives_monthly(self, mock_swift_connection):
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_multiple_projects(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
             [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
-        mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}])
+        mock_conn.get_container.return_value = ({},[])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
+
         json_data = [
             {
                 "model": "links.linkevent",
@@ -2330,7 +2343,7 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co
                 "fields": {
                     "link": "https://www.test.com/",
                     "timestamp": "2024-12-16T09:15:27.363Z",
-                    "domain": "en.wikipedia.org",
+                    "domain": "cy.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
                     "username": self.user.id,
@@ -2374,7 +2387,7 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co
                 "fields": {
                     "link": "https://www.test.com/4",
                     "timestamp": "2024-12-15T09:15:27.363Z",
-                    "domain": "en.wikipedia.org",
+                    "domain": "de.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
                     "username": self.user.id,
@@ -2405,14 +2418,39 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co
                 "--dir",
                 temp_dir,
             )
-            # assert no daily or monthly aggregates created
-            self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkAggregate.objects.filter(day=0).count())
+            monthly_link_aggregate = LinkAggregate.objects.all().first()
+            monthly_user_aggregates = UserAggregate.objects.all().first()
+            monthly_page_project_aggregates_en = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first()
+            monthly_page_project_aggregates_de = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first()
+            monthly_page_project_aggregates_cy = PageProjectAggregate.objects.filter(project_name="cy.wikipedia.org").first()
+
+            # assert only one monthly aggregate created for on_user_list=True
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(3, PageProjectAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            # assert daily aggregates were not created
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(2, monthly_link_aggregate.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(2, monthly_user_aggregates.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_de.total_links_removed)
+            self.assertEqual(0, monthly_page_project_aggregates_de.total_links_added)
+            self.assertEqual(0, monthly_page_project_aggregates_en.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_en.total_links_added)
+            self.assertEqual(0, monthly_page_project_aggregates_cy.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_cy.total_links_added)
+
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
-
     @mock.patch.dict(
         os.environ,
         {
@@ -2422,16 +2460,17 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_multiple_pages(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
             [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
-        mock_conn.get_container.return_value = ({},[])
+        mock_conn.get_container.return_value = ({}, [])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
+
         json_data = [
             {
                 "model": "links.linkevent",
@@ -2489,7 +2528,7 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m
                     "username": self.user.id,
                     "rev_id": 485489,
                     "user_id": self.user.id,
-                    "page_title": "test",
+                    "page_title": "test2",
                     "page_namespace": 0,
                     "event_id": "",
                     "user_is_bot": False,
@@ -2526,10 +2565,6 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m
         with gzip.open(archive_path, "wt", encoding="utf-8") as f:
             json.dump(json_data, f)
 
-
-        # create link events
-        call_command("loaddata", archive_path)
-
         try:
             call_command(
                 "reaggregate_link_archives",
@@ -2540,9 +2575,31 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m
                 "--dir",
                 temp_dir,
             )
-            # assert no daily or monthly aggregates created
-            self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkAggregate.objects.filter(day=0).count())
+            monthly_link_aggregate = LinkAggregate.objects.all().first()
+            monthly_user_aggregates = UserAggregate.objects.all().first()
+            monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first()
+            monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first()
+            # assert only one monthly aggregate created for on_user_list=True
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(2, PageProjectAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            # assert daily aggregates were not created
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(2, monthly_link_aggregate.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(2, monthly_user_aggregates.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_added)
+            self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added)
+            self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed)
+
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -2557,16 +2614,168 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_daily(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_multiple_users(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
-            [],
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
         mock_conn.get_container.return_value = ({},[])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
+
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user2.username,
+                    "rev_id": 485489,
+                    "user_id": self.user2.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user2.username,
+                    "rev_id": 485489,
+                    "user_id": self.user2.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            monthly_link_aggregate = LinkAggregate.objects.all().first()
+            monthly_user_aggregates_1 = UserAggregate.objects.filter(username=self.user.username).first()
+            monthly_user_aggregates_2 = UserAggregate.objects.filter(username=self.user2.username).first()
+            monthly_page_project_aggregates = PageProjectAggregate.objects.all().first()
+            # assert only one monthly aggregate created for on_user_list=True
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(1, PageProjectAggregate.objects.count())
+            self.assertEqual(2, UserAggregate.objects.count())
+            # assert daily aggregates were not created
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(2, monthly_link_aggregate.total_links_added)
+            self.assertEqual(1, monthly_link_aggregate.total_links_removed)
+            self.assertEqual(1, monthly_user_aggregates_1.total_links_added)
+            self.assertEqual(0, monthly_user_aggregates_1.total_links_removed)
+            self.assertEqual(1, monthly_user_aggregates_2.total_links_added)
+            self.assertEqual(1, monthly_user_aggregates_2.total_links_removed)
+            self.assertEqual(2, monthly_page_project_aggregates.total_links_added)
+            self.assertEqual(1, monthly_page_project_aggregates.total_links_removed)
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
+        )
+        mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
         json_data = [
             {
                 "model": "links.linkevent",
@@ -2658,6 +2867,725 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection):
             },
         ]
 
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            # assert no daily or monthly aggregates created
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+
+        # create link events
+        call_command("loaddata", archive_path)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--month",
+                "202412",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            # assert no daily or monthly aggregates created
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            daily_link_aggregate = LinkAggregate.objects.all().first()
+            daily_user_aggregate = UserAggregate.objects.all().first()
+            daily_pageproject_aggregate = PageProjectAggregate.objects.all().first()
+            # assert no monthly aggregates created
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            self.assertEqual(1, PageProjectAggregate.objects.count())
+            # assert daily aggregates were created for the correct day
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(1, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_added)
+            self.assertEqual(1, daily_user_aggregate.total_links_removed)
+            self.assertEqual(1, daily_pageproject_aggregate.total_links_added)
+            self.assertEqual(1, daily_pageproject_aggregate.total_links_removed)
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "cy.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "de.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            daily_link_aggregate = LinkAggregate.objects.all().first()
+            daily_user_aggregate = UserAggregate.objects.all().first()
+            daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first()
+            daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first()
+            # assert no monthly aggregates created
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            self.assertEqual(2, PageProjectAggregate.objects.count())
+            # assert daily aggregates were created for the correct day
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(1, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_added)
+            self.assertEqual(1, daily_user_aggregate.total_links_removed)
+            self.assertEqual(1, daily_pageproject_aggregate1.total_links_added)
+            self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed)
+            self.assertEqual(0, daily_pageproject_aggregate2.total_links_added)
+            self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed)
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test2",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            daily_link_aggregate = LinkAggregate.objects.all().first()
+            daily_user_aggregate = UserAggregate.objects.all().first()
+            monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first()
+            monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first()
+            # assert no monthly aggregates created
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            self.assertEqual(2, PageProjectAggregate.objects.count())
+            # assert daily aggregates were created for the correct day
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(1, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_added)
+            self.assertEqual(1, daily_user_aggregate.total_links_removed)
+            self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added)
+            self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added)
+            self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed)
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.username,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.username,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user2.username,
+                    "rev_id": 485489,
+                    "user_id": self.user2.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user2.username,
+                    "rev_id": 485489,
+                    "user_id": self.user2.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
         with gzip.open(archive_path, "wt", encoding="utf-8") as f:
             json.dump(json_data, f)
 
@@ -2671,16 +3599,27 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection):
                 "--dir",
                 temp_dir,
             )
-            daily_aggregate = LinkAggregate.objects.all().first()
+            daily_link_aggregate = LinkAggregate.objects.all().first()
+            daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first()
+            daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first()
             # assert no monthly aggregates created
             self.assertEqual(1, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkAggregate.objects.filter(day=0).count())
+            self.assertEqual(2, UserAggregate.objects.count())
+            self.assertEqual(1, PageProjectAggregate.objects.count())
             # assert daily aggregates were created for the correct day
             self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
             self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(2, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count())
             # assert totals match expected totals
-            self.assertEqual(1, daily_aggregate.total_links_added)
-            self.assertEqual(1, daily_aggregate.total_links_removed)
+            self.assertEqual(2, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_added)
+            self.assertEqual(0, daily_user_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate2.total_links_added)
+            self.assertEqual(1, daily_user_aggregate2.total_links_removed)
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -2809,6 +3748,8 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn
                 temp_dir,
             )
             self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -2940,6 +3881,8 @@ def test_reaggregate_link_archives_daily_skips_if_linkevents_for_day(self, mock_
                 temp_dir,
             )
             self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -3186,7 +4129,8 @@ def test_reaggregate_link_archives_only_link_event_archives(self, mock_swift_con
                 temp_dir,
             )
             self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -3293,7 +4237,8 @@ def test_reaggregate_link_archives_only_in_correct_zipped_format(self, mock_swif
                 temp_dir,
             )
             self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)

From 606cb601b898b9806a41e6a6c357555e74f8e845 Mon Sep 17 00:00:00 2001
From: Kgraessle <kgraessle@wikimedia.org>
Date: Thu, 18 Sep 2025 14:14:40 -0500
Subject: [PATCH 8/8] Backfill missing Wall Street Journal aggregate data

- Created a command to fix a given orgs monthly/daily aggregates using already existing archived data
- This fixes an issue where we have archived link events before they were aggregated

Bug: T404879
Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c
---
 .../commands/reaggregate_link_archives.py     |  12 +-
 extlinks/aggregates/tests.py                  | 726 +++++++++++++++---
 2 files changed, 632 insertions(+), 106 deletions(-)

diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
index 63b64a60..736ee664 100644
--- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py
+++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py
@@ -78,8 +78,8 @@ def _handle(self, *args, **options):
             logger.info("Swift credentials not provided. Skipping.")
             return False
 
-        # get existing archives to ensure we have not already aggregated
-        existing_link_aggregates_in_object_storage = self._get_existing_link_aggregates(
+        # get existing aggregates to ensure we have not already aggregated for the given timeframe
+        existing_aggregates = self._get_existing_aggregates(
             conn
         )
         # get all URLPatterns for an organisation
@@ -91,7 +91,7 @@ def _handle(self, *args, **options):
             # if we already have aggregates for this month uploaded, don't try to re-aggregate
             # or if we have not archived all events for the given timeframe, don't try to re-aggregate
             if self._has_aggregates_for_month(
-                existing_link_aggregates_in_object_storage, month_to_fix
+                existing_aggregates, month_to_fix
             ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month):
                 logger.warning(
                     "Organisation already has aggregates or link events for month."
@@ -106,7 +106,7 @@ def _handle(self, *args, **options):
             # if we already have aggregates for this day uploaded, don't try to re-aggregate
             # or if we have not archived all events for the given timeframe, don't try to re-aggregate
             if self._has_aggregates_for_day(
-                existing_link_aggregates_in_object_storage, day_to_fix
+                existing_aggregates, day_to_fix
             ) or self._has_link_events_for_day(day_to_fix):
                 logger.warning(
                     "Organisation already has aggregates or link events for day."
@@ -118,7 +118,7 @@ def _handle(self, *args, **options):
                     collections, day_to_fix, directory, url_patterns
                 )
 
-    def _get_existing_link_aggregates(self, conn):
+    def _get_existing_aggregates(self, conn):
         """
         This function gets existing link aggregates from object storage.
         Parameters
@@ -135,7 +135,7 @@ def _get_existing_link_aggregates(self, conn):
             for i in swift.get_object_list(
                 conn,
                 os.environ.get("SWIFT_CONTAINER_AGGREGATES", "archive-aggregates"),
-                "aggregates_linkaggregate_",
+                "aggregates_",
             )
         ]
         return existing_link_aggregates_in_object_storage
diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py
index 798828cf..00d7d4d2 100644
--- a/extlinks/aggregates/tests.py
+++ b/extlinks/aggregates/tests.py
@@ -2766,13 +2766,13 @@ def test_reaggregate_link_archives_monthly_multiple_users(self, mock_swift_conne
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_skips_if_uploaded_link_aggregates(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
             [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
-        mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}])
+        mock_conn.get_container.return_value = ({},[{"name": "aggregates_linkaggregate_100_10_2024-12-22"}])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -2898,13 +2898,13 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_skips_if_uploaded_user_aggregates(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
             [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
-        mock_conn.get_container.return_value = ({},[])
+        mock_conn.get_container.return_value = ({},[{"name": "aggregates_useraggregate_100_10__2024-12-22"}])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -3002,10 +3002,6 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m
         with gzip.open(archive_path, "wt", encoding="utf-8") as f:
             json.dump(json_data, f)
 
-
-        # create link events
-        call_command("loaddata", archive_path)
-
         try:
             call_command(
                 "reaggregate_link_archives",
@@ -3034,13 +3030,13 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_daily(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_skips_if_uploaded_pageproject_aggregates(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
-            [],
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
-        mock_conn.get_container.return_value = ({},[])
+        mock_conn.get_container.return_value = ({},[{"name": "aggregates_pageprojectaggregate_2024-12-22"}])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -3141,34 +3137,17 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection):
         try:
             call_command(
                 "reaggregate_link_archives",
-                "--day",
-                "20241215",
+                "--month",
+                "202412",
                 "--organisation",
                 self.organisation.id,
                 "--dir",
                 temp_dir,
             )
-            daily_link_aggregate = LinkAggregate.objects.all().first()
-            daily_user_aggregate = UserAggregate.objects.all().first()
-            daily_pageproject_aggregate = PageProjectAggregate.objects.all().first()
-            # assert no monthly aggregates created
-            self.assertEqual(1, LinkAggregate.objects.count())
-            self.assertEqual(1, UserAggregate.objects.count())
-            self.assertEqual(1, PageProjectAggregate.objects.count())
-            # assert daily aggregates were created for the correct day
-            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
-            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
-            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
-            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
-            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
-            self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count())
-            # assert totals match expected totals
-            self.assertEqual(1, daily_link_aggregate.total_links_added)
-            self.assertEqual(1, daily_link_aggregate.total_links_removed)
-            self.assertEqual(1, daily_user_aggregate.total_links_added)
-            self.assertEqual(1, daily_user_aggregate.total_links_removed)
-            self.assertEqual(1, daily_pageproject_aggregate.total_links_added)
-            self.assertEqual(1, daily_pageproject_aggregate.total_links_removed)
+            # assert no daily or monthly aggregates created
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -3183,11 +3162,11 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection):
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection):
+    def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
-            [],
+            [{"name": "archive-aggregates-backup-2024-12-22"}],
         )
         mock_conn.get_container.return_value = ({},[])
         temp_dir = tempfile.gettempdir()
@@ -3222,7 +3201,7 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn
                 "fields": {
                     "link": "https://www.test.com/",
                     "timestamp": "2024-12-16T09:15:27.363Z",
-                    "domain": "cy.wikipedia.org",
+                    "domain": "en.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
                     "username": self.user.id,
@@ -3266,7 +3245,7 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn
                 "fields": {
                     "link": "https://www.test.com/4",
                     "timestamp": "2024-12-15T09:15:27.363Z",
-                    "domain": "de.wikipedia.org",
+                    "domain": "en.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
                     "username": self.user.id,
@@ -3287,44 +3266,29 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn
         with gzip.open(archive_path, "wt", encoding="utf-8") as f:
             json.dump(json_data, f)
 
+
+        # create link events
+        call_command("loaddata", archive_path)
+
         try:
             call_command(
                 "reaggregate_link_archives",
-                "--day",
-                "20241215",
+                "--month",
+                "202412",
                 "--organisation",
                 self.organisation.id,
                 "--dir",
                 temp_dir,
             )
-            daily_link_aggregate = LinkAggregate.objects.all().first()
-            daily_user_aggregate = UserAggregate.objects.all().first()
-            daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first()
-            daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first()
-            # assert no monthly aggregates created
-            self.assertEqual(1, LinkAggregate.objects.count())
-            self.assertEqual(1, UserAggregate.objects.count())
-            self.assertEqual(2, PageProjectAggregate.objects.count())
-            # assert daily aggregates were created for the correct day
-            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
-            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
-            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
-            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
-            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
-            self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count())
-            # assert totals match expected totals
-            self.assertEqual(1, daily_link_aggregate.total_links_added)
-            self.assertEqual(1, daily_link_aggregate.total_links_removed)
-            self.assertEqual(1, daily_user_aggregate.total_links_added)
-            self.assertEqual(1, daily_user_aggregate.total_links_removed)
-            self.assertEqual(1, daily_pageproject_aggregate1.total_links_added)
-            self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed)
-            self.assertEqual(0, daily_pageproject_aggregate2.total_links_added)
-            self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed)
+            # assert no daily or monthly aggregates created
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
+
     @mock.patch.dict(
         os.environ,
         {
@@ -3334,7 +3298,7 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection):
+    def test_reaggregate_link_archives_daily(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
@@ -3401,7 +3365,7 @@ def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connect
                     "username": self.user.id,
                     "rev_id": 485489,
                     "user_id": self.user.id,
-                    "page_title": "test2",
+                    "page_title": "test",
                     "page_namespace": 0,
                     "event_id": "",
                     "user_is_bot": False,
@@ -3450,32 +3414,30 @@ def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connect
             )
             daily_link_aggregate = LinkAggregate.objects.all().first()
             daily_user_aggregate = UserAggregate.objects.all().first()
-            monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first()
-            monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first()
+            daily_pageproject_aggregate = PageProjectAggregate.objects.all().first()
             # assert no monthly aggregates created
             self.assertEqual(1, LinkAggregate.objects.count())
             self.assertEqual(1, UserAggregate.objects.count())
-            self.assertEqual(2, PageProjectAggregate.objects.count())
+            self.assertEqual(1, PageProjectAggregate.objects.count())
             # assert daily aggregates were created for the correct day
             self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
             self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
             self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
             self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
             self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
-            self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count())
             # assert totals match expected totals
             self.assertEqual(1, daily_link_aggregate.total_links_added)
             self.assertEqual(1, daily_link_aggregate.total_links_removed)
             self.assertEqual(1, daily_user_aggregate.total_links_added)
             self.assertEqual(1, daily_user_aggregate.total_links_removed)
-            self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added)
-            self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed)
-            self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added)
-            self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed)
+            self.assertEqual(1, daily_pageproject_aggregate.total_links_added)
+            self.assertEqual(1, daily_pageproject_aggregate.total_links_removed)
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
+
     @mock.patch.dict(
         os.environ,
         {
@@ -3485,7 +3447,7 @@ def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connect
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection):
+    def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
@@ -3505,7 +3467,7 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect
                     "domain": "en.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
-                    "username": self.user.username,
+                    "username": self.user.id,
                     "rev_id": 485489,
                     "user_id": self.user.id,
                     "page_title": "test",
@@ -3523,11 +3485,11 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect
                 "pk": 2,
                 "fields": {
                     "link": "https://www.test.com/",
-                    "timestamp": "2024-12-15T09:15:27.363Z",
-                    "domain": "en.wikipedia.org",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "cy.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
-                    "username": self.user.username,
+                    "username": self.user.id,
                     "rev_id": 485489,
                     "user_id": self.user.id,
                     "page_title": "test",
@@ -3549,9 +3511,9 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect
                     "domain": "en.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
-                    "username": self.user2.username,
+                    "username": self.user.id,
                     "rev_id": 485489,
-                    "user_id": self.user2.id,
+                    "user_id": self.user.id,
                     "page_title": "test",
                     "page_namespace": 0,
                     "event_id": "",
@@ -3568,12 +3530,12 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect
                 "fields": {
                     "link": "https://www.test.com/4",
                     "timestamp": "2024-12-15T09:15:27.363Z",
-                    "domain": "en.wikipedia.org",
+                    "domain": "de.wikipedia.org",
                     "content_type": ContentType.objects.get_for_model(URLPattern).id,
                     "object_id": self.url.id,
-                    "username": self.user2.username,
+                    "username": self.user.id,
                     "rev_id": 485489,
-                    "user_id": self.user2.id,
+                    "user_id": self.user.id,
                     "page_title": "test",
                     "page_namespace": 0,
                     "event_id": "",
@@ -3600,26 +3562,29 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect
                 temp_dir,
             )
             daily_link_aggregate = LinkAggregate.objects.all().first()
-            daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first()
-            daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first()
+            daily_user_aggregate = UserAggregate.objects.all().first()
+            daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first()
+            daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first()
             # assert no monthly aggregates created
             self.assertEqual(1, LinkAggregate.objects.count())
-            self.assertEqual(2, UserAggregate.objects.count())
-            self.assertEqual(1, PageProjectAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            self.assertEqual(2, PageProjectAggregate.objects.count())
             # assert daily aggregates were created for the correct day
             self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
             self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
             self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
             self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
-            self.assertEqual(2, UserAggregate.objects.filter(day=15).count())
-            self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count())
             # assert totals match expected totals
-            self.assertEqual(2, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_added)
             self.assertEqual(1, daily_link_aggregate.total_links_removed)
             self.assertEqual(1, daily_user_aggregate.total_links_added)
-            self.assertEqual(0, daily_user_aggregate.total_links_removed)
-            self.assertEqual(1, daily_user_aggregate2.total_links_added)
-            self.assertEqual(1, daily_user_aggregate2.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_removed)
+            self.assertEqual(1, daily_pageproject_aggregate1.total_links_added)
+            self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed)
+            self.assertEqual(0, daily_pageproject_aggregate2.total_links_added)
+            self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed)
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
@@ -3633,13 +3598,13 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect
         },
     )
     @mock.patch("swiftclient.Connection")
-    def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_connection):
+    def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection):
         mock_conn = mock_swift_connection.return_value
         mock_conn.get_account.return_value = (
             {},
             [],
         )
-        mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-15"}])
+        mock_conn.get_container.return_value = ({},[])
         temp_dir = tempfile.gettempdir()
         archive_filename = "links_linkevent_20241222_0.json.gz"
         archive_path = os.path.join(temp_dir, archive_filename)
@@ -3700,7 +3665,7 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn
                     "username": self.user.id,
                     "rev_id": 485489,
                     "user_id": self.user.id,
-                    "page_title": "test",
+                    "page_title": "test2",
                     "page_namespace": 0,
                     "event_id": "",
                     "user_is_bot": False,
@@ -3747,13 +3712,574 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn
                 "--dir",
                 temp_dir,
             )
-            self.assertEqual(0, LinkAggregate.objects.count())
-            self.assertEqual(0, UserAggregate.objects.count())
-            self.assertEqual(0, PageProjectAggregate.objects.count())
+            daily_link_aggregate = LinkAggregate.objects.all().first()
+            daily_user_aggregate = UserAggregate.objects.all().first()
+            monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first()
+            monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first()
+            # assert no monthly aggregates created
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(1, UserAggregate.objects.count())
+            self.assertEqual(2, PageProjectAggregate.objects.count())
+            # assert daily aggregates were created for the correct day
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(1, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_added)
+            self.assertEqual(1, daily_user_aggregate.total_links_removed)
+            self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added)
+            self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed)
+            self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added)
+            self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed)
         finally:
             for file in glob.glob(archive_path):
                 os.remove(file)
 
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.username,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.username,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user2.username,
+                    "rev_id": 485489,
+                    "user_id": self.user2.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user2.username,
+                    "rev_id": 485489,
+                    "user_id": self.user2.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            daily_link_aggregate = LinkAggregate.objects.all().first()
+            daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first()
+            daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first()
+            # assert no monthly aggregates created
+            self.assertEqual(1, LinkAggregate.objects.count())
+            self.assertEqual(2, UserAggregate.objects.count())
+            self.assertEqual(1, PageProjectAggregate.objects.count())
+            # assert daily aggregates were created for the correct day
+            self.assertEqual(0, LinkAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, UserAggregate.objects.filter(day=16).count())
+            self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count())
+            self.assertEqual(1, LinkAggregate.objects.filter(day=15).count())
+            self.assertEqual(2, UserAggregate.objects.filter(day=15).count())
+            self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count())
+            # assert totals match expected totals
+            self.assertEqual(2, daily_link_aggregate.total_links_added)
+            self.assertEqual(1, daily_link_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate.total_links_added)
+            self.assertEqual(0, daily_user_aggregate.total_links_removed)
+            self.assertEqual(1, daily_user_aggregate2.total_links_added)
+            self.assertEqual(1, daily_user_aggregate2.total_links_removed)
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_skips_if_uploaded_link_aggregates(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[{"name": "aggregates_linkaggregate_100_10_2024-12-15"}])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_skips_if_uploaded_user_aggregates(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[{"name": "aggregates_useraggregate_100_10__2024-12-15"}])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+    @mock.patch.dict(
+        os.environ,
+        {
+            "OPENSTACK_AUTH_URL": "fakeurl",
+            "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid",
+            "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret",
+        },
+    )
+    @mock.patch("swiftclient.Connection")
+    def test_reaggregate_link_archives_daily_skips_if_uploaded_pageproject_aggregates(self, mock_swift_connection):
+        mock_conn = mock_swift_connection.return_value
+        mock_conn.get_account.return_value = (
+            {},
+            [],
+        )
+        mock_conn.get_container.return_value = ({},[{"name": "aggregates_pageprojectaggregate_100_10__2024-12-15"}])
+        temp_dir = tempfile.gettempdir()
+        archive_filename = "links_linkevent_20241222_0.json.gz"
+        archive_path = os.path.join(temp_dir, archive_filename)
+        json_data = [
+            {
+                "model": "links.linkevent",
+                "pk": 1,
+                "fields": {
+                    "link": "https://www.another_domain.com/articles/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 2,
+                "fields": {
+                    "link": "https://www.test.com/",
+                    "timestamp": "2024-12-16T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 3,
+                "fields": {
+                    "link": "https://www.test.com/3",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 1,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+            {
+                "model": "links.linkevent",
+                "pk": 4,
+                "fields": {
+                    "link": "https://www.test.com/4",
+                    "timestamp": "2024-12-15T09:15:27.363Z",
+                    "domain": "en.wikipedia.org",
+                    "content_type": ContentType.objects.get_for_model(URLPattern).id,
+                    "object_id": self.url.id,
+                    "username": self.user.id,
+                    "rev_id": 485489,
+                    "user_id": self.user.id,
+                    "page_title": "test",
+                    "page_namespace": 0,
+                    "event_id": "",
+                    "user_is_bot": False,
+                    "hash_link_event_id": "",
+                    "change": 0,
+                    "on_user_list": True,
+                    "url": []
+                }
+            },
+        ]
+
+        with gzip.open(archive_path, "wt", encoding="utf-8") as f:
+            json.dump(json_data, f)
+
+        try:
+            call_command(
+                "reaggregate_link_archives",
+                "--day",
+                "20241215",
+                "--organisation",
+                self.organisation.id,
+                "--dir",
+                temp_dir,
+            )
+            self.assertEqual(0, LinkAggregate.objects.count())
+            self.assertEqual(0, UserAggregate.objects.count())
+            self.assertEqual(0, PageProjectAggregate.objects.count())
+        finally:
+            for file in glob.glob(archive_path):
+                os.remove(file)
+
+
     @mock.patch.dict(
         os.environ,
         {