From 8e5d0ea0eae1a27db5768ace49fe5810dc11415a Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 1/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- .../commands/fill_link_aggregates.py | 4 +- ...x_aggregates_for_organisation_and_month.py | 89 ++++ extlinks/aggregates/tests.py | 437 ++++++++++++++++++ 3 files changed, 529 insertions(+), 1 deletion(-) create mode 100644 extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py diff --git a/extlinks/aggregates/management/commands/fill_link_aggregates.py b/extlinks/aggregates/management/commands/fill_link_aggregates.py index 7b9faff3..710d1245 100644 --- a/extlinks/aggregates/management/commands/fill_link_aggregates.py +++ b/extlinks/aggregates/management/commands/fill_link_aggregates.py @@ -8,7 +8,7 @@ from django.db.models.fields import DateField from ...models import LinkAggregate -from extlinks.links.models import LinkEvent +from extlinks.links.models import LinkEvent, URLPattern from extlinks.organisations.models import Collection @@ -117,6 +117,8 @@ def _process_single_collection(self, link_event_filter, collection): None """ url_patterns = collection.get_url_patterns() + if len(url_patterns) == 0: + url_patterns = URLPattern.objects.filter(collection=collection).all() for url_pattern in url_patterns: link_events_with_annotated_timestamp = url_pattern.link_events.annotate( timestamp_date=Cast("timestamp", DateField()) diff --git a/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py new file mode 100644 index 00000000..5752756f --- /dev/null +++ b/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py @@ -0,0 +1,89 @@ +import gzip +import json +import os +import logging +from datetime import datetime + +from django.core.management import call_command + +from extlinks.common.management.commands import BaseCommand +from extlinks.links.models import URLPattern, LinkEvent +from extlinks.organisations.models import Organisation + +logger = logging.getLogger("django") +class Command(BaseCommand): + help = "Loads, parses, and fixes monthly link aggregates for a given organisation." + + def add_arguments(self, parser): + parser.add_argument( + "--month", + help="The date (YYYYMM) of the monthly archive to be fixed.", + type=str, + ) + parser.add_argument( + "--organisation", + help="The organisation id to fix link aggregates for.", + type=str, + ) + parser.add_argument( + "--dir", help="The directory from which to parse archives.", type=str + ) + parser.add_argument( + "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False + ) + + + def _handle(self, *args, **options): + directory = options["dir"] + month_to_fix = options["month"] + organisation = Organisation.objects.filter(id=options["organisation"]).first() + collections = organisation.collection_set.all() + skip_monthly = options["skip_monthly"] + if not month_to_fix or not organisation or not collections or not directory: + return + url_pattern_strings = [i.url for i in URLPattern.objects.filter(collection__in=collections)] + org_only_events = [] + for filename in os.listdir(directory): + if ( + filename.endswith(".json.gz") + and filename.startswith("links_linkevent_") + and month_to_fix in filename + ): + file_path = os.path.join(directory, filename) + with gzip.open(file_path, "rt", encoding="utf-8") as f: + data = json.load(f) + for event in data: + link = event["fields"]["link"] + for url_pattern in url_pattern_strings: + if url_pattern in link: + org_only_events.append(event) + filtered_file = f"link_events_filtered_{month_to_fix}_organisation_{organisation.id}.json.gz" + filtered_file_path = os.path.join( + directory, + filtered_file, + ) + if len(org_only_events) > 0: + try: + with gzip.open( + filtered_file_path, "wt", encoding="utf-8" + ) as new_archive: + json.dump(org_only_events, new_archive) + logger.info(f"Aggregating {len(org_only_events)} filtered events for {filtered_file}") + # load filtered records into the database + call_command("archive_link_aggregates", "load", filtered_file_path) + # run aggregate command + call_command( + "fill_link_aggregates", + collections=[i.id for i in collections.all()], + ) + # run monthly aggregate command if we're not skipping it + if not skip_monthly: + call_command( + "fill_monthly_link_aggregates", + collections=collections, + year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"), + ) + # delete the records from the database, as we do not need to re-archive or re-upload them + LinkEvent.objects.filter(pk__in=[i['pk'] for i in org_only_events]).delete() + except Exception as e: + logger.error(e) diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index 44ed323f..85c64011 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -11,6 +11,7 @@ from dateutil.relativedelta import relativedelta from unittest import mock +from django.contrib.contenttypes.models import ContentType from django.core.management import call_command, CommandError from django.test import TransactionTestCase @@ -33,6 +34,7 @@ UserFactory, ) from extlinks.organisations.models import Organisation +from ..links.models import URLPattern, LinkEvent class BaseTransactionTest(TransactionTestCase): @@ -2128,3 +2130,438 @@ def test_uploads_all_files_successfully(self, mock_swift_connection): for file in glob.glob(pattern): os.remove(file) + +class FixAggregatesForOrganisationAndMonthCommandTest(BaseTransactionTest): + + def setUp(self): + # Creating one Collection, Organisation, and URLPattern + self.organisation = OrganisationFactory(name="ACME Org") + self.collection = CollectionFactory(organisation=self.organisation) + self.user = UserFactory() + self.url = URLPatternFactory(url="www.test.com") + self.url.collection = self.collection + self.url.save() + + def test_fixes_monthly_aggregates_for_organisation(self): + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "fix_aggregates_for_organisation_and_month", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_aggregate = LinkAggregate.objects.filter(day=0).first() + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(2, monthly_aggregate.total_links_added) + self.assertEqual(1, monthly_aggregate.total_links_removed) + self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives(self): + temp_dir = tempfile.gettempdir() + archive_filename = "aggregates_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-01-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "fix_aggregates_for_organisation_and_month", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives_in_correct_zipped_format(self): + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-01-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "fix_aggregates_for_organisation_and_month", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(self): + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "fix_aggregates_for_organisation_and_month", + "--month", + "202412", + '--skip-monthly', + True, + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + link_aggregate = LinkAggregate.objects.filter(day=16).first() + link_aggregate2 = LinkAggregate.objects.filter(day=15).first() + self.assertEqual(2, LinkAggregate.objects.count()) + self.assertEqual(1, link_aggregate.total_links_added) + self.assertEqual(0, link_aggregate.total_links_removed) + self.assertEqual(1, link_aggregate2.total_links_added) + self.assertEqual(1, link_aggregate2.total_links_removed) + self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + From 65f27bf88d247fe1b4db5197eb4287c861a7d4fb Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 2/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- ...x_aggregates_for_organisation_and_month.py | 89 ---------- ...k_aggregates_for_organisation_and_month.py | 163 ++++++++++++++++++ extlinks/aggregates/tests.py | 16 +- 3 files changed, 171 insertions(+), 97 deletions(-) delete mode 100644 extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py create mode 100644 extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py diff --git a/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py deleted file mode 100644 index 5752756f..00000000 --- a/extlinks/aggregates/management/commands/fix_aggregates_for_organisation_and_month.py +++ /dev/null @@ -1,89 +0,0 @@ -import gzip -import json -import os -import logging -from datetime import datetime - -from django.core.management import call_command - -from extlinks.common.management.commands import BaseCommand -from extlinks.links.models import URLPattern, LinkEvent -from extlinks.organisations.models import Organisation - -logger = logging.getLogger("django") -class Command(BaseCommand): - help = "Loads, parses, and fixes monthly link aggregates for a given organisation." - - def add_arguments(self, parser): - parser.add_argument( - "--month", - help="The date (YYYYMM) of the monthly archive to be fixed.", - type=str, - ) - parser.add_argument( - "--organisation", - help="The organisation id to fix link aggregates for.", - type=str, - ) - parser.add_argument( - "--dir", help="The directory from which to parse archives.", type=str - ) - parser.add_argument( - "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False - ) - - - def _handle(self, *args, **options): - directory = options["dir"] - month_to_fix = options["month"] - organisation = Organisation.objects.filter(id=options["organisation"]).first() - collections = organisation.collection_set.all() - skip_monthly = options["skip_monthly"] - if not month_to_fix or not organisation or not collections or not directory: - return - url_pattern_strings = [i.url for i in URLPattern.objects.filter(collection__in=collections)] - org_only_events = [] - for filename in os.listdir(directory): - if ( - filename.endswith(".json.gz") - and filename.startswith("links_linkevent_") - and month_to_fix in filename - ): - file_path = os.path.join(directory, filename) - with gzip.open(file_path, "rt", encoding="utf-8") as f: - data = json.load(f) - for event in data: - link = event["fields"]["link"] - for url_pattern in url_pattern_strings: - if url_pattern in link: - org_only_events.append(event) - filtered_file = f"link_events_filtered_{month_to_fix}_organisation_{organisation.id}.json.gz" - filtered_file_path = os.path.join( - directory, - filtered_file, - ) - if len(org_only_events) > 0: - try: - with gzip.open( - filtered_file_path, "wt", encoding="utf-8" - ) as new_archive: - json.dump(org_only_events, new_archive) - logger.info(f"Aggregating {len(org_only_events)} filtered events for {filtered_file}") - # load filtered records into the database - call_command("archive_link_aggregates", "load", filtered_file_path) - # run aggregate command - call_command( - "fill_link_aggregates", - collections=[i.id for i in collections.all()], - ) - # run monthly aggregate command if we're not skipping it - if not skip_monthly: - call_command( - "fill_monthly_link_aggregates", - collections=collections, - year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"), - ) - # delete the records from the database, as we do not need to re-archive or re-upload them - LinkEvent.objects.filter(pk__in=[i['pk'] for i in org_only_events]).delete() - except Exception as e: - logger.error(e) diff --git a/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py new file mode 100644 index 00000000..263384f4 --- /dev/null +++ b/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py @@ -0,0 +1,163 @@ +import gzip +import json +import os +import logging +from datetime import datetime, timedelta, date + +from django.core.management import call_command +from django.db import transaction + +from extlinks.aggregates.models import LinkAggregate +from extlinks.common.management.commands import BaseCommand +from extlinks.links.models import URLPattern +from extlinks.organisations.models import Organisation + +logger = logging.getLogger("django") + + +class Command(BaseCommand): + help = ("Loads, parses, and fixes daily link aggregates for a given month and organisation. " + "Only run this command if the month's link events have not been already been aggregated.") + + def add_arguments(self, parser): + parser.add_argument( + "--month", + help="The date (YYYYMM) of the monthly archive to be fixed.", + type=str, + ) + parser.add_argument( + "--organisation", + help="The organisation id to fix link aggregates for.", + type=str, + ) + parser.add_argument( + "--dir", help="The directory from which to parse archives.", type=str + ) + parser.add_argument( + "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False + ) + + + def _handle(self, *args, **options): + directory = options["dir"] + month_to_fix = options["month"] + organisation = Organisation.objects.filter(id=options["organisation"]).first() + collections = organisation.collection_set.all() + skip_monthly = options["skip_monthly"] + if not month_to_fix or not organisation or not collections or not directory: + return + url_patterns = URLPattern.objects.filter(collection__in=collections) + events_split_by_url_pattern = self.load_events_from_archives( + directory, month_to_fix, [i.url for i in url_patterns] + ) + first_day_of_month = self.get_first_day_of_month(month_to_fix) + last_day_of_month = self.get_last_day_of_month(first_day_of_month) + try: + for i in range( + first_day_of_month.day, last_day_of_month.day+1 + ): + for collection in collections: + collection_url_pattern_strings = [ + i.url for i in url_patterns.filter(collection=collection) + ] + for collection_url_string in collection_url_pattern_strings: + for link_event in self.get_link_events_for_day( + collection_url_string, events_split_by_url_pattern, i + ): + self.fill_aggregate_from_archived_link_event(collection, link_event) + # run monthly aggregate command if we're not skipping it + if not skip_monthly: + # fill monthly aggregate for the subset of link events we just aggregated + call_command( + "fill_monthly_link_aggregates", + collections=collections, + year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"), + ) + except Exception as e: + logger.info(f"Unexpected exception occurred: {e}") + + def fill_aggregate_from_archived_link_event(self, collection, link_event): + change_number = link_event["fields"]["change"] + existing_link_aggregate = ( + LinkAggregate.objects.filter( + organisation=collection.organisation.id, + collection=collection.id, + full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]), + on_user_list=link_event["fields"]["on_user_list"], + ) + .exclude(day=0) + .first() + ) + if existing_link_aggregate is not None: + if change_number == 0: + existing_link_aggregate.total_links_removed += 1 + else: + existing_link_aggregate.total_links_added += 1 + existing_link_aggregate.save() + else: + # Create a new link aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + try: + with transaction.atomic(): + LinkAggregate.objects.create( + organisation=collection.organisation, + collection=collection, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event["fields"]["on_user_list"], + ) + except Exception as e: + logger.info( + f"Unexpected exception occurred filling aggregate: {e}" + ) + + def get_link_events_for_day( + self, collection_url: str, events_split_by_url_pattern, i: int + ): + link_events_for_day = [ + j + for j in events_split_by_url_pattern[collection_url] + if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == i + ] + return link_events_for_day + + def get_last_day_of_month(self, first_day_of_month: date) -> date: + if first_day_of_month.month == 12: + return first_day_of_month.replace(day=31) + replace = first_day_of_month.replace(month=first_day_of_month.month + 1) + return replace - timedelta(days=1) + + def get_first_day_of_month(self, month_to_fix: str) -> date: + return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1) + + def load_events_from_archives(self, directory: object, month_to_fix: str, url_pattern_strings) -> object: + events_split_by_url_pattern = dict.fromkeys(url_pattern_strings) + # initialize empty array for each url pattern in the org + for key, value in events_split_by_url_pattern.items(): + events_split_by_url_pattern[key] = [] + for filename in os.listdir(directory): + if ( + filename.endswith(".json.gz") + and filename.startswith("links_linkevent_") + and month_to_fix in filename + ): + try: + file_path = os.path.join(directory, filename) + with gzip.open(file_path, "rt", encoding="utf-8") as f: + data = json.load(f) + for event in data: + link = event["fields"]["link"] + for url_pattern in url_pattern_strings: + if url_pattern in link: + events_split_by_url_pattern[url_pattern].append( + event + ) + except Exception as e: + logger.info( + f"Unexpected exception occurred loading events from archive: {e}" + ) + return events_split_by_url_pattern diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index 85c64011..7a296c22 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -2142,7 +2142,7 @@ def setUp(self): self.url.collection = self.collection self.url.save() - def test_fixes_monthly_aggregates_for_organisation(self): + def test_fixes_link_aggregates_for_organisation_and_month(self): temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -2242,7 +2242,7 @@ def test_fixes_monthly_aggregates_for_organisation(self): try: call_command( - "fix_aggregates_for_organisation_and_month", + "fix_link_aggregates_for_organisation_and_month", "--month", "202412", "--organisation", @@ -2250,7 +2250,7 @@ def test_fixes_monthly_aggregates_for_organisation(self): "--dir", temp_dir, ) - monthly_aggregate = LinkAggregate.objects.filter(day=0).first() + monthly_aggregate = LinkAggregate.objects.all().first() self.assertEqual(1, LinkAggregate.objects.count()) self.assertEqual(2, monthly_aggregate.total_links_added) self.assertEqual(1, monthly_aggregate.total_links_removed) @@ -2259,7 +2259,7 @@ def test_fixes_monthly_aggregates_for_organisation(self): for file in glob.glob(archive_path): os.remove(file) - def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives(self): + def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archives(self): temp_dir = tempfile.gettempdir() archive_filename = "aggregates_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -2337,7 +2337,7 @@ def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives try: call_command( - "fix_aggregates_for_organisation_and_month", + "fix_link_aggregates_for_organisation_and_month", "--month", "202412", "--organisation", @@ -2351,7 +2351,7 @@ def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives for file in glob.glob(archive_path): os.remove(file) - def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives_in_correct_zipped_format(self): + def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in_correct_zipped_format(self): temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json" archive_path = os.path.join(temp_dir, archive_filename) @@ -2429,7 +2429,7 @@ def test_fixes_monthly_aggregates_for_organisation_scopes_to_link_event_archives try: call_command( - "fix_aggregates_for_organisation_and_month", + "fix_link_aggregates_for_organisation_and_month", "--month", "202412", "--organisation", @@ -2543,7 +2543,7 @@ def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(sel try: call_command( - "fix_aggregates_for_organisation_and_month", + "fix_link_aggregates_for_organisation_and_month", "--month", "202412", '--skip-monthly', From cba0539935628ff95d62f223ffb3bc81eb46d5f4 Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 3/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly/daily aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- .../commands/reaggregate_link_archives.py | 479 ++++++++++++++++ extlinks/aggregates/tests.py | 519 +++++++++++++++++- 2 files changed, 972 insertions(+), 26 deletions(-) create mode 100644 extlinks/aggregates/management/commands/reaggregate_link_archives.py diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py new file mode 100644 index 00000000..c4af8dfc --- /dev/null +++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py @@ -0,0 +1,479 @@ +import gzip +import json +import os +import logging +from datetime import datetime, timedelta, date + +from django.db import transaction + +from extlinks.aggregates.models import LinkAggregate +from extlinks.common import swift +from extlinks.common.management.commands import BaseCommand +from extlinks.links.models import URLPattern +from extlinks.organisations.models import Organisation + +logger = logging.getLogger("django") + + +class Command(BaseCommand): + help = ( + "Loads, parses, and fixes daily or monthly link aggregates for a given organisation. " + "WARNING: Only run this command if you are certain the link events have been archived before being aggregated." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--month", + help="If provided, will fix a monthly aggregate. The date (YYYYMM) of the monthly archive to be fixed.", + type=str, + ) + parser.add_argument( + "--day", + help="If provided, will fix a daily aggregate. The date (YYYYMMDD) of the daily archive to be fixed.", + type=str, + ) + parser.add_argument( + "--organisation", + help="The organisation id to fix link aggregates for.", + type=str, + ) + parser.add_argument( + "--dir", help="The directory from which to parse archives.", type=str + ) + + def _handle(self, *args, **options): + directory = options["dir"] + month_to_fix = options["month"] + day_to_fix = options["day"] + organisation = Organisation.objects.filter(id=options["organisation"]).first() + collections = organisation.collection_set.all() + + if not month_to_fix and not day_to_fix: + logger.warning("Please provide a month or day to fix.") + return + if month_to_fix and day_to_fix: + logger.warning("Please only provide a month or a day to fix-not both.") + return + if not directory: + logger.warning("Please provide a directory from which to parse archives.") + return + if not organisation: + logger.warning( + "Please provide an organisation for which to parse archives." + ) + return + if not collections: + logger.warning( + "Please provide an organisation which has collections for which to fix archives." + ) + return + try: + conn = swift.swift_connection() + except RuntimeError: + logger.info("Swift credentials not provided. Skipping.") + return False + + # get existing archives to ensure we have not already aggregated + existing_link_aggregates_in_object_storage = self._get_existing_link_aggregates( + conn + ) + # get all URLPatterns for an organisation + url_patterns = URLPattern.objects.filter(collection__in=collections) + + if month_to_fix: + # if we already have aggregates for this month uploaded, don't try to re-aggregate + if self._has_aggregates_for_month( + existing_link_aggregates_in_object_storage, month_to_fix + ): + return + # otherwise, attempt re-aggregation + with transaction.atomic(): + self._process_monthly_aggregates( + directory, month_to_fix, organisation, url_patterns + ) + else: + # if we already have aggregates for this day uploaded, don't try to re-aggregate + if self._has_aggregates_for_day( + existing_link_aggregates_in_object_storage, day_to_fix + ): + return + # otherwise, attempt re-aggregation + with transaction.atomic(): + self._process_daily_aggregates( + collections, day_to_fix, directory, url_patterns + ) + + def _get_existing_link_aggregates(self, conn): + """ + This function gets existing link aggregates from object storage. + Parameters + ---------- + conn : swiftclient.Connection + A connection to the Swift object storage. + + Returns + ------- + An array of existing link aggregates from object storage. + """ + existing_link_aggregates_in_object_storage = [ + i["name"] + for i in swift.get_object_list( + conn, + os.environ.get("SWIFT_CONTAINER_AGGREGATES", "archive-aggregates"), + "aggregates_linkaggregate_", + ) + ] + return existing_link_aggregates_in_object_storage + + def _has_aggregates_for_month( + self, existing_link_aggregates_in_object_storage, month_to_fix + ): + """ + This function checks whether there are existing aggregates for the month to fix. + Parameters + ---------- + existing_link_aggregates_in_object_storage : An array of existing link aggregates from object storage. + + month_to_fix : str + + Returns + ------- + bool: whether there are existing aggregates for a given month in object storage + """ + return ( + len( + [ + i + for i in existing_link_aggregates_in_object_storage + if self._get_first_day_of_month(month_to_fix).strftime("%Y-%m") in i + ] + ) + > 0 + ) + + def _has_aggregates_for_day( + self, existing_link_aggregates_in_object_storage, day_to_fix + ): + """ + This function checks whether there are existing aggregates for the day to fix. + Parameters + ---------- + existing_link_aggregates_in_object_storage : An array of existing link aggregates from object storage. + + day_to_fix : str + + Returns + ------- + bool: whether there are existing aggregates for a given day in object storage + """ + day_to_fix_formatted = ( + datetime.fromisoformat(day_to_fix).date().strftime("%Y-%m-%d") + ) + return ( + len( + [ + i + for i in existing_link_aggregates_in_object_storage + if day_to_fix_formatted in i + ] + ) + > 0 + ) + + def _process_daily_aggregates( + self, collections, day_to_fix, directory, url_patterns + ): + """ + This function loops through each url pattern and link event to fill the daily aggregates. + Parameters + ---------- + collections : An array of collections + + day_to_fix : str + + directory : str + + url_patterns : An array of url patterns + + Returns + ------- + None + """ + # pull month string from day input parameter + month_to_fix = day_to_fix[:-2] + # load and split link events by url pattern + events_split_by_url_pattern = self._load_events_from_archives( + directory, month_to_fix, [i.url for i in url_patterns] + ) + # loop through each collection + for collection in collections: + collection_url_pattern_strings = [ + i.url for i in url_patterns.filter(collection=collection) + ] + # loop through each collection's URLPatterns + for collection_url_string in collection_url_pattern_strings: + # get the link events for the collection and day + for link_event in self._get_link_events_for_day( + collection_url_string, + events_split_by_url_pattern, + int(day_to_fix[-2:]), + ): + # create daily aggregates + self._fill_daily_aggregate(collection, link_event) + + def _fill_daily_aggregate(self, collection, link_event): + """ + This function updates or creates a daily LinkAggregate for a collection and a parsed JSON object(LinkEvent). + Parameters + ---------- + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + change_number = link_event["fields"]["change"] + existing_link_aggregate = ( + LinkAggregate.objects.filter( + organisation=collection.organisation.id, + collection=collection.id, + full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]), + on_user_list=link_event["fields"]["on_user_list"], + ) + .exclude(day=0) + .first() + ) + if existing_link_aggregate is None: + # Create a new link aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + LinkAggregate.objects.create( + organisation=collection.organisation, + collection=collection, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event["fields"]["on_user_list"], + ) + else: + if change_number == 0: + existing_link_aggregate.total_links_removed += 1 + else: + existing_link_aggregate.total_links_added += 1 + existing_link_aggregate.save() + + def _process_monthly_aggregates( + self, directory, month_to_fix, organisation, url_patterns + ): + """ + This function loops through each url pattern and link events to fill the monthly aggregates. + Parameters + ---------- + directory : str + + month_to_fix : str + + organisation : Organisation + + url_patterns : An array of url patterns + + Returns + ------- + None + """ + # load and split link events by url pattern + events_split_by_url_pattern = self._load_events_from_archives( + directory, month_to_fix, [i.url for i in url_patterns] + ) + # get the first and last day of the month to fix + first_day_of_month = self._get_first_day_of_month(month_to_fix) + last_day_of_month = self._get_last_day_of_month(first_day_of_month) + for url_pattern, link_events in events_split_by_url_pattern.items(): + # create monthly aggregates + self._fill_monthly_aggregate( + url_pattern, last_day_of_month, organisation, url_patterns, link_events + ) + + def _fill_monthly_aggregate( + self, url_pattern, last_day_of_month, organisation, url_patterns, link_events + ): + """ + This function fills monthly LinkAggregates for an organisation and a parsed JSON object(LinkEvent). + Parameters + ---------- + url_pattern : str + + last_day_of_month : date + + organisation : Organisation + + url_patterns : An array of url patterns + + link_events : an array of link event JSON objects + + Returns + ------- + None + """ + # find the collection associated with this url + collection = url_patterns.filter(url=url_pattern).first().collection + self._process_monthly_events( + True, link_events, collection, organisation, last_day_of_month + ) + self._process_monthly_events( + False, link_events, collection, organisation, last_day_of_month + ) + + def _process_monthly_events( + self, + on_user_list_flag, + link_events, + collection, + organisation, + last_day_of_month, + ): + """ + This function updates or creates a monthly LinkAggregate for a collection and parsed JSON objects(LinkEvents). + Parameters + ---------- + on_user_list_flag : bool, whether the aggregate should save with on_user_list flag or not + + link_events : an array of link event JSON objects + + collection: a Collection + + organisation: Organisation + + last_day_of_month: date + + Returns + ------- + None + """ + events = [ + i for i in link_events if i["fields"]["on_user_list"] is on_user_list_flag + ] + if not events: + return + + total_added = sum(1 for i in events if i["fields"]["change"] == 1) + total_removed = sum(1 for i in events if i["fields"]["change"] == 0) + + existing_aggregate = LinkAggregate.objects.filter( + organisation_id=organisation.id, + collection_id=collection.id, + on_user_list=on_user_list_flag, + full_date=last_day_of_month, + day=0, + ) + + if existing_aggregate.exists(): + existing_aggregate.update( + total_links_added=total_added, + total_links_removed=total_removed, + ) + else: + LinkAggregate.objects.create( + organisation_id=organisation.id, + collection_id=collection.id, + on_user_list=on_user_list_flag, + full_date=last_day_of_month, + day=0, + total_links_added=total_added, + total_links_removed=total_removed, + ) + + def _get_link_events_for_day( + self, collection_url: str, events_split_by_url_pattern, day: int + ): + """ + This function splits parsed JSON objects(LinkEvent) by collection url pattern. + Parameters + ---------- + collection_url : str + + events_split_by_url_pattern : an array of link event JSON objects + + Returns + ------- + link_events_for_day : an array of link event JSON objects filtered by day + """ + link_events_for_day = [ + j + for j in events_split_by_url_pattern[collection_url] + if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == day + ] + return link_events_for_day + + def _get_last_day_of_month(self, first_day_of_month: date) -> date: + """ + This function gets the last day of the month from the first day of the input month + Parameters + ---------- + first_day_of_month : date + + Returns + ------- + date + """ + if first_day_of_month.month == 12: + return first_day_of_month.replace(day=31) + replace = first_day_of_month.replace(month=first_day_of_month.month + 1) + return replace - timedelta(days=1) + + def _get_first_day_of_month(self, month_to_fix: str) -> date: + """ + This function gets the first day of the month from the input month + Parameters + ---------- + month_to_fix : str + + Returns + ------- + date + """ + return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1) + + def _load_events_from_archives( + self, directory: object, month_to_fix: str, url_pattern_strings + ) -> object: + """Parse archived .json.gz files and split the link events by URL pattern. + Parameters + ---------- + directory : str + + month_to_fix : str + + url_pattern_strings : an array of str + + Returns + ------- + parsed JSON link event objects + """ + events_split_by_url_pattern = {url: [] for url in url_pattern_strings} + for filename in os.listdir(directory): + if ( + filename.endswith(".json.gz") + and filename.startswith("links_linkevent_") + and month_to_fix in filename + ): + try: + file_path = os.path.join(directory, filename) + with gzip.open(file_path, "rt", encoding="utf-8") as f: + data = json.load(f) + for event in data: + link = event["fields"]["link"] + for url_pattern in url_pattern_strings: + if url_pattern in link: + events_split_by_url_pattern[url_pattern].append( + event + ) + except Exception as e: + logger.info( + f"Unexpected exception occurred loading events from archive: {e}" + ) + return events_split_by_url_pattern diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index 7a296c22..afd3c953 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -2131,6 +2131,7 @@ def test_uploads_all_files_successfully(self, mock_swift_connection): for file in glob.glob(pattern): os.remove(file) + class FixAggregatesForOrganisationAndMonthCommandTest(BaseTransactionTest): def setUp(self): @@ -2142,10 +2143,26 @@ def setUp(self): self.url.collection = self.collection self.url.save() - def test_fixes_link_aggregates_for_organisation_and_month(self): + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ { "model": "links.linkevent", @@ -2242,7 +2259,7 @@ def test_fixes_link_aggregates_for_organisation_and_month(self): try: call_command( - "fix_link_aggregates_for_organisation_and_month", + "reaggregate_link_archives", "--month", "202412", "--organisation", @@ -2251,17 +2268,38 @@ def test_fixes_link_aggregates_for_organisation_and_month(self): temp_dir, ) monthly_aggregate = LinkAggregate.objects.all().first() + # assert only one monthly aggregates created for on_user_list=True self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=0).count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + # assert totals match expected totals self.assertEqual(2, monthly_aggregate.total_links_added) self.assertEqual(1, monthly_aggregate.total_links_removed) - self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) finally: for file in glob.glob(archive_path): os.remove(file) - def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archives(self): + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}]) temp_dir = tempfile.gettempdir() - archive_filename = "aggregates_20241222_0.json.gz" + archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) json_data = [ { @@ -2313,7 +2351,7 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv "pk": 3, "fields": { "link": "https://www.test.com/3", - "timestamp": "2024-01-15T09:15:27.363Z", + "timestamp": "2024-12-15T09:15:27.363Z", "domain": "en.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, @@ -2330,6 +2368,28 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv "url": [] } }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, ] with gzip.open(archive_path, "wt", encoding="utf-8") as f: @@ -2337,7 +2397,7 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv try: call_command( - "fix_link_aggregates_for_organisation_and_month", + "reaggregate_link_archives", "--month", "202412", "--organisation", @@ -2345,15 +2405,32 @@ def test_fixes_link_aggregates_for_organisation_and_month_only_link_event_archiv "--dir", temp_dir, ) + # assert no daily or monthly aggregates created self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=0).count()) finally: for file in glob.glob(archive_path): os.remove(file) - def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in_correct_zipped_format(self): + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) temp_dir = tempfile.gettempdir() - archive_filename = "links_linkevent_20241222_0.json" + archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) json_data = [ { @@ -2405,7 +2482,7 @@ def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in "pk": 3, "fields": { "link": "https://www.test.com/3", - "timestamp": "2024-01-15T09:15:27.363Z", + "timestamp": "2024-12-15T09:15:27.363Z", "domain": "en.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, @@ -2422,6 +2499,28 @@ def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in "url": [] } }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, ] with gzip.open(archive_path, "wt", encoding="utf-8") as f: @@ -2429,21 +2528,44 @@ def test_fixes_link_aggregates_for_organisation_and_month_link_event_archives_in try: call_command( - "fix_link_aggregates_for_organisation_and_month", - "--month", - "202412", + "reaggregate_link_archives", + "--day", + "20241215", "--organisation", self.organisation.id, "--dir", temp_dir, ) - self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + daily_aggregate = LinkAggregate.objects.all().first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=0).count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_aggregate.total_links_added) + self.assertEqual(1, daily_aggregate.total_links_removed) finally: for file in glob.glob(archive_path): os.remove(file) - def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(self): + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-15"}]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -2543,25 +2665,370 @@ def test_fixes_aggregates_for_organisation_skips_monthly_aggregation_command(sel try: call_command( - "fix_link_aggregates_for_organisation_and_month", + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_on_and_off_user_list(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": False, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", "--month", "202412", - '--skip-monthly', - True, "--organisation", self.organisation.id, "--dir", temp_dir, ) - link_aggregate = LinkAggregate.objects.filter(day=16).first() - link_aggregate2 = LinkAggregate.objects.filter(day=15).first() + monthly_aggregate_on_user_list = LinkAggregate.objects.filter(on_user_list=True).first() + monthly_aggregate_not_on_user_list = LinkAggregate.objects.filter(on_user_list=False).first() + # assert two monthly aggregates were created for on_user_list=True and on_user_list=False self.assertEqual(2, LinkAggregate.objects.count()) - self.assertEqual(1, link_aggregate.total_links_added) - self.assertEqual(0, link_aggregate.total_links_removed) - self.assertEqual(1, link_aggregate2.total_links_added) - self.assertEqual(1, link_aggregate2.total_links_removed) + self.assertEqual(2, LinkAggregate.objects.filter(day=0).count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_aggregate_on_user_list.total_links_added) + self.assertEqual(0, monthly_aggregate_on_user_list.total_links_removed) + self.assertEqual(0, monthly_aggregate_not_on_user_list.total_links_added) + self.assertEqual(1, monthly_aggregate_not_on_user_list.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_only_link_event_archives(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "aggregates_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-01-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) finally: for file in glob.glob(archive_path): os.remove(file) + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_only_in_correct_zipped_format(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-01-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) From e6e25cbb75295259a82362e9530f13fc09534fa7 Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 4/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly/daily aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- ...k_aggregates_for_organisation_and_month.py | 163 ------------------ 1 file changed, 163 deletions(-) delete mode 100644 extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py diff --git a/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py b/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py deleted file mode 100644 index 263384f4..00000000 --- a/extlinks/aggregates/management/commands/fix_link_aggregates_for_organisation_and_month.py +++ /dev/null @@ -1,163 +0,0 @@ -import gzip -import json -import os -import logging -from datetime import datetime, timedelta, date - -from django.core.management import call_command -from django.db import transaction - -from extlinks.aggregates.models import LinkAggregate -from extlinks.common.management.commands import BaseCommand -from extlinks.links.models import URLPattern -from extlinks.organisations.models import Organisation - -logger = logging.getLogger("django") - - -class Command(BaseCommand): - help = ("Loads, parses, and fixes daily link aggregates for a given month and organisation. " - "Only run this command if the month's link events have not been already been aggregated.") - - def add_arguments(self, parser): - parser.add_argument( - "--month", - help="The date (YYYYMM) of the monthly archive to be fixed.", - type=str, - ) - parser.add_argument( - "--organisation", - help="The organisation id to fix link aggregates for.", - type=str, - ) - parser.add_argument( - "--dir", help="The directory from which to parse archives.", type=str - ) - parser.add_argument( - "--skip-monthly", help="Skip the monthly aggregation and only fix daily", type=bool, default=False - ) - - - def _handle(self, *args, **options): - directory = options["dir"] - month_to_fix = options["month"] - organisation = Organisation.objects.filter(id=options["organisation"]).first() - collections = organisation.collection_set.all() - skip_monthly = options["skip_monthly"] - if not month_to_fix or not organisation or not collections or not directory: - return - url_patterns = URLPattern.objects.filter(collection__in=collections) - events_split_by_url_pattern = self.load_events_from_archives( - directory, month_to_fix, [i.url for i in url_patterns] - ) - first_day_of_month = self.get_first_day_of_month(month_to_fix) - last_day_of_month = self.get_last_day_of_month(first_day_of_month) - try: - for i in range( - first_day_of_month.day, last_day_of_month.day+1 - ): - for collection in collections: - collection_url_pattern_strings = [ - i.url for i in url_patterns.filter(collection=collection) - ] - for collection_url_string in collection_url_pattern_strings: - for link_event in self.get_link_events_for_day( - collection_url_string, events_split_by_url_pattern, i - ): - self.fill_aggregate_from_archived_link_event(collection, link_event) - # run monthly aggregate command if we're not skipping it - if not skip_monthly: - # fill monthly aggregate for the subset of link events we just aggregated - call_command( - "fill_monthly_link_aggregates", - collections=collections, - year_month=datetime.strptime(month_to_fix, "%Y%m").strftime("%Y-%m"), - ) - except Exception as e: - logger.info(f"Unexpected exception occurred: {e}") - - def fill_aggregate_from_archived_link_event(self, collection, link_event): - change_number = link_event["fields"]["change"] - existing_link_aggregate = ( - LinkAggregate.objects.filter( - organisation=collection.organisation.id, - collection=collection.id, - full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]), - on_user_list=link_event["fields"]["on_user_list"], - ) - .exclude(day=0) - .first() - ) - if existing_link_aggregate is not None: - if change_number == 0: - existing_link_aggregate.total_links_removed += 1 - else: - existing_link_aggregate.total_links_added += 1 - existing_link_aggregate.save() - else: - # Create a new link aggregate - links_added = change_number if change_number > 0 else 0 - links_removed = 1 if change_number == 0 else 0 - try: - with transaction.atomic(): - LinkAggregate.objects.create( - organisation=collection.organisation, - collection=collection, - full_date=datetime.fromisoformat( - link_event["fields"]["timestamp"] - ).date(), - total_links_added=links_added, - total_links_removed=links_removed, - on_user_list=link_event["fields"]["on_user_list"], - ) - except Exception as e: - logger.info( - f"Unexpected exception occurred filling aggregate: {e}" - ) - - def get_link_events_for_day( - self, collection_url: str, events_split_by_url_pattern, i: int - ): - link_events_for_day = [ - j - for j in events_split_by_url_pattern[collection_url] - if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == i - ] - return link_events_for_day - - def get_last_day_of_month(self, first_day_of_month: date) -> date: - if first_day_of_month.month == 12: - return first_day_of_month.replace(day=31) - replace = first_day_of_month.replace(month=first_day_of_month.month + 1) - return replace - timedelta(days=1) - - def get_first_day_of_month(self, month_to_fix: str) -> date: - return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1) - - def load_events_from_archives(self, directory: object, month_to_fix: str, url_pattern_strings) -> object: - events_split_by_url_pattern = dict.fromkeys(url_pattern_strings) - # initialize empty array for each url pattern in the org - for key, value in events_split_by_url_pattern.items(): - events_split_by_url_pattern[key] = [] - for filename in os.listdir(directory): - if ( - filename.endswith(".json.gz") - and filename.startswith("links_linkevent_") - and month_to_fix in filename - ): - try: - file_path = os.path.join(directory, filename) - with gzip.open(file_path, "rt", encoding="utf-8") as f: - data = json.load(f) - for event in data: - link = event["fields"]["link"] - for url_pattern in url_pattern_strings: - if url_pattern in link: - events_split_by_url_pattern[url_pattern].append( - event - ) - except Exception as e: - logger.info( - f"Unexpected exception occurred loading events from archive: {e}" - ) - return events_split_by_url_pattern From 8ac1a2d737ae8298099a8fb90412e7fa98173ae9 Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 5/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly/daily aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- .../commands/reaggregate_link_archives.py | 25 +- extlinks/aggregates/tests.py | 265 ++++++++++++++++++ 2 files changed, 282 insertions(+), 8 deletions(-) diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py index c4af8dfc..592a5944 100644 --- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py +++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py @@ -9,7 +9,7 @@ from extlinks.aggregates.models import LinkAggregate from extlinks.common import swift from extlinks.common.management.commands import BaseCommand -from extlinks.links.models import URLPattern +from extlinks.links.models import URLPattern, LinkEvent from extlinks.organisations.models import Organisation logger = logging.getLogger("django") @@ -18,7 +18,6 @@ class Command(BaseCommand): help = ( "Loads, parses, and fixes daily or monthly link aggregates for a given organisation. " - "WARNING: Only run this command if you are certain the link events have been archived before being aggregated." ) def add_arguments(self, parser): @@ -81,21 +80,25 @@ def _handle(self, *args, **options): url_patterns = URLPattern.objects.filter(collection__in=collections) if month_to_fix: + first_day_of_month = self._get_first_day_of_month(month_to_fix) + last_day_of_month = self._get_last_day_of_month(first_day_of_month) # if we already have aggregates for this month uploaded, don't try to re-aggregate + # or if we have not archived all events for the given timeframe, don't try to re-aggregate if self._has_aggregates_for_month( existing_link_aggregates_in_object_storage, month_to_fix - ): + ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month): return # otherwise, attempt re-aggregation with transaction.atomic(): self._process_monthly_aggregates( - directory, month_to_fix, organisation, url_patterns + directory, month_to_fix, organisation, url_patterns, last_day_of_month ) else: # if we already have aggregates for this day uploaded, don't try to re-aggregate + # or if we have not archived all events for the given timeframe, don't try to re-aggregate if self._has_aggregates_for_day( existing_link_aggregates_in_object_storage, day_to_fix - ): + ) or self._has_link_events_for_day(day_to_fix): return # otherwise, attempt re-aggregation with transaction.atomic(): @@ -180,6 +183,14 @@ def _has_aggregates_for_day( > 0 ) + def _has_link_events_for_month(self, first_day_of_month, last_day_of_month): + return LinkEvent.objects.filter(timestamp__gte=first_day_of_month, timestamp__lte=last_day_of_month).count() > 0 + + def _has_link_events_for_day(self, day_to_fix): + day = datetime.fromisoformat(day_to_fix) + return LinkEvent.objects.filter(timestamp__gte=day, timestamp__lte=day + timedelta(days=1)).count() > 0 + + def _process_daily_aggregates( self, collections, day_to_fix, directory, url_patterns ): @@ -267,7 +278,7 @@ def _fill_daily_aggregate(self, collection, link_event): existing_link_aggregate.save() def _process_monthly_aggregates( - self, directory, month_to_fix, organisation, url_patterns + self, directory, month_to_fix, organisation, url_patterns, last_day_of_month ): """ This function loops through each url pattern and link events to fill the monthly aggregates. @@ -290,8 +301,6 @@ def _process_monthly_aggregates( directory, month_to_fix, [i.url for i in url_patterns] ) # get the first and last day of the month to fix - first_day_of_month = self._get_first_day_of_month(month_to_fix) - last_day_of_month = self._get_last_day_of_month(first_day_of_month) for url_pattern, link_events in events_split_by_url_pattern.items(): # create monthly aggregates self._fill_monthly_aggregate( diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index afd3c953..e40474a0 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -2413,6 +2413,141 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co os.remove(file) + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + + # create link events + call_command("loaddata", archive_path) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=0).count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( os.environ, { @@ -2678,6 +2813,136 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn for file in glob.glob(archive_path): os.remove(file) + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_linkevents_for_day(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + # create link events + call_command("loaddata", archive_path) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) @mock.patch.dict( os.environ, From c206eb22272730030fe34aa84aeeec4962220ed6 Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 6/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly/daily aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- .../management/commands/reaggregate_link_archives.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py index 592a5944..98fa5549 100644 --- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py +++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py @@ -48,10 +48,10 @@ def _handle(self, *args, **options): collections = organisation.collection_set.all() if not month_to_fix and not day_to_fix: - logger.warning("Please provide a month or day to fix.") + logger.warning("Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix.") return if month_to_fix and day_to_fix: - logger.warning("Please only provide a month or a day to fix-not both.") + logger.warning("Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both.") return if not directory: logger.warning("Please provide a directory from which to parse archives.") @@ -87,6 +87,9 @@ def _handle(self, *args, **options): if self._has_aggregates_for_month( existing_link_aggregates_in_object_storage, month_to_fix ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month): + logger.warning( + "Organisation already has aggregates or link events for month." + ) return # otherwise, attempt re-aggregation with transaction.atomic(): @@ -99,6 +102,9 @@ def _handle(self, *args, **options): if self._has_aggregates_for_day( existing_link_aggregates_in_object_storage, day_to_fix ) or self._has_link_events_for_day(day_to_fix): + logger.warning( + "Organisation already has aggregates or link events for day." + ) return # otherwise, attempt re-aggregation with transaction.atomic(): From c5c458ada8a73c028e4ec44e623412453c48b439 Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 7/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly/daily aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- .../commands/fill_pageproject_aggregates.py | 4 +- .../commands/fill_user_aggregates.py | 4 +- .../commands/reaggregate_link_archives.py | 321 +++++- extlinks/aggregates/tests.py | 1007 ++++++++++++++++- 4 files changed, 1291 insertions(+), 45 deletions(-) diff --git a/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py b/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py index 9014d14a..a05f8cee 100644 --- a/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py +++ b/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py @@ -9,7 +9,7 @@ from django.db.models.fields import DateField from ...models import PageProjectAggregate -from extlinks.links.models import LinkEvent +from extlinks.links.models import LinkEvent, URLPattern from extlinks.organisations.models import Collection logger = logging.getLogger("django") @@ -118,6 +118,8 @@ def _process_single_collection(self, link_event_filter, collection): None """ url_patterns = collection.get_url_patterns() + if len(url_patterns) == 0: + url_patterns = URLPattern.objects.filter(collection=collection).all() for url_pattern in url_patterns: link_events_with_annotated_timestamp = url_pattern.link_events.annotate( timestamp_date=Cast("timestamp", DateField()) diff --git a/extlinks/aggregates/management/commands/fill_user_aggregates.py b/extlinks/aggregates/management/commands/fill_user_aggregates.py index e2b99a9e..d4c324e6 100644 --- a/extlinks/aggregates/management/commands/fill_user_aggregates.py +++ b/extlinks/aggregates/management/commands/fill_user_aggregates.py @@ -8,7 +8,7 @@ from django.db.models.fields import DateField from ...models import UserAggregate -from extlinks.links.models import LinkEvent +from extlinks.links.models import LinkEvent, URLPattern from extlinks.organisations.models import Collection @@ -117,6 +117,8 @@ def _process_single_collection(self, link_event_filter, collection): None """ url_patterns = collection.get_url_patterns() + if len(url_patterns) == 0: + url_patterns = URLPattern.objects.filter(collection=collection).all() for url_pattern in url_patterns: link_events_with_annotated_timestamp = url_pattern.link_events.annotate( timestamp_date=Cast("timestamp", DateField()) diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py index 98fa5549..63b64a60 100644 --- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py +++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py @@ -6,19 +6,21 @@ from django.db import transaction -from extlinks.aggregates.models import LinkAggregate +from extlinks.aggregates.models import ( + LinkAggregate, + PageProjectAggregate, + UserAggregate, +) from extlinks.common import swift from extlinks.common.management.commands import BaseCommand from extlinks.links.models import URLPattern, LinkEvent -from extlinks.organisations.models import Organisation +from extlinks.organisations.models import Organisation, User logger = logging.getLogger("django") class Command(BaseCommand): - help = ( - "Loads, parses, and fixes daily or monthly link aggregates for a given organisation. " - ) + help = "Loads, parses, and fixes daily or monthly aggregates for a given organisation. " def add_arguments(self, parser): parser.add_argument( @@ -33,7 +35,7 @@ def add_arguments(self, parser): ) parser.add_argument( "--organisation", - help="The organisation id to fix link aggregates for.", + help="The organisation id to fix aggregates for.", type=str, ) parser.add_argument( @@ -48,10 +50,14 @@ def _handle(self, *args, **options): collections = organisation.collection_set.all() if not month_to_fix and not day_to_fix: - logger.warning("Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix.") + logger.warning( + "Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix." + ) return if month_to_fix and day_to_fix: - logger.warning("Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both.") + logger.warning( + "Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both." + ) return if not directory: logger.warning("Please provide a directory from which to parse archives.") @@ -252,6 +258,130 @@ def _fill_daily_aggregate(self, collection, link_event): None """ change_number = link_event["fields"]["change"] + self._fill_daily_pageproject_aggregates(change_number, collection, link_event) + self._fill_daily_user_aggregate(change_number, collection, link_event) + self._fill_daily_link_aggregate(change_number, collection, link_event) + + def _fill_daily_pageproject_aggregates(self, change_number, collection, link_event): + """ + This function updates or creates a daily PageProjectAggregate for a collection and a parsed JSON (LinkEvent). + Parameters + ---------- + change_number : int + + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + existing_pageproject_aggregate = PageProjectAggregate.objects.filter( + organisation=collection.organisation, + collection=collection, + page_name=link_event['fields']["page_title"], + project_name=link_event['fields']["domain"], + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + on_user_list=link_event['fields']["on_user_list"], + ).exclude(day=0)[:1].all() + existing_pageproject_aggregate = ( + existing_pageproject_aggregate[0] if len(existing_pageproject_aggregate) > 0 else None + ) + if existing_pageproject_aggregate: + if change_number == 0: + existing_pageproject_aggregate.total_links_removed += 1 + else: + existing_pageproject_aggregate.total_links_added += 1 + existing_pageproject_aggregate.save() + else: + # Create a new page project aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + PageProjectAggregate.objects.get_or_create( + organisation=collection.organisation, + collection=collection, + page_name=link_event['fields']["page_title"], + project_name=link_event['fields']["domain"], + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event['fields']["on_user_list"], + ) + + def _fill_daily_user_aggregate(self, change_number, collection, link_event): + """ + This function updates or creates a daily UserAggregate for a collection and a parsed JSON (LinkEvent). + Parameters + ---------- + change_number : int + + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + try: + user_retrieved = User.objects.get(pk=link_event["fields"]["user_id"]) + except User.DoesNotExist: + return + exisiting_user_aggregate = ( + UserAggregate.objects.filter( + organisation=collection.organisation, + collection=collection, + username=user_retrieved.username, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + on_user_list=link_event["fields"]["on_user_list"], + ) + .exclude(day=0) + .first() + ) + if exisiting_user_aggregate: + if change_number == 0: + exisiting_user_aggregate.total_links_removed += 1 + else: + exisiting_user_aggregate.total_links_added += 1 + exisiting_user_aggregate.save() + else: + # Create a new link aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + UserAggregate.objects.create( + organisation=collection.organisation, + collection=collection, + username=user_retrieved.username, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event["fields"]["on_user_list"], + ) + + def _fill_daily_link_aggregate(self, change_number, collection, link_event): + """ + This function updates or creates a daily LinkAggregate for a collection and a parsed JSON (LinkEvent). + Parameters + ---------- + change_number : int + + collection : Collection + + link_event : obj + + Returns + ------- + None + """ existing_link_aggregate = ( LinkAggregate.objects.filter( organisation=collection.organisation.id, @@ -378,16 +508,124 @@ def _process_monthly_events( total_added = sum(1 for i in events if i["fields"]["change"] == 1) total_removed = sum(1 for i in events if i["fields"]["change"] == 0) - existing_aggregate = LinkAggregate.objects.filter( + # set of tuples that consist of (page_title, domain) for a group of link events + page_projects = list( + set([(i["fields"]["page_title"], i["fields"]["domain"]) for i in events]) + ) + # set of user ids to fill user aggregates for + users = list(set([i["fields"]["user_id"] for i in events])) + try: + for page_project in page_projects: + self._fill_monthly_page_project_aggregates(collection, events, last_day_of_month, on_user_list_flag, + page_project) + for user in users: + self._fill_monthly_user_aggregates( + collection, last_day_of_month, link_events, on_user_list_flag, user + ) + self._fill_monthly_link_aggregates( + collection, + last_day_of_month, + on_user_list_flag, + organisation, + total_added, + total_removed, + ) + except Exception as e: + print(e) + + + def _fill_monthly_page_project_aggregates(self, collection, events, last_day_of_month, on_user_list_flag, + page_project): + """ + This function updates or creates monthly PageProjectAggregate for collection and a parsed array of JSON(LinkEvents). + Parameters + ---------- + collection : Collection + + events : an array of JSON link_events parsed from archives + + last_day_of_month: date + + on_user_list_flag: bool + + page_project: tuple(str, str) + + Returns + ------- + None + """ + events_for_page_project = [ + i + for i in events + if i["fields"]["page_title"] == page_project[0] + and i["fields"]["domain"] == page_project[1] + ] + total_added_page_project = sum(1 for i in events_for_page_project if i["fields"]["change"] == 1) + total_removed_page_project = sum(1 for i in events_for_page_project if i["fields"]["change"] == 0) + existing_page_project_aggregate = PageProjectAggregate.objects.filter( + organisation=collection.organisation, + collection=collection, + page_name=page_project[0], + project_name=page_project[1], + day=0, + full_date=last_day_of_month, + on_user_list=on_user_list_flag, + )[:1].all() + if existing_page_project_aggregate: + existing_page_project_aggregate.total_links_added = total_added_page_project + existing_page_project_aggregate.total_links_removed = total_removed_page_project + existing_page_project_aggregate.save() + else: + PageProjectAggregate.objects.get_or_create( + organisation=collection.organisation, + collection=collection, + page_name=page_project[0], + project_name=page_project[1], + full_date=last_day_of_month, + day=0, + total_links_added=total_added_page_project, + total_links_removed=total_removed_page_project, + on_user_list=on_user_list_flag, + ) + + def _fill_monthly_link_aggregates( + self, + collection, + last_day_of_month, + on_user_list_flag, + organisation, + total_added, + total_removed, + ): + """ + This function updates or creates monthly LinkAggregate for collection. + Parameters + ---------- + collection : Collection + + last_day_of_month: date + + on_user_list_flag: bool + + organisation: Organisation + + total_added: int + + total_removed: int + + Returns + ------- + None + """ + existing_link_aggregate = LinkAggregate.objects.filter( organisation_id=organisation.id, collection_id=collection.id, on_user_list=on_user_list_flag, full_date=last_day_of_month, day=0, ) - - if existing_aggregate.exists(): - existing_aggregate.update( + if existing_link_aggregate.exists(): + existing_link_aggregate.update( total_links_added=total_added, total_links_removed=total_removed, ) @@ -401,6 +639,65 @@ def _process_monthly_events( total_links_added=total_added, total_links_removed=total_removed, ) + def _fill_monthly_user_aggregates( + self, collection, last_day_of_month, link_events, on_user_list_flag, user + ): + """ + This function updates or creates monthly UserAggregate for user and collection. + Parameters + ---------- + collection : Collection + + last_day_of_month: date + + link_events: an array of JSON link_events parsed from archives + + on_user_list_flag: bool + + user : User + + Returns + ------- + None + """ + try: + user_retrieved = User.objects.get(pk=user) + except User.DoesNotExist: + return + events_for_user = [i for i in link_events if i["fields"]["user_id"] is user] + total_added_by_user = sum( + 1 for i in events_for_user if i["fields"]["change"] == 1 + ) + total_removed_by_user = sum( + 1 for i in events_for_user if i["fields"]["change"] == 0 + ) + exisiting_user_aggregate = ( + UserAggregate.objects.filter( + organisation_id=collection.organisation.id, + collection_id=collection.id, + username=user_retrieved.username, + full_date=last_day_of_month, + day=0, + on_user_list=on_user_list_flag, + ) + .first() + ) + if exisiting_user_aggregate: + exisiting_user_aggregate.total_links_added = total_added_by_user + exisiting_user_aggregate.total_links_removed = total_removed_by_user + exisiting_user_aggregate.save() + else: + # Create a new link aggregate + UserAggregate.objects.create( + organisation_id=collection.organisation.id, + collection_id=collection.id, + username=user_retrieved.username, + full_date=last_day_of_month, + day=0, + total_links_added=total_added_by_user, + total_links_removed=total_removed_by_user, + on_user_list=on_user_list_flag, + ) def _get_link_events_for_day( self, collection_url: str, events_split_by_url_pattern, day: int diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index e40474a0..798828cf 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -2139,6 +2139,7 @@ def setUp(self): self.organisation = OrganisationFactory(name="ACME Org") self.collection = CollectionFactory(organisation=self.organisation) self.user = UserFactory() + self.user2 = UserFactory() self.url = URLPatternFactory(url="www.test.com") self.url.collection = self.collection self.url.save() @@ -2267,16 +2268,27 @@ def test_reaggregate_link_archives_monthly(self, mock_swift_connection): "--dir", temp_dir, ) - monthly_aggregate = LinkAggregate.objects.all().first() - # assert only one monthly aggregates created for on_user_list=True + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates = UserAggregate.objects.all().first() + monthly_page_project_aggregates = PageProjectAggregate.objects.all().first() + # assert only one monthly aggregate created for on_user_list=True self.assertEqual(1, LinkAggregate.objects.count()) - self.assertEqual(1, LinkAggregate.objects.filter(day=0).count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) # assert daily aggregates were not created self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) # assert totals match expected totals - self.assertEqual(2, monthly_aggregate.total_links_added) - self.assertEqual(1, monthly_aggregate.total_links_removed) + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_user_aggregates.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_page_project_aggregates.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates.total_links_removed) finally: for file in glob.glob(archive_path): os.remove(file) @@ -2291,16 +2303,17 @@ def test_reaggregate_link_archives_monthly(self, mock_swift_connection): }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_multiple_projects(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, [{"name": "archive-aggregates-backup-2024-12-22"}], ) - mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}]) + mock_conn.get_container.return_value = ({},[]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ { "model": "links.linkevent", @@ -2330,7 +2343,7 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co "fields": { "link": "https://www.test.com/", "timestamp": "2024-12-16T09:15:27.363Z", - "domain": "en.wikipedia.org", + "domain": "cy.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, "username": self.user.id, @@ -2374,7 +2387,7 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co "fields": { "link": "https://www.test.com/4", "timestamp": "2024-12-15T09:15:27.363Z", - "domain": "en.wikipedia.org", + "domain": "de.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, "username": self.user.id, @@ -2405,14 +2418,39 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co "--dir", temp_dir, ) - # assert no daily or monthly aggregates created - self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, LinkAggregate.objects.filter(day=0).count()) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates = UserAggregate.objects.all().first() + monthly_page_project_aggregates_en = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first() + monthly_page_project_aggregates_de = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first() + monthly_page_project_aggregates_cy = PageProjectAggregate.objects.filter(project_name="cy.wikipedia.org").first() + + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(3, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_user_aggregates.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_de.total_links_removed) + self.assertEqual(0, monthly_page_project_aggregates_de.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_en.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_en.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_cy.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_cy.total_links_added) + finally: for file in glob.glob(archive_path): os.remove(file) - @mock.patch.dict( os.environ, { @@ -2422,16 +2460,17 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_multiple_pages(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, [{"name": "archive-aggregates-backup-2024-12-22"}], ) - mock_conn.get_container.return_value = ({},[]) + mock_conn.get_container.return_value = ({}, []) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ { "model": "links.linkevent", @@ -2489,7 +2528,7 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m "username": self.user.id, "rev_id": 485489, "user_id": self.user.id, - "page_title": "test", + "page_title": "test2", "page_namespace": 0, "event_id": "", "user_is_bot": False, @@ -2526,10 +2565,6 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m with gzip.open(archive_path, "wt", encoding="utf-8") as f: json.dump(json_data, f) - - # create link events - call_command("loaddata", archive_path) - try: call_command( "reaggregate_link_archives", @@ -2540,9 +2575,31 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m "--dir", temp_dir, ) - # assert no daily or monthly aggregates created - self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, LinkAggregate.objects.filter(day=0).count()) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates = UserAggregate.objects.all().first() + monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first() + monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first() + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_user_aggregates.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed) + finally: for file in glob.glob(archive_path): os.remove(file) @@ -2557,16 +2614,168 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_daily(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_multiple_users(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, - [], + [{"name": "archive-aggregates-backup-2024-12-22"}], ) mock_conn.get_container.return_value = ({},[]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) + + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates_1 = UserAggregate.objects.filter(username=self.user.username).first() + monthly_user_aggregates_2 = UserAggregate.objects.filter(username=self.user2.username).first() + monthly_page_project_aggregates = PageProjectAggregate.objects.all().first() + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + self.assertEqual(2, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(1, monthly_user_aggregates_1.total_links_added) + self.assertEqual(0, monthly_user_aggregates_1.total_links_removed) + self.assertEqual(1, monthly_user_aggregates_2.total_links_added) + self.assertEqual(1, monthly_user_aggregates_2.total_links_removed) + self.assertEqual(2, monthly_page_project_aggregates.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) json_data = [ { "model": "links.linkevent", @@ -2658,6 +2867,725 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection): }, ] + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + + # create link events + call_command("loaddata", archive_path) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + daily_pageproject_aggregate = PageProjectAggregate.objects.all().first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_pageproject_aggregate.total_links_added) + self.assertEqual(1, daily_pageproject_aggregate.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "cy.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "de.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first() + daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_pageproject_aggregate1.total_links_added) + self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed) + self.assertEqual(0, daily_pageproject_aggregate2.total_links_added) + self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test2", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first() + monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.username, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.username, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + with gzip.open(archive_path, "wt", encoding="utf-8") as f: json.dump(json_data, f) @@ -2671,16 +3599,27 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection): "--dir", temp_dir, ) - daily_aggregate = LinkAggregate.objects.all().first() + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first() + daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first() # assert no monthly aggregates created self.assertEqual(1, LinkAggregate.objects.count()) - self.assertEqual(0, LinkAggregate.objects.filter(day=0).count()) + self.assertEqual(2, UserAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) # assert daily aggregates were created for the correct day self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(2, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) # assert totals match expected totals - self.assertEqual(1, daily_aggregate.total_links_added) - self.assertEqual(1, daily_aggregate.total_links_removed) + self.assertEqual(2, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(0, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate2.total_links_added) + self.assertEqual(1, daily_user_aggregate2.total_links_removed) finally: for file in glob.glob(archive_path): os.remove(file) @@ -2809,6 +3748,8 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn temp_dir, ) self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) finally: for file in glob.glob(archive_path): os.remove(file) @@ -2940,6 +3881,8 @@ def test_reaggregate_link_archives_daily_skips_if_linkevents_for_day(self, mock_ temp_dir, ) self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) finally: for file in glob.glob(archive_path): os.remove(file) @@ -3186,7 +4129,8 @@ def test_reaggregate_link_archives_only_link_event_archives(self, mock_swift_con temp_dir, ) self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) finally: for file in glob.glob(archive_path): os.remove(file) @@ -3293,7 +4237,8 @@ def test_reaggregate_link_archives_only_in_correct_zipped_format(self, mock_swif temp_dir, ) self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, LinkEvent.objects.filter(object_id=self.url.id).count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) finally: for file in glob.glob(archive_path): os.remove(file) From 606cb601b898b9806a41e6a6c357555e74f8e845 Mon Sep 17 00:00:00 2001 From: Kgraessle Date: Thu, 18 Sep 2025 14:14:40 -0500 Subject: [PATCH 8/8] Backfill missing Wall Street Journal aggregate data - Created a command to fix a given orgs monthly/daily aggregates using already existing archived data - This fixes an issue where we have archived link events before they were aggregated Bug: T404879 Change-Id: Ia9805aee9f7dc9a707df3b50847f34bd07401b6c --- .../commands/reaggregate_link_archives.py | 12 +- extlinks/aggregates/tests.py | 726 +++++++++++++++--- 2 files changed, 632 insertions(+), 106 deletions(-) diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py index 63b64a60..736ee664 100644 --- a/extlinks/aggregates/management/commands/reaggregate_link_archives.py +++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py @@ -78,8 +78,8 @@ def _handle(self, *args, **options): logger.info("Swift credentials not provided. Skipping.") return False - # get existing archives to ensure we have not already aggregated - existing_link_aggregates_in_object_storage = self._get_existing_link_aggregates( + # get existing aggregates to ensure we have not already aggregated for the given timeframe + existing_aggregates = self._get_existing_aggregates( conn ) # get all URLPatterns for an organisation @@ -91,7 +91,7 @@ def _handle(self, *args, **options): # if we already have aggregates for this month uploaded, don't try to re-aggregate # or if we have not archived all events for the given timeframe, don't try to re-aggregate if self._has_aggregates_for_month( - existing_link_aggregates_in_object_storage, month_to_fix + existing_aggregates, month_to_fix ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month): logger.warning( "Organisation already has aggregates or link events for month." @@ -106,7 +106,7 @@ def _handle(self, *args, **options): # if we already have aggregates for this day uploaded, don't try to re-aggregate # or if we have not archived all events for the given timeframe, don't try to re-aggregate if self._has_aggregates_for_day( - existing_link_aggregates_in_object_storage, day_to_fix + existing_aggregates, day_to_fix ) or self._has_link_events_for_day(day_to_fix): logger.warning( "Organisation already has aggregates or link events for day." @@ -118,7 +118,7 @@ def _handle(self, *args, **options): collections, day_to_fix, directory, url_patterns ) - def _get_existing_link_aggregates(self, conn): + def _get_existing_aggregates(self, conn): """ This function gets existing link aggregates from object storage. Parameters @@ -135,7 +135,7 @@ def _get_existing_link_aggregates(self, conn): for i in swift.get_object_list( conn, os.environ.get("SWIFT_CONTAINER_AGGREGATES", "archive-aggregates"), - "aggregates_linkaggregate_", + "aggregates_", ) ] return existing_link_aggregates_in_object_storage diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index 798828cf..00d7d4d2 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -2766,13 +2766,13 @@ def test_reaggregate_link_archives_monthly_multiple_users(self, mock_swift_conne }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_skips_if_uploaded_link_aggregates(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, [{"name": "archive-aggregates-backup-2024-12-22"}], ) - mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-22"}]) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_linkaggregate_100_10_2024-12-22"}]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -2898,13 +2898,13 @@ def test_reaggregate_link_archives_monthly_skips_if_uploaded(self, mock_swift_co }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_skips_if_uploaded_user_aggregates(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, [{"name": "archive-aggregates-backup-2024-12-22"}], ) - mock_conn.get_container.return_value = ({},[]) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_useraggregate_100_10__2024-12-22"}]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -3002,10 +3002,6 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m with gzip.open(archive_path, "wt", encoding="utf-8") as f: json.dump(json_data, f) - - # create link events - call_command("loaddata", archive_path) - try: call_command( "reaggregate_link_archives", @@ -3034,13 +3030,13 @@ def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, m }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_daily(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_skips_if_uploaded_pageproject_aggregates(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, - [], + [{"name": "archive-aggregates-backup-2024-12-22"}], ) - mock_conn.get_container.return_value = ({},[]) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_pageprojectaggregate_2024-12-22"}]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -3141,34 +3137,17 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection): try: call_command( "reaggregate_link_archives", - "--day", - "20241215", + "--month", + "202412", "--organisation", self.organisation.id, "--dir", temp_dir, ) - daily_link_aggregate = LinkAggregate.objects.all().first() - daily_user_aggregate = UserAggregate.objects.all().first() - daily_pageproject_aggregate = PageProjectAggregate.objects.all().first() - # assert no monthly aggregates created - self.assertEqual(1, LinkAggregate.objects.count()) - self.assertEqual(1, UserAggregate.objects.count()) - self.assertEqual(1, PageProjectAggregate.objects.count()) - # assert daily aggregates were created for the correct day - self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) - self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) - self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) - self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) - self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) - self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) - # assert totals match expected totals - self.assertEqual(1, daily_link_aggregate.total_links_added) - self.assertEqual(1, daily_link_aggregate.total_links_removed) - self.assertEqual(1, daily_user_aggregate.total_links_added) - self.assertEqual(1, daily_user_aggregate.total_links_removed) - self.assertEqual(1, daily_pageproject_aggregate.total_links_added) - self.assertEqual(1, daily_pageproject_aggregate.total_links_removed) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) finally: for file in glob.glob(archive_path): os.remove(file) @@ -3183,11 +3162,11 @@ def test_reaggregate_link_archives_daily(self, mock_swift_connection): }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection): + def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, - [], + [{"name": "archive-aggregates-backup-2024-12-22"}], ) mock_conn.get_container.return_value = ({},[]) temp_dir = tempfile.gettempdir() @@ -3222,7 +3201,7 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn "fields": { "link": "https://www.test.com/", "timestamp": "2024-12-16T09:15:27.363Z", - "domain": "cy.wikipedia.org", + "domain": "en.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, "username": self.user.id, @@ -3266,7 +3245,7 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn "fields": { "link": "https://www.test.com/4", "timestamp": "2024-12-15T09:15:27.363Z", - "domain": "de.wikipedia.org", + "domain": "en.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, "username": self.user.id, @@ -3287,44 +3266,29 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn with gzip.open(archive_path, "wt", encoding="utf-8") as f: json.dump(json_data, f) + + # create link events + call_command("loaddata", archive_path) + try: call_command( "reaggregate_link_archives", - "--day", - "20241215", + "--month", + "202412", "--organisation", self.organisation.id, "--dir", temp_dir, ) - daily_link_aggregate = LinkAggregate.objects.all().first() - daily_user_aggregate = UserAggregate.objects.all().first() - daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first() - daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first() - # assert no monthly aggregates created - self.assertEqual(1, LinkAggregate.objects.count()) - self.assertEqual(1, UserAggregate.objects.count()) - self.assertEqual(2, PageProjectAggregate.objects.count()) - # assert daily aggregates were created for the correct day - self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) - self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) - self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) - self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) - self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) - self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) - # assert totals match expected totals - self.assertEqual(1, daily_link_aggregate.total_links_added) - self.assertEqual(1, daily_link_aggregate.total_links_removed) - self.assertEqual(1, daily_user_aggregate.total_links_added) - self.assertEqual(1, daily_user_aggregate.total_links_removed) - self.assertEqual(1, daily_pageproject_aggregate1.total_links_added) - self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed) - self.assertEqual(0, daily_pageproject_aggregate2.total_links_added) - self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) finally: for file in glob.glob(archive_path): os.remove(file) + @mock.patch.dict( os.environ, { @@ -3334,7 +3298,7 @@ def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_conn }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection): + def test_reaggregate_link_archives_daily(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, @@ -3401,7 +3365,7 @@ def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connect "username": self.user.id, "rev_id": 485489, "user_id": self.user.id, - "page_title": "test2", + "page_title": "test", "page_namespace": 0, "event_id": "", "user_is_bot": False, @@ -3450,32 +3414,30 @@ def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connect ) daily_link_aggregate = LinkAggregate.objects.all().first() daily_user_aggregate = UserAggregate.objects.all().first() - monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first() - monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first() + daily_pageproject_aggregate = PageProjectAggregate.objects.all().first() # assert no monthly aggregates created self.assertEqual(1, LinkAggregate.objects.count()) self.assertEqual(1, UserAggregate.objects.count()) - self.assertEqual(2, PageProjectAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) # assert daily aggregates were created for the correct day self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) - self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) # assert totals match expected totals self.assertEqual(1, daily_link_aggregate.total_links_added) self.assertEqual(1, daily_link_aggregate.total_links_removed) self.assertEqual(1, daily_user_aggregate.total_links_added) self.assertEqual(1, daily_user_aggregate.total_links_removed) - self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added) - self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed) - self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added) - self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed) + self.assertEqual(1, daily_pageproject_aggregate.total_links_added) + self.assertEqual(1, daily_pageproject_aggregate.total_links_removed) finally: for file in glob.glob(archive_path): os.remove(file) + @mock.patch.dict( os.environ, { @@ -3485,7 +3447,7 @@ def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connect }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection): + def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, @@ -3505,7 +3467,7 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect "domain": "en.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, - "username": self.user.username, + "username": self.user.id, "rev_id": 485489, "user_id": self.user.id, "page_title": "test", @@ -3523,11 +3485,11 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect "pk": 2, "fields": { "link": "https://www.test.com/", - "timestamp": "2024-12-15T09:15:27.363Z", - "domain": "en.wikipedia.org", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "cy.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, - "username": self.user.username, + "username": self.user.id, "rev_id": 485489, "user_id": self.user.id, "page_title": "test", @@ -3549,9 +3511,9 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect "domain": "en.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, - "username": self.user2.username, + "username": self.user.id, "rev_id": 485489, - "user_id": self.user2.id, + "user_id": self.user.id, "page_title": "test", "page_namespace": 0, "event_id": "", @@ -3568,12 +3530,12 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect "fields": { "link": "https://www.test.com/4", "timestamp": "2024-12-15T09:15:27.363Z", - "domain": "en.wikipedia.org", + "domain": "de.wikipedia.org", "content_type": ContentType.objects.get_for_model(URLPattern).id, "object_id": self.url.id, - "username": self.user2.username, + "username": self.user.id, "rev_id": 485489, - "user_id": self.user2.id, + "user_id": self.user.id, "page_title": "test", "page_namespace": 0, "event_id": "", @@ -3600,26 +3562,29 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect temp_dir, ) daily_link_aggregate = LinkAggregate.objects.all().first() - daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first() - daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first() + daily_user_aggregate = UserAggregate.objects.all().first() + daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first() + daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first() # assert no monthly aggregates created self.assertEqual(1, LinkAggregate.objects.count()) - self.assertEqual(2, UserAggregate.objects.count()) - self.assertEqual(1, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) # assert daily aggregates were created for the correct day self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) - self.assertEqual(2, UserAggregate.objects.filter(day=15).count()) - self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) # assert totals match expected totals - self.assertEqual(2, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_added) self.assertEqual(1, daily_link_aggregate.total_links_removed) self.assertEqual(1, daily_user_aggregate.total_links_added) - self.assertEqual(0, daily_user_aggregate.total_links_removed) - self.assertEqual(1, daily_user_aggregate2.total_links_added) - self.assertEqual(1, daily_user_aggregate2.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_pageproject_aggregate1.total_links_added) + self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed) + self.assertEqual(0, daily_pageproject_aggregate2.total_links_added) + self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed) finally: for file in glob.glob(archive_path): os.remove(file) @@ -3633,13 +3598,13 @@ def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connect }, ) @mock.patch("swiftclient.Connection") - def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_connection): + def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection): mock_conn = mock_swift_connection.return_value mock_conn.get_account.return_value = ( {}, [], ) - mock_conn.get_container.return_value = ({},[{"name": "archive-aggregates-backup-2024-12-15"}]) + mock_conn.get_container.return_value = ({},[]) temp_dir = tempfile.gettempdir() archive_filename = "links_linkevent_20241222_0.json.gz" archive_path = os.path.join(temp_dir, archive_filename) @@ -3700,7 +3665,7 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn "username": self.user.id, "rev_id": 485489, "user_id": self.user.id, - "page_title": "test", + "page_title": "test2", "page_namespace": 0, "event_id": "", "user_is_bot": False, @@ -3747,13 +3712,574 @@ def test_reaggregate_link_archives_daily_skips_if_uploaded(self, mock_swift_conn "--dir", temp_dir, ) - self.assertEqual(0, LinkAggregate.objects.count()) - self.assertEqual(0, UserAggregate.objects.count()) - self.assertEqual(0, PageProjectAggregate.objects.count()) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first() + monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed) finally: for file in glob.glob(archive_path): os.remove(file) + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.username, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.username, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first() + daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(2, UserAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(2, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(0, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate2.total_links_added) + self.assertEqual(1, daily_user_aggregate2.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded_link_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_linkaggregate_100_10_2024-12-15"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded_user_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_useraggregate_100_10__2024-12-15"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded_pageproject_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_pageprojectaggregate_100_10__2024-12-15"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( os.environ, {