diff --git a/extlinks/aggregates/management/commands/fill_link_aggregates.py b/extlinks/aggregates/management/commands/fill_link_aggregates.py index 7b9faff3..710d1245 100644 --- a/extlinks/aggregates/management/commands/fill_link_aggregates.py +++ b/extlinks/aggregates/management/commands/fill_link_aggregates.py @@ -8,7 +8,7 @@ from django.db.models.fields import DateField from ...models import LinkAggregate -from extlinks.links.models import LinkEvent +from extlinks.links.models import LinkEvent, URLPattern from extlinks.organisations.models import Collection @@ -117,6 +117,8 @@ def _process_single_collection(self, link_event_filter, collection): None """ url_patterns = collection.get_url_patterns() + if len(url_patterns) == 0: + url_patterns = URLPattern.objects.filter(collection=collection).all() for url_pattern in url_patterns: link_events_with_annotated_timestamp = url_pattern.link_events.annotate( timestamp_date=Cast("timestamp", DateField()) diff --git a/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py b/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py index 9014d14a..a05f8cee 100644 --- a/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py +++ b/extlinks/aggregates/management/commands/fill_pageproject_aggregates.py @@ -9,7 +9,7 @@ from django.db.models.fields import DateField from ...models import PageProjectAggregate -from extlinks.links.models import LinkEvent +from extlinks.links.models import LinkEvent, URLPattern from extlinks.organisations.models import Collection logger = logging.getLogger("django") @@ -118,6 +118,8 @@ def _process_single_collection(self, link_event_filter, collection): None """ url_patterns = collection.get_url_patterns() + if len(url_patterns) == 0: + url_patterns = URLPattern.objects.filter(collection=collection).all() for url_pattern in url_patterns: link_events_with_annotated_timestamp = url_pattern.link_events.annotate( timestamp_date=Cast("timestamp", DateField()) diff --git a/extlinks/aggregates/management/commands/fill_user_aggregates.py b/extlinks/aggregates/management/commands/fill_user_aggregates.py index e2b99a9e..d4c324e6 100644 --- a/extlinks/aggregates/management/commands/fill_user_aggregates.py +++ b/extlinks/aggregates/management/commands/fill_user_aggregates.py @@ -8,7 +8,7 @@ from django.db.models.fields import DateField from ...models import UserAggregate -from extlinks.links.models import LinkEvent +from extlinks.links.models import LinkEvent, URLPattern from extlinks.organisations.models import Collection @@ -117,6 +117,8 @@ def _process_single_collection(self, link_event_filter, collection): None """ url_patterns = collection.get_url_patterns() + if len(url_patterns) == 0: + url_patterns = URLPattern.objects.filter(collection=collection).all() for url_pattern in url_patterns: link_events_with_annotated_timestamp = url_pattern.link_events.annotate( timestamp_date=Cast("timestamp", DateField()) diff --git a/extlinks/aggregates/management/commands/reaggregate_link_archives.py b/extlinks/aggregates/management/commands/reaggregate_link_archives.py new file mode 100644 index 00000000..736ee664 --- /dev/null +++ b/extlinks/aggregates/management/commands/reaggregate_link_archives.py @@ -0,0 +1,791 @@ +import gzip +import json +import os +import logging +from datetime import datetime, timedelta, date + +from django.db import transaction + +from extlinks.aggregates.models import ( + LinkAggregate, + PageProjectAggregate, + UserAggregate, +) +from extlinks.common import swift +from extlinks.common.management.commands import BaseCommand +from extlinks.links.models import URLPattern, LinkEvent +from extlinks.organisations.models import Organisation, User + +logger = logging.getLogger("django") + + +class Command(BaseCommand): + help = "Loads, parses, and fixes daily or monthly aggregates for a given organisation. " + + def add_arguments(self, parser): + parser.add_argument( + "--month", + help="If provided, will fix a monthly aggregate. The date (YYYYMM) of the monthly archive to be fixed.", + type=str, + ) + parser.add_argument( + "--day", + help="If provided, will fix a daily aggregate. The date (YYYYMMDD) of the daily archive to be fixed.", + type=str, + ) + parser.add_argument( + "--organisation", + help="The organisation id to fix aggregates for.", + type=str, + ) + parser.add_argument( + "--dir", help="The directory from which to parse archives.", type=str + ) + + def _handle(self, *args, **options): + directory = options["dir"] + month_to_fix = options["month"] + day_to_fix = options["day"] + organisation = Organisation.objects.filter(id=options["organisation"]).first() + collections = organisation.collection_set.all() + + if not month_to_fix and not day_to_fix: + logger.warning( + "Please provide a month (e.g. 202509) or day (e.g. 20250920 ) to fix." + ) + return + if month_to_fix and day_to_fix: + logger.warning( + "Please only provide a month (e.g. 202509) or a day (e.g. 20250920 ) to fix-not both." + ) + return + if not directory: + logger.warning("Please provide a directory from which to parse archives.") + return + if not organisation: + logger.warning( + "Please provide an organisation for which to parse archives." + ) + return + if not collections: + logger.warning( + "Please provide an organisation which has collections for which to fix archives." + ) + return + try: + conn = swift.swift_connection() + except RuntimeError: + logger.info("Swift credentials not provided. Skipping.") + return False + + # get existing aggregates to ensure we have not already aggregated for the given timeframe + existing_aggregates = self._get_existing_aggregates( + conn + ) + # get all URLPatterns for an organisation + url_patterns = URLPattern.objects.filter(collection__in=collections) + + if month_to_fix: + first_day_of_month = self._get_first_day_of_month(month_to_fix) + last_day_of_month = self._get_last_day_of_month(first_day_of_month) + # if we already have aggregates for this month uploaded, don't try to re-aggregate + # or if we have not archived all events for the given timeframe, don't try to re-aggregate + if self._has_aggregates_for_month( + existing_aggregates, month_to_fix + ) or self._has_link_events_for_month(first_day_of_month, last_day_of_month): + logger.warning( + "Organisation already has aggregates or link events for month." + ) + return + # otherwise, attempt re-aggregation + with transaction.atomic(): + self._process_monthly_aggregates( + directory, month_to_fix, organisation, url_patterns, last_day_of_month + ) + else: + # if we already have aggregates for this day uploaded, don't try to re-aggregate + # or if we have not archived all events for the given timeframe, don't try to re-aggregate + if self._has_aggregates_for_day( + existing_aggregates, day_to_fix + ) or self._has_link_events_for_day(day_to_fix): + logger.warning( + "Organisation already has aggregates or link events for day." + ) + return + # otherwise, attempt re-aggregation + with transaction.atomic(): + self._process_daily_aggregates( + collections, day_to_fix, directory, url_patterns + ) + + def _get_existing_aggregates(self, conn): + """ + This function gets existing link aggregates from object storage. + Parameters + ---------- + conn : swiftclient.Connection + A connection to the Swift object storage. + + Returns + ------- + An array of existing link aggregates from object storage. + """ + existing_link_aggregates_in_object_storage = [ + i["name"] + for i in swift.get_object_list( + conn, + os.environ.get("SWIFT_CONTAINER_AGGREGATES", "archive-aggregates"), + "aggregates_", + ) + ] + return existing_link_aggregates_in_object_storage + + def _has_aggregates_for_month( + self, existing_link_aggregates_in_object_storage, month_to_fix + ): + """ + This function checks whether there are existing aggregates for the month to fix. + Parameters + ---------- + existing_link_aggregates_in_object_storage : An array of existing link aggregates from object storage. + + month_to_fix : str + + Returns + ------- + bool: whether there are existing aggregates for a given month in object storage + """ + return ( + len( + [ + i + for i in existing_link_aggregates_in_object_storage + if self._get_first_day_of_month(month_to_fix).strftime("%Y-%m") in i + ] + ) + > 0 + ) + + def _has_aggregates_for_day( + self, existing_link_aggregates_in_object_storage, day_to_fix + ): + """ + This function checks whether there are existing aggregates for the day to fix. + Parameters + ---------- + existing_link_aggregates_in_object_storage : An array of existing link aggregates from object storage. + + day_to_fix : str + + Returns + ------- + bool: whether there are existing aggregates for a given day in object storage + """ + day_to_fix_formatted = ( + datetime.fromisoformat(day_to_fix).date().strftime("%Y-%m-%d") + ) + return ( + len( + [ + i + for i in existing_link_aggregates_in_object_storage + if day_to_fix_formatted in i + ] + ) + > 0 + ) + + def _has_link_events_for_month(self, first_day_of_month, last_day_of_month): + return LinkEvent.objects.filter(timestamp__gte=first_day_of_month, timestamp__lte=last_day_of_month).count() > 0 + + def _has_link_events_for_day(self, day_to_fix): + day = datetime.fromisoformat(day_to_fix) + return LinkEvent.objects.filter(timestamp__gte=day, timestamp__lte=day + timedelta(days=1)).count() > 0 + + + def _process_daily_aggregates( + self, collections, day_to_fix, directory, url_patterns + ): + """ + This function loops through each url pattern and link event to fill the daily aggregates. + Parameters + ---------- + collections : An array of collections + + day_to_fix : str + + directory : str + + url_patterns : An array of url patterns + + Returns + ------- + None + """ + # pull month string from day input parameter + month_to_fix = day_to_fix[:-2] + # load and split link events by url pattern + events_split_by_url_pattern = self._load_events_from_archives( + directory, month_to_fix, [i.url for i in url_patterns] + ) + # loop through each collection + for collection in collections: + collection_url_pattern_strings = [ + i.url for i in url_patterns.filter(collection=collection) + ] + # loop through each collection's URLPatterns + for collection_url_string in collection_url_pattern_strings: + # get the link events for the collection and day + for link_event in self._get_link_events_for_day( + collection_url_string, + events_split_by_url_pattern, + int(day_to_fix[-2:]), + ): + # create daily aggregates + self._fill_daily_aggregate(collection, link_event) + + def _fill_daily_aggregate(self, collection, link_event): + """ + This function updates or creates a daily LinkAggregate for a collection and a parsed JSON object(LinkEvent). + Parameters + ---------- + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + change_number = link_event["fields"]["change"] + self._fill_daily_pageproject_aggregates(change_number, collection, link_event) + self._fill_daily_user_aggregate(change_number, collection, link_event) + self._fill_daily_link_aggregate(change_number, collection, link_event) + + def _fill_daily_pageproject_aggregates(self, change_number, collection, link_event): + """ + This function updates or creates a daily PageProjectAggregate for a collection and a parsed JSON (LinkEvent). + Parameters + ---------- + change_number : int + + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + existing_pageproject_aggregate = PageProjectAggregate.objects.filter( + organisation=collection.organisation, + collection=collection, + page_name=link_event['fields']["page_title"], + project_name=link_event['fields']["domain"], + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + on_user_list=link_event['fields']["on_user_list"], + ).exclude(day=0)[:1].all() + existing_pageproject_aggregate = ( + existing_pageproject_aggregate[0] if len(existing_pageproject_aggregate) > 0 else None + ) + if existing_pageproject_aggregate: + if change_number == 0: + existing_pageproject_aggregate.total_links_removed += 1 + else: + existing_pageproject_aggregate.total_links_added += 1 + existing_pageproject_aggregate.save() + else: + # Create a new page project aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + PageProjectAggregate.objects.get_or_create( + organisation=collection.organisation, + collection=collection, + page_name=link_event['fields']["page_title"], + project_name=link_event['fields']["domain"], + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event['fields']["on_user_list"], + ) + + def _fill_daily_user_aggregate(self, change_number, collection, link_event): + """ + This function updates or creates a daily UserAggregate for a collection and a parsed JSON (LinkEvent). + Parameters + ---------- + change_number : int + + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + try: + user_retrieved = User.objects.get(pk=link_event["fields"]["user_id"]) + except User.DoesNotExist: + return + exisiting_user_aggregate = ( + UserAggregate.objects.filter( + organisation=collection.organisation, + collection=collection, + username=user_retrieved.username, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + on_user_list=link_event["fields"]["on_user_list"], + ) + .exclude(day=0) + .first() + ) + if exisiting_user_aggregate: + if change_number == 0: + exisiting_user_aggregate.total_links_removed += 1 + else: + exisiting_user_aggregate.total_links_added += 1 + exisiting_user_aggregate.save() + else: + # Create a new link aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + UserAggregate.objects.create( + organisation=collection.organisation, + collection=collection, + username=user_retrieved.username, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event["fields"]["on_user_list"], + ) + + def _fill_daily_link_aggregate(self, change_number, collection, link_event): + """ + This function updates or creates a daily LinkAggregate for a collection and a parsed JSON (LinkEvent). + Parameters + ---------- + change_number : int + + collection : Collection + + link_event : obj + + Returns + ------- + None + """ + existing_link_aggregate = ( + LinkAggregate.objects.filter( + organisation=collection.organisation.id, + collection=collection.id, + full_date=datetime.fromisoformat(link_event["fields"]["timestamp"]), + on_user_list=link_event["fields"]["on_user_list"], + ) + .exclude(day=0) + .first() + ) + if existing_link_aggregate is None: + # Create a new link aggregate + links_added = change_number if change_number > 0 else 0 + links_removed = 1 if change_number == 0 else 0 + LinkAggregate.objects.create( + organisation=collection.organisation, + collection=collection, + full_date=datetime.fromisoformat( + link_event["fields"]["timestamp"] + ).date(), + total_links_added=links_added, + total_links_removed=links_removed, + on_user_list=link_event["fields"]["on_user_list"], + ) + else: + if change_number == 0: + existing_link_aggregate.total_links_removed += 1 + else: + existing_link_aggregate.total_links_added += 1 + existing_link_aggregate.save() + + def _process_monthly_aggregates( + self, directory, month_to_fix, organisation, url_patterns, last_day_of_month + ): + """ + This function loops through each url pattern and link events to fill the monthly aggregates. + Parameters + ---------- + directory : str + + month_to_fix : str + + organisation : Organisation + + url_patterns : An array of url patterns + + Returns + ------- + None + """ + # load and split link events by url pattern + events_split_by_url_pattern = self._load_events_from_archives( + directory, month_to_fix, [i.url for i in url_patterns] + ) + # get the first and last day of the month to fix + for url_pattern, link_events in events_split_by_url_pattern.items(): + # create monthly aggregates + self._fill_monthly_aggregate( + url_pattern, last_day_of_month, organisation, url_patterns, link_events + ) + + def _fill_monthly_aggregate( + self, url_pattern, last_day_of_month, organisation, url_patterns, link_events + ): + """ + This function fills monthly LinkAggregates for an organisation and a parsed JSON object(LinkEvent). + Parameters + ---------- + url_pattern : str + + last_day_of_month : date + + organisation : Organisation + + url_patterns : An array of url patterns + + link_events : an array of link event JSON objects + + Returns + ------- + None + """ + # find the collection associated with this url + collection = url_patterns.filter(url=url_pattern).first().collection + self._process_monthly_events( + True, link_events, collection, organisation, last_day_of_month + ) + self._process_monthly_events( + False, link_events, collection, organisation, last_day_of_month + ) + + def _process_monthly_events( + self, + on_user_list_flag, + link_events, + collection, + organisation, + last_day_of_month, + ): + """ + This function updates or creates a monthly LinkAggregate for a collection and parsed JSON objects(LinkEvents). + Parameters + ---------- + on_user_list_flag : bool, whether the aggregate should save with on_user_list flag or not + + link_events : an array of link event JSON objects + + collection: a Collection + + organisation: Organisation + + last_day_of_month: date + + Returns + ------- + None + """ + events = [ + i for i in link_events if i["fields"]["on_user_list"] is on_user_list_flag + ] + if not events: + return + + total_added = sum(1 for i in events if i["fields"]["change"] == 1) + total_removed = sum(1 for i in events if i["fields"]["change"] == 0) + + # set of tuples that consist of (page_title, domain) for a group of link events + page_projects = list( + set([(i["fields"]["page_title"], i["fields"]["domain"]) for i in events]) + ) + # set of user ids to fill user aggregates for + users = list(set([i["fields"]["user_id"] for i in events])) + try: + for page_project in page_projects: + self._fill_monthly_page_project_aggregates(collection, events, last_day_of_month, on_user_list_flag, + page_project) + for user in users: + self._fill_monthly_user_aggregates( + collection, last_day_of_month, link_events, on_user_list_flag, user + ) + self._fill_monthly_link_aggregates( + collection, + last_day_of_month, + on_user_list_flag, + organisation, + total_added, + total_removed, + ) + except Exception as e: + print(e) + + + def _fill_monthly_page_project_aggregates(self, collection, events, last_day_of_month, on_user_list_flag, + page_project): + """ + This function updates or creates monthly PageProjectAggregate for collection and a parsed array of JSON(LinkEvents). + Parameters + ---------- + collection : Collection + + events : an array of JSON link_events parsed from archives + + last_day_of_month: date + + on_user_list_flag: bool + + page_project: tuple(str, str) + + Returns + ------- + None + """ + events_for_page_project = [ + i + for i in events + if i["fields"]["page_title"] == page_project[0] + and i["fields"]["domain"] == page_project[1] + ] + total_added_page_project = sum(1 for i in events_for_page_project if i["fields"]["change"] == 1) + total_removed_page_project = sum(1 for i in events_for_page_project if i["fields"]["change"] == 0) + existing_page_project_aggregate = PageProjectAggregate.objects.filter( + organisation=collection.organisation, + collection=collection, + page_name=page_project[0], + project_name=page_project[1], + day=0, + full_date=last_day_of_month, + on_user_list=on_user_list_flag, + )[:1].all() + if existing_page_project_aggregate: + existing_page_project_aggregate.total_links_added = total_added_page_project + existing_page_project_aggregate.total_links_removed = total_removed_page_project + existing_page_project_aggregate.save() + else: + PageProjectAggregate.objects.get_or_create( + organisation=collection.organisation, + collection=collection, + page_name=page_project[0], + project_name=page_project[1], + full_date=last_day_of_month, + day=0, + total_links_added=total_added_page_project, + total_links_removed=total_removed_page_project, + on_user_list=on_user_list_flag, + ) + + def _fill_monthly_link_aggregates( + self, + collection, + last_day_of_month, + on_user_list_flag, + organisation, + total_added, + total_removed, + ): + """ + This function updates or creates monthly LinkAggregate for collection. + Parameters + ---------- + collection : Collection + + last_day_of_month: date + + on_user_list_flag: bool + + organisation: Organisation + + total_added: int + + total_removed: int + + Returns + ------- + None + """ + existing_link_aggregate = LinkAggregate.objects.filter( + organisation_id=organisation.id, + collection_id=collection.id, + on_user_list=on_user_list_flag, + full_date=last_day_of_month, + day=0, + ) + if existing_link_aggregate.exists(): + existing_link_aggregate.update( + total_links_added=total_added, + total_links_removed=total_removed, + ) + else: + LinkAggregate.objects.create( + organisation_id=organisation.id, + collection_id=collection.id, + on_user_list=on_user_list_flag, + full_date=last_day_of_month, + day=0, + total_links_added=total_added, + total_links_removed=total_removed, + ) + def _fill_monthly_user_aggregates( + self, collection, last_day_of_month, link_events, on_user_list_flag, user + ): + """ + This function updates or creates monthly UserAggregate for user and collection. + Parameters + ---------- + collection : Collection + + last_day_of_month: date + + link_events: an array of JSON link_events parsed from archives + + on_user_list_flag: bool + + user : User + + Returns + ------- + None + """ + try: + user_retrieved = User.objects.get(pk=user) + except User.DoesNotExist: + return + events_for_user = [i for i in link_events if i["fields"]["user_id"] is user] + total_added_by_user = sum( + 1 for i in events_for_user if i["fields"]["change"] == 1 + ) + total_removed_by_user = sum( + 1 for i in events_for_user if i["fields"]["change"] == 0 + ) + exisiting_user_aggregate = ( + UserAggregate.objects.filter( + organisation_id=collection.organisation.id, + collection_id=collection.id, + username=user_retrieved.username, + full_date=last_day_of_month, + day=0, + on_user_list=on_user_list_flag, + ) + .first() + ) + if exisiting_user_aggregate: + exisiting_user_aggregate.total_links_added = total_added_by_user + exisiting_user_aggregate.total_links_removed = total_removed_by_user + exisiting_user_aggregate.save() + else: + # Create a new link aggregate + UserAggregate.objects.create( + organisation_id=collection.organisation.id, + collection_id=collection.id, + username=user_retrieved.username, + full_date=last_day_of_month, + day=0, + total_links_added=total_added_by_user, + total_links_removed=total_removed_by_user, + on_user_list=on_user_list_flag, + ) + + def _get_link_events_for_day( + self, collection_url: str, events_split_by_url_pattern, day: int + ): + """ + This function splits parsed JSON objects(LinkEvent) by collection url pattern. + Parameters + ---------- + collection_url : str + + events_split_by_url_pattern : an array of link event JSON objects + + Returns + ------- + link_events_for_day : an array of link event JSON objects filtered by day + """ + link_events_for_day = [ + j + for j in events_split_by_url_pattern[collection_url] + if datetime.fromisoformat(j["fields"]["timestamp"]).date().day == day + ] + return link_events_for_day + + def _get_last_day_of_month(self, first_day_of_month: date) -> date: + """ + This function gets the last day of the month from the first day of the input month + Parameters + ---------- + first_day_of_month : date + + Returns + ------- + date + """ + if first_day_of_month.month == 12: + return first_day_of_month.replace(day=31) + replace = first_day_of_month.replace(month=first_day_of_month.month + 1) + return replace - timedelta(days=1) + + def _get_first_day_of_month(self, month_to_fix: str) -> date: + """ + This function gets the first day of the month from the input month + Parameters + ---------- + month_to_fix : str + + Returns + ------- + date + """ + return datetime.strptime(month_to_fix, "%Y%m").date().replace(day=1) + + def _load_events_from_archives( + self, directory: object, month_to_fix: str, url_pattern_strings + ) -> object: + """Parse archived .json.gz files and split the link events by URL pattern. + Parameters + ---------- + directory : str + + month_to_fix : str + + url_pattern_strings : an array of str + + Returns + ------- + parsed JSON link event objects + """ + events_split_by_url_pattern = {url: [] for url in url_pattern_strings} + for filename in os.listdir(directory): + if ( + filename.endswith(".json.gz") + and filename.startswith("links_linkevent_") + and month_to_fix in filename + ): + try: + file_path = os.path.join(directory, filename) + with gzip.open(file_path, "rt", encoding="utf-8") as f: + data = json.load(f) + for event in data: + link = event["fields"]["link"] + for url_pattern in url_pattern_strings: + if url_pattern in link: + events_split_by_url_pattern[url_pattern].append( + event + ) + except Exception as e: + logger.info( + f"Unexpected exception occurred loading events from archive: {e}" + ) + return events_split_by_url_pattern diff --git a/extlinks/aggregates/tests.py b/extlinks/aggregates/tests.py index 44ed323f..00d7d4d2 100644 --- a/extlinks/aggregates/tests.py +++ b/extlinks/aggregates/tests.py @@ -11,6 +11,7 @@ from dateutil.relativedelta import relativedelta from unittest import mock +from django.contrib.contenttypes.models import ContentType from django.core.management import call_command, CommandError from django.test import TransactionTestCase @@ -33,6 +34,7 @@ UserFactory, ) from extlinks.organisations.models import Organisation +from ..links.models import URLPattern, LinkEvent class BaseTransactionTest(TransactionTestCase): @@ -2128,3 +2130,2641 @@ def test_uploads_all_files_successfully(self, mock_swift_connection): for file in glob.glob(pattern): os.remove(file) + + +class FixAggregatesForOrganisationAndMonthCommandTest(BaseTransactionTest): + + def setUp(self): + # Creating one Collection, Organisation, and URLPattern + self.organisation = OrganisationFactory(name="ACME Org") + self.collection = CollectionFactory(organisation=self.organisation) + self.user = UserFactory() + self.user2 = UserFactory() + self.url = URLPatternFactory(url="www.test.com") + self.url.collection = self.collection + self.url.save() + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates = UserAggregate.objects.all().first() + monthly_page_project_aggregates = PageProjectAggregate.objects.all().first() + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_user_aggregates.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_page_project_aggregates.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_multiple_projects(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "cy.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "de.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates = UserAggregate.objects.all().first() + monthly_page_project_aggregates_en = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first() + monthly_page_project_aggregates_de = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first() + monthly_page_project_aggregates_cy = PageProjectAggregate.objects.filter(project_name="cy.wikipedia.org").first() + + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(3, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_user_aggregates.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_de.total_links_removed) + self.assertEqual(0, monthly_page_project_aggregates_de.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_en.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_en.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_cy.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_cy.total_links_added) + + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_multiple_pages(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test2", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates = UserAggregate.objects.all().first() + monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first() + monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first() + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(2, monthly_user_aggregates.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed) + + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_multiple_users(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_link_aggregate = LinkAggregate.objects.all().first() + monthly_user_aggregates_1 = UserAggregate.objects.filter(username=self.user.username).first() + monthly_user_aggregates_2 = UserAggregate.objects.filter(username=self.user2.username).first() + monthly_page_project_aggregates = PageProjectAggregate.objects.all().first() + # assert only one monthly aggregate created for on_user_list=True + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + self.assertEqual(2, UserAggregate.objects.count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=15).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_link_aggregate.total_links_added) + self.assertEqual(1, monthly_link_aggregate.total_links_removed) + self.assertEqual(1, monthly_user_aggregates_1.total_links_added) + self.assertEqual(0, monthly_user_aggregates_1.total_links_removed) + self.assertEqual(1, monthly_user_aggregates_2.total_links_added) + self.assertEqual(1, monthly_user_aggregates_2.total_links_removed) + self.assertEqual(2, monthly_page_project_aggregates.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_uploaded_link_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_linkaggregate_100_10_2024-12-22"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_uploaded_user_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_useraggregate_100_10__2024-12-22"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_uploaded_pageproject_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_pageprojectaggregate_2024-12-22"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_skips_if_linkevents_for_month(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [{"name": "archive-aggregates-backup-2024-12-22"}], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + + # create link events + call_command("loaddata", archive_path) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + # assert no daily or monthly aggregates created + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + daily_pageproject_aggregate = PageProjectAggregate.objects.all().first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_pageproject_aggregate.total_links_added) + self.assertEqual(1, daily_pageproject_aggregate.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_projects(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "cy.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "de.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + daily_pageproject_aggregate1 = PageProjectAggregate.objects.filter(project_name="en.wikipedia.org").first() + daily_pageproject_aggregate2 = PageProjectAggregate.objects.filter(project_name="de.wikipedia.org").first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_pageproject_aggregate1.total_links_added) + self.assertEqual(0, daily_pageproject_aggregate1.total_links_removed) + self.assertEqual(0, daily_pageproject_aggregate2.total_links_added) + self.assertEqual(1, daily_pageproject_aggregate2.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_pages(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test2", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.all().first() + monthly_page_project_aggregates_page_1 = PageProjectAggregate.objects.filter(page_name="test").first() + monthly_page_project_aggregates_page_2 = PageProjectAggregate.objects.filter(page_name="test2").first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(1, UserAggregate.objects.count()) + self.assertEqual(2, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(1, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(2, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(1, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(1, daily_user_aggregate.total_links_removed) + self.assertEqual(0, monthly_page_project_aggregates_page_1.total_links_added) + self.assertEqual(1, monthly_page_project_aggregates_page_1.total_links_removed) + self.assertEqual(1, monthly_page_project_aggregates_page_2.total_links_added) + self.assertEqual(0, monthly_page_project_aggregates_page_2.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_multiple_users(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.username, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.username, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user2.username, + "rev_id": 485489, + "user_id": self.user2.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + daily_link_aggregate = LinkAggregate.objects.all().first() + daily_user_aggregate = UserAggregate.objects.filter(username=self.user.username).first() + daily_user_aggregate2 = UserAggregate.objects.filter(username=self.user2.username).first() + # assert no monthly aggregates created + self.assertEqual(1, LinkAggregate.objects.count()) + self.assertEqual(2, UserAggregate.objects.count()) + self.assertEqual(1, PageProjectAggregate.objects.count()) + # assert daily aggregates were created for the correct day + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, UserAggregate.objects.filter(day=16).count()) + self.assertEqual(0, PageProjectAggregate.objects.filter(day=16).count()) + self.assertEqual(1, LinkAggregate.objects.filter(day=15).count()) + self.assertEqual(2, UserAggregate.objects.filter(day=15).count()) + self.assertEqual(1, PageProjectAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, daily_link_aggregate.total_links_added) + self.assertEqual(1, daily_link_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate.total_links_added) + self.assertEqual(0, daily_user_aggregate.total_links_removed) + self.assertEqual(1, daily_user_aggregate2.total_links_added) + self.assertEqual(1, daily_user_aggregate2.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded_link_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_linkaggregate_100_10_2024-12-15"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded_user_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_useraggregate_100_10__2024-12-15"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_uploaded_pageproject_aggregates(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[{"name": "aggregates_pageprojectaggregate_100_10__2024-12-15"}]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_daily_skips_if_linkevents_for_day(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({},[]) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + # create link events + call_command("loaddata", archive_path) + + try: + call_command( + "reaggregate_link_archives", + "--day", + "20241215", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_monthly_on_and_off_user_list(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 4, + "fields": { + "link": "https://www.test.com/4", + "timestamp": "2024-12-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 0, + "on_user_list": False, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + monthly_aggregate_on_user_list = LinkAggregate.objects.filter(on_user_list=True).first() + monthly_aggregate_not_on_user_list = LinkAggregate.objects.filter(on_user_list=False).first() + # assert two monthly aggregates were created for on_user_list=True and on_user_list=False + self.assertEqual(2, LinkAggregate.objects.count()) + self.assertEqual(2, LinkAggregate.objects.filter(day=0).count()) + # assert daily aggregates were not created + self.assertEqual(0, LinkAggregate.objects.filter(day=16).count()) + self.assertEqual(0, LinkAggregate.objects.filter(day=15).count()) + # assert totals match expected totals + self.assertEqual(2, monthly_aggregate_on_user_list.total_links_added) + self.assertEqual(0, monthly_aggregate_on_user_list.total_links_removed) + self.assertEqual(0, monthly_aggregate_not_on_user_list.total_links_added) + self.assertEqual(1, monthly_aggregate_not_on_user_list.total_links_removed) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_only_link_event_archives(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "aggregates_20241222_0.json.gz" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-01-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file) + + @mock.patch.dict( + os.environ, + { + "OPENSTACK_AUTH_URL": "fakeurl", + "SWIFT_APPLICATION_CREDENTIAL_ID": "fakecredid", + "SWIFT_APPLICATION_CREDENTIAL_SECRET": "fakecredsecret", + }, + ) + @mock.patch("swiftclient.Connection") + def test_reaggregate_link_archives_only_in_correct_zipped_format(self, mock_swift_connection): + mock_conn = mock_swift_connection.return_value + mock_conn.get_account.return_value = ( + {}, + [], + ) + mock_conn.get_container.return_value = ({}, []) + temp_dir = tempfile.gettempdir() + archive_filename = "links_linkevent_20241222_0.json" + archive_path = os.path.join(temp_dir, archive_filename) + json_data = [ + { + "model": "links.linkevent", + "pk": 1, + "fields": { + "link": "https://www.another_domain.com/articles/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 2, + "fields": { + "link": "https://www.test.com/", + "timestamp": "2024-12-16T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + { + "model": "links.linkevent", + "pk": 3, + "fields": { + "link": "https://www.test.com/3", + "timestamp": "2024-01-15T09:15:27.363Z", + "domain": "en.wikipedia.org", + "content_type": ContentType.objects.get_for_model(URLPattern).id, + "object_id": self.url.id, + "username": self.user.id, + "rev_id": 485489, + "user_id": self.user.id, + "page_title": "test", + "page_namespace": 0, + "event_id": "", + "user_is_bot": False, + "hash_link_event_id": "", + "change": 1, + "on_user_list": True, + "url": [] + } + }, + ] + + with gzip.open(archive_path, "wt", encoding="utf-8") as f: + json.dump(json_data, f) + + try: + call_command( + "reaggregate_link_archives", + "--month", + "202412", + "--organisation", + self.organisation.id, + "--dir", + temp_dir, + ) + self.assertEqual(0, LinkAggregate.objects.count()) + self.assertEqual(0, UserAggregate.objects.count()) + self.assertEqual(0, PageProjectAggregate.objects.count()) + finally: + for file in glob.glob(archive_path): + os.remove(file)