From 965c56318ae9026bfa305c8cbe2c095aea5b00a9 Mon Sep 17 00:00:00 2001 From: Kevin Brown Date: Fri, 1 May 2026 06:35:37 +0300 Subject: [PATCH] Add source diversity analysis WP --- ..._diversity_snapshot_changelist_widget.html | 64 ++++ ingestion/tasks.py | 18 +- trends/admin.py | 185 ++++++++++- trends/api.py | 56 ++++ trends/api_urls.py | 6 + .../0004_source_diversity_snapshot.py | 53 +++ trends/models.py | 32 ++ trends/serializers.py | 30 ++ trends/tasks.py | 306 +++++++++++++++++- trends/tests/test_admin.py | 98 +++++- trends/tests/test_api.py | 67 ++++ trends/tests/test_tasks.py | 111 ++++++- 12 files changed, 1018 insertions(+), 8 deletions(-) create mode 100644 core/templates/admin/source_diversity_snapshot_changelist_widget.html create mode 100644 trends/migrations/0004_source_diversity_snapshot.py diff --git a/core/templates/admin/source_diversity_snapshot_changelist_widget.html b/core/templates/admin/source_diversity_snapshot_changelist_widget.html new file mode 100644 index 00000000..962b00c9 --- /dev/null +++ b/core/templates/admin/source_diversity_snapshot_changelist_widget.html @@ -0,0 +1,64 @@ +{% if dashboard_stats or source_diversity_alerts or source_diversity_project_drilldowns %} +
+ {% if dashboard_stats %} +
+ {% for stat in dashboard_stats %} +
+
+
+

{{ stat.title }}

+

{{ stat.value }}

+
+ {{ stat.icon }} +
+
+ {% endfor %} +
+ {% endif %} + {% if source_diversity_alerts %} +
+

Concentration Alerts

+
+ {% for alert in source_diversity_alerts %} +
+ {{ alert.message }} +
+ {% endfor %} +
+
+ {% endif %} + {% if source_diversity_project_drilldowns %} + + {% endif %} +
+{% endif %} diff --git a/ingestion/tasks.py b/ingestion/tasks.py index dfdeb198..c229ad4a 100644 --- a/ingestion/tasks.py +++ b/ingestion/tasks.py @@ -4,7 +4,7 @@ from celery import shared_task from django.conf import settings -from django.db.models import Q +from django.db.models import Model, Q from django.utils import timezone from content.deduplication import canonicalize_url @@ -16,6 +16,17 @@ logger = logging.getLogger(__name__) +def _require_pk(instance: Model) -> int: + """Return a saved model primary key as an ``int``.""" + + instance_pk = instance.pk + if instance_pk is None: + raise ValueError( + f"{instance.__class__.__name__} must be saved before task dispatch" + ) + return int(instance_pk) + + @shared_task(name="core.tasks.run_ingestion") def run_ingestion(source_config_id: int): """Fetch new content for one source config and record an ingestion run.""" @@ -74,7 +85,10 @@ def _ingest_source_config(source_config: SourceConfig) -> tuple[int, int]: for item in fetched_items: if _content_exists_for_item(source_config, item): continue - source_metadata = getattr(item, "source_metadata", None) or {} + source_metadata = { + **(getattr(item, "source_metadata", None) or {}), + "source_config_id": _require_pk(source_config), + } content = Content.objects.create( project=source_config.project, entity=_match_entity_for_item(plugin, item), diff --git a/trends/admin.py b/trends/admin.py index afbd6357..2b3de58c 100644 --- a/trends/admin.py +++ b/trends/admin.py @@ -8,7 +8,7 @@ from django.urls import reverse from django.utils import timezone -from trends.models import TopicCentroidSnapshot +from trends.models import SourceDiversitySnapshot, TopicCentroidSnapshot def _project_pk(snapshot: TopicCentroidSnapshot) -> int: @@ -44,6 +44,32 @@ def _drift_card_color(value) -> str: return "danger" +def _diversity_card_color(value) -> str: + """Return an admin card severity for normalized diversity scores.""" + + if value is None: + return "info" + numeric_value = float(value) + if numeric_value >= 0.75: + return "success" + if numeric_value >= 0.4: + return "warning" + return "danger" + + +def _share_card_color(value) -> str: + """Return an admin card severity for concentration share metrics.""" + + if value is None: + return "info" + numeric_value = float(value) + if numeric_value <= 0.4: + return "success" + if numeric_value <= 0.7: + return "warning" + return "danger" + + def _format_snapshot_freshness(computed_at) -> str: """Return a compact human-readable age for the latest snapshot.""" @@ -115,6 +141,46 @@ def _build_topic_centroid_project_drilldowns(queryset, changelist_url: str): return project_drilldowns +def _build_source_diversity_project_drilldowns(queryset, changelist_url: str): + """Build one filtered-history drilldown row per project.""" + + latest_by_project: dict[int, SourceDiversitySnapshot] = {} + snapshot_counts: dict[int, int] = {} + ordered_snapshots = queryset.select_related("project").order_by( + "project_id", "-computed_at" + ) + + for snapshot in ordered_snapshots: + project_id = int(snapshot.project_id) + snapshot_counts[project_id] = snapshot_counts.get(project_id, 0) + 1 + latest_by_project.setdefault(project_id, snapshot) + + project_drilldowns = [] + for snapshot in sorted( + latest_by_project.values(), + key=lambda value: value.project.name.lower(), + ): + project_id = int(snapshot.project_id) + alerts = cast( + list[dict[str, Any]], (snapshot.breakdown or {}).get("alerts", []) + ) + project_drilldowns.append( + { + "project_id": project_id, + "project_name": snapshot.project.name, + "snapshot_count": snapshot_counts[project_id], + "latest_snapshot": _format_snapshot_freshness(snapshot.computed_at), + "plugin_entropy": f"{_score_to_percent(snapshot.plugin_entropy):.1f}%", + "source_entropy": f"{_score_to_percent(snapshot.source_entropy):.1f}%", + "top_plugin_share": f"{_score_to_percent(snapshot.top_plugin_share):.1f}%", + "alert_count": len(alerts), + "href": f"{changelist_url}?{urlencode({'project__id__exact': project_id})}", + } + ) + + return project_drilldowns + + @admin.register(TopicCentroidSnapshot) class TopicCentroidSnapshotAdmin(admin.ModelAdmin): """Admin view for persisted topic-centroid history and drift.""" @@ -219,3 +285,120 @@ def changelist_view(self, request, extra_context=None): _build_topic_centroid_project_drilldowns(queryset, changelist_url) ) return super().changelist_view(request, extra_context=extra_context) + + +@admin.register(SourceDiversitySnapshot) +class SourceDiversitySnapshotAdmin(admin.ModelAdmin): + """Admin view for persisted source-diversity history and concentration alerts.""" + + list_before_template = "admin/source_diversity_snapshot_changelist_widget.html" + list_display = ( + "project", + "display_plugin_entropy", + "display_source_entropy", + "display_author_entropy", + "display_top_plugin_share", + "computed_at", + ) + list_filter = ( + "window_days", + ("project", admin.RelatedOnlyFieldListFilter), + "computed_at", + ) + search_fields = ("project__name",) + autocomplete_fields = ("project",) + + @admin.display(description="Plugin Diversity", ordering="plugin_entropy") + def display_plugin_entropy(self, obj): + """Render normalized plugin diversity as a percentage.""" + + return f"{_score_to_percent(obj.plugin_entropy):.1f}%" + + @admin.display(description="Source Diversity", ordering="source_entropy") + def display_source_entropy(self, obj): + """Render normalized source diversity as a percentage.""" + + return f"{_score_to_percent(obj.source_entropy):.1f}%" + + @admin.display(description="Author Diversity", ordering="author_entropy") + def display_author_entropy(self, obj): + """Render normalized author diversity as a percentage.""" + + return f"{_score_to_percent(obj.author_entropy):.1f}%" + + @admin.display(description="Top Plugin Share", ordering="top_plugin_share") + def display_top_plugin_share(self, obj): + """Render the largest plugin share as a percentage.""" + + return f"{_score_to_percent(obj.top_plugin_share):.1f}%" + + def changelist_view(self, request, extra_context=None): + """Augment the changelist with diversity summaries and alert callouts.""" + + queryset = self.get_queryset(request) + changelist_url = reverse( + f"{self.admin_site.name}:{self.model._meta.app_label}_{self.model._meta.model_name}_changelist" + ) + metrics = queryset.aggregate( + avg_plugin_entropy=Avg("plugin_entropy"), + avg_source_entropy=Avg("source_entropy"), + avg_author_entropy=Avg("author_entropy"), + avg_top_plugin_share=Avg("top_plugin_share"), + latest_snapshot_at=Max("computed_at"), + ) + alerts = [ + alert + for snapshot in queryset.order_by("project_id", "-computed_at") + for alert in cast( + list[dict[str, Any]], (snapshot.breakdown or {}).get("alerts", []) + ) + ] + + extra_context = cast(dict[str, Any], extra_context or {}) + extra_context["dashboard_stats"] = [ + { + "title": "Plugin Diversity", + "value": ( + f"{_score_to_percent(metrics['avg_plugin_entropy']):.1f}%" + if metrics["avg_plugin_entropy"] is not None + else "-" + ), + "icon": "hub", + "color": _diversity_card_color(metrics["avg_plugin_entropy"]), + }, + { + "title": "Source Diversity", + "value": ( + f"{_score_to_percent(metrics['avg_source_entropy']):.1f}%" + if metrics["avg_source_entropy"] is not None + else "-" + ), + "icon": "lan", + "color": _diversity_card_color(metrics["avg_source_entropy"]), + }, + { + "title": "Author Diversity", + "value": ( + f"{_score_to_percent(metrics['avg_author_entropy']):.1f}%" + if metrics["avg_author_entropy"] is not None + else "-" + ), + "icon": "group", + "color": _diversity_card_color(metrics["avg_author_entropy"]), + }, + { + "title": "Top Plugin Share", + "value": ( + f"{_score_to_percent(metrics['avg_top_plugin_share']):.1f}%" + if metrics["avg_top_plugin_share"] is not None + else "-" + ), + "icon": "pie_chart", + "color": _share_card_color(metrics["avg_top_plugin_share"]), + }, + ] + extra_context["source_diversity_alerts"] = alerts + extra_context["source_diversity_project_drilldowns"] = ( + _build_source_diversity_project_drilldowns(queryset, changelist_url) + ) + return super().changelist_view(request, extra_context=extra_context) diff --git a/trends/api.py b/trends/api.py index 31beadaa..d7aea4f1 100644 --- a/trends/api.py +++ b/trends/api.py @@ -19,6 +19,7 @@ from core.permissions import IsProjectContributor, IsProjectMember from trends.models import ( ContentClusterMembership, + SourceDiversitySnapshot, ThemeSuggestion, ThemeSuggestionStatus, TopicCentroidSnapshot, @@ -26,6 +27,8 @@ TopicVelocitySnapshot, ) from trends.serializers import ( + SourceDiversityObservabilitySummarySerializer, + SourceDiversitySnapshotSerializer, ThemeSuggestionDismissSerializer, ThemeSuggestionSerializer, TopicClusterDetailSerializer, @@ -334,3 +337,56 @@ def summary(self, request, *args, **kwargs): context=self.get_serializer_context(), ) return Response(serializer.data) + + +@document_project_owned_viewset( + resource_plural="source diversity snapshots", + resource_singular="source diversity snapshot", + create_description="Source diversity snapshots are pipeline-managed observability rows and are exposed read-only for health analysis.", + tag="Observability", + action_overrides=build_crud_action_overrides( + SourceDiversitySnapshotSerializer, + resource_plural="source diversity snapshots for the selected project", + resource_singular="source diversity snapshot", + ), +) +class SourceDiversitySnapshotViewSet( + ProjectOwnedQuerysetMixin, viewsets.ReadOnlyModelViewSet +): + """Inspect persisted source-diversity history for a project.""" + + serializer_class = SourceDiversitySnapshotSerializer + queryset = SourceDiversitySnapshot.objects.select_related("project") + + def get_permissions(self): + """Restrict source-diversity observability to project contributors.""" + + return [IsProjectContributor()] + + @extend_schema( + summary="Get source diversity summary", + description=( + "Return the latest persisted source-diversity snapshot for the selected project " + "along with the number of stored snapshots." + ), + request=None, + responses={ + 200: SourceDiversityObservabilitySummarySerializer, + 403: AUTHENTICATION_REQUIRED_RESPONSE, + }, + tags=["Observability"], + ) + @action(detail=False, methods=["get"], url_path="summary") + def summary(self, request, *args, **kwargs): + """Return source-diversity summary metrics for the current project.""" + + queryset = self.get_queryset() + serializer = SourceDiversityObservabilitySummarySerializer( + { + "project": _require_pk(self.get_project()), + "snapshot_count": queryset.count(), + "latest_snapshot": queryset.order_by("-computed_at").first(), + }, + context=self.get_serializer_context(), + ) + return Response(serializer.data) diff --git a/trends/api_urls.py b/trends/api_urls.py index 97d69cba..54f50b00 100644 --- a/trends/api_urls.py +++ b/trends/api_urls.py @@ -3,6 +3,7 @@ from rest_framework_nested.routers import NestedSimpleRouter from trends.api import ( + SourceDiversitySnapshotViewSet, ThemeSuggestionViewSet, TopicCentroidSnapshotViewSet, TopicClusterViewSet, @@ -27,3 +28,8 @@ def register_project_routes(project_router: NestedSimpleRouter) -> None: TopicCentroidSnapshotViewSet, basename="project-topic-centroid-snapshot", ) + project_router.register( + r"source-diversity-snapshots", + SourceDiversitySnapshotViewSet, + basename="project-source-diversity-snapshot", + ) diff --git a/trends/migrations/0004_source_diversity_snapshot.py b/trends/migrations/0004_source_diversity_snapshot.py new file mode 100644 index 00000000..076b1a62 --- /dev/null +++ b/trends/migrations/0004_source_diversity_snapshot.py @@ -0,0 +1,53 @@ +import django.db.models.deletion + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("trends", "0003_theme_suggestion"), + ] + + operations = [ + migrations.CreateModel( + name="SourceDiversitySnapshot", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("computed_at", models.DateTimeField(auto_now_add=True)), + ("window_days", models.PositiveIntegerField(default=14)), + ("plugin_entropy", models.FloatField()), + ("source_entropy", models.FloatField()), + ("author_entropy", models.FloatField()), + ("cluster_entropy", models.FloatField()), + ("top_plugin_share", models.FloatField()), + ("top_source_share", models.FloatField()), + ("breakdown", models.JSONField(blank=True, default=dict)), + ( + "project", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="source_diversity_snapshots", + to="projects.project", + ), + ), + ], + options={ + "ordering": ["-computed_at", "id"], + "db_table": "core_sourcediversitysnapshot", + "indexes": [ + models.Index( + fields=["project", "-computed_at"], + name="core_sourced_project_4bf5_idx", + ), + ], + }, + ), + ] diff --git a/trends/models.py b/trends/models.py index 7f09b03c..95cdf45d 100644 --- a/trends/models.py +++ b/trends/models.py @@ -205,3 +205,35 @@ class Meta: def __str__(self) -> str: return self.title + + +class SourceDiversitySnapshot(models.Model): + """Capture one project-level source diversity reading for a rolling window.""" + + project = models.ForeignKey( + "projects.Project", + on_delete=models.CASCADE, + related_name="source_diversity_snapshots", + ) + computed_at = models.DateTimeField(auto_now_add=True) + window_days = models.PositiveIntegerField(default=14) + plugin_entropy = models.FloatField() + source_entropy = models.FloatField() + author_entropy = models.FloatField() + cluster_entropy = models.FloatField() + top_plugin_share = models.FloatField() + top_source_share = models.FloatField() + breakdown = models.JSONField(default=dict, blank=True) + + class Meta: + ordering = ["-computed_at", "id"] + db_table = "core_sourcediversitysnapshot" + indexes = [ + models.Index( + fields=["project", "-computed_at"], + name="core_sourced_project_4bf5_idx", + ), + ] + + def __str__(self) -> str: + return f"Source diversity snapshot for {self.project.name}" diff --git a/trends/serializers.py b/trends/serializers.py index f55f72c3..36a1cf1f 100644 --- a/trends/serializers.py +++ b/trends/serializers.py @@ -6,6 +6,7 @@ from core.serializer_mixins import ProjectScopedSerializerMixin from trends.models import ( ContentClusterMembership, + SourceDiversitySnapshot, ThemeSuggestion, ThemeSuggestionStatus, TopicCentroidSnapshot, @@ -213,3 +214,32 @@ class TopicCentroidObservabilitySummarySerializer(serializers.Serializer): avg_drift_from_previous = serializers.FloatField(allow_null=True) avg_drift_from_week_ago = serializers.FloatField(allow_null=True) latest_snapshot = TopicCentroidSnapshotSerializer(allow_null=True) + + +class SourceDiversitySnapshotSerializer(serializers.ModelSerializer): + """Serialize one persisted source-diversity snapshot for a project.""" + + class Meta: + model = SourceDiversitySnapshot + fields = [ + "id", + "project", + "computed_at", + "window_days", + "plugin_entropy", + "source_entropy", + "author_entropy", + "cluster_entropy", + "top_plugin_share", + "top_source_share", + "breakdown", + ] + read_only_fields = fields + + +class SourceDiversityObservabilitySummarySerializer(serializers.Serializer): + """Serialize project-level source-diversity observability metrics.""" + + project = serializers.IntegerField() + snapshot_count = serializers.IntegerField() + latest_snapshot = SourceDiversitySnapshotSerializer(allow_null=True) diff --git a/trends/tasks.py b/trends/tasks.py index f6c589ca..592e7ef7 100644 --- a/trends/tasks.py +++ b/trends/tasks.py @@ -21,11 +21,12 @@ ) from core.llm import build_skill_user_prompt, get_skill_definition, openrouter_chat_json from content.models import Content, FeedbackType, UserFeedback -from entities.models import Entity, EntityMention -from projects.models import Project +from entities.models import Entity, EntityMention, EntityMentionRole +from projects.models import Project, SourceConfig from .models import ( ContentClusterMembership, + SourceDiversitySnapshot, ThemeSuggestion, ThemeSuggestionStatus, TopicCentroidSnapshot, @@ -43,6 +44,9 @@ TOPIC_CLUSTER_MIN_MEMBERS = 3 TOPIC_VELOCITY_TRAILING_DAYS = 7 TOPIC_VELOCITY_EMA_ALPHA = 0.5 +SOURCE_DIVERSITY_WINDOW_DAYS = 14 +SOURCE_DIVERSITY_TOP_PLUGIN_ALERT_THRESHOLD = 0.7 +SOURCE_DIVERSITY_LOW_AUTHOR_ENTROPY_THRESHOLD = 0.4 THEME_DETECTION_SKILL_NAME = "theme_detection" THEME_SUGGESTION_DAILY_CAP = 5 THEME_NOVELTY_LOOKBACK_DAYS = 30 @@ -357,8 +361,10 @@ def recompute_topic_velocity(project_id: int) -> dict[str, int]: ) snapshot_count += 1 if settings.CELERY_TASK_ALWAYS_EAGER: + recompute_source_diversity(project_id) generate_theme_suggestions(project_id) else: + _enqueue_task(recompute_source_diversity, project_id) _enqueue_task(generate_theme_suggestions, project_id) return { "project_id": project_id, @@ -367,6 +373,144 @@ def recompute_topic_velocity(project_id: int) -> dict[str, int]: } +@shared_task(name="core.tasks.recompute_source_diversity") +def recompute_source_diversity(project_id: int) -> dict[str, object]: + """Persist one fresh source-diversity snapshot for a project's recent content.""" + + computed_at = timezone.now() + window_start = computed_at - timedelta(days=SOURCE_DIVERSITY_WINDOW_DAYS) + recent_contents = list( + Content.objects.filter( + project_id=project_id, + is_active=True, + published_date__gte=window_start, + ) + .select_related("entity") + .only( + "id", + "project_id", + "entity_id", + "source_plugin", + "source_metadata", + "published_date", + ) + .order_by("published_date", "id") + ) + content_ids = [_require_pk(content) for content in recent_contents] + author_entity_ids = _author_entities_for_contents(recent_contents) + cluster_labels = { + str(cluster_id): label + for cluster_id, label in TopicCluster.objects.filter( + memberships__content_id__in=content_ids, + project_id=project_id, + ) + .distinct() + .values_list("id", "label") + } + + plugin_counts: Counter[str] = Counter() + source_counts: Counter[str] = Counter() + author_counts: Counter[str] = Counter() + for content in recent_contents: + plugin_key = content.source_plugin or "unknown" + plugin_counts[plugin_key] += 1 + source_counts[_source_bucket_key(content)] += 1 + author_entity_id = author_entity_ids.get(_require_pk(content)) + if author_entity_id is not None: + author_counts[str(author_entity_id)] += 1 + + cluster_counts: Counter[str] = Counter( + str(cluster_id) + for cluster_id in ContentClusterMembership.objects.filter( + project_id=project_id, + content_id__in=content_ids, + ).values_list("cluster_id", flat=True) + ) + + source_config_labels = { + f"source_config:{source_id}": f"{plugin_name} #{source_id}" + for source_id, plugin_name in SourceConfig.objects.filter( + pk__in=[ + int(key.split(":", 1)[1]) + for key in source_counts + if key.startswith("source_config:") + ] + ).values_list("id", "plugin_name") + } + author_labels = { + str(entity_id): name + for entity_id, name in Entity.objects.filter( + pk__in=[int(key) for key in author_counts] + ).values_list("id", "name") + } + cluster_label_map = { + cluster_id: label or f"Cluster {cluster_id}" + for cluster_id, label in cluster_labels.items() + } + + top_plugin_item = _top_count_item(plugin_counts) + top_source_item = _top_count_item(source_counts) + plugin_entropy = _normalized_shannon_entropy(plugin_counts) + source_entropy = _normalized_shannon_entropy(source_counts) + author_entropy = _normalized_shannon_entropy(author_counts) + cluster_entropy = _normalized_shannon_entropy(cluster_counts) + top_plugin_share = _top_share(plugin_counts) + top_source_share = _top_share(source_counts) + + alerts = _build_source_diversity_alerts( + top_plugin_item=top_plugin_item, + top_plugin_share=top_plugin_share, + author_counts=author_counts, + author_entropy=author_entropy, + ) + breakdown = { + "total_content_count": len(recent_contents), + "plugin_counts": _serialize_breakdown_counts( + plugin_counts, + label_resolver=lambda key: key, + ), + "source_counts": _serialize_breakdown_counts( + source_counts, + label_resolver=lambda key: source_config_labels.get( + key, + _fallback_source_label(key), + ), + ), + "author_counts": _serialize_breakdown_counts( + author_counts, + label_resolver=lambda key: author_labels.get(key, f"Entity {key}"), + ), + "cluster_counts": _serialize_breakdown_counts( + cluster_counts, + label_resolver=lambda key: cluster_label_map.get(key, f"Cluster {key}"), + ), + "alerts": alerts, + } + + snapshot = SourceDiversitySnapshot.objects.create( + project_id=project_id, + window_days=SOURCE_DIVERSITY_WINDOW_DAYS, + plugin_entropy=plugin_entropy, + source_entropy=source_entropy, + author_entropy=author_entropy, + cluster_entropy=cluster_entropy, + top_plugin_share=top_plugin_share, + top_source_share=top_source_share, + breakdown=breakdown, + ) + if snapshot.computed_at != computed_at: + SourceDiversitySnapshot.objects.filter(pk=snapshot.pk).update( + computed_at=computed_at + ) + snapshot.computed_at = computed_at + return { + "project_id": project_id, + "snapshot_id": _require_pk(snapshot), + "content_count": len(recent_contents), + "alert_count": len(alerts), + } + + @shared_task(name="core.tasks.generate_theme_suggestions") def generate_theme_suggestions(project_id: int) -> dict[str, int]: """Generate pending editor-facing theme suggestions for one project.""" @@ -929,6 +1073,164 @@ def dismiss_theme_suggestion( return theme_suggestion +def _author_entities_for_contents(contents: list[Content]) -> dict[int, int | None]: + """Resolve one best-effort author entity bucket per content row.""" + + content_ids = [_require_pk(content) for content in contents] + author_mentions: dict[int, int] = {} + for content_id, entity_id in ( + EntityMention.objects.filter( + content_id__in=content_ids, + role=EntityMentionRole.AUTHOR, + ) + .order_by("content_id", "entity_id") + .values_list("content_id", "entity_id") + ): + author_mentions.setdefault(int(content_id), int(entity_id)) + return { + _require_pk(content): author_mentions.get(_require_pk(content)) + or (_require_pk(content.entity) if content.entity is not None else None) + for content in contents + } + + +def _source_bucket_key(content: Content) -> str: + """Return the source bucket key used for source diversity counts.""" + + source_metadata = content.source_metadata or {} + source_config_id = source_metadata.get("source_config_id") + if isinstance(source_config_id, bool): + source_config_id = None + if isinstance(source_config_id, (int, str)) and source_config_id != "": + try: + return f"source_config:{int(source_config_id)}" + except ValueError: + pass + sender_email = str(source_metadata.get("sender_email", "")).strip().lower() + if sender_email: + return f"sender_email:{sender_email}" + for metadata_key in ( + "feed_url", + "subreddit", + "author_handle", + "account_acct", + "instance_url", + ): + metadata_value = str(source_metadata.get(metadata_key, "")).strip().lower() + if metadata_value: + return f"{metadata_key}:{metadata_value}" + if content.source_plugin: + return f"plugin:{content.source_plugin}" + return "unknown" + + +def _fallback_source_label(source_key: str) -> str: + """Render a human-readable source label for non-SourceConfig buckets.""" + + prefix, _, value = source_key.partition(":") + if not value: + return source_key + if prefix == "sender_email": + return value + if prefix == "plugin": + return f"{value} (unattributed)" + return value + + +def _serialize_breakdown_counts( + counts: Counter[str], + *, + label_resolver, +) -> list[dict[str, object]]: + """Serialize one counter into a stable JSON-friendly breakdown list.""" + + total_count = sum(counts.values()) + if total_count <= 0: + return [] + return [ + { + "key": key, + "label": label_resolver(key), + "count": count, + "share": count / total_count, + } + for key, count in sorted(counts.items(), key=lambda item: (-item[1], item[0])) + ] + + +def _normalized_shannon_entropy(counts: Counter[str]) -> float: + """Return Shannon entropy normalized into the inclusive [0, 1] interval.""" + + positive_counts = [count for count in counts.values() if count > 0] + if len(positive_counts) <= 1: + return 0.0 + total_count = sum(positive_counts) + entropy = -sum( + (count / total_count) * math.log2(count / total_count) + for count in positive_counts + ) + max_entropy = math.log2(len(positive_counts)) + if max_entropy <= 0: + return 0.0 + return max(0.0, min(1.0, entropy / max_entropy)) + + +def _top_share(counts: Counter[str]) -> float: + """Return the share owned by the largest bucket in one distribution.""" + + total_count = sum(counts.values()) + if total_count <= 0: + return 0.0 + return max(counts.values()) / total_count + + +def _top_count_item(counts: Counter[str]) -> tuple[str, int] | None: + """Return the largest bucket key/count pair using a stable tie-breaker.""" + + if not counts: + return None + return max(counts.items(), key=lambda item: (item[1], item[0])) + + +def _build_source_diversity_alerts( + *, + top_plugin_item: tuple[str, int] | None, + top_plugin_share: float, + author_counts: Counter[str], + author_entropy: float, +) -> list[dict[str, str]]: + """Build advisory cards for concentrated source diversity snapshots.""" + + alerts: list[dict[str, str]] = [] + if ( + top_plugin_item is not None + and top_plugin_share > SOURCE_DIVERSITY_TOP_PLUGIN_ALERT_THRESHOLD + ): + alerts.append( + { + "code": "top_plugin_share", + "severity": "warning", + "message": ( + f"Your stream is {top_plugin_share:.0%} from {top_plugin_item[0]} this week." + ), + } + ) + if author_counts and author_entropy < SOURCE_DIVERSITY_LOW_AUTHOR_ENTROPY_THRESHOLD: + author_bucket_count = len(author_counts) + alerts.append( + { + "code": "author_entropy", + "severity": "warning", + "message": ( + "Three authors account for most of your content." + if author_bucket_count <= 3 + else "A small set of authors accounts for most of your content." + ), + } + ) + return alerts + + def _resolve_dominant_entity(contents: list[Content]) -> Entity | None: """Return the most frequently referenced entity across clustered content.""" diff --git a/trends/tests/test_admin.py b/trends/tests/test_admin.py index 18ce818f..1dee2be0 100644 --- a/trends/tests/test_admin.py +++ b/trends/tests/test_admin.py @@ -10,8 +10,8 @@ from django.utils import timezone from projects.models import Project -from trends.admin import TopicCentroidSnapshotAdmin -from trends.models import TopicCentroidSnapshot +from trends.admin import SourceDiversitySnapshotAdmin, TopicCentroidSnapshotAdmin +from trends.models import SourceDiversitySnapshot, TopicCentroidSnapshot pytestmark = pytest.mark.django_db @@ -55,6 +55,21 @@ def _drilldowns(response: object) -> list[dict[str, Any]]: return cast(list[dict[str, Any]], _context(response)["centroid_project_drilldowns"]) +def _source_diversity_alerts(response: object) -> list[dict[str, Any]]: + """Return typed source diversity alerts from a changelist payload.""" + + return cast(list[dict[str, Any]], _context(response)["source_diversity_alerts"]) + + +def _source_diversity_drilldowns(response: object) -> list[dict[str, Any]]: + """Return typed source diversity drilldowns from a changelist payload.""" + + return cast( + list[dict[str, Any]], + _context(response)["source_diversity_project_drilldowns"], + ) + + @pytest.fixture def source_admin_context(django_user_model): user = _create_user( @@ -151,3 +166,82 @@ def test_topic_centroid_snapshot_admin_changelist_view_builds_dashboard_stats( f"{_require_pk(source_admin_context.project)}" ) assert centroid_project_drilldowns[0]["drift_from_previous"] == "10.0%" + + +def test_source_diversity_snapshot_admin_changelist_view_builds_dashboard_stats( + source_admin_context, mocker +): + second_project = Project.objects.create( + name="Second Diversity Project", + topic_description="Analytics", + ) + first_snapshot = SourceDiversitySnapshot.objects.create( + project=source_admin_context.project, + window_days=14, + plugin_entropy=0.8, + source_entropy=0.7, + author_entropy=0.3, + cluster_entropy=0.6, + top_plugin_share=0.75, + top_source_share=0.7, + breakdown={ + "alerts": [ + { + "code": "top_plugin_share", + "severity": "warning", + "message": "Your stream is 75% from rss this week.", + } + ] + }, + ) + second_snapshot = SourceDiversitySnapshot.objects.create( + project=second_project, + window_days=14, + plugin_entropy=0.2, + source_entropy=0.4, + author_entropy=0.1, + cluster_entropy=0.3, + top_plugin_share=0.9, + top_source_share=0.85, + breakdown={"alerts": []}, + ) + admin_instance = SourceDiversitySnapshotAdmin(SourceDiversitySnapshot, AdminSite()) + mocker.patch.object( + admin_instance, + "get_queryset", + return_value=SourceDiversitySnapshot.objects.all(), + ) + super_changelist_view = mocker.patch( + "django.contrib.admin.options.ModelAdmin.changelist_view", + side_effect=lambda request, extra_context=None: extra_context, + ) + + response = admin_instance.changelist_view(request=_request()) + dashboard_stats = _dashboard_stats(response) + alerts = _source_diversity_alerts(response) + drilldowns = _source_diversity_drilldowns(response) + + super_changelist_view.assert_called_once() + assert ( + admin_instance.list_before_template + == "admin/source_diversity_snapshot_changelist_widget.html" + ) + assert dashboard_stats[0]["value"] == "50.0%" + assert dashboard_stats[0]["color"] == "warning" + assert dashboard_stats[1]["value"] == "55.0%" + assert dashboard_stats[2]["value"] == "20.0%" + assert dashboard_stats[2]["color"] == "danger" + assert dashboard_stats[3]["value"] == "82.5%" + assert dashboard_stats[3]["color"] == "danger" + assert alerts == [ + { + "code": "top_plugin_share", + "severity": "warning", + "message": "Your stream is 75% from rss this week.", + } + ] + assert len(drilldowns) == 2 + assert drilldowns[0]["project_name"] == "Admin Project" + assert drilldowns[0]["plugin_entropy"] == "80.0%" + assert drilldowns[0]["top_plugin_share"] == "75.0%" + assert drilldowns[0]["alert_count"] == 1 diff --git a/trends/tests/test_api.py b/trends/tests/test_api.py index 531eaedd..e6f3a05a 100644 --- a/trends/tests/test_api.py +++ b/trends/tests/test_api.py @@ -12,6 +12,7 @@ from projects.models import Project, ProjectConfig, ProjectMembership, ProjectRole from trends.models import ( ContentClusterMembership, + SourceDiversitySnapshot, ThemeSuggestion, ThemeSuggestionStatus, TopicCentroidSnapshot, @@ -392,3 +393,69 @@ def test_theme_suggestion_accept_and_dismiss_actions_update_workflow_fields(self self.assertEqual(dismiss_suggestion.status, ThemeSuggestionStatus.DISMISSED) self.assertEqual(dismiss_suggestion.dismissal_reason, "already covered") self.assertEqual(dismiss_suggestion.decided_by, self.owner) + + def test_source_diversity_snapshot_list_and_summary_are_scoped_to_project(self): + owner_snapshot = SourceDiversitySnapshot.objects.create( + project=self.owner_project, + window_days=14, + plugin_entropy=0.8, + source_entropy=0.75, + author_entropy=0.5, + cluster_entropy=0.6, + top_plugin_share=0.7, + top_source_share=0.65, + breakdown={ + "total_content_count": 4, + "plugin_counts": [], + "source_counts": [], + "author_counts": [], + "cluster_counts": [], + "alerts": [], + }, + ) + other_snapshot = SourceDiversitySnapshot.objects.create( + project=self.other_project, + window_days=14, + plugin_entropy=0.2, + source_entropy=0.3, + author_entropy=0.1, + cluster_entropy=0.4, + top_plugin_share=0.95, + top_source_share=0.95, + breakdown={ + "total_content_count": 8, + "plugin_counts": [], + "source_counts": [], + "author_counts": [], + "cluster_counts": [], + "alerts": [], + }, + ) + + list_response = self.client.get( + reverse( + "v1:project-source-diversity-snapshot-list", + kwargs={"project_id": _require_pk(self.owner_project)}, + ) + ) + summary_response = self.client.get( + reverse( + "v1:project-source-diversity-snapshot-summary", + kwargs={"project_id": _require_pk(self.owner_project)}, + ) + ) + + self.assertEqual(list_response.status_code, status.HTTP_200_OK) + self.assertEqual(len(list_response.json()), 1) + self.assertEqual(list_response.json()[0]["id"], _require_pk(owner_snapshot)) + self.assertNotEqual(_require_pk(owner_snapshot), _require_pk(other_snapshot)) + + self.assertEqual(summary_response.status_code, status.HTTP_200_OK) + self.assertEqual( + summary_response.json()["project"], _require_pk(self.owner_project) + ) + self.assertEqual(summary_response.json()["snapshot_count"], 1) + self.assertEqual( + summary_response.json()["latest_snapshot"]["id"], + _require_pk(owner_snapshot), + ) diff --git a/trends/tests/test_tasks.py b/trends/tests/test_tasks.py index dbc77696..ee9f71ce 100644 --- a/trends/tests/test_tasks.py +++ b/trends/tests/test_tasks.py @@ -7,9 +7,10 @@ from content.models import Content, FeedbackType, UserFeedback from entities.models import Entity, EntityMention, EntityMentionRole from projects.model_support import SourcePluginName -from projects.models import Project +from projects.models import Project, SourceConfig from trends.models import ( ContentClusterMembership, + SourceDiversitySnapshot, ThemeSuggestion, ThemeSuggestionStatus, TopicCentroidSnapshot, @@ -22,6 +23,7 @@ assign_content_to_topic_cluster, generate_theme_suggestions, queue_topic_centroid_recompute, + recompute_source_diversity, recompute_topic_centroid, recompute_topic_clusters, recompute_topic_velocity, @@ -280,6 +282,113 @@ def test_run_all_topic_cluster_recomputations_enqueues_all_projects( assert delay_mock.call_count == 2 +def test_recompute_source_diversity_persists_entropy_breakdown_and_alerts( + source_plugin_context, +): + project = source_plugin_context.project + rss_source = SourceConfig.objects.create( + project=project, + plugin_name=SourcePluginName.RSS, + config={"feed_url": "https://example.com/feed.xml"}, + ) + reddit_source = SourceConfig.objects.create( + project=project, + plugin_name=SourcePluginName.REDDIT, + config={"subreddit": "MachineLearning"}, + ) + first_cluster = TopicCluster.objects.create( + project=project, + first_seen_at="2026-04-20T00:00:00Z", + last_seen_at="2026-04-24T00:00:00Z", + is_active=True, + member_count=3, + dominant_entity=source_plugin_context.entity, + label="Platform Signals", + ) + second_cluster = TopicCluster.objects.create( + project=project, + first_seen_at="2026-04-20T00:00:00Z", + last_seen_at="2026-04-24T00:00:00Z", + is_active=True, + member_count=1, + dominant_entity=source_plugin_context.entity, + label="Community Chatter", + ) + + contents = [] + for index in range(3): + contents.append( + Content.objects.create( + project=project, + entity=source_plugin_context.entity, + url=f"https://example.com/rss-{index}", + title=f"RSS {index}", + author="Author", + source_plugin=SourcePluginName.RSS, + published_date="2026-04-24T12:00:00Z", + content_text="Manual content body", + source_metadata={"source_config_id": _require_pk(rss_source)}, + ) + ) + contents.append( + Content.objects.create( + project=project, + entity=source_plugin_context.entity, + url="https://example.com/reddit-0", + title="Reddit 0", + author="Author", + source_plugin=SourcePluginName.REDDIT, + published_date="2026-04-24T12:00:00Z", + content_text="Manual content body", + source_metadata={"source_config_id": _require_pk(reddit_source)}, + ) + ) + for content in contents[:3]: + ContentClusterMembership.objects.create( + content=content, + cluster=first_cluster, + project=project, + similarity=0.95, + ) + ContentClusterMembership.objects.create( + content=contents[3], + cluster=second_cluster, + project=project, + similarity=0.91, + ) + + result = recompute_source_diversity(_require_pk(project)) + snapshot = SourceDiversitySnapshot.objects.get(project=project) + + assert result["project_id"] == _require_pk(project) + assert result["content_count"] == 4 + assert snapshot.window_days == 14 + assert snapshot.plugin_entropy == pytest.approx(0.811278, rel=1e-4) + assert snapshot.source_entropy == pytest.approx(0.811278, rel=1e-4) + assert snapshot.author_entropy == 0.0 + assert snapshot.cluster_entropy == pytest.approx(0.811278, rel=1e-4) + assert snapshot.top_plugin_share == pytest.approx(0.75) + assert snapshot.top_source_share == pytest.approx(0.75) + assert snapshot.breakdown["plugin_counts"][0]["key"] == SourcePluginName.RSS + assert snapshot.breakdown["plugin_counts"][0]["count"] == 3 + assert snapshot.breakdown["source_counts"][0]["label"] == ( + f"rss #{_require_pk(rss_source)}" + ) + assert snapshot.breakdown["cluster_counts"][0]["label"] == "Platform Signals" + assert snapshot.breakdown["alerts"] == [ + { + "code": "top_plugin_share", + "severity": "warning", + "message": "Your stream is 75% from rss this week.", + }, + { + "code": "author_entropy", + "severity": "warning", + "message": "Three authors account for most of your content.", + }, + ] + + def test_queue_topic_centroid_recompute_enqueues_background_task( source_plugin_context, mocker ):