diff --git a/SESSION.md b/SESSION.md index 7ab82f25..6a5ea158 100644 --- a/SESSION.md +++ b/SESSION.md @@ -1,5 +1,23 @@ # Session Restore Point +## 2026-04-29 End Of Day + +- WP4 entity extraction is implemented end-to-end. +- Backend added `EntityMention` and `EntityCandidate`, migration `core/migrations/0006_entitycandidate_entitymention.py`, new `core/entity_extraction.py`, and an `extract_entities` pipeline node between classification and relevance. +- Admin now supports reviewing entity candidates and mentions; candidate accept/reject/merge actions are wired in `core/admin.py`. +- API/frontend work is in place for entity mention summaries, project-scoped entity candidate review actions, and the new entity detail page at `frontend/src/app/entities/[id]/page.tsx`. +- `/entities` now shows pending candidates plus recent mention summaries, and links into the entity detail page. +- Focused validation that passed today: + - `pytest core/tests/test_pipeline.py core/tests/test_admin.py -q` + - `pytest core/tests/test_api.py -q` + - `python manage.py check` + - `python manage.py makemigrations --check --dry-run` + - `python3 -m mypy core/pipeline.py core/entity_extraction.py core/embeddings.py core/models.py core/admin.py core/tests/test_pipeline.py core/tests/test_admin.py` + - `cd frontend && npm run typecheck` + - `cd frontend && npm run lint` + - `cd frontend && npx vitest run src/app/entities/__tests__/page.test.tsx src/app/api/entity-candidates/[id]/__tests__/route.test.ts src/app/entities/[id]/__tests__/page.test.tsx` +- Repo-wide `just lint` was rerun after fixing `frontend/src/lib/api.ts` import ordering; backend lint fully passed and direct frontend lint now passes with `FRONTEND_LINT_OK`. + ## Useful Commands From Today diff --git a/core/admin.py b/core/admin.py index 32baea21..e40f71b3 100644 --- a/core/admin.py +++ b/core/admin.py @@ -16,10 +16,18 @@ from import_export.admin import ExportActionMixin from unfold.admin import ModelAdmin +from core.entity_extraction import ( + accept_entity_candidate, + merge_entity_candidate, + reject_entity_candidate, +) from core.models import ( BlueskyCredentials, Content, Entity, + EntityCandidate, + EntityCandidateStatus, + EntityMention, IngestionRun, Project, ProjectConfig, @@ -199,6 +207,7 @@ class EntityAdmin(admin.ModelAdmin): # Replace 'authority_score' with your new method name list_display = ("name", "project", "type", "colored_score", "created_at") + search_fields = ("name", "project__name") @admin.display(description="Authority Score", ordering="authority_score") def colored_score(self, obj): @@ -219,6 +228,116 @@ def colored_score(self, obj): ) +@admin.register(EntityMention) +class EntityMentionAdmin(admin.ModelAdmin): + """Admin view for extracted tracked-entity mentions.""" + + list_display = ( + "entity", + "project", + "content", + "role", + "sentiment", + "confidence", + "created_at", + ) + list_filter = ("role", "sentiment", ("project", admin.RelatedOnlyFieldListFilter)) + search_fields = ("entity__name", "content__title", "span") + autocomplete_fields = ("entity", "content", "project") + + +@admin.register(EntityCandidate) +class EntityCandidateAdmin(admin.ModelAdmin): + """Admin view for candidate entities awaiting human review.""" + + actions = [ + "accept_selected_candidates", + "reject_selected_candidates", + "merge_into_existing_entities", + ] + list_display = ( + "name", + "project", + "suggested_type", + "occurrence_count", + "status", + "merged_into", + "first_seen_in", + "created_at", + ) + list_filter = ( + "status", + "suggested_type", + ("project", admin.RelatedOnlyFieldListFilter), + ) + search_fields = ("name", "project__name", "merged_into__name") + autocomplete_fields = ("project", "first_seen_in", "merged_into") + ordering = ("-occurrence_count", "name") + + @admin.action(description="Accept selected candidates") + def accept_selected_candidates(self, request, queryset): + """Promote selected candidates into tracked entities.""" + + accepted_count = 0 + for candidate in queryset.select_related("project"): + if candidate.status == EntityCandidateStatus.ACCEPTED: + continue + accept_entity_candidate(candidate) + accepted_count += 1 + self.message_user( + request, + f"Accepted {accepted_count} entity candidate(s).", + messages.SUCCESS, + ) + + @admin.action(description="Reject selected candidates") + def reject_selected_candidates(self, request, queryset): + """Mark selected candidates as rejected.""" + + rejected_count = 0 + for candidate in queryset: + if candidate.status == EntityCandidateStatus.REJECTED: + continue + reject_entity_candidate(candidate) + rejected_count += 1 + self.message_user( + request, + f"Rejected {rejected_count} entity candidate(s).", + messages.SUCCESS, + ) + + @admin.action(description="Merge selected candidates into existing entities") + def merge_into_existing_entities(self, request, queryset): + """Merge candidates when a same-name entity already exists in the project.""" + + merged_count = 0 + unresolved_names: list[str] = [] + for candidate in queryset.select_related("project"): + matching_entities = Entity.objects.filter( + project=candidate.project, + name__iexact=candidate.name, + ) + if matching_entities.count() != 1: + unresolved_names.append(candidate.name) + continue + merge_entity_candidate(candidate, matching_entities.get()) + merged_count += 1 + + if merged_count: + self.message_user( + request, + f"Merged {merged_count} entity candidate(s) into existing entities.", + messages.SUCCESS, + ) + if unresolved_names: + self.message_user( + request, + "No unique same-name entity match was available for: " + + ", ".join(sorted(unresolved_names)), + messages.WARNING, + ) + + class HighValueFilter(admin.SimpleListFilter): """Filter content down to high-value reference items.""" diff --git a/core/api.py b/core/api.py index 5a32aa8c..ba894f7f 100644 --- a/core/api.py +++ b/core/api.py @@ -8,6 +8,7 @@ import logging from typing import Any +from django.db.models import Count, Prefetch from drf_spectacular.utils import ( OpenApiExample, OpenApiParameter, @@ -21,10 +22,17 @@ from rest_framework.exceptions import NotFound from rest_framework.response import Response +from core.entity_extraction import ( + accept_entity_candidate, + merge_entity_candidate, + reject_entity_candidate, +) from core.models import ( BlueskyCredentials, Content, Entity, + EntityCandidate, + EntityMention, IngestionRun, Project, ProjectConfig, @@ -35,6 +43,9 @@ ) from core.serializers import ( ContentSerializer, + EntityCandidateMergeSerializer, + EntityCandidateSerializer, + EntityMentionSummarySerializer, EntitySerializer, IngestionRunSerializer, ProjectConfigSerializer, @@ -693,7 +704,111 @@ class EntityViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet): """Manage tracked entities associated with a project.""" serializer_class = EntitySerializer - queryset = Entity.objects.select_related("project") + queryset = ( + Entity.objects.select_related("project") + .annotate(mention_count=Count("mentions", distinct=True)) + .prefetch_related( + Prefetch( + "mentions", + queryset=EntityMention.objects.select_related("content").order_by( + "-created_at" + ), + to_attr="prefetched_mentions", + ) + ) + ) + + @extend_schema( + summary="List entity mentions", + description="Return the extracted mention history for one tracked entity inside the selected project.", + request=None, + responses={200: EntityMentionSummarySerializer(many=True), 403: AUTHENTICATION_REQUIRED_RESPONSE}, + tags=["Entity Catalog"], + ) + @action(detail=True, methods=["get"], url_path="mentions") + def mentions(self, request, *args, **kwargs): + """Return the extracted mentions for the selected entity.""" + + entity = self.get_object() + mentions = entity.mentions.select_related("content").order_by("-created_at") + serializer = EntityMentionSummarySerializer(mentions, many=True) + return Response(serializer.data) + + +@document_project_owned_viewset( + resource_plural="entity candidates", + resource_singular="entity candidate", + create_description="Entity candidates are created by the pipeline and can be reviewed through dedicated actions.", + tag="Entity Catalog", + action_overrides=build_crud_action_overrides( + EntityCandidateSerializer, + resource_plural="entity candidates for the selected project", + resource_singular="entity candidate", + ), +) +class EntityCandidateViewSet(ProjectOwnedQuerysetMixin, viewsets.ReadOnlyModelViewSet): + """Inspect and resolve entity candidates surfaced by entity extraction.""" + + serializer_class = EntityCandidateSerializer + queryset = EntityCandidate.objects.select_related( + "project", "first_seen_in", "merged_into" + ) + + @extend_schema( + summary="Accept entity candidate", + description="Promote a pending entity candidate into a tracked entity and backfill recent mentions.", + request=None, + responses={200: EntityCandidateSerializer, 403: AUTHENTICATION_REQUIRED_RESPONSE}, + tags=["Entity Catalog"], + ) + @action(detail=True, methods=["post"], url_path="accept") + def accept(self, request, *args, **kwargs): + """Accept an entity candidate and return its updated representation.""" + + candidate = self.get_object() + accept_entity_candidate(candidate) + candidate.refresh_from_db() + serializer = self.get_serializer(candidate) + return Response(serializer.data) + + @extend_schema( + summary="Reject entity candidate", + description="Mark a pending entity candidate as rejected without creating a tracked entity.", + request=None, + responses={200: EntityCandidateSerializer, 403: AUTHENTICATION_REQUIRED_RESPONSE}, + tags=["Entity Catalog"], + ) + @action(detail=True, methods=["post"], url_path="reject") + def reject(self, request, *args, **kwargs): + """Reject an entity candidate and return its updated representation.""" + + candidate = self.get_object() + reject_entity_candidate(candidate) + candidate.refresh_from_db() + serializer = self.get_serializer(candidate) + return Response(serializer.data) + + @extend_schema( + summary="Merge entity candidate", + description="Merge a pending entity candidate into an existing tracked entity from the same project.", + request=EntityCandidateMergeSerializer, + responses={200: EntityCandidateSerializer, 400: EntityCandidateMergeSerializer, 403: AUTHENTICATION_REQUIRED_RESPONSE}, + tags=["Entity Catalog"], + ) + @action(detail=True, methods=["post"], url_path="merge") + def merge(self, request, *args, **kwargs): + """Merge an entity candidate into an existing tracked entity.""" + + candidate = self.get_object() + serializer = EntityCandidateMergeSerializer( + data=request.data, + context=self.get_serializer_context(), + ) + serializer.is_valid(raise_exception=True) + merge_entity_candidate(candidate, serializer.validated_data["merged_into"]) + candidate.refresh_from_db() + response_serializer = self.get_serializer(candidate) + return Response(response_serializer.data) @document_project_owned_viewset( diff --git a/core/api_urls.py b/core/api_urls.py index 2c6184e3..ae325066 100644 --- a/core/api_urls.py +++ b/core/api_urls.py @@ -3,6 +3,7 @@ from core.api import ( ContentViewSet, + EntityCandidateViewSet, EntityViewSet, IngestionRunViewSet, ProjectConfigViewSet, @@ -23,6 +24,11 @@ r"project-configs", ProjectConfigViewSet, basename="project-config" ) project_router.register(r"entities", EntityViewSet, basename="project-entity") +project_router.register( + r"entity-candidates", + EntityCandidateViewSet, + basename="project-entity-candidate", +) project_router.register(r"contents", ContentViewSet, basename="project-content") project_router.register( r"skill-results", SkillResultViewSet, basename="project-skill-result" diff --git a/core/embeddings.py b/core/embeddings.py index 051adb25..b87b418a 100644 --- a/core/embeddings.py +++ b/core/embeddings.py @@ -26,7 +26,7 @@ VectorParams, ) -from core.models import Content +from core.models import Content, Entity from core.settings_types import CoreSettings SentenceTransformer = None @@ -158,6 +158,12 @@ def collection_name_for_project(project_id: int) -> str: return f"project_{project_id}_content" +def entity_collection_name_for_project(project_id: int) -> str: + """Return the Qdrant collection name for a project's tracked entities.""" + + return f"project_{project_id}_entities" + + @lru_cache(maxsize=1) def get_qdrant_client() -> QdrantClient: """Create and cache the shared Qdrant client instance.""" @@ -237,6 +243,55 @@ def upsert_content_embedding(content: Content) -> str: return embedding_id +def upsert_entity_embedding(entity: Entity) -> str: + """Write or update an entity embedding in the project's entity collection.""" + + client = get_qdrant_client() + ensure_project_entity_collection(entity.project_id) + vector = embed_text(build_entity_embedding_text(entity)) + embedding_id = f"entity-{entity.id}" + client.upsert( + collection_name=entity_collection_name_for_project(entity.project_id), + points=[ + PointStruct( + id=embedding_id, + vector=vector, + payload={ + "entity_id": entity.id, + "project_id": entity.project_id, + "name": entity.name, + "type": entity.type, + }, + ) + ], + wait=True, + ) + return embedding_id + + +def sync_project_entity_embeddings(project_id: int) -> None: + """Ensure all tracked entities for a project are present in Qdrant.""" + + entities = Entity.objects.filter(project_id=project_id).only( + "id", + "project_id", + "name", + "type", + "description", + "website_url", + "github_url", + "linkedin_url", + "bluesky_handle", + "mastodon_handle", + "twitter_handle", + ) + if not entities.exists(): + return + ensure_project_entity_collection(project_id) + for entity in entities: + upsert_entity_embedding(entity) + + def search_similar( project_id: int, query_vector: list[float], @@ -289,6 +344,31 @@ def search_similar_content( ) +def search_similar_entities(project_id: int, query_vector: list[float], limit: int = 10): + """Search the tracked-entity collection for nearest matches.""" + + if not project_entity_collection_exists(project_id): + return [] + client = cast(Any, get_qdrant_client()) + return client.search( + collection_name=entity_collection_name_for_project(project_id), + query_vector=query_vector, + limit=limit, + with_payload=True, + ) + + +def search_similar_entities_for_content(content: Content, limit: int = 8): + """Find tracked entities whose embeddings are close to a content item.""" + + sync_project_entity_embeddings(content.project_id) + return search_similar_entities( + content.project_id, + embed_text(build_content_embedding_text(content)), + limit=limit, + ) + + def get_reference_similarity( project_id: int, vector: list[float], limit: int = 5 ) -> float: @@ -325,6 +405,21 @@ def ensure_project_collection(project_id: int) -> None: ) +def ensure_project_entity_collection(project_id: int) -> None: + """Create the per-project entity collection when it does not yet exist.""" + + client = get_qdrant_client() + collection_name = entity_collection_name_for_project(project_id) + if project_entity_collection_exists(project_id): + return + client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams( + size=get_embedding_dimension(), distance=Distance.COSINE + ), + ) + + def project_collection_exists(project_id: int) -> bool: """Return whether the project's Qdrant collection already exists.""" @@ -335,12 +430,40 @@ def project_collection_exists(project_id: int) -> bool: return True +def project_entity_collection_exists(project_id: int) -> bool: + """Return whether the project's entity collection already exists.""" + + try: + get_qdrant_client().get_collection(entity_collection_name_for_project(project_id)) + except Exception: + return False + return True + + def build_content_embedding_text(content: Content) -> str: """Build the text blob used to generate content embeddings.""" return "\n\n".join(part for part in [content.title, content.content_text] if part) +def build_entity_embedding_text(entity: Entity) -> str: + """Build the text blob used to generate entity embeddings.""" + + aliases = [ + entity.bluesky_handle, + entity.mastodon_handle, + entity.twitter_handle, + entity.website_url, + entity.github_url, + entity.linkedin_url, + ] + return "\n\n".join( + part + for part in [entity.name, entity.type, entity.description, *aliases] + if part + ) + + def normalize_text(text: str) -> str: """Trim input text and replace empty input with a stable placeholder.""" diff --git a/core/entity_extraction.py b/core/entity_extraction.py new file mode 100644 index 00000000..4e3935b7 --- /dev/null +++ b/core/entity_extraction.py @@ -0,0 +1,735 @@ +"""Entity extraction helpers for tracked mentions and review candidates.""" + +from __future__ import annotations + +import re +from datetime import timedelta +from typing import Any +from urllib.parse import urlsplit + +from django.conf import settings +from django.db import transaction +from django.utils import timezone + +from core.embeddings import search_similar_entities_for_content +from core.llm import build_skill_user_prompt, get_skill_definition, openrouter_chat_json +from core.models import ( + Content, + Entity, + EntityCandidate, + EntityCandidateStatus, + EntityMention, + EntityMentionRole, + EntityMentionSentiment, + EntityType, + SkillResult, + SkillStatus, +) + +ENTITY_EXTRACTION_SKILL_NAME = "entity_extraction" +ENTITY_RETRIEVAL_LIMIT = 8 +ENTITY_RETRIEVAL_THRESHOLD = 0.35 +RETROACTIVE_MENTION_WINDOW_DAYS = 30 + +PROPER_NOUN_PATTERN = re.compile( + r"\b(?:[A-Z][a-z0-9&+.-]+|[A-Z]{2,})(?:\s+(?:[A-Z][a-z0-9&+.-]+|[A-Z]{2,})){0,3}\b" +) +COMPANY_SUFFIXES = { + "ai", + "corp", + "corporation", + "co", + "company", + "group", + "inc", + "labs", + "systems", + "technologies", + "technology", +} +ORGANIZATION_SUFFIXES = { + "association", + "committee", + "consortium", + "foundation", + "institute", + "university", +} +NOISE_CANDIDATE_NAMES = { + "The", + "This", + "That", + "These", + "Platform", + "Engineering", + "Release Notes", +} +POSITIVE_TOKENS = {"improved", "improves", "strong", "launch", "launched", "good"} +NEGATIVE_TOKENS = {"breach", "bug", "failed", "failure", "outage", "risk"} + + +def run_entity_extraction(content: Content) -> dict[str, Any]: + """Extract tracked-entity mentions and surface unknown candidates.""" + + tracked_entities = list( + Entity.objects.filter(project_id=content.project_id).order_by("name") + ) + extraction = _run_entity_extraction_with_fallback(content, tracked_entities) + normalized_mentions, unresolved_names = _normalize_mentions( + extraction.get("mentions", []), tracked_entities + ) + if not normalized_mentions and tracked_entities: + heuristic_result = _heuristic_entity_extraction(content, tracked_entities) + normalized_mentions, unresolved_names = _normalize_mentions( + heuristic_result["mentions"], tracked_entities + ) + extraction = { + **heuristic_result, + "model_used": extraction.get("model_used", heuristic_result["model_used"]), + "latency_ms": extraction.get("latency_ms", heuristic_result["latency_ms"]), + } + + normalized_candidates = _normalize_candidates( + extraction.get("candidate_entities", []), tracked_entities + ) + for unresolved_name in unresolved_names: + normalized_candidates.append( + { + "name": unresolved_name, + "suggested_type": _guess_candidate_type(unresolved_name), + } + ) + if not normalized_candidates: + normalized_candidates = _normalize_candidates( + _discover_candidates(content, tracked_entities), tracked_entities + ) + + is_rerun = SkillResult.objects.filter( + content=content, + skill_name=ENTITY_EXTRACTION_SKILL_NAME, + status=SkillStatus.COMPLETED, + ).exists() + mentions = replace_entity_mentions(content, normalized_mentions) + candidates = persist_entity_candidates( + content, normalized_candidates, is_rerun=is_rerun + ) + primary_entity = _select_primary_entity(mentions) + if primary_entity is not None and content.entity_id is None: + content.entity = primary_entity + content.save(update_fields=["entity"]) + + confidence = max((mention.confidence for mention in mentions), default=0.0) + return { + "mentions": [_serialize_mention(mention) for mention in mentions], + "candidate_entities": [_serialize_candidate(candidate) for candidate in candidates], + "primary_entity_id": primary_entity.id if primary_entity is not None else None, + "confidence": confidence, + "explanation": extraction.get( + "explanation", + "Entity extraction matched tracked entities and proposed new candidate names.", + ), + "model_used": extraction.get("model_used", "heuristic"), + "latency_ms": int(extraction.get("latency_ms", 0) or 0), + } + + +def replace_entity_mentions( + content: Content, mention_payloads: list[dict[str, Any]] +) -> list[EntityMention]: + """Replace the extracted mentions stored for a content item.""" + + EntityMention.objects.filter(content=content).delete() + return upsert_entity_mentions(content, mention_payloads) + + +def upsert_entity_mentions( + content: Content, mention_payloads: list[dict[str, Any]] +) -> list[EntityMention]: + """Upsert mention rows for a content item without clearing other data first.""" + + mentions: list[EntityMention] = [] + seen_keys: set[tuple[int, str]] = set() + for mention_payload in mention_payloads: + entity = mention_payload["entity"] + role = mention_payload["role"] + key = (entity.id, role) + if key in seen_keys: + continue + seen_keys.add(key) + mention, _ = EntityMention.objects.update_or_create( + content=content, + entity=entity, + role=role, + defaults={ + "project": content.project, + "sentiment": mention_payload["sentiment"], + "span": mention_payload["span"], + "confidence": mention_payload["confidence"], + }, + ) + mentions.append(mention) + return mentions + + +def persist_entity_candidates( + content: Content, + candidate_payloads: list[dict[str, str]], + *, + is_rerun: bool, +) -> list[EntityCandidate]: + """Create or update pending entity candidates discovered in content.""" + + persisted: list[EntityCandidate] = [] + tracked_names = { + _normalize_name(entity.name) + for entity in Entity.objects.filter(project_id=content.project_id).only("name") + } + seen_names: set[str] = set() + for candidate_payload in candidate_payloads: + raw_name = candidate_payload.get("name", "") + name = _clean_candidate_name(raw_name) + normalized_name = _normalize_name(name) + if ( + not name + or normalized_name in seen_names + or normalized_name in tracked_names + ): + continue + seen_names.add(normalized_name) + candidate, created = EntityCandidate.objects.get_or_create( + project=content.project, + name=name, + defaults={ + "suggested_type": candidate_payload.get( + "suggested_type", _guess_candidate_type(name) + ), + "first_seen_in": content, + "occurrence_count": 1, + "status": EntityCandidateStatus.PENDING, + }, + ) + if not created: + update_fields: list[str] = [] + suggested_type = candidate_payload.get( + "suggested_type", candidate.suggested_type or _guess_candidate_type(name) + ) + if candidate.suggested_type != suggested_type: + candidate.suggested_type = suggested_type + update_fields.append("suggested_type") + if candidate.first_seen_in_id is None: + candidate.first_seen_in = content + update_fields.append("first_seen_in") + if not is_rerun: + candidate.occurrence_count += 1 + update_fields.append("occurrence_count") + if update_fields: + candidate.save(update_fields=update_fields + ["updated_at"]) + persisted.append(candidate) + return persisted + + +@transaction.atomic +def accept_entity_candidate(candidate: EntityCandidate) -> Entity: + """Accept a candidate, create the tracked entity, and backfill recent mentions.""" + + entity, _ = Entity.objects.get_or_create( + project=candidate.project, + name=candidate.name, + defaults={ + "type": candidate.suggested_type, + }, + ) + candidate.status = EntityCandidateStatus.ACCEPTED + candidate.merged_into = entity + candidate.save(update_fields=["status", "merged_into", "updated_at"]) + backfill_entity_mentions(entity, candidate_name=candidate.name) + return entity + + +@transaction.atomic +def merge_entity_candidate(candidate: EntityCandidate, entity: Entity) -> Entity: + """Merge a candidate into an existing tracked entity and backfill mentions.""" + + candidate.status = EntityCandidateStatus.MERGED + candidate.merged_into = entity + candidate.save(update_fields=["status", "merged_into", "updated_at"]) + backfill_entity_mentions(entity, candidate_name=candidate.name) + return entity + + +def reject_entity_candidate(candidate: EntityCandidate) -> None: + """Reject an extracted candidate without creating a tracked entity.""" + + candidate.status = EntityCandidateStatus.REJECTED + candidate.save(update_fields=["status", "updated_at"]) + + +def backfill_entity_mentions(entity: Entity, *, candidate_name: str | None = None) -> None: + """Retroactively attach recent content rows to an accepted or merged entity.""" + + cutoff = timezone.now() - timedelta(days=RETROACTIVE_MENTION_WINDOW_DAYS) + recent_content = Content.objects.filter( + project=entity.project, + published_date__gte=cutoff, + ).order_by("-published_date") + labels = _entity_labels(entity) + if candidate_name: + labels.append(candidate_name) + labels = [label for label in labels if label] + for content in recent_content: + mention_payloads = _heuristic_mentions_for_entities( + content, + [entity], + extra_labels={entity.id: labels}, + ) + mentions = upsert_entity_mentions(content, mention_payloads) + if content.entity_id is None and any( + mention.role in {EntityMentionRole.SUBJECT, EntityMentionRole.AUTHOR} + for mention in mentions + ): + content.entity = entity + content.save(update_fields=["entity"]) + + +def _run_entity_extraction_with_fallback( + content: Content, tracked_entities: list[Entity] +) -> dict[str, Any]: + """Run the LLM extraction step when configured, else use heuristics.""" + + if not settings.OPENROUTER_API_KEY: + return _heuristic_entity_extraction(content, tracked_entities) + + candidate_entities = _retrieve_candidate_entities(content, tracked_entities) + try: + response = openrouter_chat_json( + model=settings.AI_CLASSIFICATION_MODEL, + system_prompt=get_skill_definition( + ENTITY_EXTRACTION_SKILL_NAME + ).instructions_markdown, + user_prompt=build_skill_user_prompt( + ENTITY_EXTRACTION_SKILL_NAME, + { + "title": content.title, + "content_text": content.content_text[:5000], + "project_id": content.project_id, + "tracked_entities": [_serialize_tracked_entity(entity) for entity in candidate_entities], + }, + ), + ) + except Exception: + return _heuristic_entity_extraction(content, tracked_entities) + + payload = response.payload + return { + "mentions": payload.get("mentions", []), + "candidate_entities": payload.get("candidate_entities", []), + "explanation": str( + payload.get( + "explanation", + "LLM verified which tracked entities were present in the content.", + ) + ), + "model_used": response.model, + "latency_ms": response.latency_ms, + } + + +def _heuristic_entity_extraction( + content: Content, tracked_entities: list[Entity] +) -> dict[str, Any]: + """Fallback extractor that relies on exact label matches and title heuristics.""" + + candidate_entities = _retrieve_candidate_entities(content, tracked_entities) + mention_payloads = _heuristic_mentions_for_entities(content, candidate_entities) + return { + "mentions": [ + { + "entity_name": mention_payload["entity"].name, + "span": mention_payload["span"], + "sentiment": mention_payload["sentiment"], + "role": mention_payload["role"], + "confidence": mention_payload["confidence"], + } + for mention_payload in mention_payloads + ], + "candidate_entities": _discover_candidates(content, tracked_entities), + "explanation": "Heuristic extraction matched exact entity labels in the title, author, or body.", + "model_used": "heuristic", + "latency_ms": 0, + } + + +def _retrieve_candidate_entities( + content: Content, tracked_entities: list[Entity] +) -> list[Entity]: + """Retrieve likely tracked entities using Qdrant plus exact label matches.""" + + if not tracked_entities: + return [] + + entities_by_id = {entity.id: entity for entity in tracked_entities} + ordered_ids: list[int] = [] + try: + matches = search_similar_entities_for_content( + content, limit=ENTITY_RETRIEVAL_LIMIT + ) + except Exception: + matches = [] + + for match in matches: + if float(getattr(match, "score", 0.0)) < ENTITY_RETRIEVAL_THRESHOLD: + continue + entity_id = getattr(match, "payload", {}).get("entity_id") + if isinstance(entity_id, int) and entity_id in entities_by_id: + ordered_ids.append(entity_id) + + exact_match_ids = { + entity.id + for entity in tracked_entities + if _find_entity_span(content, entity, extra_labels=None) is not None + } + for entity_id in sorted(exact_match_ids): + if entity_id not in ordered_ids: + ordered_ids.append(entity_id) + if not ordered_ids: + return tracked_entities + return [entities_by_id[entity_id] for entity_id in ordered_ids if entity_id in entities_by_id] + + +def _normalize_mentions( + raw_mentions: Any, tracked_entities: list[Entity] +) -> tuple[list[dict[str, Any]], list[str]]: + """Resolve extracted mentions to tracked entities and collect unknown names.""" + + entity_lookup = _entity_lookup(tracked_entities) + normalized_mentions: list[dict[str, Any]] = [] + unresolved_names: list[str] = [] + if not isinstance(raw_mentions, list): + return normalized_mentions, unresolved_names + + for raw_mention in raw_mentions: + if not isinstance(raw_mention, dict): + continue + entity_name = _clean_candidate_name(str(raw_mention.get("entity_name", ""))) + if not entity_name: + continue + entity = entity_lookup.get(_normalize_name(entity_name)) + if entity is None: + unresolved_names.append(entity_name) + continue + normalized_mentions.append( + { + "entity": entity, + "role": _normalize_role(raw_mention.get("role")), + "sentiment": _normalize_sentiment(raw_mention.get("sentiment")), + "span": str(raw_mention.get("span", entity_name)).strip(), + "confidence": _normalize_confidence(raw_mention.get("confidence", 0.75)), + } + ) + return normalized_mentions, unresolved_names + + +def _normalize_candidates( + raw_candidates: Any, tracked_entities: list[Entity] +) -> list[dict[str, str]]: + """Normalize candidate payloads returned by the extractor.""" + + tracked_names = {_normalize_name(entity.name) for entity in tracked_entities} + normalized_candidates: list[dict[str, str]] = [] + seen_names: set[str] = set() + if not isinstance(raw_candidates, list): + return normalized_candidates + + for raw_candidate in raw_candidates: + if isinstance(raw_candidate, str): + candidate_name = _clean_candidate_name(raw_candidate) + suggested_type = _guess_candidate_type(candidate_name) + elif isinstance(raw_candidate, dict): + candidate_name = _clean_candidate_name(str(raw_candidate.get("name", ""))) + suggested_type = str( + raw_candidate.get("suggested_type", _guess_candidate_type(candidate_name)) + ) + else: + continue + normalized_name = _normalize_name(candidate_name) + if ( + not candidate_name + or normalized_name in tracked_names + or normalized_name in seen_names + ): + continue + seen_names.add(normalized_name) + normalized_candidates.append( + { + "name": candidate_name, + "suggested_type": _normalize_entity_type(suggested_type), + } + ) + return normalized_candidates + + +def _discover_candidates( + content: Content, tracked_entities: list[Entity] +) -> list[dict[str, str]]: + """Heuristically surface named entities that are not yet tracked.""" + + tracked_labels = set(_entity_lookup(tracked_entities).keys()) + discovered: list[dict[str, str]] = [] + seen_names: set[str] = set() + candidate_text = "\n".join( + part for part in [content.author, content.title, content.content_text[:2000]] if part + ) + for match in PROPER_NOUN_PATTERN.findall(candidate_text): + name = _clean_candidate_name(match) + normalized_name = _normalize_name(name) + if ( + not name + or name in NOISE_CANDIDATE_NAMES + or normalized_name in tracked_labels + or normalized_name in seen_names + ): + continue + seen_names.add(normalized_name) + discovered.append( + {"name": name, "suggested_type": _guess_candidate_type(name)} + ) + return discovered + + +def _heuristic_mentions_for_entities( + content: Content, + entities: list[Entity], + *, + extra_labels: dict[int, list[str]] | None = None, +) -> list[dict[str, Any]]: + """Build mention payloads from exact label matches in the content text.""" + + mention_payloads: list[dict[str, Any]] = [] + for entity in entities: + span = _find_entity_span(content, entity, extra_labels=extra_labels) + if span is None: + continue + mention_payloads.append( + { + "entity": entity, + "role": _detect_role(content, span), + "sentiment": _detect_sentiment(content, span), + "span": span, + "confidence": _heuristic_confidence(content, span), + } + ) + return mention_payloads + + +def _find_entity_span( + content: Content, + entity: Entity, + *, + extra_labels: dict[int, list[str]] | None, +) -> str | None: + """Return the first matched label for an entity inside the content.""" + + labels = extra_labels.get(entity.id, []) if extra_labels is not None else [] + labels = [*labels, *_entity_labels(entity)] + haystacks = [content.author or "", content.title or "", content.content_text or ""] + for label in labels: + stripped_label = label.strip() + if not stripped_label: + continue + pattern = re.compile(rf"(? list[str]: + """Return the names and handle-like aliases that can refer to an entity.""" + + labels = [entity.name] + for handle in ( + entity.bluesky_handle, + entity.mastodon_handle, + entity.twitter_handle, + ): + cleaned_handle = handle.strip().removeprefix("@") + if cleaned_handle: + labels.extend([cleaned_handle, f"@{cleaned_handle}"]) + labels.append(cleaned_handle.split(".")[0]) + for url in (entity.website_url, entity.github_url, entity.linkedin_url): + hostname = urlsplit(url).netloc.lower().removeprefix("www.") + if hostname: + labels.append(hostname) + labels.append(hostname.split(".")[0]) + deduped_labels: list[str] = [] + seen_labels: set[str] = set() + for label in labels: + normalized_label = _normalize_name(label) + if not normalized_label or normalized_label in seen_labels: + continue + seen_labels.add(normalized_label) + deduped_labels.append(label) + return deduped_labels + + +def _entity_lookup(entities: list[Entity]) -> dict[str, Entity]: + """Map normalized names and aliases to their tracked entity rows.""" + + lookup: dict[str, Entity] = {} + for entity in entities: + for label in _entity_labels(entity): + lookup[_normalize_name(label)] = entity + return lookup + + +def _detect_role(content: Content, span: str) -> str: + """Infer an entity mention role from where the match appeared.""" + + span_lower = span.lower() + if content.author and span_lower in content.author.lower(): + return EntityMentionRole.AUTHOR + if content.title and span_lower in content.title.lower(): + return EntityMentionRole.SUBJECT + if re.search(rf'"[^\n]{{0,120}}{re.escape(span)}[^\n]{{0,120}}"', content.content_text, re.IGNORECASE): + return EntityMentionRole.QUOTED + return EntityMentionRole.MENTIONED + + +def _detect_sentiment(content: Content, span: str) -> str: + """Infer a coarse sentiment label from nearby context around the span.""" + + text = f"{content.title}\n{content.content_text}" + match = re.search(re.escape(span), text, re.IGNORECASE) + if match is None: + return EntityMentionSentiment.NEUTRAL + start = max(0, match.start() - 80) + end = min(len(text), match.end() + 80) + window = text[start:end].lower() + if any(token in window for token in NEGATIVE_TOKENS): + return EntityMentionSentiment.NEGATIVE + if any(token in window for token in POSITIVE_TOKENS): + return EntityMentionSentiment.POSITIVE + return EntityMentionSentiment.NEUTRAL + + +def _heuristic_confidence(content: Content, span: str) -> float: + """Assign a confidence score for heuristic mention matches.""" + + span_lower = span.lower() + if content.author and span_lower in content.author.lower(): + return 0.9 + if content.title and span_lower in content.title.lower(): + return 0.85 + return 0.72 + + +def _select_primary_entity(mentions: list[EntityMention]) -> Entity | None: + """Choose the best single entity to attach directly to the content row.""" + + for preferred_role in (EntityMentionRole.SUBJECT, EntityMentionRole.AUTHOR): + for mention in mentions: + if mention.role == preferred_role: + return mention.entity + return mentions[0].entity if mentions else None + + +def _serialize_tracked_entity(entity: Entity) -> dict[str, Any]: + """Serialize tracked entity context for the entity-extraction skill prompt.""" + + return { + "name": entity.name, + "type": entity.type, + "aliases": _entity_labels(entity), + "description": entity.description, + } + + +def _serialize_mention(mention: EntityMention) -> dict[str, Any]: + """Serialize a persisted mention for the skill result payload.""" + + return { + "entity_id": mention.entity_id, + "entity_name": mention.entity.name, + "role": mention.role, + "sentiment": mention.sentiment, + "span": mention.span, + "confidence": mention.confidence, + } + + +def _serialize_candidate(candidate: EntityCandidate) -> dict[str, Any]: + """Serialize a persisted candidate for the skill result payload.""" + + return { + "id": candidate.id, + "name": candidate.name, + "suggested_type": candidate.suggested_type, + "occurrence_count": candidate.occurrence_count, + "status": candidate.status, + } + + +def _normalize_role(value: Any) -> str: + """Normalize free-form role strings into the supported enum values.""" + + role = str(value or "").strip().lower() + if role in EntityMentionRole.values: + return role + return EntityMentionRole.MENTIONED + + +def _normalize_sentiment(value: Any) -> str: + """Normalize free-form sentiment strings into the supported enum values.""" + + sentiment = str(value or "").strip().lower() + if sentiment in EntityMentionSentiment.values: + return sentiment + return EntityMentionSentiment.NEUTRAL + + +def _normalize_entity_type(value: Any) -> str: + """Normalize free-form entity-type strings into the supported enum values.""" + + entity_type = str(value or "").strip().lower() + if entity_type in EntityType.values: + return entity_type + return EntityType.ORGANIZATION + + +def _normalize_confidence(value: Any) -> float: + """Clamp arbitrary confidence inputs into the [0, 1] range.""" + + try: + confidence = float(value) + except (TypeError, ValueError): + confidence = 0.0 + return max(0.0, min(1.0, confidence)) + + +def _guess_candidate_type(name: str) -> str: + """Infer a plausible entity type for a newly discovered candidate.""" + + tokens = [token.strip(".,") for token in name.lower().split() if token] + if any(token in COMPANY_SUFFIXES for token in tokens): + return EntityType.VENDOR + if any(token in ORGANIZATION_SUFFIXES for token in tokens): + return EntityType.ORGANIZATION + title_case_tokens = [token for token in name.split() if token and token[:1].isupper()] + if 2 <= len(title_case_tokens) <= 3: + return EntityType.INDIVIDUAL + return EntityType.ORGANIZATION + + +def _clean_candidate_name(value: str) -> str: + """Normalize candidate names while preserving user-facing capitalization.""" + + cleaned_value = re.sub(r"\s+", " ", value).strip(" ,.;:-") + if not cleaned_value or len(cleaned_value) < 3: + return "" + return cleaned_value + + +def _normalize_name(value: str) -> str: + """Case-fold and collapse whitespace for entity-name comparisons.""" + + return re.sub(r"\s+", " ", value).strip().casefold() diff --git a/core/migrations/0006_entitycandidate_entitymention.py b/core/migrations/0006_entitycandidate_entitymention.py new file mode 100644 index 00000000..a9a5e49e --- /dev/null +++ b/core/migrations/0006_entitycandidate_entitymention.py @@ -0,0 +1,184 @@ +# Generated by Django 6.0.4 on 2026-04-29 02:25 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("core", "0005_content_canonical_url_content_duplicate_of_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="EntityCandidate", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("name", models.CharField(max_length=255)), + ( + "suggested_type", + models.CharField( + choices=[ + ("individual", "Individual"), + ("vendor", "Vendor"), + ("organization", "Organization"), + ], + max_length=32, + ), + ), + ("occurrence_count", models.IntegerField(default=1)), + ( + "status", + models.CharField( + choices=[ + ("pending", "Pending"), + ("accepted", "Accepted"), + ("rejected", "Rejected"), + ("merged", "Merged"), + ], + default="pending", + max_length=16, + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "first_seen_in", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="entity_candidates", + to="core.content", + ), + ), + ( + "merged_into", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="merged_entity_candidates", + to="core.entity", + ), + ), + ( + "project", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="entity_candidates", + to="core.project", + ), + ), + ], + options={ + "ordering": ["-occurrence_count", "name"], + "indexes": [ + models.Index( + fields=["project", "status", "occurrence_count"], + name="core_entity_project_4c32ec_idx", + ) + ], + "constraints": [ + models.UniqueConstraint( + fields=("project", "name"), + name="core_entitycandidate_unique_project_name", + ) + ], + }, + ), + migrations.CreateModel( + name="EntityMention", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "role", + models.CharField( + choices=[ + ("author", "Author"), + ("subject", "Subject"), + ("quoted", "Quoted"), + ("mentioned", "Mentioned"), + ], + max_length=16, + ), + ), + ( + "sentiment", + models.CharField( + blank=True, + choices=[ + ("positive", "Positive"), + ("neutral", "Neutral"), + ("negative", "Negative"), + ], + default="", + max_length=16, + ), + ), + ("span", models.TextField(blank=True)), + ("confidence", models.FloatField(default=0.0)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ( + "content", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="entity_mentions", + to="core.content", + ), + ), + ( + "entity", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="mentions", + to="core.entity", + ), + ), + ( + "project", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="entity_mentions", + to="core.project", + ), + ), + ], + options={ + "ordering": ["-created_at"], + "indexes": [ + models.Index( + fields=["entity", "created_at"], + name="core_entity_entity__8ba01e_idx", + ), + models.Index( + fields=["project", "created_at"], + name="core_entity_project_dabde7_idx", + ), + ], + "constraints": [ + models.UniqueConstraint( + fields=("content", "entity", "role"), + name="core_entitymention_unique_content_entity_role", + ) + ], + }, + ), + ] diff --git a/core/models.py b/core/models.py index 42fe5abd..a159b29d 100644 --- a/core/models.py +++ b/core/models.py @@ -80,6 +80,32 @@ class EntityType(models.TextChoices): ORGANIZATION = "organization", "Organization" +class EntityMentionRole(models.TextChoices): + """Supported roles for how an entity appears inside content.""" + + AUTHOR = "author", "Author" + SUBJECT = "subject", "Subject" + QUOTED = "quoted", "Quoted" + MENTIONED = "mentioned", "Mentioned" + + +class EntityMentionSentiment(models.TextChoices): + """Supported editorial sentiment labels for entity mentions.""" + + POSITIVE = "positive", "Positive" + NEUTRAL = "neutral", "Neutral" + NEGATIVE = "negative", "Negative" + + +class EntityCandidateStatus(models.TextChoices): + """Review workflow states for extracted entity candidates.""" + + PENDING = "pending", "Pending" + ACCEPTED = "accepted", "Accepted" + REJECTED = "rejected", "Rejected" + MERGED = "merged", "Merged" + + class SkillStatus(models.TextChoices): """Execution states recorded for AI skill runs.""" @@ -358,6 +384,93 @@ def __str__(self) -> str: return self.title +class EntityMention(models.Model): + """Represents one tracked-entity mention detected in a content item.""" + + content = models.ForeignKey( + Content, on_delete=models.CASCADE, related_name="entity_mentions" + ) + entity = models.ForeignKey( + Entity, on_delete=models.CASCADE, related_name="mentions" + ) + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="entity_mentions" + ) + role = models.CharField(max_length=16, choices=EntityMentionRole.choices) + sentiment = models.CharField( + max_length=16, + choices=EntityMentionSentiment.choices, + blank=True, + default="", + ) + span = models.TextField(blank=True) + confidence = models.FloatField(default=0.0) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + ordering = ["-created_at"] + constraints = [ + models.UniqueConstraint( + fields=["content", "entity", "role"], + name="core_entitymention_unique_content_entity_role", + ) + ] + indexes = [ + models.Index(fields=["entity", "created_at"]), + models.Index(fields=["project", "created_at"]), + ] + + def __str__(self) -> str: + return f"{self.entity.name} in {self.content.title}" + + +class EntityCandidate(models.Model): + """Stores an extracted named entity awaiting human confirmation.""" + + project = models.ForeignKey( + Project, on_delete=models.CASCADE, related_name="entity_candidates" + ) + name = models.CharField(max_length=255) + suggested_type = models.CharField(max_length=32, choices=EntityType.choices) + first_seen_in = models.ForeignKey( + Content, + null=True, + blank=True, + on_delete=models.SET_NULL, + related_name="entity_candidates", + ) + occurrence_count = models.IntegerField(default=1) + status = models.CharField( + max_length=16, + choices=EntityCandidateStatus.choices, + default=EntityCandidateStatus.PENDING, + ) + merged_into = models.ForeignKey( + Entity, + null=True, + blank=True, + on_delete=models.SET_NULL, + related_name="merged_entity_candidates", + ) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-occurrence_count", "name"] + constraints = [ + models.UniqueConstraint( + fields=["project", "name"], + name="core_entitycandidate_unique_project_name", + ) + ] + indexes = [ + models.Index(fields=["project", "status", "occurrence_count"]), + ] + + def __str__(self) -> str: + return self.name + + class IntakeAllowlist(models.Model): """Tracks who is allowed to send newsletters into a project inbox. diff --git a/core/pipeline.py b/core/pipeline.py index a9e3d720..854115bb 100644 --- a/core/pipeline.py +++ b/core/pipeline.py @@ -25,6 +25,7 @@ get_reference_similarity, search_similar_content, ) +from core.entity_extraction import run_entity_extraction from core.llm import build_skill_user_prompt, get_skill_definition, openrouter_chat_json from core.models import Content, ReviewQueue, ReviewReason, SkillResult, SkillStatus @@ -32,6 +33,7 @@ DEDUPLICATION_SKILL_NAME = "deduplication" CLASSIFICATION_SKILL_NAME = "content_classification" +ENTITY_EXTRACTION_SKILL_NAME = "entity_extraction" RELEVANCE_SKILL_NAME = "relevance_scoring" SUMMARIZATION_SKILL_NAME = "summarization" RELATED_CONTENT_SKILL_NAME = "find_related" @@ -60,6 +62,7 @@ class PipelineState(TypedDict, total=False): project_id: int dedup: dict[str, Any] | None classification: dict[str, Any] | None + entity_extraction: dict[str, Any] | None relevance: dict[str, Any] | None summary: dict[str, Any] | None status: str @@ -77,6 +80,7 @@ def get_ingestion_graph(): graph = StateGraph(PipelineState) graph.add_node("deduplicate", deduplicate_node) graph.add_node("classify", classify_node) + graph.add_node("extract_entities", extract_entities_node) graph.add_node("score_relevance", relevance_node) graph.add_node("summarize", summarize_node) graph.add_node("archive", archive_node) @@ -90,7 +94,8 @@ def get_ingestion_graph(): "unique": "classify", }, ) - graph.add_edge("classify", "score_relevance") + graph.add_edge("classify", "extract_entities") + graph.add_edge("extract_entities", "score_relevance") graph.add_conditional_edges( "score_relevance", route_by_relevance, @@ -188,6 +193,25 @@ def classify_node(state: PipelineState) -> PipelineState: return {"classification": classification} +def extract_entities_node(state: PipelineState) -> PipelineState: + """Extract tracked-entity mentions before relevance scoring.""" + + content = _get_content(state) + extraction = _execute_with_retries( + ENTITY_EXTRACTION_SKILL_NAME, lambda: run_entity_extraction(content) + ) + _create_skill_result( + content, + skill_name=ENTITY_EXTRACTION_SKILL_NAME, + status=SkillStatus.COMPLETED, + result_data=extraction, + model_used=extraction["model_used"], + latency_ms=extraction["latency_ms"], + confidence=extraction["confidence"], + ) + return {"entity_extraction": extraction} + + def relevance_node(state: PipelineState) -> PipelineState: """Score content relevance, persist the score, and keep the item active.""" @@ -383,8 +407,6 @@ def run_deduplication(content: Content) -> dict[str, Any]: "model_used": f"embedding:{settings.EMBEDDING_MODEL}", "latency_ms": 0, } - - def run_content_classification(content: Content) -> dict[str, Any]: """Classify a content item into a newsletter-oriented content type. diff --git a/core/serializers.py b/core/serializers.py index 20d8565d..60915daf 100644 --- a/core/serializers.py +++ b/core/serializers.py @@ -11,6 +11,8 @@ from core.models import ( Content, Entity, + EntityCandidate, + EntityMention, IngestionRun, IntakeAllowlist, NewsletterIntake, @@ -45,6 +47,13 @@ def _filter_related_queryset(self, request): else Entity.objects.filter(project__group__user=user) ) self.fields["entity"].queryset = entity_queryset + if "merged_into" in self.fields: + merged_into_queryset = ( + Entity.objects.filter(project=project) + if project + else Entity.objects.filter(project__group__user=user) + ) + self.fields["merged_into"].queryset = merged_into_queryset if "content" in self.fields: content_queryset = ( Content.objects.filter(project=project) @@ -107,6 +116,9 @@ class Meta: class EntitySerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): """Serialize tracked entities for a project.""" + mention_count = serializers.IntegerField(read_only=True) + latest_mentions = serializers.SerializerMethodField() + class Meta: model = Entity fields = [ @@ -122,10 +134,72 @@ class Meta: "bluesky_handle", "mastodon_handle", "twitter_handle", + "mention_count", + "latest_mentions", "created_at", ] read_only_fields = ["id", "project", "created_at"] + def get_latest_mentions(self, obj): + """Return a compact summary of the most recent mentions for an entity.""" + + mentions = getattr(obj, "prefetched_mentions", None) + if mentions is None: + mentions = obj.mentions.select_related("content").order_by("-created_at") + return EntityMentionSummarySerializer(mentions[:3], many=True).data + + +class EntityMentionSummarySerializer(serializers.ModelSerializer): + """Serialize a compact entity-mention summary for frontend display.""" + + content_id = serializers.IntegerField(read_only=True) + content_title = serializers.CharField(source="content.title", read_only=True) + + class Meta: + model = EntityMention + fields = [ + "id", + "content_id", + "content_title", + "role", + "sentiment", + "span", + "confidence", + "created_at", + ] + read_only_fields = fields + + +class EntityCandidateSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): + """Serialize extracted entity candidates awaiting editorial review.""" + + first_seen_title = serializers.CharField(source="first_seen_in.title", read_only=True) + merged_into_name = serializers.CharField(source="merged_into.name", read_only=True) + + class Meta: + model = EntityCandidate + fields = [ + "id", + "project", + "name", + "suggested_type", + "first_seen_in", + "first_seen_title", + "occurrence_count", + "status", + "merged_into", + "merged_into_name", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class EntityCandidateMergeSerializer(ProjectScopedSerializerMixin, serializers.Serializer): + """Validate merge requests for entity candidates.""" + + merged_into = serializers.PrimaryKeyRelatedField(queryset=Entity.objects.none()) + class ContentSerializer(ProjectScopedSerializerMixin, serializers.ModelSerializer): """Serialize ingested content items and enforce project/entity consistency.""" diff --git a/core/tests/test_admin.py b/core/tests/test_admin.py index 29877639..c4105a8d 100644 --- a/core/tests/test_admin.py +++ b/core/tests/test_admin.py @@ -13,6 +13,7 @@ ContentAdmin, DuplicateStateFilter, EntityAdmin, + EntityCandidateAdmin, HighValueFilter, IngestionRunAdmin, ReviewQueueAdmin, @@ -24,6 +25,9 @@ BlueskyCredentials, Content, Entity, + EntityCandidate, + EntityCandidateStatus, + EntityMention, IngestionRun, Project, ReviewQueue, @@ -497,6 +501,105 @@ def test_entity_colored_score_uses_expected_color( assert str(authority_score) in rendered +def test_accept_selected_entity_candidates_creates_entity_and_backfills_mentions( + source_admin_context, mocker +): + content = Content.objects.create( + project=source_admin_context.project, + url="https://example.com/river-labs-launch", + title="River Labs ships a new platform release", + author="Editor", + source_plugin=SourcePluginName.RSS, + published_date=timezone.now(), + content_text="River Labs announced a new hosted control plane.", + ) + candidate = EntityCandidate.objects.create( + project=source_admin_context.project, + name="River Labs", + suggested_type="vendor", + first_seen_in=content, + occurrence_count=2, + ) + admin_instance = EntityCandidateAdmin(EntityCandidate, AdminSite()) + admin_instance.message_user = mocker.Mock() + + admin_instance.accept_selected_candidates( + request=SimpleNamespace(), + queryset=EntityCandidate.objects.filter(pk=candidate.pk), + ) + + candidate.refresh_from_db() + content.refresh_from_db() + entity = Entity.objects.get( + project=source_admin_context.project, + name="River Labs", + ) + mention = EntityMention.objects.get(content=content, entity=entity) + + assert candidate.status == EntityCandidateStatus.ACCEPTED + assert candidate.merged_into_id == entity.id + assert mention.role == "subject" + assert content.entity_id == entity.id + + +def test_reject_selected_entity_candidates_marks_candidates_rejected( + source_admin_context, mocker +): + candidate = EntityCandidate.objects.create( + project=source_admin_context.project, + name="Rejected Vendor", + suggested_type="vendor", + ) + admin_instance = EntityCandidateAdmin(EntityCandidate, AdminSite()) + admin_instance.message_user = mocker.Mock() + + admin_instance.reject_selected_candidates( + request=SimpleNamespace(), + queryset=EntityCandidate.objects.filter(pk=candidate.pk), + ) + + candidate.refresh_from_db() + + assert candidate.status == EntityCandidateStatus.REJECTED + + +def test_merge_selected_entity_candidates_uses_existing_same_name_entity( + source_admin_context, mocker +): + content = Content.objects.create( + project=source_admin_context.project, + url="https://example.com/acme-merge", + title="Acme ships a new platform feature", + author="Editor", + source_plugin=SourcePluginName.RSS, + published_date=timezone.now(), + content_text="Acme expanded its hosted platform product.", + ) + entity = Entity.objects.create( + project=source_admin_context.project, + name="Acme", + type="vendor", + ) + candidate = EntityCandidate.objects.create( + project=source_admin_context.project, + name="Acme", + suggested_type="vendor", + first_seen_in=content, + ) + admin_instance = EntityCandidateAdmin(EntityCandidate, AdminSite()) + admin_instance.message_user = mocker.Mock() + + admin_instance.merge_into_existing_entities( + request=SimpleNamespace(), + queryset=EntityCandidate.objects.filter(pk=candidate.pk), + ) + + candidate.refresh_from_db() + + assert candidate.status == EntityCandidateStatus.MERGED + assert candidate.merged_into_id == entity.id + + def test_high_value_filter_only_returns_high_value_reference_content( source_admin_context, ): diff --git a/core/tests/test_api.py b/core/tests/test_api.py index d5d7c0a1..86cd85f5 100644 --- a/core/tests/test_api.py +++ b/core/tests/test_api.py @@ -11,6 +11,9 @@ BlueskyCredentials, Content, Entity, + EntityCandidate, + EntityCandidateStatus, + EntityMention, FeedbackType, IngestionRun, Project, @@ -149,6 +152,74 @@ def test_entity_list_is_scoped_to_request_user_project(self): self.assertEqual(len(response.json()), 1) self.assertEqual(response.json()[0]["id"], self.owner_entity.id) + def test_entity_list_includes_recent_mentions(self): + mention = EntityMention.objects.create( + project=self.owner_project, + content=self.owner_content, + entity=self.owner_entity, + role="subject", + sentiment="neutral", + span="Owner Entity", + confidence=0.88, + ) + + response = self.client.get( + reverse( + "v1:project-entity-list", kwargs={"project_id": self.owner_project.id} + ) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json()[0]["mention_count"], 1) + self.assertEqual(response.json()[0]["latest_mentions"][0]["id"], mention.id) + self.assertEqual( + response.json()[0]["latest_mentions"][0]["content_title"], + self.owner_content.title, + ) + + def test_entity_mentions_action_returns_full_mention_history(self): + first_mention = EntityMention.objects.create( + project=self.owner_project, + content=self.owner_content, + entity=self.owner_entity, + role="subject", + sentiment="neutral", + span="Owner Entity", + confidence=0.88, + ) + second_content = Content.objects.create( + project=self.owner_project, + url="https://example.com/owner-second", + title="Second Owner Content", + author="Owner Author", + entity=self.owner_entity, + source_plugin="rss", + published_date="2026-04-22T00:00:00Z", + content_text="Another owner content text", + ) + second_mention = EntityMention.objects.create( + project=self.owner_project, + content=second_content, + entity=self.owner_entity, + role="mentioned", + sentiment="positive", + span="Owner Entity", + confidence=0.67, + ) + + response = self.client.get( + reverse( + "v1:project-entity-mentions", + kwargs={"project_id": self.owner_project.id, "pk": self.owner_entity.id}, + ) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.json()), 2) + self.assertEqual(response.json()[0]["id"], second_mention.id) + self.assertEqual(response.json()[1]["id"], first_mention.id) + self.assertEqual(response.json()[0]["content_title"], second_content.title) + def test_content_detail_includes_duplicate_state(self): canonical = self.owner_content canonical.canonical_url = "https://example.com/owner" @@ -189,6 +260,117 @@ def test_nested_entity_list_rejects_other_users_project(self): self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + def test_entity_candidate_list_is_scoped_to_request_user_project(self): + owner_candidate = EntityCandidate.objects.create( + project=self.owner_project, + name="Owner Candidate", + suggested_type="vendor", + first_seen_in=self.owner_content, + ) + EntityCandidate.objects.create( + project=self.other_project, + name="Other Candidate", + suggested_type="organization", + first_seen_in=self.other_content, + ) + + response = self.client.get( + reverse( + "v1:project-entity-candidate-list", + kwargs={"project_id": self.owner_project.id}, + ) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.json()), 1) + self.assertEqual(response.json()[0]["id"], owner_candidate.id) + + def test_entity_candidate_accept_action_returns_updated_candidate(self): + candidate = EntityCandidate.objects.create( + project=self.owner_project, + name="River Labs", + suggested_type="vendor", + first_seen_in=self.owner_content, + ) + + response = self.client.post( + reverse( + "v1:project-entity-candidate-accept", + kwargs={"project_id": self.owner_project.id, "pk": candidate.id}, + ), + format="json", + ) + + candidate.refresh_from_db() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(candidate.status, EntityCandidateStatus.ACCEPTED) + self.assertIsNotNone(candidate.merged_into_id) + self.assertEqual(response.json()["status"], EntityCandidateStatus.ACCEPTED) + + def test_entity_candidate_reject_action_returns_updated_candidate(self): + candidate = EntityCandidate.objects.create( + project=self.owner_project, + name="Rejected Candidate", + suggested_type="organization", + first_seen_in=self.owner_content, + ) + + response = self.client.post( + reverse( + "v1:project-entity-candidate-reject", + kwargs={"project_id": self.owner_project.id, "pk": candidate.id}, + ), + format="json", + ) + + candidate.refresh_from_db() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(candidate.status, EntityCandidateStatus.REJECTED) + self.assertEqual(response.json()["status"], EntityCandidateStatus.REJECTED) + + def test_entity_candidate_merge_rejects_cross_project_entity(self): + candidate = EntityCandidate.objects.create( + project=self.owner_project, + name="Merge Candidate", + suggested_type="vendor", + first_seen_in=self.owner_content, + ) + + response = self.client.post( + reverse( + "v1:project-entity-candidate-merge", + kwargs={"project_id": self.owner_project.id, "pk": candidate.id}, + ), + {"merged_into": self.other_entity.id}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assert_standardized_validation_error(response.json(), "merged_into") + + def test_entity_candidate_merge_action_returns_updated_candidate(self): + candidate = EntityCandidate.objects.create( + project=self.owner_project, + name="Owner Entity Alias", + suggested_type="vendor", + first_seen_in=self.owner_content, + ) + + response = self.client.post( + reverse( + "v1:project-entity-candidate-merge", + kwargs={"project_id": self.owner_project.id, "pk": candidate.id}, + ), + {"merged_into": self.owner_entity.id}, + format="json", + ) + + candidate.refresh_from_db() + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(candidate.status, EntityCandidateStatus.MERGED) + self.assertEqual(candidate.merged_into_id, self.owner_entity.id) + self.assertEqual(response.json()["merged_into"], self.owner_entity.id) + def test_verify_bluesky_credentials_requires_project_credentials(self): response = self.client.post( reverse( @@ -389,6 +571,10 @@ def test_authenticated_nested_list_endpoints_smoke(self): reverse( "v1:project-entity-list", kwargs={"project_id": self.owner_project.id} ), + reverse( + "v1:project-entity-candidate-list", + kwargs={"project_id": self.owner_project.id}, + ), reverse( "v1:project-content-list", kwargs={"project_id": self.owner_project.id} ), @@ -471,6 +657,19 @@ def test_authenticated_nested_detail_endpoints_smoke(self): ), ] + candidate = EntityCandidate.objects.create( + project=self.owner_project, + name="Smoke Candidate", + suggested_type="organization", + first_seen_in=self.owner_content, + ) + detail_endpoints.append( + reverse( + "v1:project-entity-candidate-detail", + kwargs={"project_id": self.owner_project.id, "pk": candidate.id}, + ) + ) + feedback = UserFeedback.objects.create( project=self.owner_project, content=self.owner_content, diff --git a/core/tests/test_pipeline.py b/core/tests/test_pipeline.py index 33cc2acc..e55fad96 100644 --- a/core/tests/test_pipeline.py +++ b/core/tests/test_pipeline.py @@ -6,6 +6,9 @@ from core.deduplication import canonicalize_url from core.models import ( Content, + Entity, + EntityCandidate, + EntityMention, Project, ReviewQueue, ReviewReason, @@ -15,6 +18,7 @@ from core.pipeline import ( CLASSIFICATION_SKILL_NAME, DEDUPLICATION_SKILL_NAME, + ENTITY_EXTRACTION_SKILL_NAME, RELATED_CONTENT_SKILL_NAME, RELEVANCE_SKILL_NAME, SUMMARIZATION_SKILL_NAME, @@ -31,6 +35,7 @@ route_by_relevance, run_content_classification, run_deduplication, + run_entity_extraction, run_relevance_scoring, run_summarization, ) @@ -813,3 +818,109 @@ def test_pipeline_helper_utilities_cover_serialization_and_summary_edges( assert _clamp_score("bad") == 0.0 assert _clamp_score(2) == 1.0 assert _clamp_score(-1) == 0.0 + + +def test_run_entity_extraction_persists_mentions_and_candidates( + pipeline_context, mocker +): + entity = Entity.objects.create( + project=pipeline_context.project, + name="Acme Cloud", + type="vendor", + website_url="https://acme.example.com", + ) + pipeline_context.content.title = "Acme Cloud expands platform team tooling" + pipeline_context.content.content_text = ( + "Acme Cloud announced a new runtime while River Labs joined the launch." + ) + pipeline_context.content.save(update_fields=["title", "content_text"]) + mocker.patch( + "core.entity_extraction.search_similar_entities_for_content", + return_value=[SimpleNamespace(score=0.91, payload={"entity_id": entity.id})], + ) + + result = run_entity_extraction(pipeline_context.content) + + mention = EntityMention.objects.get(content=pipeline_context.content, entity=entity) + candidate = EntityCandidate.objects.get( + project=pipeline_context.project, + name="River Labs", + ) + + assert mention.role == "subject" + assert mention.span == "Acme Cloud" + assert result["primary_entity_id"] == entity.id + assert pipeline_context.content.entity_id == entity.id + assert candidate.suggested_type == "vendor" + assert candidate.occurrence_count == 1 + + +def test_process_content_records_entity_extraction_skill_result( + pipeline_context, mocker +): + entity = Entity.objects.create( + project=pipeline_context.project, + name="Acme Cloud", + type="vendor", + ) + mocker.patch( + "core.pipeline.run_content_classification", + return_value={ + "content_type": "technical_article", + "confidence": 0.92, + "explanation": "Confident classification.", + "model_used": "heuristic", + "latency_ms": 0, + }, + ) + mocker.patch( + "core.pipeline.run_entity_extraction", + return_value={ + "mentions": [ + { + "entity_id": entity.id, + "entity_name": entity.name, + "role": "subject", + "sentiment": "neutral", + "span": entity.name, + "confidence": 0.88, + } + ], + "candidate_entities": [], + "primary_entity_id": entity.id, + "confidence": 0.88, + "explanation": "Tracked entity matched in the title.", + "model_used": "heuristic", + "latency_ms": 0, + }, + ) + mocker.patch( + "core.pipeline.run_relevance_scoring", + return_value={ + "relevance_score": 0.9, + "explanation": "Highly relevant.", + "used_llm": False, + "model_used": "embedding:test", + "latency_ms": 0, + }, + ) + mocker.patch( + "core.pipeline.run_summarization", + return_value={ + "summary": "Summary for editors.", + "model_used": "heuristic", + "latency_ms": 0, + }, + ) + + result = process_content(pipeline_context.content.id) + + skill_result = SkillResult.objects.get( + content=pipeline_context.content, + skill_name=ENTITY_EXTRACTION_SKILL_NAME, + ) + + assert result["status"] == "completed" + assert skill_result.status == SkillStatus.COMPLETED + assert skill_result.confidence == pytest.approx(0.88) + assert skill_result.result_data["mentions"][0]["entity_name"] == entity.name diff --git a/frontend/src/app/__tests__/page.test.tsx b/frontend/src/app/__tests__/page.test.tsx index 68ef9bc2..b524b72c 100644 --- a/frontend/src/app/__tests__/page.test.tsx +++ b/frontend/src/app/__tests__/page.test.tsx @@ -153,6 +153,8 @@ function createEntity(overrides: Partial = {}): Entity { bluesky_handle: "", mastodon_handle: "", twitter_handle: "openai", + mention_count: 0, + latest_mentions: [], created_at: "2026-04-28T09:30:00Z", ...overrides, } diff --git a/frontend/src/app/api/entity-candidates/[id]/__tests__/route.test.ts b/frontend/src/app/api/entity-candidates/[id]/__tests__/route.test.ts new file mode 100644 index 00000000..ac3b707f --- /dev/null +++ b/frontend/src/app/api/entity-candidates/[id]/__tests__/route.test.ts @@ -0,0 +1,146 @@ +import { beforeEach, describe, expect, it, vi } from "vitest" + +import { + acceptEntityCandidate, + mergeEntityCandidate, + rejectEntityCandidate, +} from "@/lib/api" +import type { EntityCandidate } from "@/lib/types" + +import { POST } from "../route" + +vi.mock("@/lib/api", () => ({ + acceptEntityCandidate: vi.fn(), + mergeEntityCandidate: vi.fn(), + rejectEntityCandidate: vi.fn(), +})) + +function buildRequest(formData: FormData) { + return new Request("http://localhost/api/entity-candidates/9", { + method: "POST", + body: formData, + }) +} + +async function getLocation(response: Response) { + return response.headers.get("location") +} + +function createCandidate(overrides: Partial = {}): EntityCandidate { + return { + id: 9, + project: 4, + name: "River Labs", + suggested_type: "vendor", + first_seen_in: 21, + first_seen_title: "River Labs launches hosted platform", + occurrence_count: 2, + status: "pending", + merged_into: null, + merged_into_name: "", + created_at: "2026-04-28T10:00:00Z", + updated_at: "2026-04-28T11:00:00Z", + ...overrides, + } +} + +describe("POST /api/entity-candidates/[id]", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it("accepts a candidate and redirects with a success message", async () => { + vi.mocked(acceptEntityCandidate).mockResolvedValue(createCandidate()) + + const formData = new FormData() + formData.set("projectId", "4") + formData.set("redirectTo", "/entities?project=4") + formData.set("intent", "accept") + + const response = await POST(buildRequest(formData), { + params: Promise.resolve({ id: "9" }), + }) + + expect(acceptEntityCandidate).toHaveBeenCalledWith(9, 4) + expect(response.status).toBe(307) + await expect(getLocation(response)).resolves.toBe( + "http://localhost/entities?project=4&message=Candidate+accepted.", + ) + }) + + it("rejects a candidate and redirects with a success message", async () => { + vi.mocked(rejectEntityCandidate).mockResolvedValue( + createCandidate({ status: "rejected" }), + ) + + const formData = new FormData() + formData.set("projectId", "4") + formData.set("redirectTo", "/entities?project=4") + formData.set("intent", "reject") + + const response = await POST(buildRequest(formData), { + params: Promise.resolve({ id: "9" }), + }) + + expect(rejectEntityCandidate).toHaveBeenCalledWith(9, 4) + expect(response.status).toBe(307) + await expect(getLocation(response)).resolves.toBe( + "http://localhost/entities?project=4&message=Candidate+rejected.", + ) + }) + + it("merges a candidate and redirects with a success message", async () => { + vi.mocked(mergeEntityCandidate).mockResolvedValue( + createCandidate({ status: "merged", merged_into: 15, merged_into_name: "Acme" }), + ) + + const formData = new FormData() + formData.set("projectId", "4") + formData.set("redirectTo", "/entities?project=4") + formData.set("intent", "merge") + formData.set("mergedInto", "15") + + const response = await POST(buildRequest(formData), { + params: Promise.resolve({ id: "9" }), + }) + + expect(mergeEntityCandidate).toHaveBeenCalledWith(9, 4, 15) + expect(response.status).toBe(307) + await expect(getLocation(response)).resolves.toBe( + "http://localhost/entities?project=4&message=Candidate+merged.", + ) + }) + + it("redirects with a validation error when merge target is missing", async () => { + const formData = new FormData() + formData.set("projectId", "4") + formData.set("redirectTo", "/entities?project=4") + formData.set("intent", "merge") + + const response = await POST(buildRequest(formData), { + params: Promise.resolve({ id: "9" }), + }) + + expect(mergeEntityCandidate).not.toHaveBeenCalled() + expect(response.status).toBe(307) + await expect(getLocation(response)).resolves.toBe( + "http://localhost/entities?project=4&error=Select+an+entity+to+merge+into.", + ) + }) + + it("redirects with a fallback error when the helper throws a non-Error value", async () => { + vi.mocked(acceptEntityCandidate).mockRejectedValue("boom") + + const formData = new FormData() + formData.set("projectId", "4") + + const response = await POST(buildRequest(formData), { + params: Promise.resolve({ id: "9" }), + }) + + expect(response.status).toBe(307) + await expect(getLocation(response)).resolves.toBe( + "http://localhost/entities?error=Unable+to+update+entity+candidate.", + ) + }) +}) diff --git a/frontend/src/app/api/entity-candidates/[id]/route.ts b/frontend/src/app/api/entity-candidates/[id]/route.ts new file mode 100644 index 00000000..8c10d519 --- /dev/null +++ b/frontend/src/app/api/entity-candidates/[id]/route.ts @@ -0,0 +1,83 @@ +import { NextResponse } from "next/server" + +import { + acceptEntityCandidate, + mergeEntityCandidate, + rejectEntityCandidate, +} from "@/lib/api" + +/** + * Build a redirect target for the entity-candidate form handlers. + * + * @param request - Incoming request used as the base URL for relative redirects. + * @param redirectTo - Caller-provided redirect target, or a fallback path. + * @param params - Query params to append to the redirect target. + * @returns A redirect URL with the provided flash-message params. + */ +function buildRedirectUrl( + request: Request, + redirectTo: string, + params: Record, +) { + const url = new URL(redirectTo || "/entities", request.url) + for (const [key, value] of Object.entries(params)) { + url.searchParams.set(key, value) + } + return url +} + +/** + * Handle entity-candidate review form submissions. + * + * @param request - Incoming form submission request. + * @param context - Route params containing the entity-candidate id. + * @returns A redirect response pointing back to the entities UI. + */ +export async function POST( + request: Request, + context: { params: Promise<{ id: string }> }, +) { + const { id } = await context.params + const formData = await request.formData() + const redirectTo = String(formData.get("redirectTo") || "/entities") + + try { + const projectId = Number.parseInt(String(formData.get("projectId") || "0"), 10) + const candidateId = Number.parseInt(id, 10) + const intent = String(formData.get("intent") || "accept") + + if (intent === "reject") { + await rejectEntityCandidate(candidateId, projectId) + return NextResponse.redirect( + buildRedirectUrl(request, redirectTo, { message: "Candidate rejected." }), + ) + } + + if (intent === "merge") { + const mergedInto = Number.parseInt( + String(formData.get("mergedInto") || "0"), + 10, + ) + if (!Number.isInteger(mergedInto) || mergedInto <= 0) { + throw new Error("Select an entity to merge into.") + } + await mergeEntityCandidate(candidateId, projectId, mergedInto) + return NextResponse.redirect( + buildRedirectUrl(request, redirectTo, { message: "Candidate merged." }), + ) + } + + await acceptEntityCandidate(candidateId, projectId) + return NextResponse.redirect( + buildRedirectUrl(request, redirectTo, { message: "Candidate accepted." }), + ) + } catch (error) { + const message = + error instanceof Error + ? error.message + : "Unable to update entity candidate." + return NextResponse.redirect( + buildRedirectUrl(request, redirectTo, { error: message }), + ) + } +} diff --git a/frontend/src/app/entities/[id]/__tests__/page.test.tsx b/frontend/src/app/entities/[id]/__tests__/page.test.tsx new file mode 100644 index 00000000..6327b0de --- /dev/null +++ b/frontend/src/app/entities/[id]/__tests__/page.test.tsx @@ -0,0 +1,239 @@ +import { render, screen } from "@testing-library/react" +import type { ReactNode } from "react" +import { beforeEach, describe, expect, it, vi } from "vitest" + +import type { Entity, Project } from "@/lib/types" + +const { + getProjectEntitiesMock, + getProjectEntityMentionsMock, + getProjectEntityMock, + getProjectsMock, + selectProjectMock, +} = vi.hoisted(() => ({ + getProjectEntitiesMock: vi.fn(), + getProjectEntityMentionsMock: vi.fn(), + getProjectEntityMock: vi.fn(), + getProjectsMock: vi.fn(), + selectProjectMock: vi.fn(), +})) + +vi.mock("@/components/app-shell", () => ({ + AppShell: ({ + children, + description, + title, + }: { + children: ReactNode + description: string + title: string + }) => ( +
+

{title}

+

{description}

+ {children} +
+ ), +})) + +vi.mock("@/components/status-badge", () => ({ + StatusBadge: ({ + children, + tone, + }: { + children: ReactNode + tone: string + }) => ( + + {children} + + ), +})) + +vi.mock("@/lib/api", () => ({ + getProjectEntities: getProjectEntitiesMock, + getProjectEntity: getProjectEntityMock, + getProjectEntityMentions: getProjectEntityMentionsMock, + getProjects: getProjectsMock, +})) + +vi.mock("@/lib/view-helpers", async () => { + const actual = await vi.importActual( + "@/lib/view-helpers", + ) + + return { + ...actual, + selectProject: selectProjectMock, + } +}) + +function createProject(overrides: Partial = {}): Project { + return { + id: 1, + name: "AI Weekly", + group: 10, + topic_description: "AI news", + content_retention_days: 30, + created_at: "2026-04-01T00:00:00Z", + ...overrides, + } +} + +function createEntity(overrides: Partial = {}): Entity { + return { + id: 7, + project: 1, + name: "OpenAI", + type: "vendor", + description: "LLM provider", + authority_score: 0.82, + website_url: "https://openai.com", + github_url: "https://github.com/openai", + linkedin_url: "", + bluesky_handle: "openai.bsky.social", + mastodon_handle: "", + twitter_handle: "openai", + mention_count: 2, + latest_mentions: [], + created_at: "2026-04-28T10:00:00Z", + ...overrides, + } +} + +async function loadEntityDetailPageModule() { + return import("../page") +} + +async function renderEntityDetailPage( + searchParams: Record = { + project: "1", + }, + params: { id: string } = { id: "7" }, +) { + const { default: EntityDetailPage } = await loadEntityDetailPageModule() + + return render( + await EntityDetailPage({ + params: Promise.resolve(params), + searchParams: Promise.resolve(searchParams), + }), + ) +} + +describe("EntityDetailPage", () => { + beforeEach(() => { + const defaultProject = createProject() + const defaultEntity = createEntity() + + getProjectsMock.mockReset() + getProjectEntityMock.mockReset() + getProjectEntityMentionsMock.mockReset() + getProjectEntitiesMock.mockReset() + selectProjectMock.mockReset() + + getProjectsMock.mockResolvedValue([defaultProject]) + getProjectEntityMock.mockResolvedValue(defaultEntity) + getProjectEntityMentionsMock.mockResolvedValue([]) + getProjectEntitiesMock.mockResolvedValue([defaultEntity]) + selectProjectMock.mockImplementation((projects: Project[]) => { + return projects[0] ?? null + }) + }) + + it("renders the no-project empty state and skips entity lookups", async () => { + getProjectsMock.mockResolvedValue([]) + selectProjectMock.mockReturnValue(null) + + await renderEntityDetailPage({}, { id: "7" }) + + expect(selectProjectMock).toHaveBeenCalledWith([], {}) + expect( + screen.getByText("No project is available for the configured API user."), + ).toBeInTheDocument() + expect(getProjectEntityMock).not.toHaveBeenCalled() + expect(getProjectEntityMentionsMock).not.toHaveBeenCalled() + }) + + it("renders entity metadata, identity links, and mention history", async () => { + const selectedProject = createProject({ id: 3, name: "Data Signals" }) + const entity = createEntity({ + id: 11, + project: 3, + name: "Anthropic", + type: "organization", + authority_score: 0.91, + description: "Safety-focused AI company", + website_url: "https://anthropic.com", + github_url: "", + linkedin_url: "", + bluesky_handle: "", + mastodon_handle: "", + twitter_handle: "anthropicai", + mention_count: 2, + }) + getProjectsMock.mockResolvedValue([selectedProject]) + selectProjectMock.mockReturnValue(selectedProject) + getProjectEntityMock.mockResolvedValue(entity) + getProjectEntityMentionsMock.mockResolvedValue([ + { + id: 31, + content_id: 22, + content_title: "Anthropic ships a safety update", + role: "subject", + sentiment: "positive", + span: "Anthropic", + confidence: 0.94, + created_at: "2026-04-28T12:00:00Z", + }, + { + id: 32, + content_id: 23, + content_title: "Platform teams discuss Anthropic", + role: "mentioned", + sentiment: "neutral", + span: "Anthropic", + confidence: 0.76, + created_at: "2026-04-28T13:00:00Z", + }, + ]) + getProjectEntitiesMock.mockResolvedValue([ + entity, + createEntity({ + id: 12, + project: 3, + name: "OpenAI", + mention_count: 1, + }), + ]) + + await renderEntityDetailPage({ project: "3" }, { id: "11" }) + + expect(getProjectEntityMock).toHaveBeenCalledWith(3, 11) + expect(getProjectEntityMentionsMock).toHaveBeenCalledWith(3, 11) + expect(screen.getByRole("heading", { name: "Anthropic" })).toBeInTheDocument() + expect(screen.getByText("2 mentions")).toBeInTheDocument() + expect(screen.getByText("Authority 0.91")).toBeInTheDocument() + expect(screen.getByText("Safety-focused AI company")).toBeInTheDocument() + expect(screen.getByText("Website")).toBeInTheDocument() + expect(screen.getByText("Twitter anthropicai")).toBeInTheDocument() + expect( + screen.getByText("Anthropic ships a safety update"), + ).toBeInTheDocument() + expect(screen.getByText("94% confidence")).toBeInTheDocument() + expect(screen.getByText("Back to entities")).toBeInTheDocument() + expect(screen.getByText("OpenAI")).toBeInTheDocument() + + const badge = screen.getByTestId("status-badge") + expect(badge).toHaveAttribute("data-tone", "neutral") + expect(badge).toHaveTextContent("organization") + }) + + it("renders an empty mention state when no mentions exist", async () => { + await renderEntityDetailPage({ project: "1" }, { id: "7" }) + + expect( + screen.getByText("No extracted mentions exist for this entity yet."), + ).toBeInTheDocument() + }) +}) diff --git a/frontend/src/app/entities/[id]/page.tsx b/frontend/src/app/entities/[id]/page.tsx new file mode 100644 index 00000000..35c40b64 --- /dev/null +++ b/frontend/src/app/entities/[id]/page.tsx @@ -0,0 +1,243 @@ +import Link from "next/link" + +import { AppShell } from "@/components/app-shell" +import { StatusBadge } from "@/components/status-badge" +import { + getProjectEntities, + getProjectEntity, + getProjectEntityMentions, + getProjects, +} from "@/lib/api" +import { + formatDate, + getErrorMessage, + getSuccessMessage, + selectProject, +} from "@/lib/view-helpers" + +type EntityDetailPageProps = { + params: Promise<{ id: string }> + searchParams: Promise> +} + +/** + * Render the detail view for one tracked entity in the selected project. + * + * The page joins the entity record with its extracted mention history so editors can + * inspect how the pipeline is linking content to the entity over time. + * + * @param props - Async server component props from the App Router. + * @param props.params - Route params promise containing the entity id. + * @param props.searchParams - Search params promise containing the optional `project`, `error`, and `message` values. + * @returns The rendered entity detail page or the no-project empty state. + */ +export default async function EntityDetailPage({ + params, + searchParams, +}: EntityDetailPageProps) { + const [{ id }, resolvedSearchParams] = await Promise.all([params, searchParams]) + const projects = await getProjects() + const selectedProject = selectProject(projects, resolvedSearchParams) + + if (!selectedProject) { + return ( + +
+ Create a project first in Django admin. +
+
+ ) + } + + const entityId = Number.parseInt(id, 10) + const [entity, mentions, projectEntities] = await Promise.all([ + getProjectEntity(selectedProject.id, entityId), + getProjectEntityMentions(selectedProject.id, entityId), + getProjectEntities(selectedProject.id), + ]) + const errorMessage = getErrorMessage(resolvedSearchParams) + const successMessage = getSuccessMessage(resolvedSearchParams) + const siblingEntities = projectEntities.filter((candidate) => candidate.id !== entity.id) + + return ( + + {errorMessage ? ( +
{errorMessage}
+ ) : null} + {successMessage ? ( +
{successMessage}
+ ) : null} + +
+
+
+
+
+

Tracked entity

+

+ {entity.name} +

+
+ Created {formatDate(entity.created_at)} + {entity.mention_count} mention{entity.mention_count === 1 ? "" : "s"} + Authority {entity.authority_score.toFixed(2)} +
+
+ {entity.type} +
+ +
+
+

+ Description +

+

+ {entity.description || "No description is set for this entity yet."} +

+
+
+

+ Identity links +

+
    + {entity.website_url ? ( +
  • + + Website + +
  • + ) : null} + {entity.github_url ? ( +
  • + + GitHub + +
  • + ) : null} + {entity.linkedin_url ? ( +
  • + + LinkedIn + +
  • + ) : null} + {entity.bluesky_handle ?
  • Bluesky {entity.bluesky_handle}
  • : null} + {entity.mastodon_handle ?
  • Mastodon {entity.mastodon_handle}
  • : null} + {entity.twitter_handle ?
  • Twitter {entity.twitter_handle}
  • : null} + {!entity.website_url && + !entity.github_url && + !entity.linkedin_url && + !entity.bluesky_handle && + !entity.mastodon_handle && + !entity.twitter_handle ? ( +
  • No external identity links are set yet.
  • + ) : null} +
+
+
+
+ +
+
+
+

Mention history

+

+ Extracted mentions linked to this entity +

+
+ {mentions.length} total mention{mentions.length === 1 ? "" : "s"} +
+ {mentions.length === 0 ? ( +
+ No extracted mentions exist for this entity yet. +
+ ) : ( +
    + {mentions.map((mention) => ( +
  • +
    +
    + + {mention.content_title} + +
    + {mention.role} + {mention.sentiment ? {mention.sentiment} : null} + {Math.round(mention.confidence * 100)}% confidence + {formatDate(mention.created_at)} +
    +
    + {mention.span ? ( + + {mention.span} + + ) : null} +
    +
  • + ))} +
+ )} +
+
+ +
+
+

Navigation

+
+ + Back to entities + +
+
+ +
+
+

Related entities

+

+ Same-project entities +

+
+ {siblingEntities.length === 0 ? ( +

+ No other entities exist in this project yet. +

+ ) : ( +
    + {siblingEntities.slice(0, 6).map((siblingEntity) => ( +
  • + + {siblingEntity.name} + +
    + {siblingEntity.type} + {siblingEntity.mention_count} mention{siblingEntity.mention_count === 1 ? "" : "s"} +
    +
  • + ))} +
+ )} +
+
+
+
+ ) +} diff --git a/frontend/src/app/entities/__tests__/page.test.tsx b/frontend/src/app/entities/__tests__/page.test.tsx index 4e719e11..bc325b7c 100644 --- a/frontend/src/app/entities/__tests__/page.test.tsx +++ b/frontend/src/app/entities/__tests__/page.test.tsx @@ -2,15 +2,19 @@ import { render, screen } from "@testing-library/react" import type { ReactNode } from "react" import { beforeEach, describe, expect, it, vi } from "vitest" -import type { Entity, Project } from "@/lib/types" +import type { Entity, EntityCandidate, Project } from "@/lib/types" -const { getProjectEntitiesMock, getProjectsMock, selectProjectMock } = vi.hoisted( - () => ({ - getProjectEntitiesMock: vi.fn(), - getProjectsMock: vi.fn(), - selectProjectMock: vi.fn(), - }), -) +const { + getProjectEntitiesMock, + getProjectEntityCandidatesMock, + getProjectsMock, + selectProjectMock, +} = vi.hoisted(() => ({ + getProjectEntitiesMock: vi.fn(), + getProjectEntityCandidatesMock: vi.fn(), + getProjectsMock: vi.fn(), + selectProjectMock: vi.fn(), +})) vi.mock("@/components/app-shell", () => ({ AppShell: ({ @@ -46,6 +50,7 @@ vi.mock("@/components/status-badge", () => ({ vi.mock("@/lib/api", () => ({ getProjectEntities: getProjectEntitiesMock, + getProjectEntityCandidates: getProjectEntityCandidatesMock, getProjects: getProjectsMock, })) @@ -86,11 +91,33 @@ function createEntity(overrides: Partial = {}): Entity { bluesky_handle: "openai.bsky.social", mastodon_handle: "@openai@mastodon.social", twitter_handle: "openai", + mention_count: 0, + latest_mentions: [], created_at: "2026-04-28T10:00:00Z", ...overrides, } } +function createEntityCandidate( + overrides: Partial = {}, +): EntityCandidate { + return { + id: 14, + project: 1, + name: "River Labs", + suggested_type: "vendor", + first_seen_in: 21, + first_seen_title: "River Labs launches hosted platform", + occurrence_count: 2, + status: "pending", + merged_into: null, + merged_into_name: "", + created_at: "2026-04-28T10:00:00Z", + updated_at: "2026-04-28T11:00:00Z", + ...overrides, + } +} + async function loadEntitiesPageModule() { return import("../page") } @@ -115,10 +142,12 @@ describe("EntitiesPage", () => { getProjectsMock.mockReset() getProjectEntitiesMock.mockReset() + getProjectEntityCandidatesMock.mockReset() selectProjectMock.mockReset() getProjectsMock.mockResolvedValue([defaultProject]) getProjectEntitiesMock.mockResolvedValue([]) + getProjectEntityCandidatesMock.mockResolvedValue([]) selectProjectMock.mockImplementation((projects: Project[]) => { return projects[0] ?? null }) @@ -138,6 +167,7 @@ describe("EntitiesPage", () => { screen.getByText("Create a project first in Django admin."), ).toBeInTheDocument() expect(getProjectEntitiesMock).not.toHaveBeenCalled() + expect(getProjectEntityCandidatesMock).not.toHaveBeenCalled() }) it("renders flash messages and the empty entity state", async () => { @@ -160,10 +190,14 @@ describe("EntitiesPage", () => { expect( screen.getByText("No entities exist for this project yet."), ).toBeInTheDocument() + expect( + screen.getByText("No pending entity candidates right now."), + ).toBeInTheDocument() expect(getProjectEntitiesMock).toHaveBeenCalledWith(1) + expect(getProjectEntityCandidatesMock).toHaveBeenCalledWith(1) }) - it("renders entity cards, badge tone, and edit form defaults", async () => { + it("renders entity cards, mention summaries, and the candidate queue", async () => { const selectedProject = createProject({ id: 3, name: "Data Signals" }) getProjectsMock.mockResolvedValue([selectedProject]) selectProjectMock.mockReturnValue(selectedProject) @@ -181,6 +215,25 @@ describe("EntitiesPage", () => { bluesky_handle: "", mastodon_handle: "", twitter_handle: "anthropicai", + mention_count: 2, + latest_mentions: [ + { + id: 31, + content_id: 22, + content_title: "Anthropic ships a safety update", + role: "subject", + sentiment: "positive", + span: "Anthropic", + confidence: 0.94, + created_at: "2026-04-28T12:00:00Z", + }, + ], + }), + ]) + getProjectEntityCandidatesMock.mockResolvedValue([ + createEntityCandidate({ + project: 3, + occurrence_count: 3, }), ]) @@ -192,16 +245,29 @@ describe("EntitiesPage", () => { ) expect(screen.getByRole("heading", { name: "Anthropic" })).toBeInTheDocument() expect(screen.getByText("Authority 0.91")).toBeInTheDocument() + expect(screen.getByText("2 mentions")).toBeInTheDocument() + expect( + screen.getByText("Anthropic ships a safety update"), + ).toBeInTheDocument() + expect(screen.getByText("94% confidence")).toBeInTheDocument() + expect(screen.getByText("Pending entity candidates")).toBeInTheDocument() + expect(screen.getByText("River Labs")).toBeInTheDocument() + expect(screen.getByText("3 occurrences")).toBeInTheDocument() + expect( + screen.getByText("First seen in River Labs launches hosted platform"), + ).toBeInTheDocument() - const badge = screen.getByTestId("status-badge") - expect(badge).toHaveAttribute("data-tone", "neutral") - expect(badge).toHaveTextContent("organization") + const badges = screen.getAllByTestId("status-badge") + expect(badges[0]).toHaveAttribute("data-tone", "warning") + expect(badges[0]).toHaveTextContent("pending") + expect(badges[1]).toHaveAttribute("data-tone", "neutral") + expect(badges[1]).toHaveTextContent("organization") expect( screen.getByDisplayValue("Safety-focused AI company"), ).toBeInTheDocument() expect(screen.getByDisplayValue("https://anthropic.com")).toBeInTheDocument() expect(screen.getByDisplayValue("anthropicai")).toBeInTheDocument() - expect(screen.getAllByDisplayValue("/entities?project=3")).toHaveLength(3) + expect(screen.getAllByDisplayValue("/entities?project=3")).toHaveLength(6) }) }) diff --git a/frontend/src/app/entities/page.tsx b/frontend/src/app/entities/page.tsx index 6e952df9..32f0eb4c 100644 --- a/frontend/src/app/entities/page.tsx +++ b/frontend/src/app/entities/page.tsx @@ -1,6 +1,12 @@ +import Link from "next/link" + import { AppShell } from "@/components/app-shell" import { StatusBadge } from "@/components/status-badge" -import { getProjectEntities, getProjects } from "@/lib/api" +import { + getProjectEntities, + getProjectEntityCandidates, + getProjects, +} from "@/lib/api" import { formatDate, getErrorMessage, @@ -47,6 +53,7 @@ export default async function EntitiesPage({ } const entities = await getProjectEntities(selectedProject.id) + const entityCandidates = await getProjectEntityCandidates(selectedProject.id) const errorMessage = getErrorMessage(resolvedSearchParams) const successMessage = getSuccessMessage(resolvedSearchParams) @@ -64,72 +71,170 @@ export default async function EntitiesPage({
{successMessage}
) : null} -
-
-

Create entity

-
- - -
- - -
-