Skip to content

Commit 284ebd0

Browse files
committed
Implement entity extraction
1 parent 4279351 commit 284ebd0

23 files changed

Lines changed: 3075 additions & 83 deletions

File tree

core/admin.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,18 @@
1616
from import_export.admin import ExportActionMixin
1717
from unfold.admin import ModelAdmin
1818

19+
from core.entity_extraction import (
20+
accept_entity_candidate,
21+
merge_entity_candidate,
22+
reject_entity_candidate,
23+
)
1924
from core.models import (
2025
BlueskyCredentials,
2126
Content,
2227
Entity,
28+
EntityCandidate,
29+
EntityCandidateStatus,
30+
EntityMention,
2331
IngestionRun,
2432
Project,
2533
ProjectConfig,
@@ -199,6 +207,7 @@ class EntityAdmin(admin.ModelAdmin):
199207

200208
# Replace 'authority_score' with your new method name
201209
list_display = ("name", "project", "type", "colored_score", "created_at")
210+
search_fields = ("name", "project__name")
202211

203212
@admin.display(description="Authority Score", ordering="authority_score")
204213
def colored_score(self, obj):
@@ -219,6 +228,116 @@ def colored_score(self, obj):
219228
)
220229

221230

231+
@admin.register(EntityMention)
232+
class EntityMentionAdmin(admin.ModelAdmin):
233+
"""Admin view for extracted tracked-entity mentions."""
234+
235+
list_display = (
236+
"entity",
237+
"project",
238+
"content",
239+
"role",
240+
"sentiment",
241+
"confidence",
242+
"created_at",
243+
)
244+
list_filter = ("role", "sentiment", ("project", admin.RelatedOnlyFieldListFilter))
245+
search_fields = ("entity__name", "content__title", "span")
246+
autocomplete_fields = ("entity", "content", "project")
247+
248+
249+
@admin.register(EntityCandidate)
250+
class EntityCandidateAdmin(admin.ModelAdmin):
251+
"""Admin view for candidate entities awaiting human review."""
252+
253+
actions = [
254+
"accept_selected_candidates",
255+
"reject_selected_candidates",
256+
"merge_into_existing_entities",
257+
]
258+
list_display = (
259+
"name",
260+
"project",
261+
"suggested_type",
262+
"occurrence_count",
263+
"status",
264+
"merged_into",
265+
"first_seen_in",
266+
"created_at",
267+
)
268+
list_filter = (
269+
"status",
270+
"suggested_type",
271+
("project", admin.RelatedOnlyFieldListFilter),
272+
)
273+
search_fields = ("name", "project__name", "merged_into__name")
274+
autocomplete_fields = ("project", "first_seen_in", "merged_into")
275+
ordering = ("-occurrence_count", "name")
276+
277+
@admin.action(description="Accept selected candidates")
278+
def accept_selected_candidates(self, request, queryset):
279+
"""Promote selected candidates into tracked entities."""
280+
281+
accepted_count = 0
282+
for candidate in queryset.select_related("project"):
283+
if candidate.status == EntityCandidateStatus.ACCEPTED:
284+
continue
285+
accept_entity_candidate(candidate)
286+
accepted_count += 1
287+
self.message_user(
288+
request,
289+
f"Accepted {accepted_count} entity candidate(s).",
290+
messages.SUCCESS,
291+
)
292+
293+
@admin.action(description="Reject selected candidates")
294+
def reject_selected_candidates(self, request, queryset):
295+
"""Mark selected candidates as rejected."""
296+
297+
rejected_count = 0
298+
for candidate in queryset:
299+
if candidate.status == EntityCandidateStatus.REJECTED:
300+
continue
301+
reject_entity_candidate(candidate)
302+
rejected_count += 1
303+
self.message_user(
304+
request,
305+
f"Rejected {rejected_count} entity candidate(s).",
306+
messages.SUCCESS,
307+
)
308+
309+
@admin.action(description="Merge selected candidates into existing entities")
310+
def merge_into_existing_entities(self, request, queryset):
311+
"""Merge candidates when a same-name entity already exists in the project."""
312+
313+
merged_count = 0
314+
unresolved_names: list[str] = []
315+
for candidate in queryset.select_related("project"):
316+
matching_entities = Entity.objects.filter(
317+
project=candidate.project,
318+
name__iexact=candidate.name,
319+
)
320+
if matching_entities.count() != 1:
321+
unresolved_names.append(candidate.name)
322+
continue
323+
merge_entity_candidate(candidate, matching_entities.get())
324+
merged_count += 1
325+
326+
if merged_count:
327+
self.message_user(
328+
request,
329+
f"Merged {merged_count} entity candidate(s) into existing entities.",
330+
messages.SUCCESS,
331+
)
332+
if unresolved_names:
333+
self.message_user(
334+
request,
335+
"No unique same-name entity match was available for: "
336+
+ ", ".join(sorted(unresolved_names)),
337+
messages.WARNING,
338+
)
339+
340+
222341
class HighValueFilter(admin.SimpleListFilter):
223342
"""Filter content down to high-value reference items."""
224343

core/api.py

Lines changed: 116 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import logging
99
from typing import Any
1010

11+
from django.db.models import Count, Prefetch
1112
from drf_spectacular.utils import (
1213
OpenApiExample,
1314
OpenApiParameter,
@@ -21,10 +22,17 @@
2122
from rest_framework.exceptions import NotFound
2223
from rest_framework.response import Response
2324

25+
from core.entity_extraction import (
26+
accept_entity_candidate,
27+
merge_entity_candidate,
28+
reject_entity_candidate,
29+
)
2430
from core.models import (
2531
BlueskyCredentials,
2632
Content,
2733
Entity,
34+
EntityCandidate,
35+
EntityMention,
2836
IngestionRun,
2937
Project,
3038
ProjectConfig,
@@ -35,6 +43,9 @@
3543
)
3644
from core.serializers import (
3745
ContentSerializer,
46+
EntityCandidateMergeSerializer,
47+
EntityCandidateSerializer,
48+
EntityMentionSummarySerializer,
3849
EntitySerializer,
3950
IngestionRunSerializer,
4051
ProjectConfigSerializer,
@@ -693,7 +704,111 @@ class EntityViewSet(ProjectOwnedQuerysetMixin, viewsets.ModelViewSet):
693704
"""Manage tracked entities associated with a project."""
694705

695706
serializer_class = EntitySerializer
696-
queryset = Entity.objects.select_related("project")
707+
queryset = (
708+
Entity.objects.select_related("project")
709+
.annotate(mention_count=Count("mentions", distinct=True))
710+
.prefetch_related(
711+
Prefetch(
712+
"mentions",
713+
queryset=EntityMention.objects.select_related("content").order_by(
714+
"-created_at"
715+
),
716+
to_attr="prefetched_mentions",
717+
)
718+
)
719+
)
720+
721+
@extend_schema(
722+
summary="List entity mentions",
723+
description="Return the extracted mention history for one tracked entity inside the selected project.",
724+
request=None,
725+
responses={200: EntityMentionSummarySerializer(many=True), 403: AUTHENTICATION_REQUIRED_RESPONSE},
726+
tags=["Entity Catalog"],
727+
)
728+
@action(detail=True, methods=["get"], url_path="mentions")
729+
def mentions(self, request, *args, **kwargs):
730+
"""Return the extracted mentions for the selected entity."""
731+
732+
entity = self.get_object()
733+
mentions = entity.mentions.select_related("content").order_by("-created_at")
734+
serializer = EntityMentionSummarySerializer(mentions, many=True)
735+
return Response(serializer.data)
736+
737+
738+
@document_project_owned_viewset(
739+
resource_plural="entity candidates",
740+
resource_singular="entity candidate",
741+
create_description="Entity candidates are created by the pipeline and can be reviewed through dedicated actions.",
742+
tag="Entity Catalog",
743+
action_overrides=build_crud_action_overrides(
744+
EntityCandidateSerializer,
745+
resource_plural="entity candidates for the selected project",
746+
resource_singular="entity candidate",
747+
),
748+
)
749+
class EntityCandidateViewSet(ProjectOwnedQuerysetMixin, viewsets.ReadOnlyModelViewSet):
750+
"""Inspect and resolve entity candidates surfaced by entity extraction."""
751+
752+
serializer_class = EntityCandidateSerializer
753+
queryset = EntityCandidate.objects.select_related(
754+
"project", "first_seen_in", "merged_into"
755+
)
756+
757+
@extend_schema(
758+
summary="Accept entity candidate",
759+
description="Promote a pending entity candidate into a tracked entity and backfill recent mentions.",
760+
request=None,
761+
responses={200: EntityCandidateSerializer, 403: AUTHENTICATION_REQUIRED_RESPONSE},
762+
tags=["Entity Catalog"],
763+
)
764+
@action(detail=True, methods=["post"], url_path="accept")
765+
def accept(self, request, *args, **kwargs):
766+
"""Accept an entity candidate and return its updated representation."""
767+
768+
candidate = self.get_object()
769+
accept_entity_candidate(candidate)
770+
candidate.refresh_from_db()
771+
serializer = self.get_serializer(candidate)
772+
return Response(serializer.data)
773+
774+
@extend_schema(
775+
summary="Reject entity candidate",
776+
description="Mark a pending entity candidate as rejected without creating a tracked entity.",
777+
request=None,
778+
responses={200: EntityCandidateSerializer, 403: AUTHENTICATION_REQUIRED_RESPONSE},
779+
tags=["Entity Catalog"],
780+
)
781+
@action(detail=True, methods=["post"], url_path="reject")
782+
def reject(self, request, *args, **kwargs):
783+
"""Reject an entity candidate and return its updated representation."""
784+
785+
candidate = self.get_object()
786+
reject_entity_candidate(candidate)
787+
candidate.refresh_from_db()
788+
serializer = self.get_serializer(candidate)
789+
return Response(serializer.data)
790+
791+
@extend_schema(
792+
summary="Merge entity candidate",
793+
description="Merge a pending entity candidate into an existing tracked entity from the same project.",
794+
request=EntityCandidateMergeSerializer,
795+
responses={200: EntityCandidateSerializer, 400: EntityCandidateMergeSerializer, 403: AUTHENTICATION_REQUIRED_RESPONSE},
796+
tags=["Entity Catalog"],
797+
)
798+
@action(detail=True, methods=["post"], url_path="merge")
799+
def merge(self, request, *args, **kwargs):
800+
"""Merge an entity candidate into an existing tracked entity."""
801+
802+
candidate = self.get_object()
803+
serializer = EntityCandidateMergeSerializer(
804+
data=request.data,
805+
context=self.get_serializer_context(),
806+
)
807+
serializer.is_valid(raise_exception=True)
808+
merge_entity_candidate(candidate, serializer.validated_data["merged_into"])
809+
candidate.refresh_from_db()
810+
response_serializer = self.get_serializer(candidate)
811+
return Response(response_serializer.data)
697812

698813

699814
@document_project_owned_viewset(

core/api_urls.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from core.api import (
55
ContentViewSet,
6+
EntityCandidateViewSet,
67
EntityViewSet,
78
IngestionRunViewSet,
89
ProjectConfigViewSet,
@@ -23,6 +24,11 @@
2324
r"project-configs", ProjectConfigViewSet, basename="project-config"
2425
)
2526
project_router.register(r"entities", EntityViewSet, basename="project-entity")
27+
project_router.register(
28+
r"entity-candidates",
29+
EntityCandidateViewSet,
30+
basename="project-entity-candidate",
31+
)
2632
project_router.register(r"contents", ContentViewSet, basename="project-content")
2733
project_router.register(
2834
r"skill-results", SkillResultViewSet, basename="project-skill-result"

0 commit comments

Comments
 (0)