Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,38 @@ def queryset(self, request, queryset):
return queryset


class DuplicateStateFilter(admin.SimpleListFilter):
"""Filter content by duplicate retention and suppression state."""

title = "Duplicate State"
parameter_name = "duplicate_state"

def lookups(self, request, model_admin):
"""Return duplicate-state options displayed in the admin sidebar."""

return (
("canonical_with_duplicates", "Canonical rows with duplicate signals"),
("suppressed_duplicates", "Suppressed duplicate rows"),
)

def queryset(self, request, queryset):
"""Apply the selected duplicate-state filter."""

if self.value() == "canonical_with_duplicates":
return queryset.filter(duplicate_signal_count__gt=0)
if self.value() == "suppressed_duplicates":
return queryset.filter(duplicate_of__isnull=False)
return queryset


@admin.register(Content)
class ContentAdmin(admin.ModelAdmin):
"""Admin view for curated content plus trace and score context."""

list_display = (
"display_relevance",
"duplicate_badge",
"duplicate_parent",
"is_active",
"is_reference",
"preview_content",
Expand All @@ -255,6 +281,7 @@ class ContentAdmin(admin.ModelAdmin):
list_editable = ("is_reference", "is_active")
list_filter = (
HighValueFilter,
DuplicateStateFilter,
("project", admin.RelatedOnlyFieldListFilter),
"source_plugin",
"is_active",
Expand Down Expand Up @@ -377,6 +404,25 @@ def display_relevance(self, obj):
)
return format_html('<b style="color: {};">{}%</b>', color, obj.relevance_score)

@admin.display(description="Duplicates", ordering="duplicate_signal_count")
def duplicate_badge(self, obj):
"""Show how many duplicate sightings point at this content row."""

if obj.duplicate_signal_count <= 0:
return "-"
return format_html(
'<span style="font-weight: bold; color: #0f766e;">Also seen in {} source(s)</span>',
obj.duplicate_signal_count,
)

@admin.display(description="Duplicate Of", ordering="duplicate_of")
def duplicate_parent(self, obj):
"""Show the retained canonical content row when this item is a duplicate."""

if obj.duplicate_of is None:
return "-"
return obj.duplicate_of.title

def changelist_view(self, request, extra_context=None):
"""Augment the changelist with content dashboard statistics."""

Expand Down
3 changes: 3 additions & 0 deletions core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,14 @@
"entity": 4,
"source_plugin": "rss",
"content_type": "article",
"canonical_url": "https://example.com/posts/agent-memory-patterns",
"published_date": "2026-04-25T14:00:00Z",
"ingested_at": "2026-04-26T12:05:00Z",
"content_text": "A walkthrough of short-term and long-term memory patterns for production agents.",
"relevance_score": 0.92,
"embedding_id": "emb_01jabcxyz",
"duplicate_of": None,
"duplicate_signal_count": 2,
"is_reference": False,
"is_active": True,
},
Expand Down
78 changes: 78 additions & 0 deletions core/deduplication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Helpers for canonical URL normalization used by content deduplication."""

from __future__ import annotations

from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit

import httpx

TRACKING_QUERY_KEYS = frozenset(
{
"fbclid",
"gclid",
"mc_cid",
"mc_eid",
"ref",
"ref_src",
"s",
"t",
}
)
KNOWN_SHORTENER_HOSTS = frozenset({"bit.ly", "buff.ly", "lnkd.in", "t.co"})


def canonicalize_url(raw_url: str) -> str:
"""Normalize a URL into a stable canonical form for deduplication."""

if not raw_url:
return ""

resolved_url = _resolve_known_shortener(raw_url.strip())
parsed_url = urlsplit(resolved_url)
scheme = (parsed_url.scheme or "https").lower()
hostname = (parsed_url.hostname or "").lower()
if hostname.startswith("www."):
hostname = hostname[4:]

netloc = hostname
if parsed_url.port and not _is_default_port(scheme, parsed_url.port):
netloc = f"{hostname}:{parsed_url.port}"

path = parsed_url.path or "/"
if path != "/":
path = path.rstrip("/") or "/"

filtered_query = urlencode(
[
(key, value)
for key, value in parse_qsl(parsed_url.query, keep_blank_values=True)
if not _should_drop_query_parameter(key)
],
doseq=True,
)

return urlunsplit((scheme, netloc, path, filtered_query, ""))


def _resolve_known_shortener(raw_url: str) -> str:
"""Expand a supported short URL when the network request succeeds."""

hostname = (urlsplit(raw_url).hostname or "").lower()
if hostname not in KNOWN_SHORTENER_HOSTS:
return raw_url

try:
response = httpx.head(raw_url, follow_redirects=True, timeout=5.0)
response.raise_for_status()
except httpx.HTTPError:
return raw_url
return str(response.url)


def _should_drop_query_parameter(key: str) -> bool:
normalized_key = key.lower()
return normalized_key.startswith("utm_") or normalized_key in TRACKING_QUERY_KEYS


def _is_default_port(scheme: str, port: int) -> bool:
return (scheme == "http" and port == 80) or (scheme == "https" and port == 443)
82 changes: 82 additions & 0 deletions core/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import re
import time
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any

import httpx
import markdown # type: ignore[import-untyped]
from django.conf import settings

JSON_OBJECT_PATTERN = re.compile(r"\{.*\}", re.DOTALL)
Expand All @@ -19,6 +22,17 @@ class OpenRouterJSONResponse:
latency_ms: int


@dataclass(slots=True)
class SkillDefinition:
"""Represents one Claude-style skill markdown document."""

name: str
input_fields: tuple[str, ...]
output_fields: tuple[str, ...]
instructions_markdown: str
instructions_html: str


def openrouter_chat_json(
*, model: str, system_prompt: str, user_prompt: str
) -> OpenRouterJSONResponse:
Expand Down Expand Up @@ -62,6 +76,42 @@ def openrouter_chat_json(
)


@lru_cache(maxsize=16)
def get_skill_definition(skill_name: str) -> SkillDefinition:
"""Load a skill definition from the repository skill markdown directory."""

skill_path = Path(__file__).resolve().parent.parent / "skills" / skill_name / "SKILL.md"
raw_text = skill_path.read_text(encoding="utf-8")
frontmatter, body = _split_frontmatter(raw_text)
name = frontmatter.get("name", skill_name).strip() or skill_name
input_fields = _csv_field_list(frontmatter.get("input", ""))
output_fields = _csv_field_list(frontmatter.get("output", ""))
instructions_markdown = body.strip()
return SkillDefinition(
name=name,
input_fields=input_fields,
output_fields=output_fields,
instructions_markdown=instructions_markdown,
instructions_html=markdown.markdown(instructions_markdown),
)


def build_skill_user_prompt(skill_name: str, inputs: dict[str, Any]) -> str:
"""Render a consistent user prompt from a skill's declared input fields."""

skill = get_skill_definition(skill_name)
sections = []
for field_name in skill.input_fields:
value = inputs.get(field_name, "")
sections.append(f"{field_name}:\n{_stringify_skill_input(value)}")
if skill.output_fields:
sections.append(
"Return only a JSON object with these fields: "
+ ", ".join(skill.output_fields)
)
return "\n\n".join(sections)


def _extract_json_object(message_content: str) -> dict[str, Any]:
try:
payload = json.loads(message_content)
Expand All @@ -73,3 +123,35 @@ def _extract_json_object(message_content: str) -> dict[str, Any]:
if not isinstance(payload, dict):
raise ValueError("Model response JSON must be an object.")
return payload


def _split_frontmatter(raw_text: str) -> tuple[dict[str, str], str]:
"""Split a skill markdown document into simple frontmatter and body."""

if not raw_text.startswith("---\n"):
return {}, raw_text
_, _, remainder = raw_text.partition("\n")
frontmatter_block, separator, body = remainder.partition("\n---\n")
if not separator:
return {}, raw_text
frontmatter: dict[str, str] = {}
for line in frontmatter_block.splitlines():
if not line.strip() or ":" not in line:
continue
key, value = line.split(":", 1)
frontmatter[key.strip()] = value.strip()
return frontmatter, body


def _csv_field_list(raw_value: str) -> tuple[str, ...]:
"""Parse a comma-separated frontmatter field list."""

return tuple(part.strip() for part in raw_value.split(",") if part.strip())


def _stringify_skill_input(value: Any) -> str:
"""Serialize skill input values into prompt-safe text."""

if isinstance(value, (dict, list, tuple)):
return json.dumps(value, ensure_ascii=True, indent=2, sort_keys=True)
return str(value)
32 changes: 30 additions & 2 deletions core/management/commands/seed_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from httpx import HTTPError
from qdrant_client.http.exceptions import ResponseHandlingException

from core.deduplication import canonicalize_url
from core.embeddings import upsert_content_embedding
from core.models import (
Content,
Expand Down Expand Up @@ -612,6 +613,7 @@ def _seed_articles(
"author": article["author"],
"entity": entities_by_name.get(article.get("entity_name", "")),
"source_plugin": source_plugin or article["source_plugin"],
"canonical_url": canonicalize_url(article["url"]),
"published_date": now - timedelta(days=article["days_ago"]),
"content_text": article["content_text"],
"is_reference": is_reference,
Expand Down Expand Up @@ -887,10 +889,36 @@ def _build_reference_articles(self) -> list[dict[str, Any]]:

def _build_demo_content(self) -> list[dict[str, Any]]:
articles = list(LEGACY_SAMPLE_CONTENT)
articles.extend(self._build_generated_rss_content())
articles.extend(self._build_generated_reddit_content())
generated_rss = self._build_generated_rss_content()
generated_reddit = self._build_generated_reddit_content()
self._inject_duplicate_variants(articles, generated_rss, generated_reddit)
articles.extend(generated_rss)
articles.extend(generated_reddit)
return articles

@staticmethod
def _inject_duplicate_variants(
legacy_articles: list[dict[str, Any]],
generated_rss: list[dict[str, Any]],
generated_reddit: list[dict[str, Any]],
) -> None:
duplicate_pairs = [
(legacy_articles[0], generated_reddit[0], "reddit"),
(generated_rss[2], generated_reddit[1], "community"),
(generated_rss[9], generated_reddit[2], "social"),
]
for base_article, duplicate_article, source_tag in duplicate_pairs:
duplicate_article["url"] = (
f"{base_article['url']}?utm_source={source_tag}&ref=seed-demo"
)
duplicate_article["title"] = (
f"{duplicate_article['title']} linking to {base_article['title']}"
)
duplicate_article["content_text"] = (
f"This seeded item points readers to the same underlying article as '{base_article['title']}'. "
f"{duplicate_article['content_text']}"
)

def _build_generated_rss_content(self) -> list[dict[str, Any]]:
articles = []
for index in range(147):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Generated by Django 6.0.4 on 2026-04-29 01:33

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("core", "0004_blueskycredentials"),
]

operations = [
migrations.AddField(
model_name="content",
name="canonical_url",
field=models.URLField(blank=True, db_index=True, default=""),
),
migrations.AddField(
model_name="content",
name="duplicate_of",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="duplicates",
to="core.content",
),
),
migrations.AddField(
model_name="content",
name="duplicate_signal_count",
field=models.IntegerField(default=0),
),
]
Loading
Loading