webstackdev · webstackdev · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/core/admin.py b/core/admin.py
@@ -238,12 +238,38 @@ def queryset(self, request, queryset):
         return queryset
 
 
+class DuplicateStateFilter(admin.SimpleListFilter):
+    """Filter content by duplicate retention and suppression state."""
+
+    title = "Duplicate State"
+    parameter_name = "duplicate_state"
+
+    def lookups(self, request, model_admin):
+        """Return duplicate-state options displayed in the admin sidebar."""
+
+        return (
+            ("canonical_with_duplicates", "Canonical rows with duplicate signals"),
+            ("suppressed_duplicates", "Suppressed duplicate rows"),
+        )
+
+    def queryset(self, request, queryset):
+        """Apply the selected duplicate-state filter."""
+
+        if self.value() == "canonical_with_duplicates":
+            return queryset.filter(duplicate_signal_count__gt=0)
+        if self.value() == "suppressed_duplicates":
+            return queryset.filter(duplicate_of__isnull=False)
+        return queryset
+
+
 @admin.register(Content)
 class ContentAdmin(admin.ModelAdmin):
     """Admin view for curated content plus trace and score context."""
 
     list_display = (
         "display_relevance",
+        "duplicate_badge",
+        "duplicate_parent",
         "is_active",
         "is_reference",
         "preview_content",
@@ -255,6 +281,7 @@ class ContentAdmin(admin.ModelAdmin):
     list_editable = ("is_reference", "is_active")
     list_filter = (
         HighValueFilter,
+        DuplicateStateFilter,
         ("project", admin.RelatedOnlyFieldListFilter),
         "source_plugin",
         "is_active",
@@ -377,6 +404,25 @@ def display_relevance(self, obj):
         )
         return format_html('<b style="color: {};">{}%</b>', color, obj.relevance_score)
 
+    @admin.display(description="Duplicates", ordering="duplicate_signal_count")
+    def duplicate_badge(self, obj):
+        """Show how many duplicate sightings point at this content row."""
+
+        if obj.duplicate_signal_count <= 0:
+            return "-"
+        return format_html(
+            '<span style="font-weight: bold; color: #0f766e;">Also seen in {} source(s)</span>',
+            obj.duplicate_signal_count,
+        )
+
+    @admin.display(description="Duplicate Of", ordering="duplicate_of")
+    def duplicate_parent(self, obj):
+        """Show the retained canonical content row when this item is a duplicate."""
+
+        if obj.duplicate_of is None:
+            return "-"
+        return obj.duplicate_of.title
+
     def changelist_view(self, request, extra_context=None):
         """Augment the changelist with content dashboard statistics."""
 

diff --git a/core/api.py b/core/api.py
@@ -177,11 +177,14 @@
         "entity": 4,
         "source_plugin": "rss",
         "content_type": "article",
+        "canonical_url": "https://example.com/posts/agent-memory-patterns",
         "published_date": "2026-04-25T14:00:00Z",
         "ingested_at": "2026-04-26T12:05:00Z",
         "content_text": "A walkthrough of short-term and long-term memory patterns for production agents.",
         "relevance_score": 0.92,
         "embedding_id": "emb_01jabcxyz",
+        "duplicate_of": None,
+        "duplicate_signal_count": 2,
         "is_reference": False,
         "is_active": True,
     },

diff --git a/core/deduplication.py b/core/deduplication.py
@@ -0,0 +1,78 @@
+"""Helpers for canonical URL normalization used by content deduplication."""
+
+from __future__ import annotations
+
+from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
+
+import httpx
+
+TRACKING_QUERY_KEYS = frozenset(
+    {
+        "fbclid",
+        "gclid",
+        "mc_cid",
+        "mc_eid",
+        "ref",
+        "ref_src",
+        "s",
+        "t",
+    }
+)
+KNOWN_SHORTENER_HOSTS = frozenset({"bit.ly", "buff.ly", "lnkd.in", "t.co"})
+
+
+def canonicalize_url(raw_url: str) -> str:
+    """Normalize a URL into a stable canonical form for deduplication."""
+
+    if not raw_url:
+        return ""
+
+    resolved_url = _resolve_known_shortener(raw_url.strip())
+    parsed_url = urlsplit(resolved_url)
+    scheme = (parsed_url.scheme or "https").lower()
+    hostname = (parsed_url.hostname or "").lower()
+    if hostname.startswith("www."):
+        hostname = hostname[4:]
+
+    netloc = hostname
+    if parsed_url.port and not _is_default_port(scheme, parsed_url.port):
+        netloc = f"{hostname}:{parsed_url.port}"
+
+    path = parsed_url.path or "/"
+    if path != "/":
+        path = path.rstrip("/") or "/"
+
+    filtered_query = urlencode(
+        [
+            (key, value)
+            for key, value in parse_qsl(parsed_url.query, keep_blank_values=True)
+            if not _should_drop_query_parameter(key)
+        ],
+        doseq=True,
+    )
+
+    return urlunsplit((scheme, netloc, path, filtered_query, ""))
+
+
+def _resolve_known_shortener(raw_url: str) -> str:
+    """Expand a supported short URL when the network request succeeds."""
+
+    hostname = (urlsplit(raw_url).hostname or "").lower()
+    if hostname not in KNOWN_SHORTENER_HOSTS:
+        return raw_url
+
+    try:
+        response = httpx.head(raw_url, follow_redirects=True, timeout=5.0)
+        response.raise_for_status()
+    except httpx.HTTPError:
+        return raw_url
+    return str(response.url)
+
+
+def _should_drop_query_parameter(key: str) -> bool:
+    normalized_key = key.lower()
+    return normalized_key.startswith("utm_") or normalized_key in TRACKING_QUERY_KEYS
+
+
+def _is_default_port(scheme: str, port: int) -> bool:
+    return (scheme == "http" and port == 80) or (scheme == "https" and port == 443)
diff --git a/core/llm.py b/core/llm.py
@@ -4,9 +4,12 @@
 import re
 import time
 from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
 from typing import Any
 
 import httpx
+import markdown  # type: ignore[import-untyped]
 from django.conf import settings
 
 JSON_OBJECT_PATTERN = re.compile(r"\{.*\}", re.DOTALL)
@@ -19,6 +22,17 @@ class OpenRouterJSONResponse:
     latency_ms: int
 
 
+@dataclass(slots=True)
+class SkillDefinition:
+    """Represents one Claude-style skill markdown document."""
+
+    name: str
+    input_fields: tuple[str, ...]
+    output_fields: tuple[str, ...]
+    instructions_markdown: str
+    instructions_html: str
+
+
 def openrouter_chat_json(
     *, model: str, system_prompt: str, user_prompt: str
 ) -> OpenRouterJSONResponse:
@@ -62,6 +76,42 @@ def openrouter_chat_json(
     )
 
 
+@lru_cache(maxsize=16)
+def get_skill_definition(skill_name: str) -> SkillDefinition:
+    """Load a skill definition from the repository skill markdown directory."""
+
+    skill_path = Path(__file__).resolve().parent.parent / "skills" / skill_name / "SKILL.md"
+    raw_text = skill_path.read_text(encoding="utf-8")
+    frontmatter, body = _split_frontmatter(raw_text)
+    name = frontmatter.get("name", skill_name).strip() or skill_name
+    input_fields = _csv_field_list(frontmatter.get("input", ""))
+    output_fields = _csv_field_list(frontmatter.get("output", ""))
+    instructions_markdown = body.strip()
+    return SkillDefinition(
+        name=name,
+        input_fields=input_fields,
+        output_fields=output_fields,
+        instructions_markdown=instructions_markdown,
+        instructions_html=markdown.markdown(instructions_markdown),
+    )
+
+
+def build_skill_user_prompt(skill_name: str, inputs: dict[str, Any]) -> str:
+    """Render a consistent user prompt from a skill's declared input fields."""
+
+    skill = get_skill_definition(skill_name)
+    sections = []
+    for field_name in skill.input_fields:
+        value = inputs.get(field_name, "")
+        sections.append(f"{field_name}:\n{_stringify_skill_input(value)}")
+    if skill.output_fields:
+        sections.append(
+            "Return only a JSON object with these fields: "
+            + ", ".join(skill.output_fields)
+        )
+    return "\n\n".join(sections)
+
+
 def _extract_json_object(message_content: str) -> dict[str, Any]:
     try:
         payload = json.loads(message_content)
@@ -73,3 +123,35 @@ def _extract_json_object(message_content: str) -> dict[str, Any]:
     if not isinstance(payload, dict):
         raise ValueError("Model response JSON must be an object.")
     return payload
+
+
+def _split_frontmatter(raw_text: str) -> tuple[dict[str, str], str]:
+    """Split a skill markdown document into simple frontmatter and body."""
+
+    if not raw_text.startswith("---\n"):
+        return {}, raw_text
+    _, _, remainder = raw_text.partition("\n")
+    frontmatter_block, separator, body = remainder.partition("\n---\n")
+    if not separator:
+        return {}, raw_text
+    frontmatter: dict[str, str] = {}
+    for line in frontmatter_block.splitlines():
+        if not line.strip() or ":" not in line:
+            continue
+        key, value = line.split(":", 1)
+        frontmatter[key.strip()] = value.strip()
+    return frontmatter, body
+
+
+def _csv_field_list(raw_value: str) -> tuple[str, ...]:
+    """Parse a comma-separated frontmatter field list."""
+
+    return tuple(part.strip() for part in raw_value.split(",") if part.strip())
+
+
+def _stringify_skill_input(value: Any) -> str:
+    """Serialize skill input values into prompt-safe text."""
+
+    if isinstance(value, (dict, list, tuple)):
+        return json.dumps(value, ensure_ascii=True, indent=2, sort_keys=True)
+    return str(value)
diff --git a/core/management/commands/seed_demo.py b/core/management/commands/seed_demo.py
@@ -12,6 +12,7 @@
 from httpx import HTTPError
 from qdrant_client.http.exceptions import ResponseHandlingException
 
+from core.deduplication import canonicalize_url
 from core.embeddings import upsert_content_embedding
 from core.models import (
     Content,
@@ -612,6 +613,7 @@ def _seed_articles(
                 "author": article["author"],
                 "entity": entities_by_name.get(article.get("entity_name", "")),
                 "source_plugin": source_plugin or article["source_plugin"],
+                "canonical_url": canonicalize_url(article["url"]),
                 "published_date": now - timedelta(days=article["days_ago"]),
                 "content_text": article["content_text"],
                 "is_reference": is_reference,
@@ -887,10 +889,36 @@ def _build_reference_articles(self) -> list[dict[str, Any]]:
 
     def _build_demo_content(self) -> list[dict[str, Any]]:
         articles = list(LEGACY_SAMPLE_CONTENT)
-        articles.extend(self._build_generated_rss_content())
-        articles.extend(self._build_generated_reddit_content())
+        generated_rss = self._build_generated_rss_content()
+        generated_reddit = self._build_generated_reddit_content()
+        self._inject_duplicate_variants(articles, generated_rss, generated_reddit)
+        articles.extend(generated_rss)
+        articles.extend(generated_reddit)
         return articles
 
+    @staticmethod
+    def _inject_duplicate_variants(
+        legacy_articles: list[dict[str, Any]],
+        generated_rss: list[dict[str, Any]],
+        generated_reddit: list[dict[str, Any]],
+    ) -> None:
+        duplicate_pairs = [
+            (legacy_articles[0], generated_reddit[0], "reddit"),
+            (generated_rss[2], generated_reddit[1], "community"),
+            (generated_rss[9], generated_reddit[2], "social"),
+        ]
+        for base_article, duplicate_article, source_tag in duplicate_pairs:
+            duplicate_article["url"] = (
+                f"{base_article['url']}?utm_source={source_tag}&ref=seed-demo"
+            )
+            duplicate_article["title"] = (
+                f"{duplicate_article['title']} linking to {base_article['title']}"
+            )
+            duplicate_article["content_text"] = (
+                f"This seeded item points readers to the same underlying article as '{base_article['title']}'. "
+                f"{duplicate_article['content_text']}"
+            )
+
     def _build_generated_rss_content(self) -> list[dict[str, Any]]:
         articles = []
         for index in range(147):

diff --git a/core/migrations/0005_content_canonical_url_content_duplicate_of_and_more.py b/core/migrations/0005_content_canonical_url_content_duplicate_of_and_more.py
@@ -0,0 +1,35 @@
+# Generated by Django 6.0.4 on 2026-04-29 01:33
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("core", "0004_blueskycredentials"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="content",
+            name="canonical_url",
+            field=models.URLField(blank=True, db_index=True, default=""),
+        ),
+        migrations.AddField(
+            model_name="content",
+            name="duplicate_of",
+            field=models.ForeignKey(
+                blank=True,
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name="duplicates",
+                to="core.content",
+            ),
+        ),
+        migrations.AddField(
+            model_name="content",
+            name="duplicate_signal_count",
+            field=models.IntegerField(default=0),
+        ),
+    ]