diff --git a/SESSION.md b/SESSION.md new file mode 100644 index 00000000..e3b5508e --- /dev/null +++ b/SESSION.md @@ -0,0 +1,16 @@ +# Session Restore Point + + + +## Useful Commands From Today + +```bash +docker run --rm newsletter-maker-app:dev python -c "import drf_standardized_errors; print('ok')" +docker compose exec django python -c "import drf_standardized_errors; print('ok')" +docker compose exec django pip show drf-standardized-errors +docker inspect newsletter-maker-django-1 --format '{{.Id}} {{.Image}} {{.Config.Image}}' +docker inspect newsletter-maker-django-1 --format '{{json .Mounts}}' +pytest core/tests/test_embeddings.py -q +ruff check core/management/commands/seed_demo.py core/tests/test_embeddings.py +``` + diff --git a/celerybeat-schedule b/celerybeat-schedule index 023c1aed..6c433076 100644 Binary files a/celerybeat-schedule and b/celerybeat-schedule differ diff --git a/core/admin.py b/core/admin.py index 33b66f8a..9b16c9fb 100644 --- a/core/admin.py +++ b/core/admin.py @@ -107,12 +107,13 @@ class ContentAdmin(admin.ModelAdmin): @admin.display(description="Preview") def preview_content(self, obj): - """Adds a 'Quick Look' icon that shows the description in an Unfold modal.""" - if not obj.description: + """Adds a quick preview based on the stored content text.""" + preview_text = (obj.content_text or "").strip() + if not preview_text: return "-" return format_html( '🔍 View', - obj.description[:500] + preview_text[:500] ) @admin.display(description="AI Trace") @@ -343,8 +344,8 @@ class UserFeedbackAdmin(ModelAdmin): @admin.display(description="Type") def display_feedback(self, obj): if obj.feedback_type == "UPVOTE": - return format_html('👍') - return format_html('👎') + return format_html('{}', "1.2rem", "👍") + return format_html('{}', "1.2rem", "👎") @admin.display(description="Content Title") def get_content_title(self, obj): @@ -426,9 +427,10 @@ def display_efficiency(self, obj): return "0/0" percent = (obj.items_ingested / obj.items_fetched) * 100 color = "green" if percent > 90 else "orange" if percent > 50 else "red" + percent_label = f"({percent:.0f}%)" return format_html( - '{} / {} ({:.0f}%)', - obj.items_ingested, obj.items_fetched, color, percent + '{} / {} {}', + obj.items_ingested, obj.items_fetched, color, percent_label ) @admin.display(description="Duration") @@ -491,15 +493,15 @@ class SourceConfigAdmin(ModelAdmin): @admin.display(description="Status") def display_health(self, obj): if not obj.is_active: - return format_html('● Paused') + return format_html('{}', "gray", "● Paused") if obj.last_fetched_at: hours_since = (timezone.now() - obj.last_fetched_at).total_seconds() / 3600 if hours_since > 24: - return format_html('● Stale') - return format_html('● Healthy') + return format_html('{}', "red", "● Stale") + return format_html('{}', "green", "● Healthy") - return format_html('● Never Run') + return format_html('{}', "orange", "● Never Run") @admin.display(description="Config Preview") def pretty_config(self, obj): @@ -589,7 +591,8 @@ def get_content_title(self, obj): @admin.display(description="Confidence") def display_confidence(self, obj): color = "red" if obj.confidence < 0.3 else "orange" if obj.confidence < 0.6 else "green" - return format_html('{:.0f}%', color, obj.confidence * 100) + confidence_label = f"{obj.confidence * 100:.0f}%" + return format_html('{}', color, confidence_label) @admin.action(description="Approve selected items") def mark_as_approved(self, request, queryset): diff --git a/core/management/commands/seed_demo.py b/core/management/commands/seed_demo.py index e34e9651..86319ba0 100644 --- a/core/management/commands/seed_demo.py +++ b/core/management/commands/seed_demo.py @@ -1,16 +1,221 @@ from __future__ import annotations from datetime import timedelta +from typing import Any +from django.conf import settings from django.contrib.auth import get_user_model from django.core.management.base import BaseCommand from django.db import transaction from django.utils import timezone +from httpx import HTTPError +from qdrant_client.http.exceptions import ResponseHandlingException from core.embeddings import upsert_content_embedding -from core.models import Content, SourcePluginName, Tenant, TenantConfig +from core.models import ( + Content, + Entity, + EntityType, + FeedbackType, + IngestionRun, + ReviewQueue, + ReviewReason, + ReviewResolution, + RunStatus, + SkillResult, + SkillStatus, + SourceConfig, + SourcePluginName, + Tenant, + TenantConfig, + UserFeedback, +) +from core.pipeline import ( + CLASSIFICATION_SKILL_NAME, + RELEVANCE_SKILL_NAME, + SUMMARIZATION_SKILL_NAME, +) -REFERENCE_ARTICLES = [ +DEMO_TENANT_NAME = "Platform Engineering Weekly" +DEMO_TOPIC_DESCRIPTION = ( + "Platform engineering, DevOps, cloud infrastructure, reliability, and " + "developer experience." +) +REFERENCE_SOURCE_PLUGIN = "reference_seed" + +ENTITY_SPECS = [ + { + "name": "Kelsey Hightower", + "type": EntityType.INDIVIDUAL, + "description": "Cloud infrastructure educator and platform engineering voice.", + "authority_score": 0.94, + "website_url": "https://example.com/entities/kelsey-hightower", + "twitter_handle": "kelseyhightower", + }, + { + "name": "Charity Majors", + "type": EntityType.INDIVIDUAL, + "description": "Observability and DevOps leadership commentator.", + "authority_score": 0.91, + "website_url": "https://example.com/entities/charity-majors", + "twitter_handle": "mipsytipsy", + }, + { + "name": "Liz Rice", + "type": EntityType.INDIVIDUAL, + "description": "Container security and Kubernetes runtime expert.", + "authority_score": 0.89, + "website_url": "https://example.com/entities/liz-rice", + "twitter_handle": "lizrice", + }, + { + "name": "Mitchell Hashimoto", + "type": EntityType.INDIVIDUAL, + "description": "Infrastructure workflow builder and platform tooling founder.", + "authority_score": 0.9, + "website_url": "https://example.com/entities/mitchell-hashimoto", + "twitter_handle": "mitchellh", + }, + { + "name": "Solomon Hykes", + "type": EntityType.INDIVIDUAL, + "description": "Container ecosystem pioneer with platform automation perspective.", + "authority_score": 0.86, + "website_url": "https://example.com/entities/solomon-hykes", + "twitter_handle": "solomonstre", + }, + { + "name": "Abby Bangser", + "type": EntityType.INDIVIDUAL, + "description": "Progressive delivery and reliable release operations advocate.", + "authority_score": 0.83, + "website_url": "https://example.com/entities/abby-bangser", + "twitter_handle": "abangser", + }, + { + "name": "Viktor Farcic", + "type": EntityType.INDIVIDUAL, + "description": "Kubernetes automation educator and GitOps practitioner.", + "authority_score": 0.82, + "website_url": "https://example.com/entities/viktor-farcic", + "twitter_handle": "vfarcic", + }, + { + "name": "Paula Kennedy", + "type": EntityType.INDIVIDUAL, + "description": "Platform operations leader focused on delivery systems and DX.", + "authority_score": 0.8, + "website_url": "https://example.com/entities/paula-kennedy", + "twitter_handle": "paulapkennedy", + }, + { + "name": "HashiCorp", + "type": EntityType.VENDOR, + "description": "Infrastructure lifecycle tooling vendor.", + "authority_score": 0.9, + "website_url": "https://example.com/vendors/hashicorp", + "github_url": "https://github.com/hashicorp", + }, + { + "name": "Datadog", + "type": EntityType.VENDOR, + "description": "Observability platform vendor.", + "authority_score": 0.88, + "website_url": "https://example.com/vendors/datadog", + "github_url": "https://github.com/DataDog", + }, + { + "name": "Grafana Labs", + "type": EntityType.VENDOR, + "description": "Monitoring and observability tooling vendor.", + "authority_score": 0.87, + "website_url": "https://example.com/vendors/grafana-labs", + "github_url": "https://github.com/grafana", + }, + { + "name": "Vercel", + "type": EntityType.VENDOR, + "description": "Deployment platform vendor with developer productivity focus.", + "authority_score": 0.76, + "website_url": "https://example.com/vendors/vercel", + "github_url": "https://github.com/vercel", + }, + { + "name": "Jetstack", + "type": EntityType.VENDOR, + "description": "Cloud native platform security and certificate tooling vendor.", + "authority_score": 0.79, + "website_url": "https://example.com/vendors/jetstack", + "github_url": "https://github.com/jetstack", + }, + { + "name": "CNCF", + "type": EntityType.ORGANIZATION, + "description": "Cloud Native Computing Foundation ecosystem steward.", + "authority_score": 0.95, + "website_url": "https://example.com/orgs/cncf", + }, + { + "name": "Linux Foundation", + "type": EntityType.ORGANIZATION, + "description": "Open source foundation behind major infrastructure ecosystems.", + "authority_score": 0.92, + "website_url": "https://example.com/orgs/linux-foundation", + }, +] + +SOURCE_CONFIG_SPECS = [ + { + "plugin_name": SourcePluginName.RSS, + "config": {"feed_url": "https://platformweekly.example.com/feed.xml"}, + "is_active": True, + "hours_ago": 3, + }, + { + "plugin_name": SourcePluginName.RSS, + "config": {"feed_url": "https://engineering.hashicorp.example.com/feed.xml"}, + "is_active": True, + "hours_ago": 5, + }, + { + "plugin_name": SourcePluginName.RSS, + "config": {"feed_url": "https://observability.datadog.example.com/feed.xml"}, + "is_active": True, + "hours_ago": 7, + }, + { + "plugin_name": SourcePluginName.RSS, + "config": {"feed_url": "https://grafana.example.com/feed.xml"}, + "is_active": True, + "hours_ago": 11, + }, + { + "plugin_name": SourcePluginName.RSS, + "config": {"feed_url": "https://linuxfoundation.example.com/feed.xml"}, + "is_active": False, + "hours_ago": None, + }, + { + "plugin_name": SourcePluginName.RSS, + "config": {"feed_url": "https://jetstack.example.com/feed.xml"}, + "is_active": True, + "hours_ago": 15, + }, + { + "plugin_name": SourcePluginName.REDDIT, + "config": {"subreddit": "devops", "listing": "hot", "limit": 25}, + "is_active": True, + "hours_ago": 2, + }, + { + "plugin_name": SourcePluginName.REDDIT, + "config": {"subreddit": "kubernetes", "listing": "new", "limit": 25}, + "is_active": True, + "hours_ago": 4, + }, +] + +LEGACY_REFERENCE_ARTICLES = [ { "url": "https://example.com/reference/platform-engineering-golden-paths", "title": "Golden Paths for Platform Engineering Teams", @@ -48,8 +253,55 @@ }, ] +REFERENCE_TOPICS = [ + { + "slug": "golden-path-adoption", + "title": "Golden Path Adoption in Growing Engineering Orgs", + "content": "Teams get more value from platform programs when paved-road workflows, self-service actions, and clear ownership boundaries are introduced together.", + }, + { + "slug": "cluster-upgrades", + "title": "How Mature Teams De-Risk Cluster Upgrades", + "content": "Healthy platform groups stage Kubernetes upgrades with preflight checks, workload canaries, and explicit rollback steps that application teams can trust.", + }, + { + "slug": "delivery-metrics", + "title": "Delivery Metrics That Actually Change Engineering Behavior", + "content": "The most useful delivery scorecards connect lead time and incident recovery to engineering systems, not vanity measures or ticket volume.", + }, + { + "slug": "developer-portals", + "title": "Developer Portals Need Service Ownership to Work", + "content": "Catalog quality improves when developer portals tie service metadata, docs, and alert ownership into one operating model.", + }, + { + "slug": "incident-command", + "title": "Incident Command Patterns for Platform Teams", + "content": "Clear command roles, live timelines, and action-item follow-through help platform teams improve reliability under pressure.", + }, + { + "slug": "runbook-quality", + "title": "Operational Runbooks That Stay Useful", + "content": "Runbooks remain valuable when they are tested during drills, written for the pager path, and updated alongside the services they describe.", + }, + { + "slug": "cost-visibility", + "title": "Shared Infrastructure Needs Cost Visibility", + "content": "FinOps discipline improves when platform teams expose ownership tags, service-by-service usage, and visible cost trade-offs for engineering choices.", + }, + { + "slug": "release-policy", + "title": "Release Policies That Scale Without Bureaucracy", + "content": "Release systems stay fast when guardrails are automated, exceptions are visible, and change risk is measured instead of guessed.", + }, + { + "slug": "platform-adoption", + "title": "Measuring Internal Platform Adoption Beyond Login Counts", + "content": "Adoption signals are strongest when teams can complete real workflows faster, with fewer support tickets and less cognitive overhead.", + }, +] -SAMPLE_CONTENT = [ +LEGACY_SAMPLE_CONTENT = [ { "url": "https://example.com/content/backstage-adoption", "title": "Backstage Adoption Patterns in Mid-Size Platform Teams", @@ -57,7 +309,10 @@ "source_plugin": SourcePluginName.RSS, "content_text": "Teams adopting Backstage often start with service catalogs, software templates, and docs ownership to improve discoverability and reduce friction.", "days_ago": 5, - "is_reference": False, + "content_type": "technical_article", + "classification_confidence": 0.88, + "relevance_score": 0.91, + "entity_name": "CNCF", }, { "url": "https://example.com/content/argo-rollouts", @@ -66,7 +321,10 @@ "source_plugin": SourcePluginName.RSS, "content_text": "Progressive delivery helps platform teams validate rollouts with canaries, automated analysis, and safer release policies across Kubernetes workloads.", "days_ago": 4, - "is_reference": False, + "content_type": "tutorial", + "classification_confidence": 0.9, + "relevance_score": 0.87, + "entity_name": "CNCF", }, { "url": "https://example.com/content/cost-observability", @@ -75,7 +333,10 @@ "source_plugin": SourcePluginName.REDDIT, "content_text": "Shared platform teams need cost observability, ownership tagging, and usage feedback loops so product teams understand the cost of infrastructure choices.", "days_ago": 3, - "is_reference": False, + "content_type": "technical_article", + "classification_confidence": 0.81, + "relevance_score": 0.82, + "entity_name": None, }, { "url": "https://example.com/content/runbooks", @@ -84,74 +345,727 @@ "source_plugin": SourcePluginName.RSS, "content_text": "Runbooks remain valuable when they are short, current, and tied to real incident response patterns instead of static documentation nobody trusts.", "days_ago": 2, - "is_reference": False, + "content_type": "opinion", + "classification_confidence": 0.67, + "relevance_score": 0.79, + "entity_name": "Linux Foundation", + }, +] + +RSS_PUBLICATIONS = [ + { + "label": "Platform Weekly", + "host": "platformweekly.example.com", + "entity_name": "CNCF", + }, + { + "label": "HashiCorp Engineering", + "host": "engineering.hashicorp.example.com", + "entity_name": "HashiCorp", + }, + { + "label": "Datadog Observability", + "host": "observability.datadog.example.com", + "entity_name": "Datadog", + }, + { + "label": "Grafana Labs Blog", + "host": "grafana.example.com", + "entity_name": "Grafana Labs", + }, + { + "label": "Linux Foundation Engineering", + "host": "linuxfoundation.example.com", + "entity_name": "Linux Foundation", + }, + { + "label": "Jetstack Updates", + "host": "jetstack.example.com", + "entity_name": "Jetstack", + }, +] + +RSS_TOPIC_BLUEPRINTS = [ + { + "slug": "golden-path-templates", + "headline": "Golden-path templates reduce setup time for platform teams", + "content_type": "technical_article", + "body": "The piece explains how reusable templates, service metadata, and sensible defaults give teams a faster path to production.", + }, + { + "slug": "progressive-delivery", + "headline": "Progressive delivery patterns for shared Kubernetes platforms", + "content_type": "tutorial", + "body": "It walks through canary rollouts, automated checks, and ownership boundaries that make releases safer without slowing delivery.", + }, + { + "slug": "cost-guardrails", + "headline": "Cost guardrails for internal developer platforms", + "content_type": "technical_article", + "body": "The article focuses on usage visibility, quota design, and the small control loops teams need to keep infrastructure spend understandable.", + }, + { + "slug": "runbook-culture", + "headline": "Runbook maintenance is still a platform engineering advantage", + "content_type": "opinion", + "body": "It argues that concise runbooks and on-call practice still outperform sprawling internal wikis when incidents are unfolding quickly.", + }, + { + "slug": "backstage-ownership", + "headline": "Developer portals work best when ownership data is real", + "content_type": "technical_article", + "body": "The story connects service catalogs, docs ownership, and scorecards to actual adoption rather than passive documentation projects.", + }, + { + "slug": "cluster-lifecycle", + "headline": "Cluster lifecycle policies for platform teams under growth", + "content_type": "release_notes", + "body": "The author covers upgrade pacing, deprecation messaging, and health gates that help teams standardize their operating model.", + }, + { + "slug": "internal-developer-platform", + "headline": "Internal developer platform scope is easier to manage with paved roads", + "content_type": "technical_article", + "body": "It emphasizes clear boundaries, supported workflows, and fast documentation paths so teams can adopt the platform intentionally.", + }, + { + "slug": "incident-learning", + "headline": "Incident review loops that strengthen platform reliability", + "content_type": "technical_article", + "body": "This write-up connects post-incident review habits to architecture decisions, ownership, and change-risk management.", + }, +] + +REDDIT_COMMUNITIES = ["devops", "kubernetes"] +REDDIT_TOPIC_BLUEPRINTS = [ + { + "slug": "helm-ownership", + "headline": "Teams debate who should own Helm charts for shared services", + "content_type": "opinion", + "body": "Commenters compare central platform ownership with service-team autonomy and trade stories about chart maintenance drift.", + }, + { + "slug": "cost-visibility", + "headline": "Practitioners share how they expose platform costs to app teams", + "content_type": "technical_article", + "body": "The thread surfaces tactics for chargeback dashboards, tagging, and budget guardrails that teams actually act on.", + }, + { + "slug": "cluster-upgrades", + "headline": "What broke during your last cluster upgrade window?", + "content_type": "other", + "body": "Respondents compare controller drift, workload surprises, and the safeguards that made the next upgrade less painful.", + }, + { + "slug": "runbooks", + "headline": "Engineers compare runbook formats that still help under pressure", + "content_type": "other", + "body": "The discussion covers how short, actionable runbooks outperform static documentation in real incident response.", + }, + { + "slug": "platform-roadmaps", + "headline": "How do you keep platform roadmaps aligned with developer pain?", + "content_type": "opinion", + "body": "People discuss surveys, support queues, and advisory groups as ways to keep platform work grounded in real friction.", + }, + { + "slug": "delivery-speed", + "headline": "Where do you measure deployment speed without gaming the metric?", + "content_type": "technical_article", + "body": "The conversation compares lead time, failure rate, and service ownership as stronger measures than ticket throughput.", }, ] class Command(BaseCommand): - help = "Seed a demo tenant with a reference corpus and sample content for embeddings and Qdrant flows." + help = "Seed a deterministic demo tenant with entities, content, pipeline outputs, feedback, and ingestion history." def handle(self, *args, **options): + reference_articles = self._build_reference_articles() + sample_articles = self._build_demo_content() + with transaction.atomic(): tenant = self._ensure_demo_tenant() - reference_count = self._seed_articles(tenant, REFERENCE_ARTICLES, is_reference=True, source_plugin="reference_seed") - sample_count = self._seed_articles(tenant, SAMPLE_CONTENT, is_reference=False) - embedded_count = self._sync_embeddings(tenant) + self._reset_demo_runtime_state(tenant) + entity_map = self._seed_entities(tenant) + source_config_count = self._seed_source_configs(tenant) + reference_contents = self._seed_articles( + tenant, + reference_articles, + entity_map, + is_reference=True, + source_plugin=REFERENCE_SOURCE_PLUGIN, + ) + sample_contents = self._seed_articles( + tenant, + sample_articles, + entity_map, + is_reference=False, + ) + skill_result_count, review_count = self._seed_pipeline_state( + tenant, + sample_articles, + sample_contents, + ) + feedback_count = self._seed_feedback(tenant, sample_contents) + ingestion_run_count = self._seed_ingestion_runs(tenant) + embedded_count = self._sync_embeddings(reference_contents + sample_contents) self.stdout.write(self.style.SUCCESS(f"Seeded demo tenant: {tenant.name}")) - self.stdout.write(f"Reference corpus items: {reference_count}") - self.stdout.write(f"Sample content items: {sample_count}") - self.stdout.write(self.style.SUCCESS(f"Upserted embeddings for {embedded_count} content item(s).")) + self.stdout.write(f"Entities: {len(entity_map)}") + self.stdout.write(f"Source configs: {source_config_count}") + self.stdout.write(f"Reference corpus items: {len(reference_contents)}") + self.stdout.write(f"Demo content items: {len(sample_contents)}") + self.stdout.write(f"Skill results: {skill_result_count}") + self.stdout.write(f"Review queue items: {review_count}") + self.stdout.write(f"Feedback items: {feedback_count}") + self.stdout.write(f"Ingestion runs: {ingestion_run_count}") + self.stdout.write( + self.style.SUCCESS( + f"Upserted embeddings for {embedded_count} seeded content item(s)." + ) + ) def _ensure_demo_tenant(self) -> Tenant: user_model = get_user_model() user, _ = user_model.objects.get_or_create( username="demo_editor", - defaults={ - "email": "demo@example.com", - }, + defaults={"email": "demo@example.com"}, ) user.set_password("demo-password") user.save(update_fields=["password"]) tenant, created = Tenant.objects.get_or_create( user=user, - name="Platform Engineering Weekly", - defaults={ - "topic_description": "Platform engineering, DevOps, cloud infrastructure, reliability, and developer experience.", - }, + name=DEMO_TENANT_NAME, + defaults={"topic_description": DEMO_TOPIC_DESCRIPTION}, ) - if not created and not tenant.topic_description: - tenant.topic_description = "Platform engineering, DevOps, cloud infrastructure, reliability, and developer experience." + if not created and tenant.topic_description != DEMO_TOPIC_DESCRIPTION: + tenant.topic_description = DEMO_TOPIC_DESCRIPTION tenant.save(update_fields=["topic_description"]) TenantConfig.objects.get_or_create(tenant=tenant) return tenant - def _seed_articles(self, tenant: Tenant, articles: list[dict], *, is_reference: bool, source_plugin: str | None = None) -> int: - seeded_count = 0 + def _reset_demo_runtime_state(self, tenant: Tenant) -> None: + SkillResult.objects.filter(tenant=tenant).delete() + ReviewQueue.objects.filter(tenant=tenant).delete() + UserFeedback.objects.filter(tenant=tenant).delete() + IngestionRun.objects.filter(tenant=tenant).delete() + SourceConfig.objects.filter(tenant=tenant).delete() + + def _seed_entities(self, tenant: Tenant) -> dict[str, Entity]: + entities_by_name: dict[str, Entity] = {} + for spec in ENTITY_SPECS: + defaults = { + "type": spec["type"], + "description": spec["description"], + "authority_score": spec["authority_score"], + "website_url": spec.get("website_url", ""), + "github_url": spec.get("github_url", ""), + "linkedin_url": spec.get("linkedin_url", ""), + "bluesky_handle": spec.get("bluesky_handle", ""), + "mastodon_handle": spec.get("mastodon_handle", ""), + "twitter_handle": spec.get("twitter_handle", ""), + } + entity, _ = Entity.objects.update_or_create( + tenant=tenant, + name=spec["name"], + defaults=defaults, + ) + entities_by_name[entity.name] = entity + return entities_by_name + + def _seed_source_configs(self, tenant: Tenant) -> int: + now = timezone.now() + for spec in SOURCE_CONFIG_SPECS: + last_fetched_at = None + if spec["hours_ago"] is not None: + last_fetched_at = now - timedelta(hours=spec["hours_ago"]) + SourceConfig.objects.create( + tenant=tenant, + plugin_name=spec["plugin_name"], + config=spec["config"], + is_active=spec["is_active"], + last_fetched_at=last_fetched_at, + ) + return len(SOURCE_CONFIG_SPECS) + + def _seed_articles( + self, + tenant: Tenant, + articles: list[dict[str, Any]], + entities_by_name: dict[str, Entity], + *, + is_reference: bool, + source_plugin: str | None = None, + ) -> list[Content]: now = timezone.now() + seeded_contents: list[Content] = [] for article in articles: defaults = { "title": article["title"], "author": article["author"], + "entity": entities_by_name.get(article.get("entity_name", "")), "source_plugin": source_plugin or article["source_plugin"], "published_date": now - timedelta(days=article["days_ago"]), "content_text": article["content_text"], "is_reference": is_reference, "is_active": True, } - Content.objects.update_or_create( + content, _ = Content.objects.update_or_create( tenant=tenant, url=article["url"], defaults=defaults, ) - seeded_count += 1 - return seeded_count + seeded_contents.append(content) + return seeded_contents + + def _seed_pipeline_state( + self, + tenant: Tenant, + article_specs: list[dict[str, Any]], + contents: list[Content], + ) -> tuple[int, int]: + content_by_url = {content.url: content for content in contents} + content_updates: list[Content] = [] + skill_results: list[SkillResult] = [] + review_items: list[ReviewQueue] = [] + + for index, article in enumerate(article_specs): + content = content_by_url[article["url"]] + classification_confidence = float(article["classification_confidence"]) + relevance_score = float(article["relevance_score"]) + review_reason = self._review_reason_for_article( + classification_confidence, + relevance_score, + ) + content.content_type = article["content_type"] + content.relevance_score = relevance_score + content.is_active = relevance_score >= settings.AI_RELEVANCE_REVIEW_THRESHOLD + content_updates.append(content) + + skill_results.append( + SkillResult( + content=content, + tenant=tenant, + skill_name=CLASSIFICATION_SKILL_NAME, + status=SkillStatus.COMPLETED, + result_data={ + "content_type": article["content_type"], + "confidence": classification_confidence, + "explanation": self._classification_explanation(article), + }, + model_used=settings.AI_CLASSIFICATION_MODEL, + latency_ms=240 + (index % 5) * 35, + confidence=classification_confidence, + ) + ) + skill_results.append( + SkillResult( + content=content, + tenant=tenant, + skill_name=RELEVANCE_SKILL_NAME, + status=SkillStatus.COMPLETED, + result_data={ + "relevance_score": relevance_score, + "explanation": self._relevance_explanation(article), + "used_llm": bool(article.get("used_llm", False)), + }, + model_used=self._relevance_model_used(article), + latency_ms=( + 0 + if not article.get("used_llm", False) + else 900 + (index % 4) * 120 + ), + confidence=relevance_score, + ) + ) + if relevance_score >= settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD: + skill_results.append( + SkillResult( + content=content, + tenant=tenant, + skill_name=SUMMARIZATION_SKILL_NAME, + status=SkillStatus.COMPLETED, + result_data={ + "summary": self._summary_for_article(article), + }, + model_used=settings.AI_SUMMARIZATION_MODEL, + latency_ms=640 + (index % 6) * 40, + ) + ) + if review_reason is not None: + resolved = index % 6 == 0 + resolution = "" + if resolved: + resolution = ( + ReviewResolution.HUMAN_APPROVED + if relevance_score >= settings.AI_RELEVANCE_REVIEW_THRESHOLD + else ReviewResolution.HUMAN_REJECTED + ) + confidence = ( + classification_confidence + if review_reason == ReviewReason.LOW_CONFIDENCE_CLASSIFICATION + else relevance_score + ) + review_items.append( + ReviewQueue( + tenant=tenant, + content=content, + reason=review_reason, + confidence=confidence, + resolved=resolved, + resolution=resolution, + ) + ) + + Content.objects.bulk_update( + content_updates, + ["content_type", "relevance_score", "is_active"], + ) + SkillResult.objects.bulk_create(skill_results) + ReviewQueue.objects.bulk_create(review_items) + return len(skill_results), len(review_items) + + def _seed_feedback(self, tenant: Tenant, contents: list[Content]) -> int: + user_model = get_user_model() + voters = [] + for index in range(1, 7): + user, _ = user_model.objects.get_or_create( + username=f"demo_reader_{index}", + defaults={"email": f"demo-reader-{index}@example.com"}, + ) + user.set_password("demo-password") + user.save(update_fields=["password"]) + voters.append(user) + + active_contents = sorted( + [content for content in contents if content.is_active], + key=lambda content: (content.relevance_score or 0.0, content.published_date), + reverse=True, + ) + feedback_count = 0 + + for index, content in enumerate(active_contents[:30]): + UserFeedback.objects.update_or_create( + content=content, + user=voters[index % len(voters)], + defaults={ + "tenant": tenant, + "feedback_type": FeedbackType.UPVOTE, + }, + ) + feedback_count += 1 + + for index, content in enumerate(active_contents[-15:]): + UserFeedback.objects.update_or_create( + content=content, + user=voters[(index + 2) % len(voters)], + defaults={ + "tenant": tenant, + "feedback_type": FeedbackType.DOWNVOTE, + }, + ) + feedback_count += 1 + + return feedback_count + + def _seed_ingestion_runs(self, tenant: Tenant) -> int: + run_specs = [ + { + "plugin_name": SourcePluginName.RSS, + "status": RunStatus.SUCCESS, + "items_fetched": 92, + "items_ingested": 57, + "error_message": "", + "started_hours_ago": 6, + "duration_minutes": 14, + }, + { + "plugin_name": SourcePluginName.REDDIT, + "status": RunStatus.SUCCESS, + "items_fetched": 28, + "items_ingested": 18, + "error_message": "", + "started_hours_ago": 4, + "duration_minutes": 6, + }, + { + "plugin_name": SourcePluginName.RSS, + "status": RunStatus.FAILED, + "items_fetched": 0, + "items_ingested": 0, + "error_message": "Timed out while polling one disabled feed mirror.", + "started_hours_ago": 30, + "duration_minutes": 5, + }, + { + "plugin_name": SourcePluginName.REDDIT, + "status": RunStatus.SUCCESS, + "items_fetched": 24, + "items_ingested": 16, + "error_message": "", + "started_hours_ago": 32, + "duration_minutes": 7, + }, + { + "plugin_name": SourcePluginName.RSS, + "status": RunStatus.SUCCESS, + "items_fetched": 88, + "items_ingested": 54, + "error_message": "", + "started_hours_ago": 54, + "duration_minutes": 13, + }, + { + "plugin_name": SourcePluginName.REDDIT, + "status": RunStatus.FAILED, + "items_fetched": 0, + "items_ingested": 0, + "error_message": "Community listing temporarily unavailable during sync.", + "started_hours_ago": 80, + "duration_minutes": 4, + }, + ] + now = timezone.now() + for spec in run_specs: + run = IngestionRun.objects.create( + tenant=tenant, + plugin_name=spec["plugin_name"], + status=spec["status"], + items_fetched=spec["items_fetched"], + items_ingested=spec["items_ingested"], + error_message=spec["error_message"], + ) + run.started_at = now - timedelta(hours=spec["started_hours_ago"]) + run.completed_at = run.started_at + timedelta(minutes=spec["duration_minutes"]) + run.save(update_fields=["started_at", "completed_at"]) + return len(run_specs) - def _sync_embeddings(self, tenant: Tenant) -> int: + def _sync_embeddings(self, contents: list[Content]) -> int: embedded_count = 0 - for content in Content.objects.filter(tenant=tenant).order_by("id"): - upsert_content_embedding(content) + for content in sorted(contents, key=lambda item: item.id): + try: + upsert_content_embedding(content) + except (HTTPError, ResponseHandlingException) as exc: + self.stderr.write( + self.style.WARNING( + "Skipping remaining embedding sync because the embedding or " + f"vector service is unavailable: {exc}" + ) + ) + break embedded_count += 1 return embedded_count + + def _build_reference_articles(self) -> list[dict[str, Any]]: + articles = list(LEGACY_REFERENCE_ARTICLES) + for round_index in range(5): + for topic_index, topic in enumerate(REFERENCE_TOPICS): + articles.append( + { + "url": f"https://example.com/reference/{topic['slug']}-{round_index + 1}", + "title": topic["title"], + "author": "Reference Corpus", + "content_text": topic["content"], + "days_ago": 32 + round_index * 7 + topic_index, + } + ) + return articles + + def _build_demo_content(self) -> list[dict[str, Any]]: + articles = list(LEGACY_SAMPLE_CONTENT) + articles.extend(self._build_generated_rss_content()) + articles.extend(self._build_generated_reddit_content()) + return articles + + def _build_generated_rss_content(self) -> list[dict[str, Any]]: + articles = [] + for index in range(147): + band = self._band_for_index(index, relevant_cutoff=87, borderline_cutoff=122) + publication = RSS_PUBLICATIONS[index % len(RSS_PUBLICATIONS)] + topic = RSS_TOPIC_BLUEPRINTS[index % len(RSS_TOPIC_BLUEPRINTS)] + relevance_score = self._relevance_score(index, band) + articles.append( + { + "url": ( + f"https://{publication['host']}/2026/04/" + f"{topic['slug']}-{index + 1:03d}" + ), + "title": self._rss_title(publication["label"], topic["headline"], band), + "author": f"{publication['label']} Editorial", + "source_plugin": SourcePluginName.RSS, + "content_text": self._rss_body(publication["label"], topic["body"], band), + "days_ago": 1 + (index % 30), + "content_type": self._content_type_for_band(topic["content_type"], band), + "classification_confidence": self._classification_confidence(index), + "relevance_score": relevance_score, + "entity_name": publication["entity_name"], + "used_llm": band == "borderline", + } + ) + return articles + + def _build_generated_reddit_content(self) -> list[dict[str, Any]]: + articles = [] + for index in range(49): + band = self._band_for_index(index, relevant_cutoff=24, borderline_cutoff=34) + subreddit = REDDIT_COMMUNITIES[index % len(REDDIT_COMMUNITIES)] + topic = REDDIT_TOPIC_BLUEPRINTS[index % len(REDDIT_TOPIC_BLUEPRINTS)] + articles.append( + { + "url": ( + f"https://www.reddit.com/r/{subreddit}/comments/" + f"demo{index + 1:03d}/{topic['slug']}/" + ), + "title": self._reddit_title(subreddit, topic["headline"], band), + "author": f"u/demo_{subreddit}_{index + 1:03d}", + "source_plugin": SourcePluginName.REDDIT, + "content_text": self._reddit_body(subreddit, topic["body"], band, index), + "days_ago": 1 + ((index * 2) % 30), + "content_type": self._content_type_for_band(topic["content_type"], band), + "classification_confidence": self._classification_confidence(index + 200), + "relevance_score": self._relevance_score(index + 200, band), + "entity_name": None, + "used_llm": band == "borderline", + } + ) + return articles + + @staticmethod + def _band_for_index(index: int, *, relevant_cutoff: int, borderline_cutoff: int) -> str: + if index < relevant_cutoff: + return "relevant" + if index < borderline_cutoff: + return "borderline" + return "irrelevant" + + @staticmethod + def _classification_confidence(index: int) -> float: + if index % 11 == 0: + return 0.55 + return round(0.66 + (index % 8) * 0.03, 2) + + @staticmethod + def _relevance_score(index: int, band: str) -> float: + if band == "relevant": + return round(0.74 + (index % 18) * 0.012, 2) + if band == "borderline": + return round(0.44 + (index % 15) * 0.015, 2) + return round(0.12 + (index % 12) * 0.015, 2) + + @staticmethod + def _content_type_for_band(base_content_type: str, band: str) -> str: + if band == "irrelevant" and base_content_type == "technical_article": + return "other" + return base_content_type + + @staticmethod + def _rss_title(source_label: str, headline: str, band: str) -> str: + if band == "relevant": + return f"{headline}" + if band == "borderline": + return f"{headline} for teams still shaping their platform charter" + return f"{source_label}: {headline.lower().capitalize()} outside the core platform loop" + + @staticmethod + def _reddit_title(subreddit: str, headline: str, band: str) -> str: + if band == "relevant": + return f"r/{subreddit}: {headline}" + if band == "borderline": + return f"r/{subreddit}: {headline} and where teams disagree" + return f"r/{subreddit}: {headline} with limited editorial fit" + + @staticmethod + def _rss_body(source_label: str, body: str, band: str) -> str: + if band == "relevant": + return ( + f"{source_label} reports a concrete platform engineering practice. " + f"{body} The example is directly applicable to infrastructure, " + "developer experience, and reliability workflows." + ) + if band == "borderline": + return ( + f"{source_label} touches on a topic adjacent to platform engineering. " + f"{body} Editors would probably want to review whether the angle is " + "specific enough for the newsletter audience." + ) + return ( + f"{source_label} covers a topic that only partially overlaps with the " + f"newsletter focus. {body} It is still technical, but the connection to " + "platform engineering is weak compared with stronger candidates." + ) + + @staticmethod + def _reddit_body(subreddit: str, body: str, band: str, index: int) -> str: + score = 18 + (index % 35) + if band == "relevant": + return ( + f"A discussion in r/{subreddit} highlights platform operations trade-offs. " + f"{body} The post picked up roughly {score} upvotes and several replies " + "from practitioners sharing first-hand implementation details." + ) + if band == "borderline": + return ( + f"A thread in r/{subreddit} raises a useful but mixed operational question. " + f"{body} The discussion is practical, yet it needs editorial judgment to " + "decide whether it is specific enough for platform readers." + ) + return ( + f"A thread in r/{subreddit} is only loosely connected to the tenant topic. " + f"{body} The conversation is interesting, but it is more peripheral than the " + "other seeded stories." + ) + + @staticmethod + def _review_reason_for_article( + classification_confidence: float, + relevance_score: float, + ) -> str | None: + if classification_confidence < settings.AI_CLASSIFICATION_REVIEW_THRESHOLD: + return ReviewReason.LOW_CONFIDENCE_CLASSIFICATION + if relevance_score < settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD and ( + relevance_score >= settings.AI_RELEVANCE_REVIEW_THRESHOLD + ): + return ReviewReason.BORDERLINE_RELEVANCE + return None + + @staticmethod + def _classification_explanation(article: dict[str, Any]) -> str: + return ( + f"The seeded classifier maps this item to {article['content_type']} based " + "on its language, operating context, and editorial angle." + ) + + @staticmethod + def _relevance_explanation(article: dict[str, Any]) -> str: + relevance_score = float(article["relevance_score"]) + if article.get("used_llm", False): + return ( + f"Borderline similarity of {relevance_score:.2f} required editorial " + "adjudication, so the seeded result keeps this item in the review band." + ) + if relevance_score >= settings.AI_RELEVANCE_SUMMARIZE_THRESHOLD: + return ( + f"Reference corpus similarity is strong at {relevance_score:.2f}, so the " + "item is ready to surface without additional review." + ) + return ( + f"Reference corpus similarity is weak at {relevance_score:.2f}, so the " + "seed marks the item as archived rather than surfaced." + ) + + @staticmethod + def _relevance_model_used(article: dict[str, Any]) -> str: + if article.get("used_llm", False): + return settings.AI_RELEVANCE_MODEL + return f"embedding:{settings.EMBEDDING_MODEL}" + + @staticmethod + def _summary_for_article(article: dict[str, Any]) -> str: + source_name = article["source_plugin"].upper() + return ( + f"{article['title']} gives platform teams a concrete angle on delivery, " + f"reliability, or developer experience. The seeded summary keeps the focus " + f"on why this {source_name} item is worth surfacing in the newsletter." + ) diff --git a/core/tests/test_admin.py b/core/tests/test_admin.py index 31756aec..0aede5b5 100644 --- a/core/tests/test_admin.py +++ b/core/tests/test_admin.py @@ -6,8 +6,8 @@ from django.contrib.admin.sites import AdminSite from django.utils import timezone -from core.admin import ReviewQueueAdmin, SourceConfigAdmin -from core.models import Content, ReviewQueue, ReviewReason, SourceConfig, SourcePluginName, Tenant +from core.admin import ContentAdmin, IngestionRunAdmin, ReviewQueueAdmin, SourceConfigAdmin +from core.models import Content, IngestionRun, ReviewQueue, ReviewReason, RunStatus, SourceConfig, SourcePluginName, Tenant pytestmark = pytest.mark.django_db @@ -75,6 +75,21 @@ def test_test_source_connection_reports_failures(source_admin_context, mocker): ) +def test_source_config_display_health_renders_without_django6_format_html_error(source_admin_context): + source_config = SourceConfig.objects.create( + tenant=source_admin_context.tenant, + plugin_name=SourcePluginName.RSS, + config={"feed_url": "https://example.com/feed.xml"}, + is_active=True, + last_fetched_at=timezone.now(), + ) + admin_instance = SourceConfigAdmin(SourceConfig, AdminSite()) + + rendered = admin_instance.display_health(source_config) + + assert "Healthy" in rendered + + def test_review_queue_changelist_view_builds_dashboard_stats(source_admin_context, mocker): content = Content.objects.create( tenant=source_admin_context.tenant, @@ -104,3 +119,74 @@ def test_review_queue_changelist_view_builds_dashboard_stats(source_admin_contex super_changelist_view.assert_called_once() assert response["dashboard_stats"][0]["value"] == 1 assert response["dashboard_stats"][1]["value"] == "42%" + + +def test_review_queue_display_confidence_renders_without_django6_format_error(source_admin_context): + content = Content.objects.create( + tenant=source_admin_context.tenant, + url="https://example.com/review-confidence", + title="Review Confidence", + author="Reviewer", + source_plugin=SourcePluginName.RSS, + published_date=timezone.now(), + content_text="Review queue content", + ) + review_item = ReviewQueue.objects.create( + tenant=source_admin_context.tenant, + content=content, + reason=ReviewReason.BORDERLINE_RELEVANCE, + confidence=0.42, + resolved=False, + ) + admin_instance = ReviewQueueAdmin(ReviewQueue, AdminSite()) + + rendered = admin_instance.display_confidence(review_item) + + assert "42%" in rendered + + +def test_ingestion_run_display_efficiency_renders_without_django6_format_error(source_admin_context): + run = IngestionRun.objects.create( + tenant=source_admin_context.tenant, + plugin_name=SourcePluginName.RSS, + status=RunStatus.SUCCESS, + items_fetched=12, + items_ingested=9, + ) + admin_instance = IngestionRunAdmin(IngestionRun, AdminSite()) + + rendered = admin_instance.display_efficiency(run) + + assert "75%" in rendered + + +def test_content_preview_uses_content_text(source_admin_context): + content = Content.objects.create( + tenant=source_admin_context.tenant, + url="https://example.com/admin-preview", + title="Admin Preview", + author="Editor", + source_plugin=SourcePluginName.RSS, + published_date=timezone.now(), + content_text="A short preview from the content body.", + ) + admin_instance = ContentAdmin(Content, AdminSite()) + + preview = admin_instance.preview_content(content) + + assert 'title="A short preview from the content body."' in preview + + +def test_content_preview_returns_dash_when_content_text_blank(source_admin_context): + content = Content.objects.create( + tenant=source_admin_context.tenant, + url="https://example.com/admin-preview-empty", + title="Admin Preview Empty", + author="Editor", + source_plugin=SourcePluginName.RSS, + published_date=timezone.now(), + content_text=" ", + ) + admin_instance = ContentAdmin(Content, AdminSite()) + + assert admin_instance.preview_content(content) == "-" diff --git a/core/tests/test_embeddings.py b/core/tests/test_embeddings.py index 231980a9..a1fba3d6 100644 --- a/core/tests/test_embeddings.py +++ b/core/tests/test_embeddings.py @@ -1,7 +1,9 @@ from types import SimpleNamespace +import httpx import pytest from django.core.management import call_command +from qdrant_client.http.exceptions import ResponseHandlingException from core import embeddings from core.embeddings import ( @@ -11,7 +13,22 @@ search_similar_content, upsert_content_embedding, ) -from core.models import Content, SourcePluginName, Tenant +from core.models import ( + Content, + Entity, + IngestionRun, + ReviewQueue, + SkillResult, + SourceConfig, + SourcePluginName, + Tenant, + UserFeedback, +) +from core.pipeline import ( + CLASSIFICATION_SKILL_NAME, + RELEVANCE_SKILL_NAME, + SUMMARIZATION_SKILL_NAME, +) pytestmark = pytest.mark.django_db @@ -173,7 +190,52 @@ def test_seed_demo_creates_reference_corpus_and_embeds_demo_content(mocker, caps call_command("seed_demo") tenant = Tenant.objects.get(name="Platform Engineering Weekly") - assert Content.objects.filter(tenant=tenant, is_reference=True).exists() - assert Content.objects.filter(tenant=tenant, is_reference=False).exists() - assert upsert_mock.call_count == Content.objects.filter(tenant=tenant).count() - assert "Reference corpus items" in capsys.readouterr().out + assert Entity.objects.filter(tenant=tenant).count() == 15 + assert SourceConfig.objects.filter(tenant=tenant).count() == 8 + assert Content.objects.filter(tenant=tenant, is_reference=True).count() == 50 + assert Content.objects.filter(tenant=tenant, is_reference=False).count() == 200 + assert SkillResult.objects.filter(tenant=tenant, skill_name=CLASSIFICATION_SKILL_NAME).count() == 200 + assert SkillResult.objects.filter(tenant=tenant, skill_name=RELEVANCE_SKILL_NAME).count() == 200 + assert SkillResult.objects.filter(tenant=tenant, skill_name=SUMMARIZATION_SKILL_NAME).count() == 115 + assert ReviewQueue.objects.filter(tenant=tenant).exists() + assert UserFeedback.objects.filter(tenant=tenant).count() == 45 + assert IngestionRun.objects.filter(tenant=tenant).count() == 6 + assert upsert_mock.call_count == 250 + output = capsys.readouterr().out + assert "Reference corpus items: 50" in output + assert "Demo content items: 200" in output + + +def test_seed_demo_is_stable_on_rerun(mocker): + mocker.patch("core.management.commands.seed_demo.upsert_content_embedding") + + call_command("seed_demo") + call_command("seed_demo") + + tenant = Tenant.objects.get(name="Platform Engineering Weekly") + assert Entity.objects.filter(tenant=tenant).count() == 15 + assert SourceConfig.objects.filter(tenant=tenant).count() == 8 + assert Content.objects.filter(tenant=tenant, is_reference=True).count() == 50 + assert Content.objects.filter(tenant=tenant, is_reference=False).count() == 200 + assert SkillResult.objects.filter(tenant=tenant).count() == 515 + assert ReviewQueue.objects.filter(tenant=tenant).count() > 0 + assert UserFeedback.objects.filter(tenant=tenant).count() == 45 + assert IngestionRun.objects.filter(tenant=tenant).count() == 6 + + +def test_seed_demo_skips_embeddings_when_vector_stack_is_unavailable(mocker, capsys): + upsert_mock = mocker.patch( + "core.management.commands.seed_demo.upsert_content_embedding", + side_effect=ResponseHandlingException(httpx.ConnectError("connection refused")), + ) + + call_command("seed_demo") + + tenant = Tenant.objects.get(name="Platform Engineering Weekly") + assert Content.objects.filter(tenant=tenant, is_reference=True).count() == 50 + assert Content.objects.filter(tenant=tenant, is_reference=False).count() == 200 + assert SkillResult.objects.filter(tenant=tenant).count() == 515 + assert upsert_mock.call_count == 1 + combined_output = capsys.readouterr() + assert "Skipping remaining embedding sync" in combined_output.err + assert "Upserted embeddings for 0 seeded content item(s)." in combined_output.out diff --git a/justfile b/justfile index f78e2bc9..803c6cd8 100644 --- a/justfile +++ b/justfile @@ -135,7 +135,7 @@ migrate: seed: if [ ! -f .env ]; then cp .env.example .env; fi - python3 manage.py seed_demo + {{compose}} exec django python manage.py seed_demo embed-all: if [ ! -f .env ]; then cp .env.example .env; fi